diff --git a/docs/source/options/stream_descriptors.rst b/docs/source/options/stream_descriptors.rst
index 71677ec650..992bffda77 100644
--- a/docs/source/options/stream_descriptors.rst
+++ b/docs/source/options/stream_descriptors.rst
@@ -64,4 +64,5 @@ These are the available fields:
     stream.
 
 .. include:: /options/drm_stream_descriptors.rst
+.. include:: /options/dash_stream_descriptors.rst
 .. include:: /options/hls_stream_descriptors.rst
diff --git a/packager/media/base/language_utils.cc b/packager/media/base/language_utils.cc
index 5860d3b5df..7f8e443d6f 100644
--- a/packager/media/base/language_utils.cc
+++ b/packager/media/base/language_utils.cc
@@ -110,7 +110,7 @@ std::string LanguageToShortestForm(const std::string& language) {
 
   // This could happen legitimately for languages which have no 2-letter code,
   // but that would imply that the input language code is a 3-letter code.
-  DCHECK_EQ(3u, main_language.size());
+  DCHECK_EQ(3u, main_language.size()) << main_language;
   return main_language + subtag;
 }
 
diff --git a/packager/media/formats/mp2t/es_parser_h26x.cc b/packager/media/formats/mp2t/es_parser_h26x.cc
index 9588fa7d67..14cbf69fd0 100644
--- a/packager/media/formats/mp2t/es_parser_h26x.cc
+++ b/packager/media/formats/mp2t/es_parser_h26x.cc
@@ -62,6 +62,14 @@ bool EsParserH26x::Parse(const uint8_t* buf,
     // Link the end of the byte queue with the incoming timing descriptor.
     timing_desc_list_.push_back(
         std::pair<int64_t, TimingDesc>(es_queue_->tail(), timing_desc));
+
+    // Warns if there are a large number of cached timestamps, which should be 1
+    // or 2 if everythings works as expected.
+    const size_t kWarningSize =
+        24;  // An arbitrary number (it is 1 second for a fps of 24).
+    LOG_IF(WARNING, timing_desc_list_.size() >= kWarningSize)
+        << "Unusually large number of cached timestamps ("
+        << timing_desc_list_.size() << ").";
   }
 
   // Add the incoming bytes to the ES queue.
@@ -209,8 +217,10 @@ bool EsParserH26x::ParseInternal() {
     // AUD shall be the first NAL unit if present. There shall be at most one
     // AUD in any access unit. We can emit the current access unit which shall
     // not contain the AUD.
-    if (nalu.is_aud())
-      return EmitCurrentAccessUnit();
+    if (nalu.is_aud()) {
+      RCHECK(EmitCurrentAccessUnit());
+      continue;
+    }
 
     // We can only determine if the current access unit ends after seeing
     // another VCL NAL unit.
@@ -281,7 +291,9 @@ bool EsParserH26x::EmitFrame(int64_t access_unit_pos,
 
   // Emit a frame.
   DVLOG(LOG_LEVEL_ES) << "Emit frame: stream_pos=" << access_unit_pos
-                      << " size=" << access_unit_size;
+                      << " size=" << access_unit_size << " pts "
+                      << current_timing_desc.pts << " timing_desc_list size "
+                      << timing_desc_list_.size();
   int es_size;
   const uint8_t* es;
   es_queue_->PeekAt(access_unit_pos, &es, &es_size);
diff --git a/packager/media/formats/mp2t/es_parser_h26x_unittest.cc b/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
index df07c1891e..cd2aebe9ef 100644
--- a/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
+++ b/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
@@ -163,6 +163,11 @@ class EsParserH26xTest : public testing::Test {
                const H26xNaluType* types,
                size_t types_count);
 
+  // Returns the vector of samples data j
+  std::vector<std::vector<uint8_t>> BuildSamplesData(Nalu::CodecType codec_type,
+                                                     const H26xNaluType* types,
+                                                     size_t types_count);
+
   void EmitSample(uint32_t pid, const std::shared_ptr<MediaSample>& sample) {
     size_t sample_id = sample_count_;
     sample_count_++;
@@ -186,27 +191,29 @@ class EsParserH26xTest : public testing::Test {
   bool has_stream_info_;
 };
 
-void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
-                               const H26xNaluType* types,
-                               size_t types_count) {
-  // Duration of one 25fps video frame in 90KHz clock units.
-  const uint32_t kMpegTicksPerFrame = 3600;
+// Return AnnexB samples data and stores NAL Unit samples data in |samples_|,
+// which is what will be returned from |EsParser|.
+std::vector<std::vector<uint8_t>> EsParserH26xTest::BuildSamplesData(
+    Nalu::CodecType codec_type,
+    const H26xNaluType* types,
+    size_t types_count) {
+  std::vector<std::vector<uint8_t>> samples_data;
+
   const uint8_t kStartCode[] = {0x00, 0x00, 0x01};
 
-  TestableEsParser es_parser(
-      codec_type,
-      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
-      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
-
   bool seen_key_frame = false;
-  std::vector<uint8_t> cur_sample_data;
-  ASSERT_EQ(kSeparator, types[0]);
+  std::vector<uint8_t> nal_unit_sample_data;
+  std::vector<uint8_t> annex_b_sample_data;
+  CHECK_EQ(kSeparator, types[0]);
   for (size_t k = 1; k < types_count; k++) {
     if (types[k] == kSeparator) {
       // We should not be emitting samples until we see a key frame.
       if (seen_key_frame)
-        samples_.push_back(cur_sample_data);
-      cur_sample_data.clear();
+        samples_.push_back(nal_unit_sample_data);
+      if (!annex_b_sample_data.empty())
+        samples_data.push_back(annex_b_sample_data);
+      nal_unit_sample_data.clear();
+      annex_b_sample_data.clear();
     } else {
       if (codec_type == Nalu::kH264) {
         if (types[k] == kH264VclKeyFrame)
@@ -218,34 +225,56 @@ void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
 
       std::vector<uint8_t> es_data =
           CreateNalu(codec_type, types[k], static_cast<uint8_t>(k));
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(static_cast<uint8_t>(es_data.size()));
-      cur_sample_data.insert(cur_sample_data.end(), es_data.begin(),
-                             es_data.end());
+
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(static_cast<uint8_t>(es_data.size()));
+      nal_unit_sample_data.insert(nal_unit_sample_data.end(), es_data.begin(),
+                                  es_data.end());
+
       es_data.insert(es_data.begin(), kStartCode,
                      kStartCode + arraysize(kStartCode));
-
-      const int64_t pts = k * kMpegTicksPerFrame;
-      const int64_t dts = k * kMpegTicksPerFrame;
-      // This may process the previous sample; but since we don't know whether
-      // we are at the end yet, this will not process the current sample until
-      // later.
-      size_t offset = 0;
-      size_t size = 1;
-      while (offset < es_data.size()) {
-        // Insert the data in parts to test partial data searches.
-        size = std::min(size + 1, es_data.size() - offset);
-        ASSERT_TRUE(es_parser.Parse(&es_data[offset], static_cast<int>(size),
-                                    pts, dts));
-        offset += size;
-      }
+      annex_b_sample_data.insert(annex_b_sample_data.end(), es_data.begin(),
+                                 es_data.end());
     }
   }
   if (seen_key_frame)
-    samples_.push_back(cur_sample_data);
+    samples_.push_back(nal_unit_sample_data);
+  if (!annex_b_sample_data.empty())
+    samples_data.push_back(annex_b_sample_data);
 
+  return samples_data;
+}
+
+void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
+                               const H26xNaluType* types,
+                               size_t types_count) {
+  // Duration of one 25fps video frame in 90KHz clock units.
+  const uint32_t kMpegTicksPerFrame = 3600;
+
+  TestableEsParser es_parser(
+      codec_type,
+      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
+      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
+
+  int64_t timestamp = 0;
+  for (const auto& sample_data :
+       BuildSamplesData(codec_type, types, types_count)) {
+    // This may process the previous sample; but since we don't know whether
+    // we are at the end yet, this will not process the current sample until
+    // later.
+    size_t offset = 0;
+    size_t size = 1;
+    while (offset < sample_data.size()) {
+      // Insert the data in parts to test partial data searches.
+      size = std::min(size + 1, sample_data.size() - offset);
+      ASSERT_TRUE(es_parser.Parse(&sample_data[offset], static_cast<int>(size),
+                                  timestamp, timestamp));
+      offset += size;
+    }
+    timestamp += kMpegTicksPerFrame;
+  }
   es_parser.Flush();
 }
 
@@ -350,6 +379,55 @@ TEST_F(EsParserH26xTest, H264BasicSupport) {
   EXPECT_TRUE(has_stream_info_);
 }
 
+// This is not compliant to H264 spec, but VLC generates streams like this. See
+// https://github.com/google/shaka-packager/issues/526 for details.
+TEST_F(EsParserH26xTest, H264AudInAccessUnit) {
+  // clang-format off
+  const H26xNaluType kData[] = {
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+  };
+  // clang-format on
+
+  TestableEsParser es_parser(
+      Nalu::kH264,
+      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
+      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
+
+  size_t sample_index = 0;
+  for (const auto& sample_data :
+       BuildSamplesData(Nalu::kH264, kData, arraysize(kData))) {
+    // Duration of one 25fps video frame in 90KHz clock units.
+    const uint32_t kMpegTicksPerFrame = 3600;
+    const int64_t timestamp = kMpegTicksPerFrame * sample_index;
+    ASSERT_TRUE(es_parser.Parse(sample_data.data(),
+                                static_cast<int>(sample_data.size()), timestamp,
+                                timestamp));
+    sample_index++;
+
+    // The number of emitted samples are less than the number of samples that
+    // are pushed to the EsParser since samples could be cached internally
+    // before being emitted.
+    // The delay is at most 2 in our current implementation.
+    const size_t kExpectedMaxDelay = 2;
+    EXPECT_NEAR(sample_index, sample_count_, kExpectedMaxDelay);
+  }
+
+  es_parser.Flush();
+  EXPECT_EQ(sample_index, sample_count_);
+  EXPECT_TRUE(has_stream_info_);
+}
+
 TEST_F(EsParserH26xTest, H264DeterminesAccessUnitsWithoutAUD) {
   const H26xNaluType kData[] = {
     kSeparator, kH264Sps, kH264VclKeyFrame,