From 3f7ecd4e29a474a9d0e218ce6af99ee14523d06e Mon Sep 17 00:00:00 2001
From: KongQun Yang <kqyang@google.com>
Date: Mon, 10 Dec 2018 14:12:01 -0800
Subject: [PATCH] Workaround extra AUD in the access unit

VLC seems to generate access units with extra AUDs. In #526, the below
sequence is seen:

  AUD | SPS | PPS | SPS | PPS | AUD | SEI | SEI | SEI | IDR_SLICE

Previously we exit early when seeing AUD, which results in delayed
processing of the access unit.

The behavior is changed to continue processing the following NAL units
to workaround the content issue.

Closes #526.

Change-Id: I80f571c0711c6db1337eb393fce36fae5432b6c5
---
 docs/source/options/stream_descriptors.rst    |   1 +
 packager/media/base/language_utils.cc         |   2 +-
 packager/media/formats/mp2t/es_parser_h26x.cc |  18 ++-
 .../formats/mp2t/es_parser_h26x_unittest.cc   | 150 +++++++++++++-----
 4 files changed, 131 insertions(+), 40 deletions(-)

diff --git a/docs/source/options/stream_descriptors.rst b/docs/source/options/stream_descriptors.rst
index 71677ec650..992bffda77 100644
--- a/docs/source/options/stream_descriptors.rst
+++ b/docs/source/options/stream_descriptors.rst
@@ -64,4 +64,5 @@ These are the available fields:
     stream.
 
 .. include:: /options/drm_stream_descriptors.rst
+.. include:: /options/dash_stream_descriptors.rst
 .. include:: /options/hls_stream_descriptors.rst
diff --git a/packager/media/base/language_utils.cc b/packager/media/base/language_utils.cc
index 5860d3b5df..7f8e443d6f 100644
--- a/packager/media/base/language_utils.cc
+++ b/packager/media/base/language_utils.cc
@@ -110,7 +110,7 @@ std::string LanguageToShortestForm(const std::string& language) {
 
   // This could happen legitimately for languages which have no 2-letter code,
   // but that would imply that the input language code is a 3-letter code.
-  DCHECK_EQ(3u, main_language.size());
+  DCHECK_EQ(3u, main_language.size()) << main_language;
   return main_language + subtag;
 }
 
diff --git a/packager/media/formats/mp2t/es_parser_h26x.cc b/packager/media/formats/mp2t/es_parser_h26x.cc
index 9588fa7d67..14cbf69fd0 100644
--- a/packager/media/formats/mp2t/es_parser_h26x.cc
+++ b/packager/media/formats/mp2t/es_parser_h26x.cc
@@ -62,6 +62,14 @@ bool EsParserH26x::Parse(const uint8_t* buf,
     // Link the end of the byte queue with the incoming timing descriptor.
     timing_desc_list_.push_back(
         std::pair<int64_t, TimingDesc>(es_queue_->tail(), timing_desc));
+
+    // Warns if there are a large number of cached timestamps, which should be 1
+    // or 2 if everythings works as expected.
+    const size_t kWarningSize =
+        24;  // An arbitrary number (it is 1 second for a fps of 24).
+    LOG_IF(WARNING, timing_desc_list_.size() >= kWarningSize)
+        << "Unusually large number of cached timestamps ("
+        << timing_desc_list_.size() << ").";
   }
 
   // Add the incoming bytes to the ES queue.
@@ -209,8 +217,10 @@ bool EsParserH26x::ParseInternal() {
     // AUD shall be the first NAL unit if present. There shall be at most one
     // AUD in any access unit. We can emit the current access unit which shall
     // not contain the AUD.
-    if (nalu.is_aud())
-      return EmitCurrentAccessUnit();
+    if (nalu.is_aud()) {
+      RCHECK(EmitCurrentAccessUnit());
+      continue;
+    }
 
     // We can only determine if the current access unit ends after seeing
     // another VCL NAL unit.
@@ -281,7 +291,9 @@ bool EsParserH26x::EmitFrame(int64_t access_unit_pos,
 
   // Emit a frame.
   DVLOG(LOG_LEVEL_ES) << "Emit frame: stream_pos=" << access_unit_pos
-                      << " size=" << access_unit_size;
+                      << " size=" << access_unit_size << " pts "
+                      << current_timing_desc.pts << " timing_desc_list size "
+                      << timing_desc_list_.size();
   int es_size;
   const uint8_t* es;
   es_queue_->PeekAt(access_unit_pos, &es, &es_size);
diff --git a/packager/media/formats/mp2t/es_parser_h26x_unittest.cc b/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
index df07c1891e..cd2aebe9ef 100644
--- a/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
+++ b/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
@@ -163,6 +163,11 @@ class EsParserH26xTest : public testing::Test {
                const H26xNaluType* types,
                size_t types_count);
 
+  // Returns the vector of samples data j
+  std::vector<std::vector<uint8_t>> BuildSamplesData(Nalu::CodecType codec_type,
+                                                     const H26xNaluType* types,
+                                                     size_t types_count);
+
   void EmitSample(uint32_t pid, const std::shared_ptr<MediaSample>& sample) {
     size_t sample_id = sample_count_;
     sample_count_++;
@@ -186,27 +191,29 @@ class EsParserH26xTest : public testing::Test {
   bool has_stream_info_;
 };
 
-void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
-                               const H26xNaluType* types,
-                               size_t types_count) {
-  // Duration of one 25fps video frame in 90KHz clock units.
-  const uint32_t kMpegTicksPerFrame = 3600;
+// Return AnnexB samples data and stores NAL Unit samples data in |samples_|,
+// which is what will be returned from |EsParser|.
+std::vector<std::vector<uint8_t>> EsParserH26xTest::BuildSamplesData(
+    Nalu::CodecType codec_type,
+    const H26xNaluType* types,
+    size_t types_count) {
+  std::vector<std::vector<uint8_t>> samples_data;
+
   const uint8_t kStartCode[] = {0x00, 0x00, 0x01};
 
-  TestableEsParser es_parser(
-      codec_type,
-      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
-      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
-
   bool seen_key_frame = false;
-  std::vector<uint8_t> cur_sample_data;
-  ASSERT_EQ(kSeparator, types[0]);
+  std::vector<uint8_t> nal_unit_sample_data;
+  std::vector<uint8_t> annex_b_sample_data;
+  CHECK_EQ(kSeparator, types[0]);
   for (size_t k = 1; k < types_count; k++) {
     if (types[k] == kSeparator) {
       // We should not be emitting samples until we see a key frame.
       if (seen_key_frame)
-        samples_.push_back(cur_sample_data);
-      cur_sample_data.clear();
+        samples_.push_back(nal_unit_sample_data);
+      if (!annex_b_sample_data.empty())
+        samples_data.push_back(annex_b_sample_data);
+      nal_unit_sample_data.clear();
+      annex_b_sample_data.clear();
     } else {
       if (codec_type == Nalu::kH264) {
         if (types[k] == kH264VclKeyFrame)
@@ -218,34 +225,56 @@ void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
 
       std::vector<uint8_t> es_data =
           CreateNalu(codec_type, types[k], static_cast<uint8_t>(k));
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(static_cast<uint8_t>(es_data.size()));
-      cur_sample_data.insert(cur_sample_data.end(), es_data.begin(),
-                             es_data.end());
+
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(static_cast<uint8_t>(es_data.size()));
+      nal_unit_sample_data.insert(nal_unit_sample_data.end(), es_data.begin(),
+                                  es_data.end());
+
       es_data.insert(es_data.begin(), kStartCode,
                      kStartCode + arraysize(kStartCode));
-
-      const int64_t pts = k * kMpegTicksPerFrame;
-      const int64_t dts = k * kMpegTicksPerFrame;
-      // This may process the previous sample; but since we don't know whether
-      // we are at the end yet, this will not process the current sample until
-      // later.
-      size_t offset = 0;
-      size_t size = 1;
-      while (offset < es_data.size()) {
-        // Insert the data in parts to test partial data searches.
-        size = std::min(size + 1, es_data.size() - offset);
-        ASSERT_TRUE(es_parser.Parse(&es_data[offset], static_cast<int>(size),
-                                    pts, dts));
-        offset += size;
-      }
+      annex_b_sample_data.insert(annex_b_sample_data.end(), es_data.begin(),
+                                 es_data.end());
     }
   }
   if (seen_key_frame)
-    samples_.push_back(cur_sample_data);
+    samples_.push_back(nal_unit_sample_data);
+  if (!annex_b_sample_data.empty())
+    samples_data.push_back(annex_b_sample_data);
 
+  return samples_data;
+}
+
+void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
+                               const H26xNaluType* types,
+                               size_t types_count) {
+  // Duration of one 25fps video frame in 90KHz clock units.
+  const uint32_t kMpegTicksPerFrame = 3600;
+
+  TestableEsParser es_parser(
+      codec_type,
+      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
+      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
+
+  int64_t timestamp = 0;
+  for (const auto& sample_data :
+       BuildSamplesData(codec_type, types, types_count)) {
+    // This may process the previous sample; but since we don't know whether
+    // we are at the end yet, this will not process the current sample until
+    // later.
+    size_t offset = 0;
+    size_t size = 1;
+    while (offset < sample_data.size()) {
+      // Insert the data in parts to test partial data searches.
+      size = std::min(size + 1, sample_data.size() - offset);
+      ASSERT_TRUE(es_parser.Parse(&sample_data[offset], static_cast<int>(size),
+                                  timestamp, timestamp));
+      offset += size;
+    }
+    timestamp += kMpegTicksPerFrame;
+  }
   es_parser.Flush();
 }
 
@@ -350,6 +379,55 @@ TEST_F(EsParserH26xTest, H264BasicSupport) {
   EXPECT_TRUE(has_stream_info_);
 }
 
+// This is not compliant to H264 spec, but VLC generates streams like this. See
+// https://github.com/google/shaka-packager/issues/526 for details.
+TEST_F(EsParserH26xTest, H264AudInAccessUnit) {
+  // clang-format off
+  const H26xNaluType kData[] = {
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+  };
+  // clang-format on
+
+  TestableEsParser es_parser(
+      Nalu::kH264,
+      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
+      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
+
+  size_t sample_index = 0;
+  for (const auto& sample_data :
+       BuildSamplesData(Nalu::kH264, kData, arraysize(kData))) {
+    // Duration of one 25fps video frame in 90KHz clock units.
+    const uint32_t kMpegTicksPerFrame = 3600;
+    const int64_t timestamp = kMpegTicksPerFrame * sample_index;
+    ASSERT_TRUE(es_parser.Parse(sample_data.data(),
+                                static_cast<int>(sample_data.size()), timestamp,
+                                timestamp));
+    sample_index++;
+
+    // The number of emitted samples are less than the number of samples that
+    // are pushed to the EsParser since samples could be cached internally
+    // before being emitted.
+    // The delay is at most 2 in our current implementation.
+    const size_t kExpectedMaxDelay = 2;
+    EXPECT_NEAR(sample_index, sample_count_, kExpectedMaxDelay);
+  }
+
+  es_parser.Flush();
+  EXPECT_EQ(sample_index, sample_count_);
+  EXPECT_TRUE(has_stream_info_);
+}
+
 TEST_F(EsParserH26xTest, H264DeterminesAccessUnitsWithoutAUD) {
   const H26xNaluType kData[] = {
     kSeparator, kH264Sps, kH264VclKeyFrame,