Workaround extra AUD in the access unit

VLC seems to generate access units with extra AUDs. In #526, the below sequence is seen: AUD | SPS | PPS | SPS | PPS | AUD | SEI | SEI | SEI | IDR_SLICE Previously we exit early when seeing AUD, which results in delayed processing of the access unit. The behavior is changed to continue processing the following NAL units to workaround the content issue. Closes #526. Change-Id: I80f571c0711c6db1337eb393fce36fae5432b6c5
2018-12-10 14:12:01 -08:00 · 2018-12-10 14:12:01 -08:00 · 3f7ecd4e29
parent 89611a526b
commit 3f7ecd4e29
4 changed files with 131 additions and 40 deletions
--- a/docs/source/options/stream_descriptors.rst
+++ b/docs/source/options/stream_descriptors.rst
@ -64,4 +64,5 @@ These are the available fields:
    stream.

 .. include:: /options/drm_stream_descriptors.rst
+.. include:: /options/dash_stream_descriptors.rst
 .. include:: /options/hls_stream_descriptors.rst
--- a/packager/media/base/language_utils.cc
+++ b/packager/media/base/language_utils.cc
@ -110,7 +110,7 @@ std::string LanguageToShortestForm(const std::string& language) {

  // This could happen legitimately for languages which have no 2-letter code,
  // but that would imply that the input language code is a 3-letter code.
-  DCHECK_EQ(3u, main_language.size());
+  DCHECK_EQ(3u, main_language.size()) << main_language;
  return main_language + subtag;
 }

--- a/packager/media/formats/mp2t/es_parser_h26x.cc
+++ b/packager/media/formats/mp2t/es_parser_h26x.cc
@ -62,6 +62,14 @@ bool EsParserH26x::Parse(const uint8_t* buf,
    // Link the end of the byte queue with the incoming timing descriptor.
    timing_desc_list_.push_back(
        std::pair<int64_t, TimingDesc>(es_queue_->tail(), timing_desc));
+
+    // Warns if there are a large number of cached timestamps, which should be 1
+    // or 2 if everythings works as expected.
+    const size_t kWarningSize =
+        24;  // An arbitrary number (it is 1 second for a fps of 24).
+    LOG_IF(WARNING, timing_desc_list_.size() >= kWarningSize)
+        << "Unusually large number of cached timestamps ("
+        << timing_desc_list_.size() << ").";
  }

  // Add the incoming bytes to the ES queue.
@ -209,8 +217,10 @@ bool EsParserH26x::ParseInternal() {
    // AUD shall be the first NAL unit if present. There shall be at most one
    // AUD in any access unit. We can emit the current access unit which shall
    // not contain the AUD.
-    if (nalu.is_aud())
-      return EmitCurrentAccessUnit();
+    if (nalu.is_aud()) {
+      RCHECK(EmitCurrentAccessUnit());
+      continue;
+    }

    // We can only determine if the current access unit ends after seeing
    // another VCL NAL unit.
@ -281,7 +291,9 @@ bool EsParserH26x::EmitFrame(int64_t access_unit_pos,

  // Emit a frame.
  DVLOG(LOG_LEVEL_ES) << "Emit frame: stream_pos=" << access_unit_pos
-                      << " size=" << access_unit_size;
+                      << " size=" << access_unit_size << " pts "
+                      << current_timing_desc.pts << " timing_desc_list size "
+                      << timing_desc_list_.size();
  int es_size;
  const uint8_t* es;
  es_queue_->PeekAt(access_unit_pos, &es, &es_size);
--- a/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
+++ b/packager/media/formats/mp2t/es_parser_h26x_unittest.cc
@ -163,6 +163,11 @@ class EsParserH26xTest : public testing::Test {
               const H26xNaluType* types,
               size_t types_count);

+  // Returns the vector of samples data j
+  std::vector<std::vector<uint8_t>> BuildSamplesData(Nalu::CodecType codec_type,
+                                                     const H26xNaluType* types,
+                                                     size_t types_count);
+
  void EmitSample(uint32_t pid, const std::shared_ptr<MediaSample>& sample) {
    size_t sample_id = sample_count_;
    sample_count_++;
@ -186,27 +191,29 @@ class EsParserH26xTest : public testing::Test {
  bool has_stream_info_;
 };

-void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
-                               const H26xNaluType* types,
-                               size_t types_count) {
-  // Duration of one 25fps video frame in 90KHz clock units.
-  const uint32_t kMpegTicksPerFrame = 3600;
+// Return AnnexB samples data and stores NAL Unit samples data in |samples_|,
+// which is what will be returned from |EsParser|.
+std::vector<std::vector<uint8_t>> EsParserH26xTest::BuildSamplesData(
+    Nalu::CodecType codec_type,
+    const H26xNaluType* types,
+    size_t types_count) {
+  std::vector<std::vector<uint8_t>> samples_data;
+
  const uint8_t kStartCode[] = {0x00, 0x00, 0x01};

-  TestableEsParser es_parser(
-      codec_type,
-      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
-      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
-
  bool seen_key_frame = false;
-  std::vector<uint8_t> cur_sample_data;
-  ASSERT_EQ(kSeparator, types[0]);
+  std::vector<uint8_t> nal_unit_sample_data;
+  std::vector<uint8_t> annex_b_sample_data;
+  CHECK_EQ(kSeparator, types[0]);
  for (size_t k = 1; k < types_count; k++) {
    if (types[k] == kSeparator) {
      // We should not be emitting samples until we see a key frame.
      if (seen_key_frame)
-        samples_.push_back(cur_sample_data);
-      cur_sample_data.clear();
+        samples_.push_back(nal_unit_sample_data);
+      if (!annex_b_sample_data.empty())
+        samples_data.push_back(annex_b_sample_data);
+      nal_unit_sample_data.clear();
+      annex_b_sample_data.clear();
    } else {
      if (codec_type == Nalu::kH264) {
        if (types[k] == kH264VclKeyFrame)
@ -218,34 +225,56 @@ void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,

      std::vector<uint8_t> es_data =
          CreateNalu(codec_type, types[k], static_cast<uint8_t>(k));
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(0);
-      cur_sample_data.push_back(static_cast<uint8_t>(es_data.size()));
-      cur_sample_data.insert(cur_sample_data.end(), es_data.begin(),
-                             es_data.end());
+
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(0);
+      nal_unit_sample_data.push_back(static_cast<uint8_t>(es_data.size()));
+      nal_unit_sample_data.insert(nal_unit_sample_data.end(), es_data.begin(),
+                                  es_data.end());
+
      es_data.insert(es_data.begin(), kStartCode,
                     kStartCode + arraysize(kStartCode));
-
-      const int64_t pts = k * kMpegTicksPerFrame;
-      const int64_t dts = k * kMpegTicksPerFrame;
-      // This may process the previous sample; but since we don't know whether
-      // we are at the end yet, this will not process the current sample until
-      // later.
-      size_t offset = 0;
-      size_t size = 1;
-      while (offset < es_data.size()) {
-        // Insert the data in parts to test partial data searches.
-        size = std::min(size + 1, es_data.size() - offset);
-        ASSERT_TRUE(es_parser.Parse(&es_data[offset], static_cast<int>(size),
-                                    pts, dts));
-        offset += size;
-      }
+      annex_b_sample_data.insert(annex_b_sample_data.end(), es_data.begin(),
+                                 es_data.end());
    }
  }
  if (seen_key_frame)
-    samples_.push_back(cur_sample_data);
+    samples_.push_back(nal_unit_sample_data);
+  if (!annex_b_sample_data.empty())
+    samples_data.push_back(annex_b_sample_data);

+  return samples_data;
+}
+
+void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
+                               const H26xNaluType* types,
+                               size_t types_count) {
+  // Duration of one 25fps video frame in 90KHz clock units.
+  const uint32_t kMpegTicksPerFrame = 3600;
+
+  TestableEsParser es_parser(
+      codec_type,
+      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
+      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
+
+  int64_t timestamp = 0;
+  for (const auto& sample_data :
+       BuildSamplesData(codec_type, types, types_count)) {
+    // This may process the previous sample; but since we don't know whether
+    // we are at the end yet, this will not process the current sample until
+    // later.
+    size_t offset = 0;
+    size_t size = 1;
+    while (offset < sample_data.size()) {
+      // Insert the data in parts to test partial data searches.
+      size = std::min(size + 1, sample_data.size() - offset);
+      ASSERT_TRUE(es_parser.Parse(&sample_data[offset], static_cast<int>(size),
+                                  timestamp, timestamp));
+      offset += size;
+    }
+    timestamp += kMpegTicksPerFrame;
+  }
  es_parser.Flush();
 }

@ -350,6 +379,55 @@ TEST_F(EsParserH26xTest, H264BasicSupport) {
  EXPECT_TRUE(has_stream_info_);
 }

+// This is not compliant to H264 spec, but VLC generates streams like this. See
+// https://github.com/google/shaka-packager/issues/526 for details.
+TEST_F(EsParserH26xTest, H264AudInAccessUnit) {
+  // clang-format off
+  const H26xNaluType kData[] = {
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
+    kSeparator, kH264Aud, kH264Vcl,
+    kSeparator, kH264Aud, kH264Vcl,
+  };
+  // clang-format on
+
+  TestableEsParser es_parser(
+      Nalu::kH264,
+      base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
+      base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
+
+  size_t sample_index = 0;
+  for (const auto& sample_data :
+       BuildSamplesData(Nalu::kH264, kData, arraysize(kData))) {
+    // Duration of one 25fps video frame in 90KHz clock units.
+    const uint32_t kMpegTicksPerFrame = 3600;
+    const int64_t timestamp = kMpegTicksPerFrame * sample_index;
+    ASSERT_TRUE(es_parser.Parse(sample_data.data(),
+                                static_cast<int>(sample_data.size()), timestamp,
+                                timestamp));
+    sample_index++;
+
+    // The number of emitted samples are less than the number of samples that
+    // are pushed to the EsParser since samples could be cached internally
+    // before being emitted.
+    // The delay is at most 2 in our current implementation.
+    const size_t kExpectedMaxDelay = 2;
+    EXPECT_NEAR(sample_index, sample_count_, kExpectedMaxDelay);
+  }
+
+  es_parser.Flush();
+  EXPECT_EQ(sample_index, sample_count_);
+  EXPECT_TRUE(has_stream_info_);
+}
+
 TEST_F(EsParserH26xTest, H264DeterminesAccessUnitsWithoutAUD) {
  const H26xNaluType kData[] = {
    kSeparator, kH264Sps, kH264VclKeyFrame,