Workaround extra AUD in the access unit

VLC seems to generate access units with extra AUDs. In #526, the below
sequence is seen:

  AUD | SPS | PPS | SPS | PPS | AUD | SEI | SEI | SEI | IDR_SLICE

Previously we exit early when seeing AUD, which results in delayed
processing of the access unit.

The behavior is changed to continue processing the following NAL units
to workaround the content issue.

Closes #526.

Change-Id: I80f571c0711c6db1337eb393fce36fae5432b6c5
This commit is contained in:
KongQun Yang 2018-12-10 14:12:01 -08:00
parent 89611a526b
commit 3f7ecd4e29
4 changed files with 131 additions and 40 deletions

View File

@ -64,4 +64,5 @@ These are the available fields:
stream. stream.
.. include:: /options/drm_stream_descriptors.rst .. include:: /options/drm_stream_descriptors.rst
.. include:: /options/dash_stream_descriptors.rst
.. include:: /options/hls_stream_descriptors.rst .. include:: /options/hls_stream_descriptors.rst

View File

@ -110,7 +110,7 @@ std::string LanguageToShortestForm(const std::string& language) {
// This could happen legitimately for languages which have no 2-letter code, // This could happen legitimately for languages which have no 2-letter code,
// but that would imply that the input language code is a 3-letter code. // but that would imply that the input language code is a 3-letter code.
DCHECK_EQ(3u, main_language.size()); DCHECK_EQ(3u, main_language.size()) << main_language;
return main_language + subtag; return main_language + subtag;
} }

View File

@ -62,6 +62,14 @@ bool EsParserH26x::Parse(const uint8_t* buf,
// Link the end of the byte queue with the incoming timing descriptor. // Link the end of the byte queue with the incoming timing descriptor.
timing_desc_list_.push_back( timing_desc_list_.push_back(
std::pair<int64_t, TimingDesc>(es_queue_->tail(), timing_desc)); std::pair<int64_t, TimingDesc>(es_queue_->tail(), timing_desc));
// Warns if there are a large number of cached timestamps, which should be 1
// or 2 if everythings works as expected.
const size_t kWarningSize =
24; // An arbitrary number (it is 1 second for a fps of 24).
LOG_IF(WARNING, timing_desc_list_.size() >= kWarningSize)
<< "Unusually large number of cached timestamps ("
<< timing_desc_list_.size() << ").";
} }
// Add the incoming bytes to the ES queue. // Add the incoming bytes to the ES queue.
@ -209,8 +217,10 @@ bool EsParserH26x::ParseInternal() {
// AUD shall be the first NAL unit if present. There shall be at most one // AUD shall be the first NAL unit if present. There shall be at most one
// AUD in any access unit. We can emit the current access unit which shall // AUD in any access unit. We can emit the current access unit which shall
// not contain the AUD. // not contain the AUD.
if (nalu.is_aud()) if (nalu.is_aud()) {
return EmitCurrentAccessUnit(); RCHECK(EmitCurrentAccessUnit());
continue;
}
// We can only determine if the current access unit ends after seeing // We can only determine if the current access unit ends after seeing
// another VCL NAL unit. // another VCL NAL unit.
@ -281,7 +291,9 @@ bool EsParserH26x::EmitFrame(int64_t access_unit_pos,
// Emit a frame. // Emit a frame.
DVLOG(LOG_LEVEL_ES) << "Emit frame: stream_pos=" << access_unit_pos DVLOG(LOG_LEVEL_ES) << "Emit frame: stream_pos=" << access_unit_pos
<< " size=" << access_unit_size; << " size=" << access_unit_size << " pts "
<< current_timing_desc.pts << " timing_desc_list size "
<< timing_desc_list_.size();
int es_size; int es_size;
const uint8_t* es; const uint8_t* es;
es_queue_->PeekAt(access_unit_pos, &es, &es_size); es_queue_->PeekAt(access_unit_pos, &es, &es_size);

View File

@ -163,6 +163,11 @@ class EsParserH26xTest : public testing::Test {
const H26xNaluType* types, const H26xNaluType* types,
size_t types_count); size_t types_count);
// Returns the vector of samples data j
std::vector<std::vector<uint8_t>> BuildSamplesData(Nalu::CodecType codec_type,
const H26xNaluType* types,
size_t types_count);
void EmitSample(uint32_t pid, const std::shared_ptr<MediaSample>& sample) { void EmitSample(uint32_t pid, const std::shared_ptr<MediaSample>& sample) {
size_t sample_id = sample_count_; size_t sample_id = sample_count_;
sample_count_++; sample_count_++;
@ -186,27 +191,29 @@ class EsParserH26xTest : public testing::Test {
bool has_stream_info_; bool has_stream_info_;
}; };
void EsParserH26xTest::RunTest(Nalu::CodecType codec_type, // Return AnnexB samples data and stores NAL Unit samples data in |samples_|,
// which is what will be returned from |EsParser|.
std::vector<std::vector<uint8_t>> EsParserH26xTest::BuildSamplesData(
Nalu::CodecType codec_type,
const H26xNaluType* types, const H26xNaluType* types,
size_t types_count) { size_t types_count) {
// Duration of one 25fps video frame in 90KHz clock units. std::vector<std::vector<uint8_t>> samples_data;
const uint32_t kMpegTicksPerFrame = 3600;
const uint8_t kStartCode[] = {0x00, 0x00, 0x01}; const uint8_t kStartCode[] = {0x00, 0x00, 0x01};
TestableEsParser es_parser(
codec_type,
base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
bool seen_key_frame = false; bool seen_key_frame = false;
std::vector<uint8_t> cur_sample_data; std::vector<uint8_t> nal_unit_sample_data;
ASSERT_EQ(kSeparator, types[0]); std::vector<uint8_t> annex_b_sample_data;
CHECK_EQ(kSeparator, types[0]);
for (size_t k = 1; k < types_count; k++) { for (size_t k = 1; k < types_count; k++) {
if (types[k] == kSeparator) { if (types[k] == kSeparator) {
// We should not be emitting samples until we see a key frame. // We should not be emitting samples until we see a key frame.
if (seen_key_frame) if (seen_key_frame)
samples_.push_back(cur_sample_data); samples_.push_back(nal_unit_sample_data);
cur_sample_data.clear(); if (!annex_b_sample_data.empty())
samples_data.push_back(annex_b_sample_data);
nal_unit_sample_data.clear();
annex_b_sample_data.clear();
} else { } else {
if (codec_type == Nalu::kH264) { if (codec_type == Nalu::kH264) {
if (types[k] == kH264VclKeyFrame) if (types[k] == kH264VclKeyFrame)
@ -218,34 +225,56 @@ void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
std::vector<uint8_t> es_data = std::vector<uint8_t> es_data =
CreateNalu(codec_type, types[k], static_cast<uint8_t>(k)); CreateNalu(codec_type, types[k], static_cast<uint8_t>(k));
cur_sample_data.push_back(0);
cur_sample_data.push_back(0); nal_unit_sample_data.push_back(0);
cur_sample_data.push_back(0); nal_unit_sample_data.push_back(0);
cur_sample_data.push_back(static_cast<uint8_t>(es_data.size())); nal_unit_sample_data.push_back(0);
cur_sample_data.insert(cur_sample_data.end(), es_data.begin(), nal_unit_sample_data.push_back(static_cast<uint8_t>(es_data.size()));
nal_unit_sample_data.insert(nal_unit_sample_data.end(), es_data.begin(),
es_data.end()); es_data.end());
es_data.insert(es_data.begin(), kStartCode, es_data.insert(es_data.begin(), kStartCode,
kStartCode + arraysize(kStartCode)); kStartCode + arraysize(kStartCode));
annex_b_sample_data.insert(annex_b_sample_data.end(), es_data.begin(),
es_data.end());
}
}
if (seen_key_frame)
samples_.push_back(nal_unit_sample_data);
if (!annex_b_sample_data.empty())
samples_data.push_back(annex_b_sample_data);
const int64_t pts = k * kMpegTicksPerFrame; return samples_data;
const int64_t dts = k * kMpegTicksPerFrame; }
void EsParserH26xTest::RunTest(Nalu::CodecType codec_type,
const H26xNaluType* types,
size_t types_count) {
// Duration of one 25fps video frame in 90KHz clock units.
const uint32_t kMpegTicksPerFrame = 3600;
TestableEsParser es_parser(
codec_type,
base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
int64_t timestamp = 0;
for (const auto& sample_data :
BuildSamplesData(codec_type, types, types_count)) {
// This may process the previous sample; but since we don't know whether // This may process the previous sample; but since we don't know whether
// we are at the end yet, this will not process the current sample until // we are at the end yet, this will not process the current sample until
// later. // later.
size_t offset = 0; size_t offset = 0;
size_t size = 1; size_t size = 1;
while (offset < es_data.size()) { while (offset < sample_data.size()) {
// Insert the data in parts to test partial data searches. // Insert the data in parts to test partial data searches.
size = std::min(size + 1, es_data.size() - offset); size = std::min(size + 1, sample_data.size() - offset);
ASSERT_TRUE(es_parser.Parse(&es_data[offset], static_cast<int>(size), ASSERT_TRUE(es_parser.Parse(&sample_data[offset], static_cast<int>(size),
pts, dts)); timestamp, timestamp));
offset += size; offset += size;
} }
timestamp += kMpegTicksPerFrame;
} }
}
if (seen_key_frame)
samples_.push_back(cur_sample_data);
es_parser.Flush(); es_parser.Flush();
} }
@ -350,6 +379,55 @@ TEST_F(EsParserH26xTest, H264BasicSupport) {
EXPECT_TRUE(has_stream_info_); EXPECT_TRUE(has_stream_info_);
} }
// This is not compliant to H264 spec, but VLC generates streams like this. See
// https://github.com/google/shaka-packager/issues/526 for details.
TEST_F(EsParserH26xTest, H264AudInAccessUnit) {
// clang-format off
const H26xNaluType kData[] = {
kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
kSeparator, kH264Aud, kH264Vcl,
kSeparator, kH264Aud, kH264Vcl,
kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
kSeparator, kH264Aud, kH264Vcl,
kSeparator, kH264Aud, kH264Vcl,
kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
kSeparator, kH264Aud, kH264Vcl,
kSeparator, kH264Aud, kH264Vcl,
kSeparator, kH264Aud, kH264Sps, kH264Aud, kH264VclKeyFrame,
kSeparator, kH264Aud, kH264Vcl,
kSeparator, kH264Aud, kH264Vcl,
};
// clang-format on
TestableEsParser es_parser(
Nalu::kH264,
base::Bind(&EsParserH26xTest::NewVideoConfig, base::Unretained(this)),
base::Bind(&EsParserH26xTest::EmitSample, base::Unretained(this)));
size_t sample_index = 0;
for (const auto& sample_data :
BuildSamplesData(Nalu::kH264, kData, arraysize(kData))) {
// Duration of one 25fps video frame in 90KHz clock units.
const uint32_t kMpegTicksPerFrame = 3600;
const int64_t timestamp = kMpegTicksPerFrame * sample_index;
ASSERT_TRUE(es_parser.Parse(sample_data.data(),
static_cast<int>(sample_data.size()), timestamp,
timestamp));
sample_index++;
// The number of emitted samples are less than the number of samples that
// are pushed to the EsParser since samples could be cached internally
// before being emitted.
// The delay is at most 2 in our current implementation.
const size_t kExpectedMaxDelay = 2;
EXPECT_NEAR(sample_index, sample_count_, kExpectedMaxDelay);
}
es_parser.Flush();
EXPECT_EQ(sample_index, sample_count_);
EXPECT_TRUE(has_stream_info_);
}
TEST_F(EsParserH26xTest, H264DeterminesAccessUnitsWithoutAUD) { TEST_F(EsParserH26xTest, H264DeterminesAccessUnitsWithoutAUD) {
const H26xNaluType kData[] = { const H26xNaluType kData[] = {
kSeparator, kH264Sps, kH264VclKeyFrame, kSeparator, kH264Sps, kH264VclKeyFrame,