From af7d6a792174a48cd8f033add7683c47de5012a1 Mon Sep 17 00:00:00 2001 From: KongQun Yang Date: Mon, 14 Dec 2015 12:33:18 -0800 Subject: [PATCH] Estimate duration of last sample in cluster from next cluster Change-Id: I7dbc4045d366bbfb0c12f9652ffe97b8fcf447cf --- .../media/formats/webm/webm_cluster_parser.cc | 60 +++---- .../media/formats/webm/webm_cluster_parser.h | 10 +- .../webm/webm_cluster_parser_unittest.cc | 161 ++++++++---------- .../media/formats/webm/webm_media_parser.cc | 2 +- 4 files changed, 98 insertions(+), 135 deletions(-) diff --git a/packager/media/formats/webm/webm_cluster_parser.cc b/packager/media/formats/webm/webm_cluster_parser.cc index 7d4e547170..fdf8a93238 100644 --- a/packager/media/formats/webm/webm_cluster_parser.cc +++ b/packager/media/formats/webm/webm_cluster_parser.cc @@ -93,6 +93,13 @@ void WebMClusterParser::Reset() { ResetTextTracks(); } +void WebMClusterParser::Flush() { + // Estimate the duration of the last frame if necessary. + audio_.ApplyDurationEstimateIfNeeded(); + video_.ApplyDurationEstimateIfNeeded(); + Reset(); +} + int WebMClusterParser::Parse(const uint8_t* buf, int size) { int result = parser_.Parse(buf, size); @@ -103,9 +110,6 @@ int WebMClusterParser::Parse(const uint8_t* buf, int size) { cluster_ended_ = parser_.IsParsingComplete(); if (cluster_ended_) { - audio_.ApplyDurationEstimateIfNeeded(); - video_.ApplyDurationEstimateIfNeeded(); - // If there were no buffers in this cluster, set the cluster start time to // be the |cluster_timecode_|. if (cluster_start_time_ == kNoTimestamp) { @@ -155,7 +159,7 @@ int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) { static const uint8_t kTocConfigMask = 0xf8; static const uint8_t kTocFrameCountCodeMask = 0x03; static const uint8_t kFrameCountMask = 0x3f; - static const int64_t kPacketDurationMax = 120; + static const int64_t kPacketDurationMaxMs = 120000; if (size < 1) { LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) @@ -209,14 +213,14 @@ int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) { DCHECK_GT(frame_count, 0); int64_t duration = kOpusFrameDurationsMu[opusConfig] * frame_count; - if (duration > kPacketDurationMax) { + if (duration > kPacketDurationMaxMs * 1000) { // Intentionally allowing packet to pass through for now. Decoder should // either handle or fail gracefully. LOG as breadcrumbs in case // things go sideways. LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) << "Warning, demuxed Opus packet with encoded duration: " - << duration << "ms. Should be no greater than " - << kPacketDurationMax << "ms."; + << duration / 1000 << "ms. Should be no greater than " + << kPacketDurationMaxMs << "ms."; } return duration; @@ -496,13 +500,11 @@ bool WebMClusterParser::OnBlock(bool is_simple_block, // TrackEntry->DefaultDuration when available. This layering violation is a // workaround for http://crbug.com/396634, decreasing the likelihood of // fall-back to rough estimation techniques for Blocks that lack a - // BlockDuration at the end of a cluster. Cross cluster durations are not - // feasible given flexibility of cluster ordering and MSE APIs. Duration - // estimation may still apply in cases of encryption and codecs for which - // we do not extract encoded duration. Within a cluster, estimates are applied - // as Block Timecode deltas, or once the whole cluster is parsed in the case - // of the last Block in the cluster. See Track::EmitBuffer and - // ApplyDurationEstimateIfNeeded(). + // BlockDuration at the end of a cluster. Duration estimation may still apply + // in cases of encryption and codecs for which we do not extract encoded + // duration. Estimates are applied as Block Timecode deltas, or once the whole + // stream is parsed in the case of the last Block in the stream. See + // Track::EmitBuffer and ApplyDurationEstimateIfNeeded(). if (encoded_duration != kNoTimestamp) { DCHECK(encoded_duration != kInfiniteDuration); DCHECK(encoded_duration > 0); @@ -518,9 +520,9 @@ bool WebMClusterParser::OnBlock(bool is_simple_block, const auto kWarnDurationDiff = timecode_multiplier_ * 2; if (duration_difference > kWarnDurationDiff) { LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) - << "BlockDuration (" << block_duration_time_delta + << "BlockDuration (" << block_duration_time_delta / 1000 << "ms) differs significantly from encoded duration (" - << encoded_duration << "ms)."; + << encoded_duration / 1000 << "ms)."; } } } else if (block_duration_time_delta != kNoTimestamp) { @@ -589,16 +591,8 @@ void WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() { int64_t estimated_duration = GetDurationEstimate(); last_added_buffer_missing_duration_->set_duration(estimated_duration); - if (is_video_) { - // Exposing estimation so splicing/overlap frame processing can make - // informed decisions downstream. - // TODO(kqyang): Should we wait for the next cluster to set the duration? - // last_added_buffer_missing_duration_->set_is_duration_estimated(true); - } - LIMITED_LOG(INFO, num_duration_estimates_, kMaxDurationEstimateLogs) - << "Estimating WebM block duration to be " - << estimated_duration + << "Estimating WebM block duration to be " << estimated_duration / 1000 << "ms for the last (Simple)Block in the Cluster for this Track. Use " "BlockGroups with BlockDurations at the end of each Track in a " "Cluster to avoid estimation."; @@ -653,25 +647,15 @@ bool WebMClusterParser::Track::EmitBufferHelp( return false; } - // The estimated frame duration is the minimum (for audio) or the maximum - // (for video) non-zero duration since the last initialization segment. The - // minimum is used for audio to ensure frame durations aren't overestimated, - // triggering unnecessary frame splicing. For video, splicing does not apply, - // so maximum is used and overlap is simply resolved by showing the - // later of the overlapping frames at its given PTS, effectively trimming down - // the over-estimated duration of the previous frame. - // TODO: Use max for audio and disable splicing whenever estimated buffers are - // encountered. + // The estimated frame duration is the maximum non-zero duration since the + // last initialization segment. if (duration > 0) { int64_t orig_duration_estimate = estimated_next_frame_duration_; if (estimated_next_frame_duration_ == kNoTimestamp) { estimated_next_frame_duration_ = duration; - } else if (is_video_) { - estimated_next_frame_duration_ = - std::max(duration, estimated_next_frame_duration_); } else { estimated_next_frame_duration_ = - std::min(duration, estimated_next_frame_duration_); + std::max(duration, estimated_next_frame_duration_); } if (orig_duration_estimate != estimated_next_frame_duration_) { diff --git a/packager/media/formats/webm/webm_cluster_parser.h b/packager/media/formats/webm/webm_cluster_parser.h index cc7524af59..2f73b524f9 100644 --- a/packager/media/formats/webm/webm_cluster_parser.h +++ b/packager/media/formats/webm/webm_cluster_parser.h @@ -102,10 +102,8 @@ class WebMClusterParser : public WebMParserClient { int64_t default_duration_; // If kNoTimestamp, then a default value will be used. This estimate is the - // maximum (for video), or minimum (for audio) duration seen so far for this - // track, and is used only if |default_duration_| is kNoTimestamp. - // TODO: Use maximum for audio too, adding checks to disable splicing when - // these estimates are observed in SourceBufferStream. + // maximum duration seen so far for this track, and is used only if + // |default_duration_| is kNoTimestamp. int64_t estimated_next_frame_duration_; MediaParser::NewSampleCB new_sample_cb_; @@ -130,6 +128,10 @@ class WebMClusterParser : public WebMParserClient { /// Resets the parser state so it can accept a new cluster. void Reset(); + /// Flush data currently in the parser and reset the parser so it can accept a + /// new cluster. + void Flush(); + /// Parses a WebM cluster element in |buf|. /// @return -1 if the parse fails. /// @return 0 if more data is needed. diff --git a/packager/media/formats/webm/webm_cluster_parser_unittest.cc b/packager/media/formats/webm/webm_cluster_parser_unittest.cc index 7f9111197e..83f91a186f 100644 --- a/packager/media/formats/webm/webm_cluster_parser_unittest.cc +++ b/packager/media/formats/webm/webm_cluster_parser_unittest.cc @@ -417,8 +417,8 @@ TEST_F(WebMClusterParserTest, TracksWithSampleMissingDuration) { {kAudioTrackNum, 36, kTestAudioFrameDefaultDurationInMs, true, NULL, 0}, {kVideoTrackNum, 33, 33, true, NULL, 0}, {kAudioTrackNum, 70, kTestAudioFrameDefaultDurationInMs, true, NULL, 0}, - {kVideoTrackNum, 66, kExpectedVideoEstimationInMs, true, NULL, 0}, {kAudioTrackNum, 83, kTestAudioFrameDefaultDurationInMs, true, NULL, 0}, + {kVideoTrackNum, 66, kExpectedVideoEstimationInMs, true, NULL, 0}, }; const int kExpectedBuffersOnPartialCluster[] = { 0, // Video simple block without DefaultDuration should be held back @@ -429,46 +429,32 @@ TEST_F(WebMClusterParserTest, TracksWithSampleMissingDuration) { 5, // 3rd audio ready 6, // 2nd video emitted, 3rd video held back with no duration 7, // 4th audio ready - 9, // Cluster end emits all buffers and 3rd video's duration is estimated + 8, // 5th audio ready }; ASSERT_EQ(arraysize(kBlockInfo), arraysize(kExpectedBuffersOnPartialCluster)); int block_count = arraysize(kBlockInfo); - // Iteratively create a cluster containing the first N+1 blocks and parse all - // but the last byte of the cluster (except when N==|block_count|, just parse - // the whole cluster). Verify that the corresponding entry in + // Iteratively create a cluster containing the first N+1 blocks and parse the + // cluster. Verify that the corresponding entry in // |kExpectedBuffersOnPartialCluster| identifies the exact subset of // |kBlockInfo| returned by the parser. for (int i = 0; i < block_count; ++i) { - if (i > 0) - parser_->Reset(); - // Since we don't know exactly the offsets of each block in the full - // cluster, build a cluster with exactly one additional block so that - // parse of all but one byte should deterministically parse all but the - // last full block. Don't |exceed block_count| blocks though. - int blocks_in_cluster = std::min(i + 2, block_count); - scoped_ptr cluster(CreateCluster(0, kBlockInfo, - blocks_in_cluster)); - // Parse all but the last byte unless we need to parse the full cluster. - bool parse_full_cluster = i == (block_count - 1); + parser_->Reset(); - int result = parser_->Parse(cluster->data(), parse_full_cluster ? - cluster->size() : cluster->size() - 1); - if (parse_full_cluster) { - DVLOG(1) << "Verifying parse result of full cluster of " - << blocks_in_cluster << " blocks"; - EXPECT_EQ(cluster->size(), result); - } else { - DVLOG(1) << "Verifying parse result of cluster of " - << blocks_in_cluster << " blocks with last block incomplete"; - EXPECT_GT(cluster->size(), result); - EXPECT_LT(0, result); - } + const int blocks_in_cluster = i + 1; + scoped_ptr cluster( + CreateCluster(0, kBlockInfo, blocks_in_cluster)); + EXPECT_EQ(cluster->size(), + parser_->Parse(cluster->data(), cluster->size())); EXPECT_TRUE( VerifyBuffers(kExpectedBlockInfo, kExpectedBuffersOnPartialCluster[i])); } + + // The last (3rd) video is emitted on flush with duration estimated. + parser_->Flush(); + EXPECT_TRUE(VerifyBuffers(&kExpectedBlockInfo[block_count - 1], 1)); } TEST_F(WebMClusterParserTest, Reset) { @@ -611,6 +597,7 @@ TEST_F(WebMClusterParserTest, IgnoredTracks) { int result = parser_->Parse(cluster->data(), cluster->size()); EXPECT_EQ(cluster->size(), result); + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kOutputBlockInfo, output_block_count)); } @@ -640,6 +627,7 @@ TEST_F(WebMClusterParserTest, ParseTextTracks) { int result = parser_->Parse(cluster->data(), cluster->size()); EXPECT_EQ(cluster->size(), result); + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kInputBlockInfo, input_block_count)); } @@ -718,6 +706,7 @@ TEST_F(WebMClusterParserTest, ParseEncryptedBlock) { int result = parser_->Parse(cluster->data(), cluster->size()); EXPECT_EQ(cluster->size(), result); + parser_->Flush(); ASSERT_EQ(1UL, video_buffers_.size()); scoped_refptr buffer = video_buffers_[0]; VerifyEncryptedBuffer(buffer); @@ -811,59 +800,51 @@ TEST_F(WebMClusterParserTest, ParseWithoutAnyDurationsSimpleBlocks) { InSequence s; // Absent DefaultDuration information, SimpleBlock durations are derived from - // inter-buffer track timestamp delta if within the cluster. Duration for the - // last block in a cluster is estimated independently for each track in the - // cluster. For video tracks we use the maximum seen so far. For audio we use - // the the minimum. - // TODO: Move audio over to use the maximum. + // inter-buffer track timestamp delta either within or across clusters. + // Duration for the last block is estimated independently for each track when + // Flush() is called. We use the maximum seen so far for estimation. - const int kExpectedAudioEstimationInMs = 22; - const int kExpectedVideoEstimationInMs = 34; const BlockInfo kBlockInfo1[] = { {kAudioTrackNum, 0, 23, true, NULL, 0}, {kAudioTrackNum, 23, 22, true, NULL, 0}, {kVideoTrackNum, 33, 33, true, NULL, 0}, {kAudioTrackNum, 45, 23, true, NULL, 0}, {kVideoTrackNum, 66, 34, true, NULL, 0}, - {kAudioTrackNum, 68, kExpectedAudioEstimationInMs, true, NULL, 0}, - {kVideoTrackNum, 100, kExpectedVideoEstimationInMs, true, NULL, 0}, + {kAudioTrackNum, 68, 24, true, NULL, 0}, + {kVideoTrackNum, 100, 35, true, NULL, 0}, }; int block_count1 = arraysize(kBlockInfo1); scoped_ptr cluster1(CreateCluster(0, kBlockInfo1, block_count1)); - // Send slightly less than the first full cluster so all but the last video - // block is parsed. Verify the last fully parsed audio and video buffer are - // both missing from the result (parser should hold them aside for duration - // estimation prior to end of cluster detection in the absence of - // DefaultDurations.) - int result = parser_->Parse(cluster1->data(), cluster1->size() - 1); - EXPECT_GT(result, 0); - EXPECT_LT(result, cluster1->size()); + // Verify the last fully parsed audio and video buffer are both missing from + // the result (parser should hold them aside for duration estimation until + // Flush() called in the absence of DefaultDurations). + EXPECT_EQ(cluster1->size(), + parser_->Parse(cluster1->data(), cluster1->size())); EXPECT_EQ(3UL, audio_buffers_.size()); - EXPECT_EQ(1UL, video_buffers_.size()); - ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 3)); - - parser_->Reset(); - - // Now parse the full first cluster and verify all the blocks are parsed. - result = parser_->Parse(cluster1->data(), cluster1->size()); - EXPECT_EQ(cluster1->size(), result); - ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1)); + EXPECT_EQ(2UL, video_buffers_.size()); + ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 2)); // Verify that the estimated frame duration is tracked across clusters for // each track. + const int kExpectedAudioEstimationInMs = 24; + const int kExpectedVideoEstimationInMs = 35; const BlockInfo kBlockInfo2[] = { - // Estimate carries over across clusters - {kAudioTrackNum, 200, kExpectedAudioEstimationInMs, true, NULL, 0}, - // Estimate carries over across clusters - {kVideoTrackNum, 201, kExpectedVideoEstimationInMs, true, NULL, 0}, + {kAudioTrackNum, 92, kExpectedAudioEstimationInMs, true, NULL, 0}, + {kVideoTrackNum, 135, kExpectedVideoEstimationInMs, true, NULL, 0}, }; int block_count2 = arraysize(kBlockInfo2); scoped_ptr cluster2(CreateCluster(0, kBlockInfo2, block_count2)); - result = parser_->Parse(cluster2->data(), cluster2->size()); - EXPECT_EQ(cluster2->size(), result); + EXPECT_EQ(cluster2->size(), + parser_->Parse(cluster2->data(), cluster2->size())); + + // Verify that remaining blocks of cluster1 are emitted. + ASSERT_TRUE(VerifyBuffers(&kBlockInfo1[block_count1 - 2], 2)); + + // Now flush and verify blocks in cluster2 are emitted. + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kBlockInfo2, block_count2)); } @@ -871,57 +852,51 @@ TEST_F(WebMClusterParserTest, ParseWithoutAnyDurationsBlockGroups) { InSequence s; // Absent DefaultDuration and BlockDuration information, BlockGroup block - // durations are derived from inter-buffer track timestamp delta if within the - // cluster. Duration for the last block in a cluster is estimated - // independently for each track in the cluster. For video tracks we use the - // maximum seen so far. For audio we use the the minimum. - // TODO: Move audio over to use the maximum. + // durations are derived from inter-buffer track timestamp delta either within + // or across clusters. Duration for the last block is estimated independently + // for each track when Flush() is called. We use the maximum seen so far. - const int kExpectedAudioEstimationInMs = 22; - const int kExpectedVideoEstimationInMs = 34; const BlockInfo kBlockInfo1[] = { {kAudioTrackNum, 0, -23, false, NULL, 0}, {kAudioTrackNum, 23, -22, false, NULL, 0}, {kVideoTrackNum, 33, -33, false, NULL, 0}, {kAudioTrackNum, 45, -23, false, NULL, 0}, {kVideoTrackNum, 66, -34, false, NULL, 0}, - {kAudioTrackNum, 68, -kExpectedAudioEstimationInMs, false, NULL, 0}, - {kVideoTrackNum, 100, -kExpectedVideoEstimationInMs, false, NULL, 0}, + {kAudioTrackNum, 68, -24, false, NULL, 0}, + {kVideoTrackNum, 100, -35, false, NULL, 0}, }; int block_count1 = arraysize(kBlockInfo1); scoped_ptr cluster1(CreateCluster(0, kBlockInfo1, block_count1)); - // Send slightly less than the first full cluster so all but the last video - // block is parsed. Verify the last fully parsed audio and video buffer are - // both missing from the result (parser should hold them aside for duration - // estimation prior to end of cluster detection in the absence of - // DefaultDurations.) - int result = parser_->Parse(cluster1->data(), cluster1->size() - 1); - EXPECT_GT(result, 0); - EXPECT_LT(result, cluster1->size()); + // Verify the last fully parsed audio and video buffer are both missing from + // the result (parser should hold them aside for duration estimation until + // Flush() called in the absence of DefaultDurations). + EXPECT_EQ(cluster1->size(), + parser_->Parse(cluster1->data(), cluster1->size())); EXPECT_EQ(3UL, audio_buffers_.size()); - EXPECT_EQ(1UL, video_buffers_.size()); - ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 3)); - - parser_->Reset(); - - // Now parse the full first cluster and verify all the blocks are parsed. - result = parser_->Parse(cluster1->data(), cluster1->size()); - EXPECT_EQ(cluster1->size(), result); - ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1)); + EXPECT_EQ(2UL, video_buffers_.size()); + ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 2)); // Verify that the estimated frame duration is tracked across clusters for // each track. + const int kExpectedAudioEstimationInMs = 24; + const int kExpectedVideoEstimationInMs = 35; const BlockInfo kBlockInfo2[] = { - {kAudioTrackNum, 200, -kExpectedAudioEstimationInMs, false, NULL, 0}, - {kVideoTrackNum, 201, -kExpectedVideoEstimationInMs, false, NULL, 0}, + {kAudioTrackNum, 92, -kExpectedAudioEstimationInMs, false, NULL, 0}, + {kVideoTrackNum, 135, -kExpectedVideoEstimationInMs, false, NULL, 0}, }; int block_count2 = arraysize(kBlockInfo2); scoped_ptr cluster2(CreateCluster(0, kBlockInfo2, block_count2)); - result = parser_->Parse(cluster2->data(), cluster2->size()); - EXPECT_EQ(cluster2->size(), result); + EXPECT_EQ(cluster2->size(), + parser_->Parse(cluster2->data(), cluster2->size())); + + // Verify that remaining blocks of cluster1 are emitted. + ASSERT_TRUE(VerifyBuffers(&kBlockInfo1[block_count1 - 2], 2)); + + // Now flush and verify blocks in cluster2 are emitted. + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kBlockInfo2, block_count2)); } @@ -958,13 +933,13 @@ TEST_F(WebMClusterParserTest, int result = parser_->Parse(cluster->data(), cluster->size() - 1); EXPECT_GT(result, 0); EXPECT_LT(result, cluster->size()); + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count - 1)); - parser_->Reset(); - // Now parse a whole cluster to verify that all the blocks will get parsed. result = parser_->Parse(cluster->data(), cluster->size()); EXPECT_EQ(cluster->size(), result); + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count)); } @@ -988,6 +963,7 @@ TEST_F(WebMClusterParserTest, scoped_ptr cluster(CreateCluster(0, kBlockInfo, block_count)); int result = parser_->Parse(cluster->data(), cluster->size()); EXPECT_EQ(cluster->size(), result); + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count)); } @@ -1004,6 +980,7 @@ TEST_F(WebMClusterParserTest, scoped_ptr cluster(CreateCluster(0, kBlockInfo, block_count)); int result = parser_->Parse(cluster->data(), cluster->size()); EXPECT_EQ(cluster->size(), result); + parser_->Flush(); ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count)); } diff --git a/packager/media/formats/webm/webm_media_parser.cc b/packager/media/formats/webm/webm_media_parser.cc index 25297e39b6..63c2d6e7bd 100644 --- a/packager/media/formats/webm/webm_media_parser.cc +++ b/packager/media/formats/webm/webm_media_parser.cc @@ -44,7 +44,7 @@ void WebMMediaParser::Flush() { byte_queue_.Reset(); if (cluster_parser_) - cluster_parser_->Reset(); + cluster_parser_->Flush(); if (state_ == kParsingClusters) { ChangeState(kParsingHeaders); }