Estimate duration of last sample in cluster from next cluster

Change-Id: I7dbc4045d366bbfb0c12f9652ffe97b8fcf447cf
This commit is contained in:
KongQun Yang 2015-12-14 12:33:18 -08:00
parent 07378e806c
commit af7d6a7921
4 changed files with 98 additions and 135 deletions

View File

@ -93,6 +93,13 @@ void WebMClusterParser::Reset() {
ResetTextTracks();
}
void WebMClusterParser::Flush() {
// Estimate the duration of the last frame if necessary.
audio_.ApplyDurationEstimateIfNeeded();
video_.ApplyDurationEstimateIfNeeded();
Reset();
}
int WebMClusterParser::Parse(const uint8_t* buf, int size) {
int result = parser_.Parse(buf, size);
@ -103,9 +110,6 @@ int WebMClusterParser::Parse(const uint8_t* buf, int size) {
cluster_ended_ = parser_.IsParsingComplete();
if (cluster_ended_) {
audio_.ApplyDurationEstimateIfNeeded();
video_.ApplyDurationEstimateIfNeeded();
// If there were no buffers in this cluster, set the cluster start time to
// be the |cluster_timecode_|.
if (cluster_start_time_ == kNoTimestamp) {
@ -155,7 +159,7 @@ int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) {
static const uint8_t kTocConfigMask = 0xf8;
static const uint8_t kTocFrameCountCodeMask = 0x03;
static const uint8_t kFrameCountMask = 0x3f;
static const int64_t kPacketDurationMax = 120;
static const int64_t kPacketDurationMaxMs = 120000;
if (size < 1) {
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
@ -209,14 +213,14 @@ int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) {
DCHECK_GT(frame_count, 0);
int64_t duration = kOpusFrameDurationsMu[opusConfig] * frame_count;
if (duration > kPacketDurationMax) {
if (duration > kPacketDurationMaxMs * 1000) {
// Intentionally allowing packet to pass through for now. Decoder should
// either handle or fail gracefully. LOG as breadcrumbs in case
// things go sideways.
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "Warning, demuxed Opus packet with encoded duration: "
<< duration << "ms. Should be no greater than "
<< kPacketDurationMax << "ms.";
<< duration / 1000 << "ms. Should be no greater than "
<< kPacketDurationMaxMs << "ms.";
}
return duration;
@ -496,13 +500,11 @@ bool WebMClusterParser::OnBlock(bool is_simple_block,
// TrackEntry->DefaultDuration when available. This layering violation is a
// workaround for http://crbug.com/396634, decreasing the likelihood of
// fall-back to rough estimation techniques for Blocks that lack a
// BlockDuration at the end of a cluster. Cross cluster durations are not
// feasible given flexibility of cluster ordering and MSE APIs. Duration
// estimation may still apply in cases of encryption and codecs for which
// we do not extract encoded duration. Within a cluster, estimates are applied
// as Block Timecode deltas, or once the whole cluster is parsed in the case
// of the last Block in the cluster. See Track::EmitBuffer and
// ApplyDurationEstimateIfNeeded().
// BlockDuration at the end of a cluster. Duration estimation may still apply
// in cases of encryption and codecs for which we do not extract encoded
// duration. Estimates are applied as Block Timecode deltas, or once the whole
// stream is parsed in the case of the last Block in the stream. See
// Track::EmitBuffer and ApplyDurationEstimateIfNeeded().
if (encoded_duration != kNoTimestamp) {
DCHECK(encoded_duration != kInfiniteDuration);
DCHECK(encoded_duration > 0);
@ -518,9 +520,9 @@ bool WebMClusterParser::OnBlock(bool is_simple_block,
const auto kWarnDurationDiff = timecode_multiplier_ * 2;
if (duration_difference > kWarnDurationDiff) {
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "BlockDuration (" << block_duration_time_delta
<< "BlockDuration (" << block_duration_time_delta / 1000
<< "ms) differs significantly from encoded duration ("
<< encoded_duration << "ms).";
<< encoded_duration / 1000 << "ms).";
}
}
} else if (block_duration_time_delta != kNoTimestamp) {
@ -589,16 +591,8 @@ void WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() {
int64_t estimated_duration = GetDurationEstimate();
last_added_buffer_missing_duration_->set_duration(estimated_duration);
if (is_video_) {
// Exposing estimation so splicing/overlap frame processing can make
// informed decisions downstream.
// TODO(kqyang): Should we wait for the next cluster to set the duration?
// last_added_buffer_missing_duration_->set_is_duration_estimated(true);
}
LIMITED_LOG(INFO, num_duration_estimates_, kMaxDurationEstimateLogs)
<< "Estimating WebM block duration to be "
<< estimated_duration
<< "Estimating WebM block duration to be " << estimated_duration / 1000
<< "ms for the last (Simple)Block in the Cluster for this Track. Use "
"BlockGroups with BlockDurations at the end of each Track in a "
"Cluster to avoid estimation.";
@ -653,25 +647,15 @@ bool WebMClusterParser::Track::EmitBufferHelp(
return false;
}
// The estimated frame duration is the minimum (for audio) or the maximum
// (for video) non-zero duration since the last initialization segment. The
// minimum is used for audio to ensure frame durations aren't overestimated,
// triggering unnecessary frame splicing. For video, splicing does not apply,
// so maximum is used and overlap is simply resolved by showing the
// later of the overlapping frames at its given PTS, effectively trimming down
// the over-estimated duration of the previous frame.
// TODO: Use max for audio and disable splicing whenever estimated buffers are
// encountered.
// The estimated frame duration is the maximum non-zero duration since the
// last initialization segment.
if (duration > 0) {
int64_t orig_duration_estimate = estimated_next_frame_duration_;
if (estimated_next_frame_duration_ == kNoTimestamp) {
estimated_next_frame_duration_ = duration;
} else if (is_video_) {
estimated_next_frame_duration_ =
std::max(duration, estimated_next_frame_duration_);
} else {
estimated_next_frame_duration_ =
std::min(duration, estimated_next_frame_duration_);
std::max(duration, estimated_next_frame_duration_);
}
if (orig_duration_estimate != estimated_next_frame_duration_) {

View File

@ -102,10 +102,8 @@ class WebMClusterParser : public WebMParserClient {
int64_t default_duration_;
// If kNoTimestamp, then a default value will be used. This estimate is the
// maximum (for video), or minimum (for audio) duration seen so far for this
// track, and is used only if |default_duration_| is kNoTimestamp.
// TODO: Use maximum for audio too, adding checks to disable splicing when
// these estimates are observed in SourceBufferStream.
// maximum duration seen so far for this track, and is used only if
// |default_duration_| is kNoTimestamp.
int64_t estimated_next_frame_duration_;
MediaParser::NewSampleCB new_sample_cb_;
@ -130,6 +128,10 @@ class WebMClusterParser : public WebMParserClient {
/// Resets the parser state so it can accept a new cluster.
void Reset();
/// Flush data currently in the parser and reset the parser so it can accept a
/// new cluster.
void Flush();
/// Parses a WebM cluster element in |buf|.
/// @return -1 if the parse fails.
/// @return 0 if more data is needed.

View File

@ -417,8 +417,8 @@ TEST_F(WebMClusterParserTest, TracksWithSampleMissingDuration) {
{kAudioTrackNum, 36, kTestAudioFrameDefaultDurationInMs, true, NULL, 0},
{kVideoTrackNum, 33, 33, true, NULL, 0},
{kAudioTrackNum, 70, kTestAudioFrameDefaultDurationInMs, true, NULL, 0},
{kVideoTrackNum, 66, kExpectedVideoEstimationInMs, true, NULL, 0},
{kAudioTrackNum, 83, kTestAudioFrameDefaultDurationInMs, true, NULL, 0},
{kVideoTrackNum, 66, kExpectedVideoEstimationInMs, true, NULL, 0},
};
const int kExpectedBuffersOnPartialCluster[] = {
0, // Video simple block without DefaultDuration should be held back
@ -429,46 +429,32 @@ TEST_F(WebMClusterParserTest, TracksWithSampleMissingDuration) {
5, // 3rd audio ready
6, // 2nd video emitted, 3rd video held back with no duration
7, // 4th audio ready
9, // Cluster end emits all buffers and 3rd video's duration is estimated
8, // 5th audio ready
};
ASSERT_EQ(arraysize(kBlockInfo), arraysize(kExpectedBuffersOnPartialCluster));
int block_count = arraysize(kBlockInfo);
// Iteratively create a cluster containing the first N+1 blocks and parse all
// but the last byte of the cluster (except when N==|block_count|, just parse
// the whole cluster). Verify that the corresponding entry in
// Iteratively create a cluster containing the first N+1 blocks and parse the
// cluster. Verify that the corresponding entry in
// |kExpectedBuffersOnPartialCluster| identifies the exact subset of
// |kBlockInfo| returned by the parser.
for (int i = 0; i < block_count; ++i) {
if (i > 0)
parser_->Reset();
// Since we don't know exactly the offsets of each block in the full
// cluster, build a cluster with exactly one additional block so that
// parse of all but one byte should deterministically parse all but the
// last full block. Don't |exceed block_count| blocks though.
int blocks_in_cluster = std::min(i + 2, block_count);
scoped_ptr<Cluster> cluster(CreateCluster(0, kBlockInfo,
blocks_in_cluster));
// Parse all but the last byte unless we need to parse the full cluster.
bool parse_full_cluster = i == (block_count - 1);
int result = parser_->Parse(cluster->data(), parse_full_cluster ?
cluster->size() : cluster->size() - 1);
if (parse_full_cluster) {
DVLOG(1) << "Verifying parse result of full cluster of "
<< blocks_in_cluster << " blocks";
EXPECT_EQ(cluster->size(), result);
} else {
DVLOG(1) << "Verifying parse result of cluster of "
<< blocks_in_cluster << " blocks with last block incomplete";
EXPECT_GT(cluster->size(), result);
EXPECT_LT(0, result);
}
const int blocks_in_cluster = i + 1;
scoped_ptr<Cluster> cluster(
CreateCluster(0, kBlockInfo, blocks_in_cluster));
EXPECT_EQ(cluster->size(),
parser_->Parse(cluster->data(), cluster->size()));
EXPECT_TRUE(
VerifyBuffers(kExpectedBlockInfo, kExpectedBuffersOnPartialCluster[i]));
}
// The last (3rd) video is emitted on flush with duration estimated.
parser_->Flush();
EXPECT_TRUE(VerifyBuffers(&kExpectedBlockInfo[block_count - 1], 1));
}
TEST_F(WebMClusterParserTest, Reset) {
@ -611,6 +597,7 @@ TEST_F(WebMClusterParserTest, IgnoredTracks) {
int result = parser_->Parse(cluster->data(), cluster->size());
EXPECT_EQ(cluster->size(), result);
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kOutputBlockInfo, output_block_count));
}
@ -640,6 +627,7 @@ TEST_F(WebMClusterParserTest, ParseTextTracks) {
int result = parser_->Parse(cluster->data(), cluster->size());
EXPECT_EQ(cluster->size(), result);
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kInputBlockInfo, input_block_count));
}
@ -718,6 +706,7 @@ TEST_F(WebMClusterParserTest, ParseEncryptedBlock) {
int result = parser_->Parse(cluster->data(), cluster->size());
EXPECT_EQ(cluster->size(), result);
parser_->Flush();
ASSERT_EQ(1UL, video_buffers_.size());
scoped_refptr<MediaSample> buffer = video_buffers_[0];
VerifyEncryptedBuffer(buffer);
@ -811,59 +800,51 @@ TEST_F(WebMClusterParserTest, ParseWithoutAnyDurationsSimpleBlocks) {
InSequence s;
// Absent DefaultDuration information, SimpleBlock durations are derived from
// inter-buffer track timestamp delta if within the cluster. Duration for the
// last block in a cluster is estimated independently for each track in the
// cluster. For video tracks we use the maximum seen so far. For audio we use
// the the minimum.
// TODO: Move audio over to use the maximum.
// inter-buffer track timestamp delta either within or across clusters.
// Duration for the last block is estimated independently for each track when
// Flush() is called. We use the maximum seen so far for estimation.
const int kExpectedAudioEstimationInMs = 22;
const int kExpectedVideoEstimationInMs = 34;
const BlockInfo kBlockInfo1[] = {
{kAudioTrackNum, 0, 23, true, NULL, 0},
{kAudioTrackNum, 23, 22, true, NULL, 0},
{kVideoTrackNum, 33, 33, true, NULL, 0},
{kAudioTrackNum, 45, 23, true, NULL, 0},
{kVideoTrackNum, 66, 34, true, NULL, 0},
{kAudioTrackNum, 68, kExpectedAudioEstimationInMs, true, NULL, 0},
{kVideoTrackNum, 100, kExpectedVideoEstimationInMs, true, NULL, 0},
{kAudioTrackNum, 68, 24, true, NULL, 0},
{kVideoTrackNum, 100, 35, true, NULL, 0},
};
int block_count1 = arraysize(kBlockInfo1);
scoped_ptr<Cluster> cluster1(CreateCluster(0, kBlockInfo1, block_count1));
// Send slightly less than the first full cluster so all but the last video
// block is parsed. Verify the last fully parsed audio and video buffer are
// both missing from the result (parser should hold them aside for duration
// estimation prior to end of cluster detection in the absence of
// DefaultDurations.)
int result = parser_->Parse(cluster1->data(), cluster1->size() - 1);
EXPECT_GT(result, 0);
EXPECT_LT(result, cluster1->size());
// Verify the last fully parsed audio and video buffer are both missing from
// the result (parser should hold them aside for duration estimation until
// Flush() called in the absence of DefaultDurations).
EXPECT_EQ(cluster1->size(),
parser_->Parse(cluster1->data(), cluster1->size()));
EXPECT_EQ(3UL, audio_buffers_.size());
EXPECT_EQ(1UL, video_buffers_.size());
ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 3));
parser_->Reset();
// Now parse the full first cluster and verify all the blocks are parsed.
result = parser_->Parse(cluster1->data(), cluster1->size());
EXPECT_EQ(cluster1->size(), result);
ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1));
EXPECT_EQ(2UL, video_buffers_.size());
ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 2));
// Verify that the estimated frame duration is tracked across clusters for
// each track.
const int kExpectedAudioEstimationInMs = 24;
const int kExpectedVideoEstimationInMs = 35;
const BlockInfo kBlockInfo2[] = {
// Estimate carries over across clusters
{kAudioTrackNum, 200, kExpectedAudioEstimationInMs, true, NULL, 0},
// Estimate carries over across clusters
{kVideoTrackNum, 201, kExpectedVideoEstimationInMs, true, NULL, 0},
{kAudioTrackNum, 92, kExpectedAudioEstimationInMs, true, NULL, 0},
{kVideoTrackNum, 135, kExpectedVideoEstimationInMs, true, NULL, 0},
};
int block_count2 = arraysize(kBlockInfo2);
scoped_ptr<Cluster> cluster2(CreateCluster(0, kBlockInfo2, block_count2));
result = parser_->Parse(cluster2->data(), cluster2->size());
EXPECT_EQ(cluster2->size(), result);
EXPECT_EQ(cluster2->size(),
parser_->Parse(cluster2->data(), cluster2->size()));
// Verify that remaining blocks of cluster1 are emitted.
ASSERT_TRUE(VerifyBuffers(&kBlockInfo1[block_count1 - 2], 2));
// Now flush and verify blocks in cluster2 are emitted.
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kBlockInfo2, block_count2));
}
@ -871,57 +852,51 @@ TEST_F(WebMClusterParserTest, ParseWithoutAnyDurationsBlockGroups) {
InSequence s;
// Absent DefaultDuration and BlockDuration information, BlockGroup block
// durations are derived from inter-buffer track timestamp delta if within the
// cluster. Duration for the last block in a cluster is estimated
// independently for each track in the cluster. For video tracks we use the
// maximum seen so far. For audio we use the the minimum.
// TODO: Move audio over to use the maximum.
// durations are derived from inter-buffer track timestamp delta either within
// or across clusters. Duration for the last block is estimated independently
// for each track when Flush() is called. We use the maximum seen so far.
const int kExpectedAudioEstimationInMs = 22;
const int kExpectedVideoEstimationInMs = 34;
const BlockInfo kBlockInfo1[] = {
{kAudioTrackNum, 0, -23, false, NULL, 0},
{kAudioTrackNum, 23, -22, false, NULL, 0},
{kVideoTrackNum, 33, -33, false, NULL, 0},
{kAudioTrackNum, 45, -23, false, NULL, 0},
{kVideoTrackNum, 66, -34, false, NULL, 0},
{kAudioTrackNum, 68, -kExpectedAudioEstimationInMs, false, NULL, 0},
{kVideoTrackNum, 100, -kExpectedVideoEstimationInMs, false, NULL, 0},
{kAudioTrackNum, 68, -24, false, NULL, 0},
{kVideoTrackNum, 100, -35, false, NULL, 0},
};
int block_count1 = arraysize(kBlockInfo1);
scoped_ptr<Cluster> cluster1(CreateCluster(0, kBlockInfo1, block_count1));
// Send slightly less than the first full cluster so all but the last video
// block is parsed. Verify the last fully parsed audio and video buffer are
// both missing from the result (parser should hold them aside for duration
// estimation prior to end of cluster detection in the absence of
// DefaultDurations.)
int result = parser_->Parse(cluster1->data(), cluster1->size() - 1);
EXPECT_GT(result, 0);
EXPECT_LT(result, cluster1->size());
// Verify the last fully parsed audio and video buffer are both missing from
// the result (parser should hold them aside for duration estimation until
// Flush() called in the absence of DefaultDurations).
EXPECT_EQ(cluster1->size(),
parser_->Parse(cluster1->data(), cluster1->size()));
EXPECT_EQ(3UL, audio_buffers_.size());
EXPECT_EQ(1UL, video_buffers_.size());
ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 3));
parser_->Reset();
// Now parse the full first cluster and verify all the blocks are parsed.
result = parser_->Parse(cluster1->data(), cluster1->size());
EXPECT_EQ(cluster1->size(), result);
ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1));
EXPECT_EQ(2UL, video_buffers_.size());
ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 2));
// Verify that the estimated frame duration is tracked across clusters for
// each track.
const int kExpectedAudioEstimationInMs = 24;
const int kExpectedVideoEstimationInMs = 35;
const BlockInfo kBlockInfo2[] = {
{kAudioTrackNum, 200, -kExpectedAudioEstimationInMs, false, NULL, 0},
{kVideoTrackNum, 201, -kExpectedVideoEstimationInMs, false, NULL, 0},
{kAudioTrackNum, 92, -kExpectedAudioEstimationInMs, false, NULL, 0},
{kVideoTrackNum, 135, -kExpectedVideoEstimationInMs, false, NULL, 0},
};
int block_count2 = arraysize(kBlockInfo2);
scoped_ptr<Cluster> cluster2(CreateCluster(0, kBlockInfo2, block_count2));
result = parser_->Parse(cluster2->data(), cluster2->size());
EXPECT_EQ(cluster2->size(), result);
EXPECT_EQ(cluster2->size(),
parser_->Parse(cluster2->data(), cluster2->size()));
// Verify that remaining blocks of cluster1 are emitted.
ASSERT_TRUE(VerifyBuffers(&kBlockInfo1[block_count1 - 2], 2));
// Now flush and verify blocks in cluster2 are emitted.
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kBlockInfo2, block_count2));
}
@ -958,13 +933,13 @@ TEST_F(WebMClusterParserTest,
int result = parser_->Parse(cluster->data(), cluster->size() - 1);
EXPECT_GT(result, 0);
EXPECT_LT(result, cluster->size());
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count - 1));
parser_->Reset();
// Now parse a whole cluster to verify that all the blocks will get parsed.
result = parser_->Parse(cluster->data(), cluster->size());
EXPECT_EQ(cluster->size(), result);
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count));
}
@ -988,6 +963,7 @@ TEST_F(WebMClusterParserTest,
scoped_ptr<Cluster> cluster(CreateCluster(0, kBlockInfo, block_count));
int result = parser_->Parse(cluster->data(), cluster->size());
EXPECT_EQ(cluster->size(), result);
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count));
}
@ -1004,6 +980,7 @@ TEST_F(WebMClusterParserTest,
scoped_ptr<Cluster> cluster(CreateCluster(0, kBlockInfo, block_count));
int result = parser_->Parse(cluster->data(), cluster->size());
EXPECT_EQ(cluster->size(), result);
parser_->Flush();
ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count));
}

View File

@ -44,7 +44,7 @@ void WebMMediaParser::Flush() {
byte_queue_.Reset();
if (cluster_parser_)
cluster_parser_->Reset();
cluster_parser_->Flush();
if (state_ == kParsingClusters) {
ChangeState(kParsingHeaders);
}