Estimate duration of last sample in cluster from next cluster

Change-Id: I7dbc4045d366bbfb0c12f9652ffe97b8fcf447cf
2015-12-14 12:33:18 -08:00 · 2015-12-14 12:33:18 -08:00 · af7d6a7921
parent 07378e806c
commit af7d6a7921
4 changed files with 98 additions and 135 deletions
--- a/packager/media/formats/webm/webm_cluster_parser.cc
+++ b/packager/media/formats/webm/webm_cluster_parser.cc
@ -93,6 +93,13 @@ void WebMClusterParser::Reset() {
  ResetTextTracks();
 }
 void WebMClusterParser::Flush() {
  // Estimate the duration of the last frame if necessary.
  audio_.ApplyDurationEstimateIfNeeded();
  video_.ApplyDurationEstimateIfNeeded();
  Reset();
 }
 int WebMClusterParser::Parse(const uint8_t* buf, int size) {
  int result = parser_.Parse(buf, size);
@ -103,9 +110,6 @@ int WebMClusterParser::Parse(const uint8_t* buf, int size) {
  cluster_ended_ = parser_.IsParsingComplete();
  if (cluster_ended_) {
    audio_.ApplyDurationEstimateIfNeeded();
    video_.ApplyDurationEstimateIfNeeded();
    // If there were no buffers in this cluster, set the cluster start time to
    // be the |cluster_timecode_|.
    if (cluster_start_time_ == kNoTimestamp) {
@ -155,7 +159,7 @@ int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) {
  static const uint8_t kTocConfigMask = 0xf8;
  static const uint8_t kTocFrameCountCodeMask = 0x03;
  static const uint8_t kFrameCountMask = 0x3f;
-  static const int64_t kPacketDurationMax = 120;
+  static const int64_t kPacketDurationMaxMs = 120000;
  if (size < 1) {
    LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
@ -209,14 +213,14 @@ int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) {
  DCHECK_GT(frame_count, 0);
  int64_t duration = kOpusFrameDurationsMu[opusConfig] * frame_count;
-  if (duration > kPacketDurationMax) {
+  if (duration > kPacketDurationMaxMs * 1000) {
    // Intentionally allowing packet to pass through for now. Decoder should
    // either handle or fail gracefully. LOG as breadcrumbs in case
    // things go sideways.
    LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
        << "Warning, demuxed Opus packet with encoded duration: "
-        << duration << "ms. Should be no greater than "
+        << duration / 1000 << "ms. Should be no greater than "
-        << kPacketDurationMax << "ms.";
+        << kPacketDurationMaxMs << "ms.";
  }
  return duration;
@ -496,13 +500,11 @@ bool WebMClusterParser::OnBlock(bool is_simple_block,
  // TrackEntry->DefaultDuration when available. This layering violation is a
  // workaround for http://crbug.com/396634, decreasing the likelihood of
  // fall-back to rough estimation techniques for Blocks that lack a
-  // BlockDuration at the end of a cluster. Cross cluster durations are not
+  // BlockDuration at the end of a cluster. Duration estimation may still apply
-  // feasible given flexibility of cluster ordering and MSE APIs. Duration
+  // in cases of encryption and codecs for which we do not extract encoded
-  // estimation may still apply in cases of encryption and codecs for which
+  // duration. Estimates are applied as Block Timecode deltas, or once the whole
-  // we do not extract encoded duration. Within a cluster, estimates are applied
+  // stream is parsed in the case of the last Block in the stream. See
-  // as Block Timecode deltas, or once the whole cluster is parsed in the case
+  // Track::EmitBuffer and ApplyDurationEstimateIfNeeded().
  // of the last Block in the cluster. See Track::EmitBuffer and
  // ApplyDurationEstimateIfNeeded().
  if (encoded_duration != kNoTimestamp) {
    DCHECK(encoded_duration != kInfiniteDuration);
    DCHECK(encoded_duration > 0);
@ -518,9 +520,9 @@ bool WebMClusterParser::OnBlock(bool is_simple_block,
      const auto kWarnDurationDiff = timecode_multiplier_ * 2;
      if (duration_difference > kWarnDurationDiff) {
        LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
-            << "BlockDuration (" << block_duration_time_delta
+            << "BlockDuration (" << block_duration_time_delta / 1000
            << "ms) differs significantly from encoded duration ("
-            << encoded_duration << "ms).";
+            << encoded_duration / 1000 << "ms).";
      }
    }
  } else if (block_duration_time_delta != kNoTimestamp) {
@ -589,16 +591,8 @@ void WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() {
  int64_t estimated_duration = GetDurationEstimate();
  last_added_buffer_missing_duration_->set_duration(estimated_duration);
  if (is_video_) {
    // Exposing estimation so splicing/overlap frame processing can make
    // informed decisions downstream.
    // TODO(kqyang): Should we wait for the next cluster to set the duration?
    // last_added_buffer_missing_duration_->set_is_duration_estimated(true);
  }
  LIMITED_LOG(INFO, num_duration_estimates_, kMaxDurationEstimateLogs)
-      << "Estimating WebM block duration to be "
+      << "Estimating WebM block duration to be " << estimated_duration / 1000
      << estimated_duration
      << "ms for the last (Simple)Block in the Cluster for this Track. Use "
         "BlockGroups with BlockDurations at the end of each Track in a "
         "Cluster to avoid estimation.";
@ -653,25 +647,15 @@ bool WebMClusterParser::Track::EmitBufferHelp(
    return false;
  }
-  // The estimated frame duration is the minimum (for audio) or the maximum
+  // The estimated frame duration is the maximum non-zero duration since the
-  // (for video) non-zero duration since the last initialization segment. The
+  // last initialization segment.
  // minimum is used for audio to ensure frame durations aren't overestimated,
  // triggering unnecessary frame splicing. For video, splicing does not apply,
  // so maximum is used and overlap is simply resolved by showing the
  // later of the overlapping frames at its given PTS, effectively trimming down
  // the over-estimated duration of the previous frame.
  // TODO: Use max for audio and disable splicing whenever estimated buffers are
  // encountered.
  if (duration > 0) {
    int64_t orig_duration_estimate = estimated_next_frame_duration_;
    if (estimated_next_frame_duration_ == kNoTimestamp) {
      estimated_next_frame_duration_ = duration;
    } else if (is_video_) {
      estimated_next_frame_duration_ =
          std::max(duration, estimated_next_frame_duration_);
    } else {
      estimated_next_frame_duration_ =
-          std::min(duration, estimated_next_frame_duration_);
+          std::max(duration, estimated_next_frame_duration_);
    }
    if (orig_duration_estimate != estimated_next_frame_duration_) {
--- a/packager/media/formats/webm/webm_cluster_parser.h
+++ b/packager/media/formats/webm/webm_cluster_parser.h
@ -102,10 +102,8 @@ class WebMClusterParser : public WebMParserClient {
    int64_t default_duration_;
    // If kNoTimestamp, then a default value will be used. This estimate is the
-    // maximum (for video), or minimum (for audio) duration seen so far for this
+    // maximum duration seen so far for this track, and is used only if
-    // track, and is used only if |default_duration_| is kNoTimestamp.
+    // |default_duration_| is kNoTimestamp.
    // TODO: Use maximum for audio too, adding checks to disable splicing when
    // these estimates are observed in SourceBufferStream.
    int64_t estimated_next_frame_duration_;
    MediaParser::NewSampleCB new_sample_cb_;
@ -130,6 +128,10 @@ class WebMClusterParser : public WebMParserClient {
  /// Resets the parser state so it can accept a new cluster.
  void Reset();
  /// Flush data currently in the parser and reset the parser so it can accept a
  /// new cluster.
  void Flush();
  /// Parses a WebM cluster element in |buf|.
  /// @return -1 if the parse fails.
  /// @return 0 if more data is needed.
--- a/packager/media/formats/webm/webm_cluster_parser_unittest.cc
+++ b/packager/media/formats/webm/webm_cluster_parser_unittest.cc
@ -417,8 +417,8 @@ TEST_F(WebMClusterParserTest, TracksWithSampleMissingDuration) {
      {kAudioTrackNum, 36, kTestAudioFrameDefaultDurationInMs, true, NULL, 0},
      {kVideoTrackNum, 33, 33, true, NULL, 0},
      {kAudioTrackNum, 70, kTestAudioFrameDefaultDurationInMs, true, NULL, 0},
      {kVideoTrackNum, 66, kExpectedVideoEstimationInMs, true, NULL, 0},
      {kAudioTrackNum, 83, kTestAudioFrameDefaultDurationInMs, true, NULL, 0},
      {kVideoTrackNum, 66, kExpectedVideoEstimationInMs, true, NULL, 0},
  };
  const int kExpectedBuffersOnPartialCluster[] = {
    0,  // Video simple block without DefaultDuration should be held back
@ -429,46 +429,32 @@ TEST_F(WebMClusterParserTest, TracksWithSampleMissingDuration) {
    5,  // 3rd audio ready
    6,  // 2nd video emitted, 3rd video held back with no duration
    7,  // 4th audio ready
-    9,  // Cluster end emits all buffers and 3rd video's duration is estimated
+    8,  // 5th audio ready
  };
  ASSERT_EQ(arraysize(kBlockInfo), arraysize(kExpectedBuffersOnPartialCluster));
  int block_count = arraysize(kBlockInfo);
-  // Iteratively create a cluster containing the first N+1 blocks and parse all
+  // Iteratively create a cluster containing the first N+1 blocks and parse the
-  // but the last byte of the cluster (except when N==|block_count|, just parse
+  // cluster. Verify that the corresponding entry in
  // the whole cluster). Verify that the corresponding entry in
  // |kExpectedBuffersOnPartialCluster| identifies the exact subset of
  // |kBlockInfo| returned by the parser.
  for (int i = 0; i < block_count; ++i) {
    if (i > 0)
    parser_->Reset();
    // Since we don't know exactly the offsets of each block in the full
    // cluster, build a cluster with exactly one additional block so that
    // parse of all but one byte should deterministically parse all but the
    // last full block. Don't |exceed block_count| blocks though.
    int blocks_in_cluster = std::min(i + 2, block_count);
    scoped_ptr<Cluster> cluster(CreateCluster(0, kBlockInfo,
                                              blocks_in_cluster));
    // Parse all but the last byte unless we need to parse the full cluster.
    bool parse_full_cluster = i == (block_count - 1);
-    int result = parser_->Parse(cluster->data(), parse_full_cluster ?
+    const int blocks_in_cluster = i + 1;
-                                cluster->size() : cluster->size() - 1);
+    scoped_ptr<Cluster> cluster(
-    if (parse_full_cluster) {
+        CreateCluster(0, kBlockInfo, blocks_in_cluster));
      DVLOG(1) << "Verifying parse result of full cluster of "
               << blocks_in_cluster << " blocks";
      EXPECT_EQ(cluster->size(), result);
    } else {
      DVLOG(1) << "Verifying parse result of cluster of "
               << blocks_in_cluster << " blocks with last block incomplete";
      EXPECT_GT(cluster->size(), result);
      EXPECT_LT(0, result);
    }
    EXPECT_EQ(cluster->size(),
              parser_->Parse(cluster->data(), cluster->size()));
    EXPECT_TRUE(
        VerifyBuffers(kExpectedBlockInfo, kExpectedBuffersOnPartialCluster[i]));
  }
  // The last (3rd) video is emitted on flush with duration estimated.
  parser_->Flush();
  EXPECT_TRUE(VerifyBuffers(&kExpectedBlockInfo[block_count - 1], 1));
 }
 TEST_F(WebMClusterParserTest, Reset) {
@ -611,6 +597,7 @@ TEST_F(WebMClusterParserTest, IgnoredTracks) {
  int result = parser_->Parse(cluster->data(), cluster->size());
  EXPECT_EQ(cluster->size(), result);
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kOutputBlockInfo, output_block_count));
 }
@ -640,6 +627,7 @@ TEST_F(WebMClusterParserTest, ParseTextTracks) {
  int result = parser_->Parse(cluster->data(), cluster->size());
  EXPECT_EQ(cluster->size(), result);
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kInputBlockInfo, input_block_count));
 }
@ -718,6 +706,7 @@ TEST_F(WebMClusterParserTest, ParseEncryptedBlock) {
  int result = parser_->Parse(cluster->data(), cluster->size());
  EXPECT_EQ(cluster->size(), result);
  parser_->Flush();
  ASSERT_EQ(1UL, video_buffers_.size());
  scoped_refptr<MediaSample> buffer = video_buffers_[0];
  VerifyEncryptedBuffer(buffer);
@ -811,59 +800,51 @@ TEST_F(WebMClusterParserTest, ParseWithoutAnyDurationsSimpleBlocks) {
  InSequence s;
  // Absent DefaultDuration information, SimpleBlock durations are derived from
-  // inter-buffer track timestamp delta if within the cluster. Duration for the
+  // inter-buffer track timestamp delta either within or across clusters.
-  // last block in a cluster is estimated independently for each track in the
+  // Duration for the last block is estimated independently for each track when
-  // cluster. For video tracks we use the maximum seen so far. For audio we use
+  // Flush() is called. We use the maximum seen so far for estimation.
  // the the minimum.
  // TODO: Move audio over to use the maximum.
  const int kExpectedAudioEstimationInMs = 22;
  const int kExpectedVideoEstimationInMs = 34;
  const BlockInfo kBlockInfo1[] = {
      {kAudioTrackNum, 0, 23, true, NULL, 0},
      {kAudioTrackNum, 23, 22, true, NULL, 0},
      {kVideoTrackNum, 33, 33, true, NULL, 0},
      {kAudioTrackNum, 45, 23, true, NULL, 0},
      {kVideoTrackNum, 66, 34, true, NULL, 0},
-      {kAudioTrackNum, 68, kExpectedAudioEstimationInMs, true, NULL, 0},
+      {kAudioTrackNum, 68, 24, true, NULL, 0},
-      {kVideoTrackNum, 100, kExpectedVideoEstimationInMs, true, NULL, 0},
+      {kVideoTrackNum, 100, 35, true, NULL, 0},
  };
  int block_count1 = arraysize(kBlockInfo1);
  scoped_ptr<Cluster> cluster1(CreateCluster(0, kBlockInfo1, block_count1));
-  // Send slightly less than the first full cluster so all but the last video
+  // Verify the last fully parsed audio and video buffer are both missing from
-  // block is parsed. Verify the last fully parsed audio and video buffer are
+  // the result (parser should hold them aside for duration estimation until
-  // both missing from the result (parser should hold them aside for duration
+  // Flush() called in the absence of DefaultDurations).
-  // estimation prior to end of cluster detection in the absence of
+  EXPECT_EQ(cluster1->size(),
-  // DefaultDurations.)
+            parser_->Parse(cluster1->data(), cluster1->size()));
  int result = parser_->Parse(cluster1->data(), cluster1->size() - 1);
  EXPECT_GT(result, 0);
  EXPECT_LT(result, cluster1->size());
  EXPECT_EQ(3UL, audio_buffers_.size());
-  EXPECT_EQ(1UL, video_buffers_.size());
+  EXPECT_EQ(2UL, video_buffers_.size());
-  ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 3));
+  ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 2));
  parser_->Reset();
  // Now parse the full first cluster and verify all the blocks are parsed.
  result = parser_->Parse(cluster1->data(), cluster1->size());
  EXPECT_EQ(cluster1->size(), result);
  ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1));
  // Verify that the estimated frame duration is tracked across clusters for
  // each track.
  const int kExpectedAudioEstimationInMs = 24;
  const int kExpectedVideoEstimationInMs = 35;
  const BlockInfo kBlockInfo2[] = {
-      // Estimate carries over across clusters
+      {kAudioTrackNum, 92, kExpectedAudioEstimationInMs, true, NULL, 0},
-      {kAudioTrackNum, 200, kExpectedAudioEstimationInMs, true, NULL, 0},
+      {kVideoTrackNum, 135, kExpectedVideoEstimationInMs, true, NULL, 0},
      // Estimate carries over across clusters
      {kVideoTrackNum, 201, kExpectedVideoEstimationInMs, true, NULL, 0},
  };
  int block_count2 = arraysize(kBlockInfo2);
  scoped_ptr<Cluster> cluster2(CreateCluster(0, kBlockInfo2, block_count2));
-  result = parser_->Parse(cluster2->data(), cluster2->size());
+  EXPECT_EQ(cluster2->size(),
-  EXPECT_EQ(cluster2->size(), result);
+            parser_->Parse(cluster2->data(), cluster2->size()));
  // Verify that remaining blocks of cluster1 are emitted.
  ASSERT_TRUE(VerifyBuffers(&kBlockInfo1[block_count1 - 2], 2));
  // Now flush and verify blocks in cluster2 are emitted.
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kBlockInfo2, block_count2));
 }
@ -871,57 +852,51 @@ TEST_F(WebMClusterParserTest, ParseWithoutAnyDurationsBlockGroups) {
  InSequence s;
  // Absent DefaultDuration and BlockDuration information, BlockGroup block
-  // durations are derived from inter-buffer track timestamp delta if within the
+  // durations are derived from inter-buffer track timestamp delta either within
-  // cluster. Duration for the last block in a cluster is estimated
+  // or across clusters. Duration for the last block is estimated independently
-  // independently for each track in the cluster. For video tracks we use the
+  // for each track when Flush() is called. We use the maximum seen so far.
  // maximum seen so far. For audio we use the the minimum.
  // TODO: Move audio over to use the maximum.
  const int kExpectedAudioEstimationInMs = 22;
  const int kExpectedVideoEstimationInMs = 34;
  const BlockInfo kBlockInfo1[] = {
      {kAudioTrackNum, 0, -23, false, NULL, 0},
      {kAudioTrackNum, 23, -22, false, NULL, 0},
      {kVideoTrackNum, 33, -33, false, NULL, 0},
      {kAudioTrackNum, 45, -23, false, NULL, 0},
      {kVideoTrackNum, 66, -34, false, NULL, 0},
-      {kAudioTrackNum, 68, -kExpectedAudioEstimationInMs, false, NULL, 0},
+      {kAudioTrackNum, 68, -24, false, NULL, 0},
-      {kVideoTrackNum, 100, -kExpectedVideoEstimationInMs, false, NULL, 0},
+      {kVideoTrackNum, 100, -35, false, NULL, 0},
  };
  int block_count1 = arraysize(kBlockInfo1);
  scoped_ptr<Cluster> cluster1(CreateCluster(0, kBlockInfo1, block_count1));
-  // Send slightly less than the first full cluster so all but the last video
+  // Verify the last fully parsed audio and video buffer are both missing from
-  // block is parsed. Verify the last fully parsed audio and video buffer are
+  // the result (parser should hold them aside for duration estimation until
-  // both missing from the result (parser should hold them aside for duration
+  // Flush() called in the absence of DefaultDurations).
-  // estimation prior to end of cluster detection in the absence of
+  EXPECT_EQ(cluster1->size(),
-  // DefaultDurations.)
+            parser_->Parse(cluster1->data(), cluster1->size()));
  int result = parser_->Parse(cluster1->data(), cluster1->size() - 1);
  EXPECT_GT(result, 0);
  EXPECT_LT(result, cluster1->size());
  EXPECT_EQ(3UL, audio_buffers_.size());
-  EXPECT_EQ(1UL, video_buffers_.size());
+  EXPECT_EQ(2UL, video_buffers_.size());
-  ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 3));
+  ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1 - 2));
  parser_->Reset();
  // Now parse the full first cluster and verify all the blocks are parsed.
  result = parser_->Parse(cluster1->data(), cluster1->size());
  EXPECT_EQ(cluster1->size(), result);
  ASSERT_TRUE(VerifyBuffers(kBlockInfo1, block_count1));
  // Verify that the estimated frame duration is tracked across clusters for
  // each track.
  const int kExpectedAudioEstimationInMs = 24;
  const int kExpectedVideoEstimationInMs = 35;
  const BlockInfo kBlockInfo2[] = {
-      {kAudioTrackNum, 200, -kExpectedAudioEstimationInMs, false, NULL, 0},
+      {kAudioTrackNum, 92, -kExpectedAudioEstimationInMs, false, NULL, 0},
-      {kVideoTrackNum, 201, -kExpectedVideoEstimationInMs, false, NULL, 0},
+      {kVideoTrackNum, 135, -kExpectedVideoEstimationInMs, false, NULL, 0},
  };
  int block_count2 = arraysize(kBlockInfo2);
  scoped_ptr<Cluster> cluster2(CreateCluster(0, kBlockInfo2, block_count2));
-  result = parser_->Parse(cluster2->data(), cluster2->size());
+  EXPECT_EQ(cluster2->size(),
-  EXPECT_EQ(cluster2->size(), result);
+            parser_->Parse(cluster2->data(), cluster2->size()));
  // Verify that remaining blocks of cluster1 are emitted.
  ASSERT_TRUE(VerifyBuffers(&kBlockInfo1[block_count1 - 2], 2));
  // Now flush and verify blocks in cluster2 are emitted.
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kBlockInfo2, block_count2));
 }
@ -958,13 +933,13 @@ TEST_F(WebMClusterParserTest,
  int result = parser_->Parse(cluster->data(), cluster->size() - 1);
  EXPECT_GT(result, 0);
  EXPECT_LT(result, cluster->size());
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count - 1));
  parser_->Reset();
  // Now parse a whole cluster to verify that all the blocks will get parsed.
  result = parser_->Parse(cluster->data(), cluster->size());
  EXPECT_EQ(cluster->size(), result);
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count));
 }
@ -988,6 +963,7 @@ TEST_F(WebMClusterParserTest,
  scoped_ptr<Cluster> cluster(CreateCluster(0, kBlockInfo, block_count));
  int result = parser_->Parse(cluster->data(), cluster->size());
  EXPECT_EQ(cluster->size(), result);
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count));
 }
@ -1004,6 +980,7 @@ TEST_F(WebMClusterParserTest,
  scoped_ptr<Cluster> cluster(CreateCluster(0, kBlockInfo, block_count));
  int result = parser_->Parse(cluster->data(), cluster->size());
  EXPECT_EQ(cluster->size(), result);
  parser_->Flush();
  ASSERT_TRUE(VerifyBuffers(kBlockInfo, block_count));
 }
--- a/packager/media/formats/webm/webm_media_parser.cc
+++ b/packager/media/formats/webm/webm_media_parser.cc
@ -44,7 +44,7 @@ void WebMMediaParser::Flush() {
  byte_queue_.Reset();
  if (cluster_parser_)
-    cluster_parser_->Reset();
+    cluster_parser_->Flush();
  if (state_ == kParsingClusters) {
    ChangeState(kParsingHeaders);
  }