Make WebVttMediaParser use WebVttSampleConverter

- WebVttMediaParser uses WebVttSampleConverter to generate non overlapping media samples. - The media samples contains ISO BMFF boxes. - Add kCodecWebVtt to signal that the media is webvtt and the samples will be in ISO BMFF boxes. Change-Id: I639902cdba7b04af75428bc20622e26b8203cfb2
2017-02-14 13:40:09 -08:00 · 2017-02-14 13:40:09 -08:00 · a3ce51785a
parent 924d6d4693
commit a3ce51785a
12 changed files with 273 additions and 288 deletions
--- a/packager/media/base/stream_info.h
+++ b/packager/media/base/stream_info.h
@ -51,6 +51,7 @@ enum Codec {
  kCodecAudioMaxPlusOne,

  kCodecText = 300,
+  kCodecWebVtt = kCodecText,
 };

 /// Abstract class holds stream information.
--- a/packager/media/base/text_stream_info.cc
+++ b/packager/media/base/text_stream_info.cc
@ -11,10 +11,11 @@ namespace media {

 TextStreamInfo::TextStreamInfo(int track_id, uint32_t time_scale,
                               uint64_t duration,
+                               Codec codec,
                               const std::string& codec_string,
                               const std::string& codec_config, uint16_t width,
                               uint16_t height, const std::string& language)
-    : StreamInfo(kStreamText, track_id, time_scale, duration, kCodecText,
+    : StreamInfo(kStreamText, track_id, time_scale, duration, codec,
                 codec_string,
                 reinterpret_cast<const uint8_t*>(codec_config.data()),
                 codec_config.size(), language, false),
--- a/packager/media/base/text_stream_info.h
+++ b/packager/media/base/text_stream_info.h
@ -20,7 +20,8 @@ class TextStreamInfo : public StreamInfo {
  /// @param track_id is the track ID of this stream.
  /// @param time_scale is the time scale of this stream.
  /// @param duration is the duration of this stream.
-  /// @param codec_string is the codec.
+  /// @param codec is the media codec.
+  /// @param codec_string is the codec in string format.
  /// @param codec_config is configuration for this text stream. This could be
  ///        the metadata that applies to all the samples of this stream. This
  ///        may be empty.
@ -28,6 +29,7 @@ class TextStreamInfo : public StreamInfo {
  /// @param height of the text. This may be 0.
  /// @param language is the language of this stream. This may be empty.
  TextStreamInfo(int track_id, uint32_t time_scale, uint64_t duration,
+                 Codec codec,
                 const std::string& codec_string,
                 const std::string& codec_config, uint16_t width,
                 uint16_t height, const std::string& language);
--- a/packager/media/formats/mp2t/pes_packet_generator_unittest.cc
+++ b/packager/media/formats/mp2t/pes_packet_generator_unittest.cc
@ -247,9 +247,9 @@ TEST_F(PesPacketGeneratorTest, InitializeAudioNonAac) {

 // Text is not supported yet.
 TEST_F(PesPacketGeneratorTest, InitializeTextInfo) {
-  std::shared_ptr<TextStreamInfo> stream_info(
-      new TextStreamInfo(kTrackId, kTimeScale, kDuration, kCodecString,
-                         std::string(), kWidth, kHeight, kLanguage));
+  std::shared_ptr<TextStreamInfo> stream_info(new TextStreamInfo(
+      kTrackId, kTimeScale, kDuration, kCodecText, kCodecString, std::string(),
+      kWidth, kHeight, kLanguage));
  EXPECT_FALSE(generator_.Initialize(*stream_info));
 }

--- a/packager/media/formats/webvtt/cue.cc
+++ b/packager/media/formats/webvtt/cue.cc
@ -8,53 +8,5 @@ namespace media {
 Cue::Cue() : start_time(0), duration(0) {}
 Cue::~Cue() {}

-// Mapping:
-// comment --> side data (and side data only sample)
-// settings --> side data
-// start_time --> pts
-std::shared_ptr<MediaSample> CueToMediaSample(const Cue& cue) {
-  const bool kKeyFrame = true;
-  if (!cue.comment.empty()) {
-    const std::string comment = base::JoinString(cue.comment, "\n");
-    return MediaSample::FromMetadata(
-        reinterpret_cast<const uint8_t*>(comment.data()), comment.size());
-  }
-
-  const std::string payload = base::JoinString(cue.payload, "\n");
-  std::shared_ptr<MediaSample> media_sample = MediaSample::CopyFrom(
-      reinterpret_cast<const uint8_t*>(payload.data()), payload.size(),
-      reinterpret_cast<const uint8_t*>(cue.settings.data()),
-      cue.settings.size(), !kKeyFrame);
-
-  media_sample->set_config_id(cue.identifier);
-  media_sample->set_pts(cue.start_time);
-  media_sample->set_duration(cue.duration);
-  return media_sample;
-}
-
-// TODO(rkuroiwa): Cue gets converted to MediaSample in WebVttMediaParser and
-// then back to Cue in the muxer. Consider making MediaSample a protobuf or make
-// Cue a protobuf and (ab)use MediaSample::data() to store serialized Cue.
-Cue MediaSampleToCue(const MediaSample& sample) {
-  Cue cue;
-  if (sample.data_size() == 0) {
-    std::string comment(sample.side_data(),
-                        sample.side_data() + sample.side_data_size());
-    cue.comment.push_back(comment);
-    return cue;
-  }
-
-  std::string payload(sample.data(), sample.data() + sample.data_size());
-  cue.payload.push_back(payload);
-  cue.identifier.assign(sample.config_id());
-  cue.start_time = sample.pts();
-  cue.duration = sample.duration();
-  if (sample.side_data_size() != 0) {
-    cue.settings.assign(sample.side_data(),
-                        sample.side_data() + sample.side_data_size());
-  }
-  return cue;
-}
-
 }  // namespace media
 }  // namespace shaka
--- a/packager/media/formats/webvtt/cue.h
+++ b/packager/media/formats/webvtt/cue.h
@ -1,3 +1,6 @@
+#ifndef PACKAGER_MEDIA_FORMATS_WEBVTT_CUE_H_
+#define PACKAGER_MEDIA_FORMATS_WEBVTT_CUE_H_
+
 #include <stdint.h>

 #include <memory>
@ -20,19 +23,13 @@ struct Cue {
  uint64_t start_time;
  uint64_t duration;
  std::string settings;
-  std::vector<std::string> payload;
-  std::vector<std::string> comment;
+
+  // |payload| and |comment| may have trailing "\n" character.
+  std::string payload;
+  std::string comment;
 };

-/// Convert Cue to MediaSample.
-/// @param cue data.
-/// @return @a cue converted to a MediaSample.
-std::shared_ptr<MediaSample> CueToMediaSample(const Cue& cue);
-
-/// Convert MediaSample to Cue.
-/// @param sample to be converted.
-/// @return @a sample converted to Cue.
-Cue MediaSampleToCue(const MediaSample& sample);
-
 }  // namespace media
 }  // namespace shaka
+
+#endif  // PACKAGER_MEDIA_FORMATS_WEBVTT_CUE_H_
--- a/packager/media/formats/webvtt/webvtt_media_parser.cc
+++ b/packager/media/formats/webvtt/webvtt_media_parser.cc
@ -22,6 +22,8 @@ namespace media {

 namespace {

+const bool kFlush = true;
+
 // There's only one track in a WebVTT file.
 const int kTrackId = 0;

@ -186,7 +188,8 @@ bool ParseTimingAndSettingsLine(const std::string& line,

 }  // namespace

-WebVttMediaParser::WebVttMediaParser() : state_(kHeader) {}
+WebVttMediaParser::WebVttMediaParser()
+    : state_(kHeader), sample_converter_(new WebVttSampleConverter()) {}
 WebVttMediaParser::~WebVttMediaParser() {}

 void WebVttMediaParser::Init(const InitCB& init_cb,
@ -205,17 +208,20 @@ bool WebVttMediaParser::Flush() {
    // If it was in the middle of the payload and the stream finished, then this
    // is an end of the payload. The rest of the data is part of the payload.
    if (state_ == kCuePayload) {
-      current_cue_.payload.push_back(data_);
+      current_cue_.payload += data_ + "\n";
    } else {
-      current_cue_.comment.push_back(data_);
+      current_cue_.comment += data_ + "\n";
    }
    data_.clear();
  }

-  bool result = new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_));
-  current_cue_ = Cue();
+  if (!ProcessCurrentCue(kFlush)) {
+    state_ = kParseError;
+    return false;
+  }
+
  state_ = kCueIdentifierOrTimingOrComment;
-  return result;
+  return true;
 }

 bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
@ -265,8 +271,11 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
          // There is no one metadata to determine what the language is. Parts
          // of the text may be annotated as some specific language.
          const char kLanguage[] = "";
+
+          const char kWebVttCodecString[] = "wvtt";
          streams.emplace_back(
-              new TextStreamInfo(kTrackId, kTimescale, kDuration, "wvtt",
+              new TextStreamInfo(kTrackId, kTimescale, kDuration,
+                                 kCodecWebVtt, kWebVttCodecString,
                                 base::JoinString(header_, "\n"),
                                 0,  // Not necessary.
                                 0,
@ -291,7 +300,7 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
          if (base::StartsWith(line, "NOTE",
                               base::CompareCase::INSENSITIVE_ASCII)) {
            state_ = kComment;
-            current_cue_.comment.push_back(line);
+            current_cue_.comment += line + "\n";
          } else {
            // A cue can start from a cue identifier.
            // https://w3c.github.io/webvtt/#webvtt-cue-identifier
@ -322,29 +331,27 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
      case kCuePayload: {
        if (line.empty()) {
          state_ = kCueIdentifierOrTimingOrComment;
-          if (!new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_))) {
+          if (!ProcessCurrentCue(!kFlush)) {
            state_ = kParseError;
            return false;
          }
-          current_cue_ = Cue();
          break;
        }

-        current_cue_.payload.push_back(line);
+        current_cue_.payload += line + "\n";
        break;
      }
      case kComment: {
        if (line.empty()) {
          state_ = kCueIdentifierOrTimingOrComment;
-          if (!new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_))) {
+          if (!ProcessCurrentCue(!kFlush)) {
            state_ = kParseError;
            return false;
          }
-          current_cue_ = Cue();
          break;
        }

-        current_cue_.comment.push_back(line);
+        current_cue_.comment += line + "\n";
        break;
      }
      case kParseError:
@ -356,5 +363,25 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
  return true;
 }

+void WebVttMediaParser::InjectWebVttSampleConvertForTesting(
+    std::unique_ptr<WebVttSampleConverter> converter) {
+  sample_converter_ = std::move(converter);
+}
+
+bool WebVttMediaParser::ProcessCurrentCue(bool flush) {
+  sample_converter_->PushCue(current_cue_);
+  current_cue_ = Cue();
+  if (flush)
+    sample_converter_->Flush();
+
+  while (sample_converter_->ReadySamplesSize() > 0) {
+    if (!new_sample_cb_.Run(kTrackId, sample_converter_->PopSample())) {
+      LOG(ERROR) << "New sample callback failed.";
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace media
 }  // namespace shaka
--- a/packager/media/formats/webvtt/webvtt_media_parser.h
+++ b/packager/media/formats/webvtt/webvtt_media_parser.h
@ -8,12 +8,15 @@
 #define MEDIA_FORMATS_WEBVTT_WEBVTT_MEDIA_PARSER_H_

 #include <stdint.h>
+
+#include <memory>
 #include <string>
 #include <vector>

 #include "packager/base/compiler_specific.h"
 #include "packager/media/base/media_parser.h"
 #include "packager/media/formats/webvtt/cue.h"
+#include "packager/media/formats/webvtt/webvtt_sample_converter.h"

 namespace shaka {
 namespace media {
@ -34,6 +37,9 @@ class WebVttMediaParser : public MediaParser {
  bool Parse(const uint8_t* buf, int size) override WARN_UNUSED_RESULT;
  /// @}

+  void InjectWebVttSampleConvertForTesting(
+      std::unique_ptr<WebVttSampleConverter> converter);
+
 private:
  enum WebVttReadingState {
    kHeader,
@ -45,6 +51,11 @@ class WebVttMediaParser : public MediaParser {
    kParseError,
  };

+  // Sends current cue to sample converter, and dispatches any ready samples to
+  // the callback.
+  // current_cue_ is always cleared.
+  bool ProcessCurrentCue(bool flush);
+
  InitCB init_cb_;
  NewSampleCB new_sample_cb_;

@ -62,6 +73,8 @@ class WebVttMediaParser : public MediaParser {

  Cue current_cue_;

+  std::unique_ptr<WebVttSampleConverter> sample_converter_;
+
  DISALLOW_COPY_AND_ASSIGN(WebVttMediaParser);
 };

--- a/packager/media/formats/webvtt/webvtt_media_parser_unittest.cc
+++ b/packager/media/formats/webvtt/webvtt_media_parser_unittest.cc
@ -8,13 +8,30 @@
 #include <gtest/gtest.h>

 #include "packager/base/bind.h"
+#include "packager/base/strings/string_number_conversions.h"
 #include "packager/media/base/media_sample.h"
 #include "packager/media/base/stream_info.h"
+#include "packager/media/formats/mp4/box_definitions.h"
 #include "packager/media/formats/webvtt/webvtt_media_parser.h"

 namespace shaka {
 namespace media {

+using mp4::VTTCueBox;
+
+namespace {
+// Data is a vector and must not be empty.
+MATCHER_P3(MatchesStartTimeEndTimeAndData, start_time, end_time, data, "") {
+  *result_listener << "which is (" << arg->pts() << ", "
+                   << (arg->pts() + arg->duration()) << ", "
+                   << base::HexEncode(arg->data(), arg->data_size()) << ")";
+  return arg->pts() == start_time &&
+         (arg->pts() + arg->duration() == end_time) &&
+         arg->data_size() == data.size() &&
+         (memcmp(&data[0], arg->data(), arg->data_size()) == 0);
+}
+}  // namespace
+
 typedef testing::MockFunction<void(
    const std::vector<std::shared_ptr<StreamInfo>>& stream_info)>
    MockInitCallback;
@ -22,15 +39,13 @@ typedef testing::MockFunction<
    bool(uint32_t track_id, const std::shared_ptr<MediaSample>& media_sample)>
    MockNewSampleCallback;

-using testing::_;
+using testing::AtLeast;
 using testing::InSequence;
 using testing::Return;
+using testing::_;

 class WebVttMediaParserTest : public ::testing::Test {
 public:
-  WebVttMediaParserTest() {}
-  ~WebVttMediaParserTest() override {}
-
  void InitializeParser() {
    parser_.Init(
        base::Bind(&MockInitCallback::Call, base::Unretained(&init_callback_)),
@ -51,13 +66,21 @@ TEST_F(WebVttMediaParserTest, Init) {

 TEST_F(WebVttMediaParserTest, ParseOneCue) {
  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
+
+  VTTCueBox cue_box;
+  cue_box.cue_payload.cue_text = "subtitle";
+  std::vector<uint8_t> expected;
+  AppendBoxToVector(&cue_box, &expected);
+
+  EXPECT_CALL(new_sample_callback_,
+              Call(_, MatchesStartTimeEndTimeAndData(60000, 3600000, expected)))
+      .WillOnce(Return(true));

  const char kWebVtt[] =
      "WEBVTT\n"
      "\n"
      "00:01:00.000 --> 01:00:00.000\n"
-      "subtitle";
+      "subtitle\n";
  InitializeParser();
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
                            arraysize(kWebVtt) - 1));
@ -82,20 +105,63 @@ TEST_F(WebVttMediaParserTest, DifferentLineBreaks) {
  EXPECT_TRUE(parser_.Flush());
 }

-TEST_F(WebVttMediaParserTest, ParseMultpleCues) {
+// Verify that a typical case with mulitple cues works.
+TEST_F(WebVttMediaParserTest, ParseMultipleCues) {
  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, _))
-      .Times(2)
-      .WillRepeatedly(Return(true));
+
+
+  VTTCueBox first_cue_box;
+  first_cue_box.cue_payload.cue_text = "subtitle";
+
+  VTTCueBox second_cue_data;
+  second_cue_data.cue_payload.cue_text = "more subtitle";
+
+  VTTCueBox third_cue_data;
+  third_cue_data.cue_payload.cue_text = "more text";
+
+  std::vector<uint8_t> expected;
+  AppendBoxToVector(&first_cue_box, &expected);
+  EXPECT_CALL(new_sample_callback_,
+              Call(_, MatchesStartTimeEndTimeAndData(1000, 2321, expected)))
+      .WillOnce(Return(true));
+
+  expected.clear();
+  AppendBoxToVector(&first_cue_box, &expected);
+  AppendBoxToVector(&second_cue_data, &expected);
+  EXPECT_CALL(new_sample_callback_,
+              Call(_, MatchesStartTimeEndTimeAndData(2321, 5200, expected)))
+      .WillOnce(Return(true));
+
+  expected.clear();
+  AppendBoxToVector(&second_cue_data, &expected);
+  EXPECT_CALL(new_sample_callback_,
+              Call(_, MatchesStartTimeEndTimeAndData(5200, 5800, expected)))
+      .WillOnce(Return(true));
+
+  expected.clear();
+  AppendBoxToVector(&second_cue_data, &expected);
+  AppendBoxToVector(&third_cue_data, &expected);
+  EXPECT_CALL(new_sample_callback_,
+              Call(_, MatchesStartTimeEndTimeAndData(5800, 7000, expected)))
+      .WillOnce(Return(true));
+
+  expected.clear();
+  AppendBoxToVector(&third_cue_data, &expected);
+  EXPECT_CALL(new_sample_callback_,
+              Call(_, MatchesStartTimeEndTimeAndData(7000, 8000, expected)))
+      .WillOnce(Return(true));

  const char kWebVtt[] =
      "WEBVTT\n"
      "\n"
-      "00:01:00.000 --> 01:00:00.000\n"
+      "00:00:01.000 --> 00:00:05.200\n"
      "subtitle\n"
      "\n"
-      "02:01:00.000 --> 02:02:00.000\n"
-      "more subtitle";
+      "00:00:02.321 --> 00:00:07.000\n"
+      "more subtitle\n"
+      "\n"
+      "00:00:05.800 --> 00:00:08.000\n"
+      "more text\n" ;
  InitializeParser();
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
                            arraysize(kWebVtt) - 1));
@ -112,9 +178,8 @@ MATCHER_P2(MatchesStartTimeAndDuration, start_time, duration, "") {
 TEST_F(WebVttMediaParserTest, VerifyTimingParsing) {
  EXPECT_CALL(init_callback_, Call(_));
  EXPECT_CALL(new_sample_callback_,
-              Call(_, MatchesStartTimeAndDuration(61004, 204088)))
+              Call(_, MatchesStartTimeAndDuration(61004u, 204088u)))
      .WillOnce(Return(true));
-
  const char kWebVtt[] =
      "WEBVTT\n"
      "\n"
@ -159,48 +224,15 @@ TEST_F(WebVttMediaParserTest, SpacesInTimestamp) {
                    arraysize(kSpacesInTimestamp) - 1));
 }

-MATCHER_P(MatchesPayload, data, "") {
-  std::vector<uint8_t> arg_data(arg->data(), arg->data() + arg->data_size());
-  return arg_data == data;
-}
-
-TEST_F(WebVttMediaParserTest, VerifyCuePayload) {
-  const char kExpectedPayload1[] = "subtitle";
-  const char kExpectedPayload2[] = "hello";
-  std::vector<uint8_t> expected_payload(
-      kExpectedPayload1, kExpectedPayload1 + arraysize(kExpectedPayload1) - 1);
-
-  InSequence s;
-  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, MatchesPayload(expected_payload)))
-      .WillOnce(Return(true));
-
-  expected_payload.assign(kExpectedPayload2,
-                          kExpectedPayload2 + arraysize(kExpectedPayload2) - 1);
-  EXPECT_CALL(new_sample_callback_, Call(_, MatchesPayload(expected_payload)))
-      .WillOnce(Return(true));
-
-  const char kWebVtt[] =
-      "WEBVTT\n"
-      "\n"
-      "00:01:01.004 --> 00:01:22.088\n"
-      "subtitle\n"
-      "\n"
-      "02:06:00.000 --> 02:30:02.006\n"
-      "hello";
-
-  InitializeParser();
-  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
-                            arraysize(kWebVtt) - 1));
-
-  EXPECT_TRUE(parser_.Flush());
+MATCHER_P(MatchesPayload, payload, "") {
+  return arg.payload.front() == std::string(payload);
 }

 // Verify that a sample can be created from multiple calls to Parse(), i.e. one
 // Parse() is not a full sample.
 TEST_F(WebVttMediaParserTest, PartialParse) {
  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);

  const char kWebVtt[] =
      "WEBVTT\n"
@ -210,7 +242,8 @@ TEST_F(WebVttMediaParserTest, PartialParse) {
  InitializeParser();
  // Pass in the first 8 bytes, i.e. right before the first cue.
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt), 8));
-  // Pass in the rest of the cue.
+
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt) + 8,
                            arraysize(kWebVtt) - 1 - 8));

@ -221,6 +254,7 @@ TEST_F(WebVttMediaParserTest, PartialParse) {
 TEST_F(WebVttMediaParserTest, BadMetadataHeader) {
  EXPECT_CALL(init_callback_, Call(_)).Times(0);
  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
+
  const char kBadWebVtt[] =
      "WEBVTT\n"
      "00:01:01.004 --> 00:04:25.092\n";
@ -230,12 +264,8 @@ TEST_F(WebVttMediaParserTest, BadMetadataHeader) {
  EXPECT_TRUE(parser_.Flush());
 }

-MATCHER_P(MatchesComment, comment, "") {
-  std::vector<uint8_t> arg_comment(arg->side_data(),
-                                   arg->side_data() + arg->side_data_size());
-  return arg_comment == comment;
-}
-
+// TODO(rkuroiwa): WebVttSampleConverter doesn't handle comments yet. Once its
+// implemented, this should verify that comment is in the sample.
 // Verify that comment is parsed.
 TEST_F(WebVttMediaParserTest, Comment) {
  const char kExpectedComment[] = "NOTE This is a comment";
@ -243,8 +273,6 @@ TEST_F(WebVttMediaParserTest, Comment) {
      kExpectedComment, kExpectedComment + arraysize(kExpectedComment) - 1);

  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, MatchesComment(expected_comment)))
-      .WillOnce(Return(true));

  const char kWebVtt[] =
      "WEBVTT\n"
@ -260,7 +288,6 @@ TEST_F(WebVttMediaParserTest, Comment) {
 // Verify that comment with --> is rejected.
 TEST_F(WebVttMediaParserTest, BadComment) {
  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);

  const char kWebVtt[] =
      "WEBVTT\n"
--- a/packager/media/formats/webvtt/webvtt_sample_converter.cc
+++ b/packager/media/formats/webvtt/webvtt_sample_converter.cc
@ -22,7 +22,7 @@ namespace media {
 namespace {

 std::shared_ptr<MediaSample> CreateEmptyCueSample(uint64_t start_time,
-                                                uint64_t end_time) {
+                                                  uint64_t end_time) {
  DCHECK_GT(end_time, start_time);
  mp4::VTTEmptyCueBox empty_cue_box;

@ -36,6 +36,15 @@ std::shared_ptr<MediaSample> CreateEmptyCueSample(uint64_t start_time,
  return empty_cue_sample;
 }

+void StripTrailingNewlines(const std::string& input, std::string* output) {
+  const size_t found = input.find_last_not_of('\n');
+  if (found != std::string::npos) {
+    *output = input.substr(0, found + 1);
+  } else {
+    *output = input;
+  }
+}
+
 mp4::VTTCueBox CueBoxFromCue(const Cue& cue) {
  mp4::VTTCueBox cue_box;
  if (!cue.identifier.empty()) {
@ -46,7 +55,7 @@ mp4::VTTCueBox CueBoxFromCue(const Cue& cue) {
    cue_box.cue_settings.settings = cue.settings;
  }

-  cue_box.cue_payload.cue_text = cue.payload.front();
+  StripTrailingNewlines(cue.payload, &cue_box.cue_payload.cue_text);
  return cue_box;
 }

@ -127,19 +136,18 @@ WebVttSampleConverter::~WebVttSampleConverter() {}

 // Note that this |sample| is either a cue or a comment. It does not have any
 // info on whether the next cue is overlapping or not.
-void WebVttSampleConverter::PushSample(std::shared_ptr<MediaSample> sample) {
-  if (sample->data_size() == 0u) {
+void WebVttSampleConverter::PushCue(const Cue& cue) {
+  if (!cue.comment.empty()) {
    // A comment. Put it in the buffer and skip.
    mp4::VTTAdditionalTextBox comment;
-    comment.cue_additional_text.assign(
-        sample->side_data(), sample->side_data() + sample->side_data_size());
+    StripTrailingNewlines(cue.comment, &comment.cue_additional_text);
    additional_texts_.push_back(comment);
    // TODO(rkuriowa): Handle comments as samples.

    return;
  }

-  cues_.push_back(MediaSampleToCue(*sample));
+  cues_.push_back(cue);
  if (cues_.size() == 1) {
    // Cannot make a decision with just one sample. Cache it and wait for
    // another one.
--- a/packager/media/formats/webvtt/webvtt_sample_converter.h
+++ b/packager/media/formats/webvtt/webvtt_sample_converter.h
@ -51,28 +51,29 @@ void AppendBoxToVector(mp4::Box* box, std::vector<uint8_t>* output_vector);
 ///\n
 /// This class buffers the samples that are passed to AddSample() and creates
 /// more samples as necessary.
+/// Methods are virtual only for mocking, not intended for inheritance.
 class WebVttSampleConverter {
 public:
  WebVttSampleConverter();
-  ~WebVttSampleConverter();
+  virtual ~WebVttSampleConverter();

-  /// Add a sample.
-  /// @param sample is the sample to be added. It should contain one VTT cue.
-  void PushSample(std::shared_ptr<MediaSample> sample);
+  /// Add a webvtt cue.
+  /// @param cue is a webvtt cue.
+  virtual void PushCue(const Cue& cue);

  /// Process all the buffered samples.
  /// This finalizes the object and further calls to PushSample() may result in
  /// an undefined behavior.
-  void Flush();
+  virtual void Flush();

  /// @return The number of samples that are processed and ready to be popped.
-  size_t ReadySamplesSize();
+  virtual size_t ReadySamplesSize();

  /// Returns a MediaSample that is non-overlapping with the previous samples
  /// that it has output. The data in the sample is one or more ISO-BMFF boxes
  /// for the duration of the sample.
  /// @return The first sample that is ready to be processed.
-  std::shared_ptr<MediaSample> PopSample();
+  virtual std::shared_ptr<MediaSample> PopSample();

 private:
  // Handle |cues_| except the last item, and create samples from them.
--- a/packager/media/formats/webvtt/webvtt_sample_converter_unittest.cc
+++ b/packager/media/formats/webvtt/webvtt_sample_converter_unittest.cc
@ -82,23 +82,18 @@ TEST_F(WebVttFragmenterTest, AppendBoxToVector) {
 //   |-- cue2 --|

 TEST_F(WebVttFragmenterTest, NoOverlapContiguous) {
-  std::shared_ptr<MediaSample> sample1 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
-                            arraysize(kCueMessage1) - 1, true);
-  sample1->set_pts(0);
-  sample1->set_dts(0);
-  sample1->set_duration(2000);
+  Cue cue1;
+  cue1.payload = kCueMessage1;
+  cue1.start_time = 0;
+  cue1.duration = 2000;
+  webvtt_sample_converter_.PushCue(cue1);

-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
+  cue2.payload = kCueMessage2;
+  cue2.start_time = 2000;
+  cue2.duration = 1000;

-  std::shared_ptr<MediaSample> sample2 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
-                            arraysize(kCueMessage2) - 1, true);
-  sample2->set_pts(2000);
-  sample2->set_dts(2000);
-  sample2->set_duration(1000);
-
-  webvtt_sample_converter_.PushSample(sample2);
+  webvtt_sample_converter_.PushCue(cue2);
  webvtt_sample_converter_.Flush();
  EXPECT_EQ(2u, webvtt_sample_converter_.ReadySamplesSize());

@ -119,23 +114,18 @@ TEST_F(WebVttFragmenterTest, NoOverlapContiguous) {

 // Verify that if is a gap, then a sample is created for the gap.
 TEST_F(WebVttFragmenterTest, Gap) {
-  std::shared_ptr<MediaSample> sample1 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
-                            arraysize(kCueMessage1) - 1, true);
-  sample1->set_pts(0);
-  sample1->set_dts(0);
-  sample1->set_duration(1000);
+  Cue cue1;
+  cue1.payload = kCueMessage1;
+  cue1.start_time = 0;
+  cue1.duration = 1000;
+  webvtt_sample_converter_.PushCue(cue1);

-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
+  cue2.payload = kCueMessage2;
+  cue2.start_time = 2000;
+  cue2.duration = 1000;
+  webvtt_sample_converter_.PushCue(cue2);

-  std::shared_ptr<MediaSample> sample2 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
-                            arraysize(kCueMessage2) - 1, true);
-  sample2->set_pts(2000);
-  sample2->set_dts(2000);
-  sample2->set_duration(1000);
-
-  webvtt_sample_converter_.PushSample(sample2);
  EXPECT_EQ(2u, webvtt_sample_converter_.ReadySamplesSize());

  webvtt_sample_converter_.Flush();
@ -165,30 +155,23 @@ TEST_F(WebVttFragmenterTest, Gap) {
 // The previous cue always ends before the current cue ends.
 // Cues are overlapping, no samples should be created in PushSample().
 TEST_F(WebVttFragmenterTest, OverlappingCuesSequential) {
-  std::shared_ptr<MediaSample> sample1 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
-                            arraysize(kCueMessage1) - 1, true);
-  sample1->set_pts(0);
-  sample1->set_dts(0);
-  sample1->set_duration(2000);
+  Cue cue1;
+  cue1.payload = kCueMessage1;
+  cue1.start_time = 0;
+  cue1.duration = 2000;
+  webvtt_sample_converter_.PushCue(cue1);

-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
+  cue2.payload = kCueMessage2;
+  cue2.start_time = 1000;
+  cue2.duration = 2000;
+  webvtt_sample_converter_.PushCue(cue2);

-  std::shared_ptr<MediaSample> sample2 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
-                            arraysize(kCueMessage2) - 1, true);
-  sample2->set_pts(1000);
-  sample2->set_dts(1000);
-  sample2->set_duration(2000);
-  webvtt_sample_converter_.PushSample(sample2);
-
-  std::shared_ptr<MediaSample> sample3 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage3),
-                            arraysize(kCueMessage3) - 1, true);
-  sample3->set_pts(1500);
-  sample3->set_dts(1500);
-  sample3->set_duration(4000);
-  webvtt_sample_converter_.PushSample(sample3);
+  Cue cue3;
+  cue3.payload = kCueMessage3;
+  cue3.start_time = 1500;
+  cue3.duration = 4000;
+  webvtt_sample_converter_.PushCue(cue3);

  webvtt_sample_converter_.Flush();
  // There should be 5 samples for [0,1000], [1000,1500], [1500,2000],
@ -232,38 +215,29 @@ TEST_F(WebVttFragmenterTest, OverlappingCuesSequential) {
 }

 TEST_F(WebVttFragmenterTest, OverlappingLongCue) {
-  std::shared_ptr<MediaSample> sample1 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
-                            arraysize(kCueMessage1) - 1, true);
-  sample1->set_pts(0);
-  sample1->set_dts(0);
-  sample1->set_duration(10000);
+  Cue cue1;
+  cue1.payload = kCueMessage1;
+  cue1.start_time = 0;
+  cue1.duration = 10000;
+  webvtt_sample_converter_.PushCue(cue1);

-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
+  cue2.payload = kCueMessage2;
+  cue2.start_time = 1000;
+  cue2.duration = 5000;
+  webvtt_sample_converter_.PushCue(cue2);

-  std::shared_ptr<MediaSample> sample2 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
-                            arraysize(kCueMessage2) - 1, true);
-  sample2->set_pts(1000);
-  sample2->set_dts(1000);
-  sample2->set_duration(5000);
-  webvtt_sample_converter_.PushSample(sample2);
+  Cue cue3;
+  cue3.payload = kCueMessage3;
+  cue3.start_time = 2000;
+  cue3.duration = 1000;
+  webvtt_sample_converter_.PushCue(cue3);

-  std::shared_ptr<MediaSample> sample3 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage3),
-                            arraysize(kCueMessage3) - 1, true);
-  sample3->set_pts(2000);
-  sample3->set_dts(2000);
-  sample3->set_duration(1000);
-  webvtt_sample_converter_.PushSample(sample3);
-
-  std::shared_ptr<MediaSample> sample4 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage4),
-                            arraysize(kCueMessage4) - 1, true);
-  sample4->set_pts(8000);
-  sample4->set_dts(8000);
-  sample4->set_duration(1000);
-  webvtt_sample_converter_.PushSample(sample4);
+  Cue cue4;
+  cue4.payload = kCueMessage4;
+  cue4.start_time = 8000;
+  cue4.duration = 1000;
+  webvtt_sample_converter_.PushCue(cue4);
  webvtt_sample_converter_.Flush();

  // There should be 7 samples for [0,1000], [1000,2000], [2000,3000],
@ -320,13 +294,11 @@ TEST_F(WebVttFragmenterTest, OverlappingLongCue) {
 }

 TEST_F(WebVttFragmenterTest, GapAtBeginning) {
-  std::shared_ptr<MediaSample> sample1 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
-                            arraysize(kCueMessage1) - 1, true);
-  sample1->set_pts(1200);
-  sample1->set_dts(1200);
-  sample1->set_duration(2000);
-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue;
+  cue.payload = kCueMessage1;
+  cue.start_time = 1200;
+  cue.duration = 2000;
+  webvtt_sample_converter_.PushCue(cue);

  webvtt_sample_converter_.Flush();
  EXPECT_EQ(1u, webvtt_sample_converter_.ReadySamplesSize());
@ -340,24 +312,18 @@ TEST_F(WebVttFragmenterTest, GapAtBeginning) {
 }

 TEST_F(WebVttFragmenterTest, SameStartTime) {
-  // TODO(rkuroiwa): This should be std::shared_ptr if this is applied on HEAD.
-  std::shared_ptr<MediaSample> sample1 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
-                            arraysize(kCueMessage1) - 1, true);
-  sample1->set_pts(0);
-  sample1->set_dts(0);
-  sample1->set_duration(2000);
+  Cue cue1;
+  cue1.payload = kCueMessage1;
+  cue1.start_time = 0;
+  cue1.duration = 2000;
+  webvtt_sample_converter_.PushCue(cue1);

-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
+  cue2.payload = kCueMessage2;
+  cue2.start_time = 0;
+  cue2.duration = 1500;
+  webvtt_sample_converter_.PushCue(cue2);

-  std::shared_ptr<MediaSample> sample2 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
-                            arraysize(kCueMessage2) - 1, true);
-  sample2->set_pts(0);
-  sample2->set_dts(0);
-  sample2->set_duration(1500);
-
-  webvtt_sample_converter_.PushSample(sample2);
  webvtt_sample_converter_.Flush();
  EXPECT_EQ(2u, webvtt_sample_converter_.ReadySamplesSize());

@ -380,39 +346,29 @@ TEST_F(WebVttFragmenterTest, SameStartTime) {

 // This test is a combination of the test cases above.
 TEST_F(WebVttFragmenterTest, MoreCases) {
-  std::shared_ptr<MediaSample> sample1 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
-                            arraysize(kCueMessage1) - 1, true);
-  sample1->set_pts(0);
-  sample1->set_dts(0);
-  sample1->set_duration(2000);
+  Cue cue1;
+  cue1.payload = kCueMessage1;
+  cue1.start_time = 0;
+  cue1.duration = 2000;
+  webvtt_sample_converter_.PushCue(cue1);

-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
+  cue2.payload = kCueMessage2;
+  cue2.start_time = 100;
+  cue2.duration = 100;
+  webvtt_sample_converter_.PushCue(cue2);

-  std::shared_ptr<MediaSample> sample2 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
-                            arraysize(kCueMessage2) - 1, true);
-  sample2->set_pts(100);
-  sample2->set_dts(100);
-  sample2->set_duration(100);
+  Cue cue3;
+  cue3.payload = kCueMessage3;
+  cue3.start_time = 1500;
+  cue3.duration = 1000;
+  webvtt_sample_converter_.PushCue(cue3);

-  webvtt_sample_converter_.PushSample(sample2);
-
-  std::shared_ptr<MediaSample> sample3 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage3),
-                            arraysize(kCueMessage3) - 1, true);
-  sample3->set_pts(1500);
-  sample3->set_dts(1500);
-  sample3->set_duration(1000);
-  webvtt_sample_converter_.PushSample(sample3);
-
-  std::shared_ptr<MediaSample> sample4 =
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage4),
-                            arraysize(kCueMessage4) - 1, true);
-  sample4->set_pts(1500);
-  sample4->set_dts(1500);
-  sample4->set_duration(800);
-  webvtt_sample_converter_.PushSample(sample4);
+  Cue cue4;
+  cue4.payload = kCueMessage4;
+  cue4.start_time = 1500;
+  cue4.duration = 800;
+  webvtt_sample_converter_.PushCue(cue4);

  webvtt_sample_converter_.Flush();
  EXPECT_EQ(6u, webvtt_sample_converter_.ReadySamplesSize());