Make WebVttMediaParser use WebVttSampleConverter

- WebVttMediaParser uses WebVttSampleConverter to generate non overlapping media samples. - The media samples contains ISO BMFF boxes. - Add kCodecWebVtt to signal that the media is webvtt and the samples will be in ISO BMFF boxes. Change-Id: I639902cdba7b04af75428bc20622e26b8203cfb2
2017-02-14 13:40:09 -08:00 · 2017-02-14 13:40:09 -08:00 · a3ce51785a
parent 924d6d4693
commit a3ce51785a
12 changed files with 273 additions and 288 deletions
--- a/packager/media/base/stream_info.h
+++ b/packager/media/base/stream_info.h
@ -51,6 +51,7 @@ enum Codec {
  kCodecAudioMaxPlusOne,
  kCodecText = 300,
  kCodecWebVtt = kCodecText,
 };
 /// Abstract class holds stream information.
--- a/packager/media/base/text_stream_info.cc
+++ b/packager/media/base/text_stream_info.cc
@ -11,10 +11,11 @@ namespace media {
 TextStreamInfo::TextStreamInfo(int track_id, uint32_t time_scale,
                               uint64_t duration,
                               Codec codec,
                               const std::string& codec_string,
                               const std::string& codec_config, uint16_t width,
                               uint16_t height, const std::string& language)
-    : StreamInfo(kStreamText, track_id, time_scale, duration, kCodecText,
+    : StreamInfo(kStreamText, track_id, time_scale, duration, codec,
                 codec_string,
                 reinterpret_cast<const uint8_t*>(codec_config.data()),
                 codec_config.size(), language, false),
--- a/packager/media/base/text_stream_info.h
+++ b/packager/media/base/text_stream_info.h
@ -20,7 +20,8 @@ class TextStreamInfo : public StreamInfo {
  /// @param track_id is the track ID of this stream.
  /// @param time_scale is the time scale of this stream.
  /// @param duration is the duration of this stream.
-  /// @param codec_string is the codec.
+  /// @param codec is the media codec.
  /// @param codec_string is the codec in string format.
  /// @param codec_config is configuration for this text stream. This could be
  ///        the metadata that applies to all the samples of this stream. This
  ///        may be empty.
@ -28,6 +29,7 @@ class TextStreamInfo : public StreamInfo {
  /// @param height of the text. This may be 0.
  /// @param language is the language of this stream. This may be empty.
  TextStreamInfo(int track_id, uint32_t time_scale, uint64_t duration,
                 Codec codec,
                 const std::string& codec_string,
                 const std::string& codec_config, uint16_t width,
                 uint16_t height, const std::string& language);
--- a/packager/media/formats/mp2t/pes_packet_generator_unittest.cc
+++ b/packager/media/formats/mp2t/pes_packet_generator_unittest.cc
@ -247,9 +247,9 @@ TEST_F(PesPacketGeneratorTest, InitializeAudioNonAac) {
 // Text is not supported yet.
 TEST_F(PesPacketGeneratorTest, InitializeTextInfo) {
-  std::shared_ptr<TextStreamInfo> stream_info(
+  std::shared_ptr<TextStreamInfo> stream_info(new TextStreamInfo(
-      new TextStreamInfo(kTrackId, kTimeScale, kDuration, kCodecString,
+      kTrackId, kTimeScale, kDuration, kCodecText, kCodecString, std::string(),
-                         std::string(), kWidth, kHeight, kLanguage));
+      kWidth, kHeight, kLanguage));
  EXPECT_FALSE(generator_.Initialize(*stream_info));
 }
--- a/packager/media/formats/webvtt/cue.cc
+++ b/packager/media/formats/webvtt/cue.cc
@ -8,53 +8,5 @@ namespace media {
 Cue::Cue() : start_time(0), duration(0) {}
 Cue::~Cue() {}
 // Mapping:
 // comment --> side data (and side data only sample)
 // settings --> side data
 // start_time --> pts
 std::shared_ptr<MediaSample> CueToMediaSample(const Cue& cue) {
  const bool kKeyFrame = true;
  if (!cue.comment.empty()) {
    const std::string comment = base::JoinString(cue.comment, "\n");
    return MediaSample::FromMetadata(
        reinterpret_cast<const uint8_t*>(comment.data()), comment.size());
  }
  const std::string payload = base::JoinString(cue.payload, "\n");
  std::shared_ptr<MediaSample> media_sample = MediaSample::CopyFrom(
      reinterpret_cast<const uint8_t*>(payload.data()), payload.size(),
      reinterpret_cast<const uint8_t*>(cue.settings.data()),
      cue.settings.size(), !kKeyFrame);
  media_sample->set_config_id(cue.identifier);
  media_sample->set_pts(cue.start_time);
  media_sample->set_duration(cue.duration);
  return media_sample;
 }
 // TODO(rkuroiwa): Cue gets converted to MediaSample in WebVttMediaParser and
 // then back to Cue in the muxer. Consider making MediaSample a protobuf or make
 // Cue a protobuf and (ab)use MediaSample::data() to store serialized Cue.
 Cue MediaSampleToCue(const MediaSample& sample) {
  Cue cue;
  if (sample.data_size() == 0) {
    std::string comment(sample.side_data(),
                        sample.side_data() + sample.side_data_size());
    cue.comment.push_back(comment);
    return cue;
  }
  std::string payload(sample.data(), sample.data() + sample.data_size());
  cue.payload.push_back(payload);
  cue.identifier.assign(sample.config_id());
  cue.start_time = sample.pts();
  cue.duration = sample.duration();
  if (sample.side_data_size() != 0) {
    cue.settings.assign(sample.side_data(),
                        sample.side_data() + sample.side_data_size());
  }
  return cue;
 }
 }  // namespace media
 }  // namespace shaka
--- a/packager/media/formats/webvtt/cue.h
+++ b/packager/media/formats/webvtt/cue.h
@ -1,3 +1,6 @@
 #ifndef PACKAGER_MEDIA_FORMATS_WEBVTT_CUE_H_
 #define PACKAGER_MEDIA_FORMATS_WEBVTT_CUE_H_
 #include <stdint.h>
 #include <memory>
@ -20,19 +23,13 @@ struct Cue {
  uint64_t start_time;
  uint64_t duration;
  std::string settings;
-  std::vector<std::string> payload;
+
-  std::vector<std::string> comment;
+  // |payload| and |comment| may have trailing "\n" character.
  std::string payload;
  std::string comment;
 };
 /// Convert Cue to MediaSample.
 /// @param cue data.
 /// @return @a cue converted to a MediaSample.
 std::shared_ptr<MediaSample> CueToMediaSample(const Cue& cue);
 /// Convert MediaSample to Cue.
 /// @param sample to be converted.
 /// @return @a sample converted to Cue.
 Cue MediaSampleToCue(const MediaSample& sample);
 }  // namespace media
 }  // namespace shaka
 #endif  // PACKAGER_MEDIA_FORMATS_WEBVTT_CUE_H_
--- a/packager/media/formats/webvtt/webvtt_media_parser.cc
+++ b/packager/media/formats/webvtt/webvtt_media_parser.cc
@ -22,6 +22,8 @@ namespace media {
 namespace {
 const bool kFlush = true;
 // There's only one track in a WebVTT file.
 const int kTrackId = 0;
@ -186,7 +188,8 @@ bool ParseTimingAndSettingsLine(const std::string& line,
 }  // namespace
-WebVttMediaParser::WebVttMediaParser() : state_(kHeader) {}
+WebVttMediaParser::WebVttMediaParser()
    : state_(kHeader), sample_converter_(new WebVttSampleConverter()) {}
 WebVttMediaParser::~WebVttMediaParser() {}
 void WebVttMediaParser::Init(const InitCB& init_cb,
@ -205,17 +208,20 @@ bool WebVttMediaParser::Flush() {
    // If it was in the middle of the payload and the stream finished, then this
    // is an end of the payload. The rest of the data is part of the payload.
    if (state_ == kCuePayload) {
-      current_cue_.payload.push_back(data_);
+      current_cue_.payload += data_ + "\n";
    } else {
-      current_cue_.comment.push_back(data_);
+      current_cue_.comment += data_ + "\n";
    }
    data_.clear();
  }
-  bool result = new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_));
+  if (!ProcessCurrentCue(kFlush)) {
-  current_cue_ = Cue();
+    state_ = kParseError;
    return false;
  }
  state_ = kCueIdentifierOrTimingOrComment;
-  return result;
+  return true;
 }
 bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
@ -265,8 +271,11 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
          // There is no one metadata to determine what the language is. Parts
          // of the text may be annotated as some specific language.
          const char kLanguage[] = "";
          const char kWebVttCodecString[] = "wvtt";
          streams.emplace_back(
-              new TextStreamInfo(kTrackId, kTimescale, kDuration, "wvtt",
+              new TextStreamInfo(kTrackId, kTimescale, kDuration,
                                 kCodecWebVtt, kWebVttCodecString,
                                 base::JoinString(header_, "\n"),
                                 0,  // Not necessary.
                                 0,
@ -291,7 +300,7 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
          if (base::StartsWith(line, "NOTE",
                               base::CompareCase::INSENSITIVE_ASCII)) {
            state_ = kComment;
-            current_cue_.comment.push_back(line);
+            current_cue_.comment += line + "\n";
          } else {
            // A cue can start from a cue identifier.
            // https://w3c.github.io/webvtt/#webvtt-cue-identifier
@ -322,29 +331,27 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
      case kCuePayload: {
        if (line.empty()) {
          state_ = kCueIdentifierOrTimingOrComment;
-          if (!new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_))) {
+          if (!ProcessCurrentCue(!kFlush)) {
            state_ = kParseError;
            return false;
          }
          current_cue_ = Cue();
          break;
        }
-        current_cue_.payload.push_back(line);
+        current_cue_.payload += line + "\n";
        break;
      }
      case kComment: {
        if (line.empty()) {
          state_ = kCueIdentifierOrTimingOrComment;
-          if (!new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_))) {
+          if (!ProcessCurrentCue(!kFlush)) {
            state_ = kParseError;
            return false;
          }
          current_cue_ = Cue();
          break;
        }
-        current_cue_.comment.push_back(line);
+        current_cue_.comment += line + "\n";
        break;
      }
      case kParseError:
@ -356,5 +363,25 @@ bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
  return true;
 }
 void WebVttMediaParser::InjectWebVttSampleConvertForTesting(
    std::unique_ptr<WebVttSampleConverter> converter) {
  sample_converter_ = std::move(converter);
 }
 bool WebVttMediaParser::ProcessCurrentCue(bool flush) {
  sample_converter_->PushCue(current_cue_);
  current_cue_ = Cue();
  if (flush)
    sample_converter_->Flush();
  while (sample_converter_->ReadySamplesSize() > 0) {
    if (!new_sample_cb_.Run(kTrackId, sample_converter_->PopSample())) {
      LOG(ERROR) << "New sample callback failed.";
      return false;
    }
  }
  return true;
 }
 }  // namespace media
 }  // namespace shaka
--- a/packager/media/formats/webvtt/webvtt_media_parser.h
+++ b/packager/media/formats/webvtt/webvtt_media_parser.h
@ -8,12 +8,15 @@
 #define MEDIA_FORMATS_WEBVTT_WEBVTT_MEDIA_PARSER_H_
 #include <stdint.h>
 #include <memory>
 #include <string>
 #include <vector>
 #include "packager/base/compiler_specific.h"
 #include "packager/media/base/media_parser.h"
 #include "packager/media/formats/webvtt/cue.h"
 #include "packager/media/formats/webvtt/webvtt_sample_converter.h"
 namespace shaka {
 namespace media {
@ -34,6 +37,9 @@ class WebVttMediaParser : public MediaParser {
  bool Parse(const uint8_t* buf, int size) override WARN_UNUSED_RESULT;
  /// @}
  void InjectWebVttSampleConvertForTesting(
      std::unique_ptr<WebVttSampleConverter> converter);
 private:
  enum WebVttReadingState {
    kHeader,
@ -45,6 +51,11 @@ class WebVttMediaParser : public MediaParser {
    kParseError,
  };
  // Sends current cue to sample converter, and dispatches any ready samples to
  // the callback.
  // current_cue_ is always cleared.
  bool ProcessCurrentCue(bool flush);
  InitCB init_cb_;
  NewSampleCB new_sample_cb_;
@ -62,6 +73,8 @@ class WebVttMediaParser : public MediaParser {
  Cue current_cue_;
  std::unique_ptr<WebVttSampleConverter> sample_converter_;
  DISALLOW_COPY_AND_ASSIGN(WebVttMediaParser);
 };
--- a/packager/media/formats/webvtt/webvtt_media_parser_unittest.cc
+++ b/packager/media/formats/webvtt/webvtt_media_parser_unittest.cc
@ -8,13 +8,30 @@
 #include <gtest/gtest.h>
 #include "packager/base/bind.h"
 #include "packager/base/strings/string_number_conversions.h"
 #include "packager/media/base/media_sample.h"
 #include "packager/media/base/stream_info.h"
 #include "packager/media/formats/mp4/box_definitions.h"
 #include "packager/media/formats/webvtt/webvtt_media_parser.h"
 namespace shaka {
 namespace media {
 using mp4::VTTCueBox;
 namespace {
 // Data is a vector and must not be empty.
 MATCHER_P3(MatchesStartTimeEndTimeAndData, start_time, end_time, data, "") {
  *result_listener << "which is (" << arg->pts() << ", "
                   << (arg->pts() + arg->duration()) << ", "
                   << base::HexEncode(arg->data(), arg->data_size()) << ")";
  return arg->pts() == start_time &&
         (arg->pts() + arg->duration() == end_time) &&
         arg->data_size() == data.size() &&
         (memcmp(&data[0], arg->data(), arg->data_size()) == 0);
 }
 }  // namespace
 typedef testing::MockFunction<void(
    const std::vector<std::shared_ptr<StreamInfo>>& stream_info)>
    MockInitCallback;
@ -22,15 +39,13 @@ typedef testing::MockFunction<
    bool(uint32_t track_id, const std::shared_ptr<MediaSample>& media_sample)>
    MockNewSampleCallback;
-using testing::_;
+using testing::AtLeast;
 using testing::InSequence;
 using testing::Return;
 using testing::_;
 class WebVttMediaParserTest : public ::testing::Test {
 public:
  WebVttMediaParserTest() {}
  ~WebVttMediaParserTest() override {}
  void InitializeParser() {
    parser_.Init(
        base::Bind(&MockInitCallback::Call, base::Unretained(&init_callback_)),
@ -51,13 +66,21 @@ TEST_F(WebVttMediaParserTest, Init) {
 TEST_F(WebVttMediaParserTest, ParseOneCue) {
  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
+
  VTTCueBox cue_box;
  cue_box.cue_payload.cue_text = "subtitle";
  std::vector<uint8_t> expected;
  AppendBoxToVector(&cue_box, &expected);
  EXPECT_CALL(new_sample_callback_,
              Call(_, MatchesStartTimeEndTimeAndData(60000, 3600000, expected)))
      .WillOnce(Return(true));
  const char kWebVtt[] =
      "WEBVTT\n"
      "\n"
      "00:01:00.000 --> 01:00:00.000\n"
-      "subtitle";
+      "subtitle\n";
  InitializeParser();
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
                            arraysize(kWebVtt) - 1));
@ -82,20 +105,63 @@ TEST_F(WebVttMediaParserTest, DifferentLineBreaks) {
  EXPECT_TRUE(parser_.Flush());
 }
-TEST_F(WebVttMediaParserTest, ParseMultpleCues) {
+// Verify that a typical case with mulitple cues works.
 TEST_F(WebVttMediaParserTest, ParseMultipleCues) {
  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, _))
+
-      .Times(2)
+
-      .WillRepeatedly(Return(true));
+  VTTCueBox first_cue_box;
  first_cue_box.cue_payload.cue_text = "subtitle";
  VTTCueBox second_cue_data;
  second_cue_data.cue_payload.cue_text = "more subtitle";
  VTTCueBox third_cue_data;
  third_cue_data.cue_payload.cue_text = "more text";
  std::vector<uint8_t> expected;
  AppendBoxToVector(&first_cue_box, &expected);
  EXPECT_CALL(new_sample_callback_,
              Call(_, MatchesStartTimeEndTimeAndData(1000, 2321, expected)))
      .WillOnce(Return(true));
  expected.clear();
  AppendBoxToVector(&first_cue_box, &expected);
  AppendBoxToVector(&second_cue_data, &expected);
  EXPECT_CALL(new_sample_callback_,
              Call(_, MatchesStartTimeEndTimeAndData(2321, 5200, expected)))
      .WillOnce(Return(true));
  expected.clear();
  AppendBoxToVector(&second_cue_data, &expected);
  EXPECT_CALL(new_sample_callback_,
              Call(_, MatchesStartTimeEndTimeAndData(5200, 5800, expected)))
      .WillOnce(Return(true));
  expected.clear();
  AppendBoxToVector(&second_cue_data, &expected);
  AppendBoxToVector(&third_cue_data, &expected);
  EXPECT_CALL(new_sample_callback_,
              Call(_, MatchesStartTimeEndTimeAndData(5800, 7000, expected)))
      .WillOnce(Return(true));
  expected.clear();
  AppendBoxToVector(&third_cue_data, &expected);
  EXPECT_CALL(new_sample_callback_,
              Call(_, MatchesStartTimeEndTimeAndData(7000, 8000, expected)))
      .WillOnce(Return(true));
  const char kWebVtt[] =
      "WEBVTT\n"
      "\n"
-      "00:01:00.000 --> 01:00:00.000\n"
+      "00:00:01.000 --> 00:00:05.200\n"
      "subtitle\n"
      "\n"
-      "02:01:00.000 --> 02:02:00.000\n"
+      "00:00:02.321 --> 00:00:07.000\n"
-      "more subtitle";
+      "more subtitle\n"
      "\n"
      "00:00:05.800 --> 00:00:08.000\n"
      "more text\n" ;
  InitializeParser();
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
                            arraysize(kWebVtt) - 1));
@ -112,9 +178,8 @@ MATCHER_P2(MatchesStartTimeAndDuration, start_time, duration, "") {
 TEST_F(WebVttMediaParserTest, VerifyTimingParsing) {
  EXPECT_CALL(init_callback_, Call(_));
  EXPECT_CALL(new_sample_callback_,
-              Call(_, MatchesStartTimeAndDuration(61004, 204088)))
+              Call(_, MatchesStartTimeAndDuration(61004u, 204088u)))
      .WillOnce(Return(true));
  const char kWebVtt[] =
      "WEBVTT\n"
      "\n"
@ -159,48 +224,15 @@ TEST_F(WebVttMediaParserTest, SpacesInTimestamp) {
                    arraysize(kSpacesInTimestamp) - 1));
 }
-MATCHER_P(MatchesPayload, data, "") {
+MATCHER_P(MatchesPayload, payload, "") {
-  std::vector<uint8_t> arg_data(arg->data(), arg->data() + arg->data_size());
+  return arg.payload.front() == std::string(payload);
  return arg_data == data;
 }
 TEST_F(WebVttMediaParserTest, VerifyCuePayload) {
  const char kExpectedPayload1[] = "subtitle";
  const char kExpectedPayload2[] = "hello";
  std::vector<uint8_t> expected_payload(
      kExpectedPayload1, kExpectedPayload1 + arraysize(kExpectedPayload1) - 1);
  InSequence s;
  EXPECT_CALL(init_callback_, Call(_));
  EXPECT_CALL(new_sample_callback_, Call(_, MatchesPayload(expected_payload)))
      .WillOnce(Return(true));
  expected_payload.assign(kExpectedPayload2,
                          kExpectedPayload2 + arraysize(kExpectedPayload2) - 1);
  EXPECT_CALL(new_sample_callback_, Call(_, MatchesPayload(expected_payload)))
      .WillOnce(Return(true));
  const char kWebVtt[] =
      "WEBVTT\n"
      "\n"
      "00:01:01.004 --> 00:01:22.088\n"
      "subtitle\n"
      "\n"
      "02:06:00.000 --> 02:30:02.006\n"
      "hello";
  InitializeParser();
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
                            arraysize(kWebVtt) - 1));
  EXPECT_TRUE(parser_.Flush());
 }
 // Verify that a sample can be created from multiple calls to Parse(), i.e. one
 // Parse() is not a full sample.
 TEST_F(WebVttMediaParserTest, PartialParse) {
  EXPECT_CALL(init_callback_, Call(_));
-  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
  const char kWebVtt[] =
      "WEBVTT\n"
@ -210,7 +242,8 @@ TEST_F(WebVttMediaParserTest, PartialParse) {
  InitializeParser();
  // Pass in the first 8 bytes, i.e. right before the first cue.
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt), 8));
-  // Pass in the rest of the cue.
+
  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt) + 8,
                            arraysize(kWebVtt) - 1 - 8));
@ -221,6 +254,7 @@ TEST_F(WebVttMediaParserTest, PartialParse) {
 TEST_F(WebVttMediaParserTest, BadMetadataHeader) {
  EXPECT_CALL(init_callback_, Call(_)).Times(0);
  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
  const char kBadWebVtt[] =
      "WEBVTT\n"
      "00:01:01.004 --> 00:04:25.092\n";
@ -230,12 +264,8 @@ TEST_F(WebVttMediaParserTest, BadMetadataHeader) {
  EXPECT_TRUE(parser_.Flush());
 }
-MATCHER_P(MatchesComment, comment, "") {
+// TODO(rkuroiwa): WebVttSampleConverter doesn't handle comments yet. Once its
-  std::vector<uint8_t> arg_comment(arg->side_data(),
+// implemented, this should verify that comment is in the sample.
                                   arg->side_data() + arg->side_data_size());
  return arg_comment == comment;
 }
 // Verify that comment is parsed.
 TEST_F(WebVttMediaParserTest, Comment) {
  const char kExpectedComment[] = "NOTE This is a comment";
@ -243,8 +273,6 @@ TEST_F(WebVttMediaParserTest, Comment) {
      kExpectedComment, kExpectedComment + arraysize(kExpectedComment) - 1);
  EXPECT_CALL(init_callback_, Call(_));
  EXPECT_CALL(new_sample_callback_, Call(_, MatchesComment(expected_comment)))
      .WillOnce(Return(true));
  const char kWebVtt[] =
      "WEBVTT\n"
@ -260,7 +288,6 @@ TEST_F(WebVttMediaParserTest, Comment) {
 // Verify that comment with --> is rejected.
 TEST_F(WebVttMediaParserTest, BadComment) {
  EXPECT_CALL(init_callback_, Call(_));
  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
  const char kWebVtt[] =
      "WEBVTT\n"
--- a/packager/media/formats/webvtt/webvtt_sample_converter.cc
+++ b/packager/media/formats/webvtt/webvtt_sample_converter.cc
@ -22,7 +22,7 @@ namespace media {
 namespace {
 std::shared_ptr<MediaSample> CreateEmptyCueSample(uint64_t start_time,
-                                                uint64_t end_time) {
+                                                  uint64_t end_time) {
  DCHECK_GT(end_time, start_time);
  mp4::VTTEmptyCueBox empty_cue_box;
@ -36,6 +36,15 @@ std::shared_ptr<MediaSample> CreateEmptyCueSample(uint64_t start_time,
  return empty_cue_sample;
 }
 void StripTrailingNewlines(const std::string& input, std::string* output) {
  const size_t found = input.find_last_not_of('\n');
  if (found != std::string::npos) {
    *output = input.substr(0, found + 1);
  } else {
    *output = input;
  }
 }
 mp4::VTTCueBox CueBoxFromCue(const Cue& cue) {
  mp4::VTTCueBox cue_box;
  if (!cue.identifier.empty()) {
@ -46,7 +55,7 @@ mp4::VTTCueBox CueBoxFromCue(const Cue& cue) {
    cue_box.cue_settings.settings = cue.settings;
  }
-  cue_box.cue_payload.cue_text = cue.payload.front();
+  StripTrailingNewlines(cue.payload, &cue_box.cue_payload.cue_text);
  return cue_box;
 }
@ -127,19 +136,18 @@ WebVttSampleConverter::~WebVttSampleConverter() {}
 // Note that this |sample| is either a cue or a comment. It does not have any
 // info on whether the next cue is overlapping or not.
-void WebVttSampleConverter::PushSample(std::shared_ptr<MediaSample> sample) {
+void WebVttSampleConverter::PushCue(const Cue& cue) {
-  if (sample->data_size() == 0u) {
+  if (!cue.comment.empty()) {
    // A comment. Put it in the buffer and skip.
    mp4::VTTAdditionalTextBox comment;
-    comment.cue_additional_text.assign(
+    StripTrailingNewlines(cue.comment, &comment.cue_additional_text);
        sample->side_data(), sample->side_data() + sample->side_data_size());
    additional_texts_.push_back(comment);
    // TODO(rkuriowa): Handle comments as samples.
    return;
  }
-  cues_.push_back(MediaSampleToCue(*sample));
+  cues_.push_back(cue);
  if (cues_.size() == 1) {
    // Cannot make a decision with just one sample. Cache it and wait for
    // another one.
--- a/packager/media/formats/webvtt/webvtt_sample_converter.h
+++ b/packager/media/formats/webvtt/webvtt_sample_converter.h
@ -51,28 +51,29 @@ void AppendBoxToVector(mp4::Box* box, std::vector<uint8_t>* output_vector);
 ///\n
 /// This class buffers the samples that are passed to AddSample() and creates
 /// more samples as necessary.
 /// Methods are virtual only for mocking, not intended for inheritance.
 class WebVttSampleConverter {
 public:
  WebVttSampleConverter();
-  ~WebVttSampleConverter();
+  virtual ~WebVttSampleConverter();
-  /// Add a sample.
+  /// Add a webvtt cue.
-  /// @param sample is the sample to be added. It should contain one VTT cue.
+  /// @param cue is a webvtt cue.
-  void PushSample(std::shared_ptr<MediaSample> sample);
+  virtual void PushCue(const Cue& cue);
  /// Process all the buffered samples.
  /// This finalizes the object and further calls to PushSample() may result in
  /// an undefined behavior.
-  void Flush();
+  virtual void Flush();
  /// @return The number of samples that are processed and ready to be popped.
-  size_t ReadySamplesSize();
+  virtual size_t ReadySamplesSize();
  /// Returns a MediaSample that is non-overlapping with the previous samples
  /// that it has output. The data in the sample is one or more ISO-BMFF boxes
  /// for the duration of the sample.
  /// @return The first sample that is ready to be processed.
-  std::shared_ptr<MediaSample> PopSample();
+  virtual std::shared_ptr<MediaSample> PopSample();
 private:
  // Handle |cues_| except the last item, and create samples from them.
--- a/packager/media/formats/webvtt/webvtt_sample_converter_unittest.cc
+++ b/packager/media/formats/webvtt/webvtt_sample_converter_unittest.cc
@ -82,23 +82,18 @@ TEST_F(WebVttFragmenterTest, AppendBoxToVector) {
 //   |-- cue2 --|
 TEST_F(WebVttFragmenterTest, NoOverlapContiguous) {
-  std::shared_ptr<MediaSample> sample1 =
+  Cue cue1;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
+  cue1.payload = kCueMessage1;
-                            arraysize(kCueMessage1) - 1, true);
+  cue1.start_time = 0;
-  sample1->set_pts(0);
+  cue1.duration = 2000;
-  sample1->set_dts(0);
+  webvtt_sample_converter_.PushCue(cue1);
  sample1->set_duration(2000);
-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
  cue2.payload = kCueMessage2;
  cue2.start_time = 2000;
  cue2.duration = 1000;
-  std::shared_ptr<MediaSample> sample2 =
+  webvtt_sample_converter_.PushCue(cue2);
      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
                            arraysize(kCueMessage2) - 1, true);
  sample2->set_pts(2000);
  sample2->set_dts(2000);
  sample2->set_duration(1000);
  webvtt_sample_converter_.PushSample(sample2);
  webvtt_sample_converter_.Flush();
  EXPECT_EQ(2u, webvtt_sample_converter_.ReadySamplesSize());
@ -119,23 +114,18 @@ TEST_F(WebVttFragmenterTest, NoOverlapContiguous) {
 // Verify that if is a gap, then a sample is created for the gap.
 TEST_F(WebVttFragmenterTest, Gap) {
-  std::shared_ptr<MediaSample> sample1 =
+  Cue cue1;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
+  cue1.payload = kCueMessage1;
-                            arraysize(kCueMessage1) - 1, true);
+  cue1.start_time = 0;
-  sample1->set_pts(0);
+  cue1.duration = 1000;
-  sample1->set_dts(0);
+  webvtt_sample_converter_.PushCue(cue1);
  sample1->set_duration(1000);
-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
  cue2.payload = kCueMessage2;
  cue2.start_time = 2000;
  cue2.duration = 1000;
  webvtt_sample_converter_.PushCue(cue2);
  std::shared_ptr<MediaSample> sample2 =
      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
                            arraysize(kCueMessage2) - 1, true);
  sample2->set_pts(2000);
  sample2->set_dts(2000);
  sample2->set_duration(1000);
  webvtt_sample_converter_.PushSample(sample2);
  EXPECT_EQ(2u, webvtt_sample_converter_.ReadySamplesSize());
  webvtt_sample_converter_.Flush();
@ -165,30 +155,23 @@ TEST_F(WebVttFragmenterTest, Gap) {
 // The previous cue always ends before the current cue ends.
 // Cues are overlapping, no samples should be created in PushSample().
 TEST_F(WebVttFragmenterTest, OverlappingCuesSequential) {
-  std::shared_ptr<MediaSample> sample1 =
+  Cue cue1;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
+  cue1.payload = kCueMessage1;
-                            arraysize(kCueMessage1) - 1, true);
+  cue1.start_time = 0;
-  sample1->set_pts(0);
+  cue1.duration = 2000;
-  sample1->set_dts(0);
+  webvtt_sample_converter_.PushCue(cue1);
  sample1->set_duration(2000);
-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
  cue2.payload = kCueMessage2;
  cue2.start_time = 1000;
  cue2.duration = 2000;
  webvtt_sample_converter_.PushCue(cue2);
-  std::shared_ptr<MediaSample> sample2 =
+  Cue cue3;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
+  cue3.payload = kCueMessage3;
-                            arraysize(kCueMessage2) - 1, true);
+  cue3.start_time = 1500;
-  sample2->set_pts(1000);
+  cue3.duration = 4000;
-  sample2->set_dts(1000);
+  webvtt_sample_converter_.PushCue(cue3);
  sample2->set_duration(2000);
  webvtt_sample_converter_.PushSample(sample2);
  std::shared_ptr<MediaSample> sample3 =
      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage3),
                            arraysize(kCueMessage3) - 1, true);
  sample3->set_pts(1500);
  sample3->set_dts(1500);
  sample3->set_duration(4000);
  webvtt_sample_converter_.PushSample(sample3);
  webvtt_sample_converter_.Flush();
  // There should be 5 samples for [0,1000], [1000,1500], [1500,2000],
@ -232,38 +215,29 @@ TEST_F(WebVttFragmenterTest, OverlappingCuesSequential) {
 }
 TEST_F(WebVttFragmenterTest, OverlappingLongCue) {
-  std::shared_ptr<MediaSample> sample1 =
+  Cue cue1;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
+  cue1.payload = kCueMessage1;
-                            arraysize(kCueMessage1) - 1, true);
+  cue1.start_time = 0;
-  sample1->set_pts(0);
+  cue1.duration = 10000;
-  sample1->set_dts(0);
+  webvtt_sample_converter_.PushCue(cue1);
  sample1->set_duration(10000);
-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
  cue2.payload = kCueMessage2;
  cue2.start_time = 1000;
  cue2.duration = 5000;
  webvtt_sample_converter_.PushCue(cue2);
-  std::shared_ptr<MediaSample> sample2 =
+  Cue cue3;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
+  cue3.payload = kCueMessage3;
-                            arraysize(kCueMessage2) - 1, true);
+  cue3.start_time = 2000;
-  sample2->set_pts(1000);
+  cue3.duration = 1000;
-  sample2->set_dts(1000);
+  webvtt_sample_converter_.PushCue(cue3);
  sample2->set_duration(5000);
  webvtt_sample_converter_.PushSample(sample2);
-  std::shared_ptr<MediaSample> sample3 =
+  Cue cue4;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage3),
+  cue4.payload = kCueMessage4;
-                            arraysize(kCueMessage3) - 1, true);
+  cue4.start_time = 8000;
-  sample3->set_pts(2000);
+  cue4.duration = 1000;
-  sample3->set_dts(2000);
+  webvtt_sample_converter_.PushCue(cue4);
  sample3->set_duration(1000);
  webvtt_sample_converter_.PushSample(sample3);
  std::shared_ptr<MediaSample> sample4 =
      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage4),
                            arraysize(kCueMessage4) - 1, true);
  sample4->set_pts(8000);
  sample4->set_dts(8000);
  sample4->set_duration(1000);
  webvtt_sample_converter_.PushSample(sample4);
  webvtt_sample_converter_.Flush();
  // There should be 7 samples for [0,1000], [1000,2000], [2000,3000],
@ -320,13 +294,11 @@ TEST_F(WebVttFragmenterTest, OverlappingLongCue) {
 }
 TEST_F(WebVttFragmenterTest, GapAtBeginning) {
-  std::shared_ptr<MediaSample> sample1 =
+  Cue cue;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
+  cue.payload = kCueMessage1;
-                            arraysize(kCueMessage1) - 1, true);
+  cue.start_time = 1200;
-  sample1->set_pts(1200);
+  cue.duration = 2000;
-  sample1->set_dts(1200);
+  webvtt_sample_converter_.PushCue(cue);
  sample1->set_duration(2000);
  webvtt_sample_converter_.PushSample(sample1);
  webvtt_sample_converter_.Flush();
  EXPECT_EQ(1u, webvtt_sample_converter_.ReadySamplesSize());
@ -340,24 +312,18 @@ TEST_F(WebVttFragmenterTest, GapAtBeginning) {
 }
 TEST_F(WebVttFragmenterTest, SameStartTime) {
-  // TODO(rkuroiwa): This should be std::shared_ptr if this is applied on HEAD.
+  Cue cue1;
-  std::shared_ptr<MediaSample> sample1 =
+  cue1.payload = kCueMessage1;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
+  cue1.start_time = 0;
-                            arraysize(kCueMessage1) - 1, true);
+  cue1.duration = 2000;
-  sample1->set_pts(0);
+  webvtt_sample_converter_.PushCue(cue1);
  sample1->set_dts(0);
  sample1->set_duration(2000);
-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
  cue2.payload = kCueMessage2;
  cue2.start_time = 0;
  cue2.duration = 1500;
  webvtt_sample_converter_.PushCue(cue2);
  std::shared_ptr<MediaSample> sample2 =
      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
                            arraysize(kCueMessage2) - 1, true);
  sample2->set_pts(0);
  sample2->set_dts(0);
  sample2->set_duration(1500);
  webvtt_sample_converter_.PushSample(sample2);
  webvtt_sample_converter_.Flush();
  EXPECT_EQ(2u, webvtt_sample_converter_.ReadySamplesSize());
@ -380,39 +346,29 @@ TEST_F(WebVttFragmenterTest, SameStartTime) {
 // This test is a combination of the test cases above.
 TEST_F(WebVttFragmenterTest, MoreCases) {
-  std::shared_ptr<MediaSample> sample1 =
+  Cue cue1;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage1),
+  cue1.payload = kCueMessage1;
-                            arraysize(kCueMessage1) - 1, true);
+  cue1.start_time = 0;
-  sample1->set_pts(0);
+  cue1.duration = 2000;
-  sample1->set_dts(0);
+  webvtt_sample_converter_.PushCue(cue1);
  sample1->set_duration(2000);
-  webvtt_sample_converter_.PushSample(sample1);
+  Cue cue2;
  cue2.payload = kCueMessage2;
  cue2.start_time = 100;
  cue2.duration = 100;
  webvtt_sample_converter_.PushCue(cue2);
-  std::shared_ptr<MediaSample> sample2 =
+  Cue cue3;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage2),
+  cue3.payload = kCueMessage3;
-                            arraysize(kCueMessage2) - 1, true);
+  cue3.start_time = 1500;
-  sample2->set_pts(100);
+  cue3.duration = 1000;
-  sample2->set_dts(100);
+  webvtt_sample_converter_.PushCue(cue3);
  sample2->set_duration(100);
-  webvtt_sample_converter_.PushSample(sample2);
+  Cue cue4;
-
+  cue4.payload = kCueMessage4;
-  std::shared_ptr<MediaSample> sample3 =
+  cue4.start_time = 1500;
-      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage3),
+  cue4.duration = 800;
-                            arraysize(kCueMessage3) - 1, true);
+  webvtt_sample_converter_.PushCue(cue4);
  sample3->set_pts(1500);
  sample3->set_dts(1500);
  sample3->set_duration(1000);
  webvtt_sample_converter_.PushSample(sample3);
  std::shared_ptr<MediaSample> sample4 =
      MediaSample::CopyFrom(reinterpret_cast<const uint8_t*>(kCueMessage4),
                            arraysize(kCueMessage4) - 1, true);
  sample4->set_pts(1500);
  sample4->set_dts(1500);
  sample4->set_duration(800);
  webvtt_sample_converter_.PushSample(sample4);
  webvtt_sample_converter_.Flush();
  EXPECT_EQ(6u, webvtt_sample_converter_.ReadySamplesSize());