From 1cdce293431c12f646f6cb185b5b06f6400a2352 Mon Sep 17 00:00:00 2001
From: Rintaro Kuroiwa <rkuroiwa@google.com>
Date: Thu, 19 Nov 2015 15:58:29 -0800
Subject: [PATCH] WebVttMediaParser and TextStreamInfo

- Add WebVttMediaParser which parses and creates MediaSamples from text
  WebVtt inputt

Change-Id: Ia7bb7474df7f15e454e887b8c291fdfdc3195e46
---
 packager/media/base/demuxer.cc                |   4 +
 packager/media/base/media_base.gyp            |   4 +-
 packager/media/base/media_sample.cc           |   9 +-
 packager/media/base/media_sample.h            |  21 +-
 packager/media/base/text_stream_info.cc       |  39 ++
 packager/media/base/text_stream_info.h        |  57 +++
 packager/media/formats/webvtt/webvtt.gyp      |  38 ++
 .../formats/webvtt/webvtt_media_parser.cc     | 380 ++++++++++++++++++
 .../formats/webvtt/webvtt_media_parser.h      |  84 ++++
 .../webvtt/webvtt_media_parser_unittest.cc    | 318 +++++++++++++++
 packager/packager.gyp                         |   2 +
 11 files changed, 950 insertions(+), 6 deletions(-)
 create mode 100644 packager/media/base/text_stream_info.cc
 create mode 100644 packager/media/base/text_stream_info.h
 create mode 100644 packager/media/formats/webvtt/webvtt.gyp
 create mode 100644 packager/media/formats/webvtt/webvtt_media_parser.cc
 create mode 100644 packager/media/formats/webvtt/webvtt_media_parser.h
 create mode 100644 packager/media/formats/webvtt/webvtt_media_parser_unittest.cc
diff --git a/packager/media/base/demuxer.cc b/packager/media/base/demuxer.cc
index 00a2ff7dff..0979f497c7 100644
--- a/packager/media/base/demuxer.cc
+++ b/packager/media/base/demuxer.cc
@@ -18,6 +18,7 @@
 #include "packager/media/formats/mp2t/mp2t_media_parser.h"
 #include "packager/media/formats/mp4/mp4_media_parser.h"
 #include "packager/media/formats/webm/webm_media_parser.h"
+#include "packager/media/formats/webvtt/webvtt_media_parser.h"
 #include "packager/media/formats/wvm/wvm_media_parser.h"
 
 namespace {
@@ -87,6 +88,9 @@ Status Demuxer::Initialize() {
     case CONTAINER_WEBM:
       parser_.reset(new WebMMediaParser());
       break;
+    case CONTAINER_WEBVTT:
+      parser_.reset(new WebVttMediaParser());
+      break;
     default:
       NOTIMPLEMENTED();
       return Status(error::UNIMPLEMENTED, "Container not supported.");
diff --git a/packager/media/base/media_base.gyp b/packager/media/base/media_base.gyp
index 44b38ee551..caf5e093e3 100644
--- a/packager/media/base/media_base.gyp
+++ b/packager/media/base/media_base.gyp
@@ -67,10 +67,12 @@
         'status.h',
         'stream_info.cc',
         'stream_info.h',
+        'text_stream_info.cc',
+        'text_stream_info.h',
         'text_track.h',
-        'timestamp.h',
         'text_track_config.cc',
         'text_track_config.h',
+        'timestamp.h',
         'video_stream_info.cc',
         'video_stream_info.h',
         'widevine_key_source.cc',
diff --git a/packager/media/base/media_sample.cc b/packager/media/base/media_sample.cc
index c16cef8f4c..119115f5ff 100644
--- a/packager/media/base/media_sample.cc
+++ b/packager/media/base/media_sample.cc
@@ -26,8 +26,6 @@ MediaSample::MediaSample(const uint8_t* data,
       is_encrypted_(false) {
   if (!data) {
     CHECK_EQ(size, 0u);
-    CHECK(!side_data);
-    return;
   }
 
   data_.assign(data, data + size);
@@ -65,6 +63,13 @@ scoped_refptr<MediaSample> MediaSample::CopyFrom(const uint8_t* data,
       data, data_size, side_data, side_data_size, is_key_frame));
 }
 
+// static
+scoped_refptr<MediaSample> MediaSample::FromMetadata(const uint8_t* metadata,
+                                                     size_t metadata_size) {
+  return make_scoped_refptr(
+      new MediaSample(nullptr, 0, metadata, metadata_size, false));
+}
+
 // static
 scoped_refptr<MediaSample> MediaSample::CreateEmptyMediaSample() {
   MediaSample* media_sample = new MediaSample();
diff --git a/packager/media/base/media_sample.h b/packager/media/base/media_sample.h
index f3983e9e4d..f0c61cf659 100644
--- a/packager/media/base/media_sample.h
+++ b/packager/media/base/media_sample.h
@@ -38,7 +38,6 @@ class MediaSample : public base::RefCountedThreadSafe<MediaSample> {
   ///        Must not be NULL.
   /// @param size indicates sample size in bytes. Must not be negative.
   /// @param side_data_size indicates additional sample data size in bytes.
-  ///        Must not be negative.
   /// @param is_key_frame indicates whether the sample is a key frame.
   static scoped_refptr<MediaSample> CopyFrom(const uint8_t* data,
                                              size_t size,
@@ -46,6 +45,15 @@ class MediaSample : public base::RefCountedThreadSafe<MediaSample> {
                                              size_t side_data_size,
                                              bool is_key_frame);
 
+  /// Create a MediaSample object from metadata.
+  /// Unlike other factory methods, this cannot be a key frame. It must be only
+  /// for metadata.
+  /// @param metadata points to the buffer containing metadata.
+  ///        Must not be NULL.
+  /// @param metadata_size is the size of metadata in bytes.
+  static scoped_refptr<MediaSample> FromMetadata(const uint8_t* metadata,
+                                                 size_t metadata_size);
+
   /// Create a MediaSample object with default members.
   static scoped_refptr<MediaSample> CreateEmptyMediaSample();
 
@@ -103,12 +111,10 @@ class MediaSample : public base::RefCountedThreadSafe<MediaSample> {
   }
 
   const uint8_t* side_data() const {
-    DCHECK(!end_of_stream());
     return &side_data_[0];
   }
 
   size_t side_data_size() const {
-    DCHECK(!end_of_stream());
     return side_data_.size();
   }
 
@@ -127,6 +133,11 @@ class MediaSample : public base::RefCountedThreadSafe<MediaSample> {
   // If there's no data in this buffer, it represents end of stream.
   bool end_of_stream() const { return data_.size() == 0; }
 
+  const std::string& config_id() const { return config_id_; }
+  void set_config_id(const std::string& config_id) {
+    config_id_ = config_id;
+  }
+
   /// @return a human-readable string describing |*this|.
   std::string ToString() const;
 
@@ -160,6 +171,10 @@ class MediaSample : public base::RefCountedThreadSafe<MediaSample> {
   // Not used by mp4 and other containers.
   std::vector<uint8_t> side_data_;
 
+  // Text specific fields.
+  // For now this is the cue identifier for WebVTT.
+  std::string config_id_;
+
   DISALLOW_COPY_AND_ASSIGN(MediaSample);
 };
 
diff --git a/packager/media/base/text_stream_info.cc b/packager/media/base/text_stream_info.cc
new file mode 100644
index 0000000000..59509996c9
--- /dev/null
+++ b/packager/media/base/text_stream_info.cc
@@ -0,0 +1,39 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file or at
+// https://developers.google.com/open-source/licenses/bsd
+
+#include "packager/media/base/text_stream_info.h"
+
+namespace edash_packager {
+namespace media {
+
+TextStreamInfo::TextStreamInfo(int track_id,
+                               uint32_t time_scale,
+                               uint64_t duration,
+                               const std::string& codec_string,
+                               const std::string& language,
+                               const std::string& extra_data,
+                               uint16_t width,
+                               uint16_t height)
+    : StreamInfo(kStreamText,
+                 track_id,
+                 time_scale,
+                 duration,
+                 codec_string,
+                 language,
+                 reinterpret_cast<const uint8_t*>(extra_data.data()),
+                 extra_data.size(),
+                 false),
+      width_(width),
+      height_(height) {}
+
+TextStreamInfo::~TextStreamInfo() {}
+
+bool TextStreamInfo::IsValidConfig() const {
+  return true;
+}
+
+}  // namespace media
+}  // namespace edash_packager
diff --git a/packager/media/base/text_stream_info.h b/packager/media/base/text_stream_info.h
new file mode 100644
index 0000000000..e968de0798
--- /dev/null
+++ b/packager/media/base/text_stream_info.h
@@ -0,0 +1,57 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file or at
+// https://developers.google.com/open-source/licenses/bsd
+
+#ifndef PACKAGER_MEDIA_BASE_TEXT_STREAM_INFO_H_
+#define PACKAGER_MEDIA_BASE_TEXT_STREAM_INFO_H_
+
+#include "packager/media/base/stream_info.h"
+
+#include <string>
+
+namespace edash_packager {
+namespace media {
+
+class TextStreamInfo : public StreamInfo {
+ public:
+  /// No encryption supported.
+  /// @param track_id is the track ID of this stream.
+  /// @param time_scale is the time scale of this stream.
+  /// @param duration is the duration of this stream.
+  /// @param codec_string is the codec.
+  /// @param language is the language of this stream. This may be empty.
+  /// @param extra_data is extra data for this text stream. This could be the
+  ///        metadata that applies to all the samples of this stream. This may
+  ///        be empty.
+  /// @param width of the text. This may be 0.
+  /// @param height of the text. This may be 0.
+  TextStreamInfo(int track_id,
+                 uint32_t time_scale,
+                 uint64_t duration,
+                 const std::string& codec_string,
+                 const std::string& language,
+                 const std::string& extra_data,
+                 uint16_t width,
+                 uint16_t height);
+
+  bool IsValidConfig() const override;
+
+  uint16_t width() const { return width_; }
+  uint16_t height() const { return height_; }
+
+ protected:
+  ~TextStreamInfo() override;
+
+ private:
+  uint16_t width_;
+  uint16_t height_;
+
+  // Allow copying. This is very light weight.
+};
+
+}  // namespace media
+}  // namespace edash_packager
+
+#endif  // PACKAGER_MEDIA_BASE_TEXT_STREAM_INFO_H_
diff --git a/packager/media/formats/webvtt/webvtt.gyp b/packager/media/formats/webvtt/webvtt.gyp
new file mode 100644
index 0000000000..c2f88bde8a
--- /dev/null
+++ b/packager/media/formats/webvtt/webvtt.gyp
@@ -0,0 +1,38 @@
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+{
+  'includes': [
+    '../../../common.gypi',
+  ],
+  'targets': [
+    {
+      'target_name': 'webvtt',
+      'type': '<(component)',
+      'sources': [
+        'webvtt_media_parser.cc',
+        'webvtt_media_parser.h',
+      ],
+      'dependencies': [
+        '../../../base/base.gyp:base',
+        '../../base/media_base.gyp:base',
+      ],
+    },
+    {
+      'target_name': 'webvtt_unittest',
+      'type': '<(gtest_target_type)',
+      'sources': [
+        'webvtt_media_parser_unittest.cc',
+      ],
+      'dependencies': [
+        '../../../testing/gmock.gyp:gmock',
+        '../../../testing/gtest.gyp:gtest',
+        '../../test/media_test.gyp:media_test_support',
+        'webvtt',
+      ]
+    },
+  ],
+}
diff --git a/packager/media/formats/webvtt/webvtt_media_parser.cc b/packager/media/formats/webvtt/webvtt_media_parser.cc
new file mode 100644
index 0000000000..45f731f0c4
--- /dev/null
+++ b/packager/media/formats/webvtt/webvtt_media_parser.cc
@@ -0,0 +1,380 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file or at
+// https://developers.google.com/open-source/licenses/bsd
+
+#include "packager/media/formats/webvtt/webvtt_media_parser.h"
+
+#include <string>
+#include <vector>
+
+#include "packager/base/logging.h"
+#include "packager/base/strings/string_number_conversions.h"
+#include "packager/base/strings/string_split.h"
+#include "packager/base/strings/string_util.h"
+#include "packager/media/base/media_sample.h"
+#include "packager/media/base/text_stream_info.h"
+
+namespace edash_packager {
+namespace media {
+
+namespace {
+
+// There's only one track in a WebVTT file.
+const int kTrackId = 0;
+
+const char kCR = 0x0D;
+const char kLF = 0x0A;
+
+// Reads the first line from |data| and removes the line. Returns false if there
+// isn't a line break. Sets |line| with the content of the first line without
+// the line break.
+bool ReadLine(std::string* data, std::string* line) {
+  if (data->size() == 0) {
+    return false;
+  }
+  size_t string_position = 0;
+  // Length of the line break mark. 1 for LF and CR, 2 for CRLF.
+  int line_break_length = 1;
+  bool found_line_break = false;
+  while (string_position < data->size()) {
+    if (data->at(string_position) == kLF) {
+      found_line_break = true;
+      break;
+    }
+
+    if (data->at(string_position) == kCR) {
+      found_line_break = true;
+      if (string_position + 1 >= data->size())
+        break;
+
+      if (data->at(string_position + 1) == kLF)
+        line_break_length = 2;
+      break;
+    }
+
+    ++string_position;
+  }
+
+  if (!found_line_break)
+    return false;
+
+  *line = data->substr(0, string_position);
+  data->erase(0, string_position + line_break_length);
+  return true;
+}
+
+bool TimestampToMilliseconds(const std::string& original_str,
+                             uint64_t* time_ms) {
+  const size_t kMinutesLength = 2;
+  const size_t kSecondsLength = 2;
+  const size_t kMillisecondsLength = 3;
+
+  // +2 for a colon and a dot for splitting minutes and seconds AND seconds and
+  // milliseconds, respectively.
+  const size_t kMinimalLength =
+      kMinutesLength + kSecondsLength + kMillisecondsLength + 2;
+
+  base::StringPiece str(original_str);
+  if (str.size() < kMinimalLength)
+    return false;
+
+  int hours = 0;
+  int minutes = 0;
+  int seconds = 0;
+  int milliseconds = 0;
+
+  size_t str_index = 0;
+  if (str.size() > kMinimalLength) {
+    // Check if hours is in the right format, if so get the number.
+    // -1 for excluding colon for splitting hours and minutes.
+    const size_t hours_length = str.size() - kMinimalLength - 1;
+    if (!base::StringToInt(str.substr(0, hours_length), &hours))
+      return false;
+    str_index += hours_length;
+
+    if (str[str_index] != ':')
+      return false;
+    ++str_index;
+  }
+
+  DCHECK_EQ(str.size() - str_index, kMinimalLength);
+
+  if (!base::StringToInt(str.substr(str_index, kMinutesLength), &minutes))
+    return false;
+  if (minutes < 0 || minutes > 60)
+    return false;
+
+  str_index += kMinutesLength;
+  if (str[str_index] != ':')
+    return false;
+  ++str_index;
+
+  if (!base::StringToInt(str.substr(str_index, kSecondsLength), &seconds))
+    return false;
+  if (seconds < 0 || seconds > 60)
+    return false;
+
+  str_index += kSecondsLength;
+  if (str[str_index] != '.')
+    return false;
+  ++str_index;
+
+  if (!base::StringToInt(str.substr(str_index, kMillisecondsLength),
+                         &milliseconds)) {
+    return false;
+  }
+  str_index += kMillisecondsLength;
+
+  if (milliseconds < 0 || milliseconds > 999)
+    return false;
+
+  DCHECK_EQ(str.size(), str_index);
+  *time_ms = milliseconds +
+             seconds * 1000 +
+             minutes * 60 * 1000 +
+             hours * 60 * 60 * 1000;
+  return true;
+}
+
+// Clears |settings| and 0s |start_time| and |duration| regardless of the
+// parsing result.
+bool ParseTimingAndSettingsLine(const std::string& line,
+                                uint64_t* start_time,
+                                uint64_t* duration,
+                                std::string* settings) {
+  *start_time = 0;
+  *duration = 0;
+  settings->clear();
+  std::vector<std::string> entries = base::SplitString(
+      line, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
+  if (entries.size() < 3) {
+    // The timing is time1 --> time3 so if there aren't 3 entries, this is parse
+    // error.
+    LOG(ERROR) << "Not enough tokens to be a timing " << line;
+    return false;
+  }
+
+  if (entries[1] != "-->") {
+    LOG(ERROR) << "Cannot find an arrow at the right place " << line;
+    return false;
+  }
+
+  const std::string& start_time_str = entries[0];
+  if (!TimestampToMilliseconds(start_time_str, start_time)) {
+    LOG(ERROR) << "Failed to parse " << start_time_str << " in " << line;
+    return false;
+  }
+
+  const std::string& end_time_str = entries[2];
+  uint64_t end_time = 0;
+  if (!TimestampToMilliseconds(end_time_str, &end_time)) {
+    LOG(ERROR) << "Failed to parse " << end_time_str << " in " << line;
+    return false;
+  }
+  *duration = end_time - *start_time;
+
+  entries.erase(entries.begin(), entries.begin() + 3);
+  *settings = base::JoinString(entries, " ");
+  return true;
+}
+
+// Mapping:
+// comment --> side data (and side data only sample)
+// settings --> side data
+// start_time --> pts
+scoped_refptr<MediaSample> CueToMediaSample(const Cue& cue) {
+  const bool kKeyFrame = true;
+  if (!cue.comment.empty()) {
+    const std::string comment = base::JoinString(cue.comment, "\n");
+    return MediaSample::FromMetadata(
+        reinterpret_cast<const uint8_t*>(comment.data()), comment.size());
+  }
+
+  const std::string payload = base::JoinString(cue.payload, "\n");
+  scoped_refptr<MediaSample> media_sample = MediaSample::CopyFrom(
+      reinterpret_cast<const uint8_t*>(payload.data()),
+      payload.size(),
+      reinterpret_cast<const uint8_t*>(cue.settings.data()),
+      cue.settings.size(),
+      !kKeyFrame);
+
+  media_sample->set_config_id(cue.identifier);
+  media_sample->set_pts(cue.start_time);
+  media_sample->set_duration(cue.duration);
+  return media_sample;
+}
+
+}  // namespace
+
+Cue::Cue() : start_time(0), duration(0) {}
+Cue::~Cue() {}
+
+WebVttMediaParser::WebVttMediaParser() : state_(kHeader) {}
+WebVttMediaParser::~WebVttMediaParser() {}
+
+void WebVttMediaParser::Init(const InitCB& init_cb,
+                             const NewSampleCB& new_sample_cb,
+                             KeySource* decryption_key_source) {
+  init_cb_ = init_cb;
+  new_sample_cb_ = new_sample_cb;
+}
+
+void WebVttMediaParser::Flush() {
+  // If not in one of these states just be ready for more data.
+  if (state_ != kCuePayload && state_ != kComment)
+    return;
+
+  if (!data_.empty()) {
+    // If it was in the middle of the payload and the stream finished, then this
+    // is an end of the payload. The rest of the data is part of the payload.
+    if (state_ == kCuePayload) {
+      current_cue_.payload.push_back(data_);
+    } else {
+      current_cue_.comment.push_back(data_);
+    }
+    data_.clear();
+  }
+
+  new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_));
+  current_cue_ = Cue();
+  state_ = kCueIdentifierOrTimingOrComment;
+}
+
+bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
+  if (state_ == kParseError) {
+    LOG(WARNING) << "The parser is in an error state, ignoring input.";
+    return false;
+  }
+
+  data_.insert(data_.end(), buf, buf + size);
+
+  std::string line;
+  while (ReadLine(&data_, &line)) {
+    // Only kCueIdentifierOrTimingOrComment and kCueTiming states accept -->.
+    // Error otherwise.
+    const bool has_arrow = line.find("-->") != std::string::npos;
+    if (state_ == kCueTiming) {
+      if (!has_arrow) {
+        LOG(ERROR) << "Expected --> in: " << line;
+        state_ = kParseError;
+        return false;
+      }
+    } else if (state_ != kCueIdentifierOrTimingOrComment) {
+      if (has_arrow) {
+        LOG(ERROR) << "Unexpected --> in " << line;
+        state_ = kParseError;
+        return false;
+      }
+    }
+
+    switch (state_) {
+      case kHeader:
+        // No check. This should be WEBVTT when this object was created.
+        header_.push_back(line);
+        state_ = kMetadata;
+        break;
+      case kMetadata: {
+        if (line.empty()) {
+          std::vector<scoped_refptr<StreamInfo> > streams;
+          // The resolution of timings are in milliseconds.
+          const int kTimescale = 1000;
+
+          // The duration passed here is not very important. Also the whole file
+          // must be read before determining the real duration which doesn't
+          // work nicely with the current demuxer.
+          const int kDuration = 0;
+
+          // There is no one metadata to determine what the language is. Parts
+          // of the text may be annotated as some specific language.
+          const char kLanguage[] = "";
+          streams.push_back(new TextStreamInfo(
+              kTrackId,
+              kTimescale,
+              kDuration,
+              "wvtt",
+              kLanguage,
+              base::JoinString(header_, "\n"),
+              0,         // Not necessary.
+              0));       // Not necessary.
+
+          init_cb_.Run(streams);
+          state_ = kCueIdentifierOrTimingOrComment;
+          break;
+        }
+
+        header_.push_back(line);
+        break;
+      }
+      case kCueIdentifierOrTimingOrComment: {
+        // Note that there can be one or more line breaks before a cue starts;
+        // skip this line.
+        // Or the file could end without a new cue.
+        if (line.empty())
+          break;
+
+        if (!has_arrow) {
+          if (base::StartsWith(line, "NOTE",
+                                      base::CompareCase::INSENSITIVE_ASCII)) {
+            state_ = kComment;
+            current_cue_.comment.push_back(line);
+          } else {
+            // A cue can start from a cue identifier.
+            // https://w3c.github.io/webvtt/#webvtt-cue-identifier
+            current_cue_.identifier = line;
+            // The next line must be a timing.
+            state_ = kCueTiming;
+          }
+          break;
+        }
+
+        // No break statement if the line has an arrow; it should be a WebVTT
+        // timing, so fall thru. Setting state_ to kCueTiming so that the state
+        // always matches the case.
+        state_ = kCueTiming;
+      }
+      case kCueTiming: {
+        DCHECK(has_arrow);
+        if (!ParseTimingAndSettingsLine(line, &current_cue_.start_time,
+                                        &current_cue_.duration,
+                                        &current_cue_.settings)) {
+          state_ = kParseError;
+          return false;
+        }
+        state_ = kCuePayload;
+        break;
+      }
+      case kCuePayload: {
+        if (line.empty()) {
+          state_ = kCueIdentifierOrTimingOrComment;
+          new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_));
+          current_cue_ = Cue();
+          break;
+        }
+
+        current_cue_.payload.push_back(line);
+        break;
+      }
+      case kComment: {
+        if (line.empty()) {
+          state_ = kCueIdentifierOrTimingOrComment;
+          new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_));
+          current_cue_ = Cue();
+          break;
+        }
+
+        current_cue_.comment.push_back(line);
+        break;
+      }
+      case kParseError:
+        NOTREACHED();
+        return false;
+    }
+  }
+
+  return true;
+}
+
+}  // namespace media
+}  // namespace edash_packager
diff --git a/packager/media/formats/webvtt/webvtt_media_parser.h b/packager/media/formats/webvtt/webvtt_media_parser.h
new file mode 100644
index 0000000000..49446ba653
--- /dev/null
+++ b/packager/media/formats/webvtt/webvtt_media_parser.h
@@ -0,0 +1,84 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file or at
+// https://developers.google.com/open-source/licenses/bsd
+
+#ifndef MEDIA_FORMATS_WEBVTT_WEBVTT_MEDIA_PARSER_H_
+#define MEDIA_FORMATS_WEBVTT_WEBVTT_MEDIA_PARSER_H_
+
+#include "packager/media/base/media_parser.h"
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+namespace edash_packager {
+namespace media {
+
+// If comment is not empty, then this is metadata and other fields must
+// be empty.
+// Data that can be multiline are vector of strings.
+struct Cue {
+  Cue();
+  ~Cue();
+
+  std::string identifier;
+  uint64_t start_time;
+  uint64_t duration;
+  std::string settings;
+  std::vector<std::string> payload;
+  std::vector<std::string> comment;
+};
+
+// WebVTT parser.
+// The input may not be encrypted so decryption_key_source is ignored.
+class WebVttMediaParser : public MediaParser {
+ public:
+  WebVttMediaParser();
+  ~WebVttMediaParser() override;
+
+  /// @name MediaParser implementation overrides.
+  /// @{
+  void Init(const InitCB& init_cb,
+            const NewSampleCB& new_sample_cb,
+            KeySource* decryption_key_source) override;
+  void Flush() override;
+  bool Parse(const uint8_t* buf, int size) override;
+  /// @}
+
+ private:
+  enum WebVttReadingState {
+    kHeader,
+    kMetadata,
+    kCueIdentifierOrTimingOrComment,
+    kCueTiming,
+    kCuePayload,
+    kComment,
+    kParseError,
+  };
+
+  InitCB init_cb_;
+  NewSampleCB new_sample_cb_;
+
+  // All the unprocessed data passed to this parser.
+  std::string data_;
+
+  // The WEBVTT text + metadata header (global settings) for this webvtt.
+  // One element per line.
+  std::vector<std::string> header_;
+
+  // This is set to what the parser is expecting. For example, if the parse is
+  // expecting a kCueTiming, then the next line that it parses should be a
+  // WebVTT timing line or an empty line.
+  WebVttReadingState state_;
+
+  Cue current_cue_;
+
+  DISALLOW_COPY_AND_ASSIGN(WebVttMediaParser);
+};
+
+}  // namespace media
+}  // namespace edash_packager
+
+#endif  // MEDIA_FORMATS_WEBVTT_WEBVTT_MEDIA_PARSER_H_
diff --git a/packager/media/formats/webvtt/webvtt_media_parser_unittest.cc b/packager/media/formats/webvtt/webvtt_media_parser_unittest.cc
new file mode 100644
index 0000000000..16b73bd9b5
--- /dev/null
+++ b/packager/media/formats/webvtt/webvtt_media_parser_unittest.cc
@@ -0,0 +1,318 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file or at
+// https://developers.google.com/open-source/licenses/bsd
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "packager/base/bind.h"
+#include "packager/media/base/media_sample.h"
+#include "packager/media/base/stream_info.h"
+#include "packager/media/formats/webvtt/webvtt_media_parser.h"
+
+namespace edash_packager {
+namespace media {
+
+typedef testing::MockFunction<void(const std::vector<scoped_refptr<StreamInfo>>&
+                                       stream_info)> MockInitCallback;
+typedef testing::MockFunction<bool(
+    uint32_t track_id,
+    const scoped_refptr<MediaSample>& media_sample)> MockNewSampleCallback;
+
+using testing::_;
+using testing::InSequence;
+using testing::Return;
+
+class WebVttMediaParserTest : public ::testing::Test {
+ public:
+  WebVttMediaParserTest() {}
+  ~WebVttMediaParserTest() override {}
+
+  void InitializeParser() {
+    parser_.Init(
+        base::Bind(&MockInitCallback::Call, base::Unretained(&init_callback_)),
+        base::Bind(&MockNewSampleCallback::Call,
+                   base::Unretained(&new_sample_callback_)),
+        nullptr);
+  }
+
+  MockInitCallback init_callback_;
+  MockNewSampleCallback new_sample_callback_;
+
+  WebVttMediaParser parser_;
+};
+
+TEST_F(WebVttMediaParserTest, Init) {
+  InitializeParser();
+}
+
+TEST_F(WebVttMediaParserTest, ParseOneCue) {
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "00:01:00.000 --> 01:00:00.000\n"
+      "subtitle";
+  InitializeParser();
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                            arraysize(kWebVtt) - 1));
+
+  parser_.Flush();
+}
+
+// Verify that different types of line breaks work.
+TEST_F(WebVttMediaParserTest, DifferentLineBreaks) {
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
+
+  const char kWebVtt[] =
+      "WEBVTT\r\n"
+      "\r\n"
+      "00:01:00.000 --> 01:00:00.000\n"
+      "subtitle\r";
+  InitializeParser();
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                            arraysize(kWebVtt) - 1));
+
+  parser_.Flush();
+}
+
+TEST_F(WebVttMediaParserTest, ParseMultpleCues) {
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, _))
+      .Times(2)
+      .WillRepeatedly(Return(true));
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "00:01:00.000 --> 01:00:00.000\n"
+      "subtitle\n"
+      "\n"
+      "02:01:00.000 --> 02:02:00.000\n"
+      "more subtitle";
+  InitializeParser();
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                            arraysize(kWebVtt) - 1));
+
+  parser_.Flush();
+}
+
+MATCHER_P2(MatchesStartTimeAndDuration, start_time, duration, "") {
+  return arg->pts() == start_time && arg->duration() == duration;
+}
+
+// Verify that the timing parsing is done correctly and gets the right start
+// time and duration in milliseconds.
+TEST_F(WebVttMediaParserTest, VerifyTimingParsing) {
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_,
+              Call(_, MatchesStartTimeAndDuration(61004, 204088)))
+      .WillOnce(Return(true));
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "00:01:01.004 --> 00:04:25.092\n"
+      "subtitle";
+  InitializeParser();
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                            arraysize(kWebVtt) - 1));
+
+  parser_.Flush();
+}
+
+// Expect parse failure if hour part of the timestamp is too short.
+TEST_F(WebVttMediaParserTest, MalformedHourTimestamp) {
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
+
+  const char kHourStringTooShort[] =
+      "WEBVTT\n"
+      "\n"
+      "0:01:01.004 --> 00:04:25.092\n"
+      "subtitle\n";
+  InitializeParser();
+
+  EXPECT_FALSE(
+      parser_.Parse(reinterpret_cast<const uint8_t*>(kHourStringTooShort),
+                    arraysize(kHourStringTooShort) - 1));
+}
+
+// Each component of the timestamp is correct but contains spaces.
+TEST_F(WebVttMediaParserTest, SpacesInTimestamp) {
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
+
+  const char kHourStringTooShort[] =
+      "WEBVTT\n"
+      "\n"
+      "0:01: 1.004 --> 0 :04:25.092\n"
+      "subtitle\n";
+  InitializeParser();
+
+  EXPECT_FALSE(
+      parser_.Parse(reinterpret_cast<const uint8_t*>(kHourStringTooShort),
+                    arraysize(kHourStringTooShort) - 1));
+}
+
+MATCHER_P(MatchesPayload, data, "") {
+  std::vector<uint8_t> arg_data(arg->data(), arg->data() + arg->data_size());
+  return arg_data == data;
+}
+
+TEST_F(WebVttMediaParserTest, VerifyCuePayload) {
+  const char kExpectedPayload1[] = "subtitle";
+  const char kExpectedPayload2[] = "hello";
+  std::vector<uint8_t> expected_payload(
+      kExpectedPayload1, kExpectedPayload1 + arraysize(kExpectedPayload1) - 1);
+
+  InSequence s;
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, MatchesPayload(expected_payload)))
+      .WillOnce(Return(true));
+
+  expected_payload.assign(kExpectedPayload2,
+                          kExpectedPayload2 + arraysize(kExpectedPayload2) - 1);
+  EXPECT_CALL(new_sample_callback_, Call(_, MatchesPayload(expected_payload)))
+      .WillOnce(Return(true));
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "00:01:01.004 --> 00:01:22.088\n"
+      "subtitle\n"
+      "\n"
+      "02:06:00.000 --> 02:30:02.006\n"
+      "hello";
+
+  InitializeParser();
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                            arraysize(kWebVtt) - 1));
+
+  parser_.Flush();
+}
+
+// Verify that a sample can be created from multiple calls to Parse(), i.e. one
+// Parse() is not a full sample.
+TEST_F(WebVttMediaParserTest, PartialParse) {
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).WillOnce(Return(true));
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "00:01:01.004 --> 00:04:25.092\n"
+      "subtitle";
+  InitializeParser();
+  // Pass in the first 8 bytes, i.e. right before the first cue.
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt), 8));
+  // Pass in the rest of the cue.
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt) + 8,
+                            arraysize(kWebVtt) - 1 - 8));
+
+  parser_.Flush();
+}
+
+// Verify that metadata header with --> is rejected.
+TEST_F(WebVttMediaParserTest, BadMetadataHeader) {
+  EXPECT_CALL(init_callback_, Call(_)).Times(0);
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
+  const char kBadWebVtt[] =
+      "WEBVTT\n"
+      "00:01:01.004 --> 00:04:25.092\n";
+  InitializeParser();
+  EXPECT_FALSE(parser_.Parse(reinterpret_cast<const uint8_t*>(kBadWebVtt),
+                             arraysize(kBadWebVtt) - 1));
+  parser_.Flush();
+}
+
+MATCHER_P(MatchesComment, comment, "") {
+  std::vector<uint8_t> arg_comment(arg->side_data(),
+                                   arg->side_data() + arg->side_data_size());
+  return arg_comment == comment;
+}
+
+// Verify that comment is parsed.
+TEST_F(WebVttMediaParserTest, Comment) {
+  const char kExpectedComment[] = "NOTE This is a comment";
+  std::vector<uint8_t> expected_comment(
+      kExpectedComment, kExpectedComment + arraysize(kExpectedComment) - 1);
+
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, MatchesComment(expected_comment)))
+      .WillOnce(Return(true));
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "NOTE This is a comment\n";
+
+  InitializeParser();
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                            arraysize(kWebVtt) - 1));
+  parser_.Flush();
+}
+
+// Verify that comment with --> is rejected.
+TEST_F(WebVttMediaParserTest, BadComment) {
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "NOTE BAD Comment -->.\n";
+
+  InitializeParser();
+  EXPECT_FALSE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                             arraysize(kWebVtt) - 1));
+  parser_.Flush();
+}
+
+MATCHER_P(HeaderMatches, header, "") {
+  const std::vector<uint8_t>& extra_data = arg.at(0)->extra_data();
+  return extra_data == header;
+}
+
+// Verify that the metadata header and the WEBVTT magic string is there.
+TEST_F(WebVttMediaParserTest, Header) {
+  const char kHeader[] = "WEBVTT\nRegion: id=anything width=40%";
+  std::vector<uint8_t> expected_header(kHeader,
+                                       kHeader + arraysize(kHeader) - 1);
+
+  EXPECT_CALL(init_callback_, Call(HeaderMatches(expected_header)));
+  ON_CALL(new_sample_callback_, Call(_, _)).WillByDefault(Return(true));
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "Region: id=anything width=40%\n"
+      "\n"
+      "00:01:01.004 --> 00:04:25.092\n"
+      "subtitle";
+
+  InitializeParser();
+  EXPECT_TRUE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                            arraysize(kWebVtt) - 1));
+  parser_.Flush();
+}
+
+// Verify that if timing is not present after an identifier, the parser errors.
+TEST_F(WebVttMediaParserTest, NoTimingAfterIdentifier) {
+  EXPECT_CALL(init_callback_, Call(_));
+  EXPECT_CALL(new_sample_callback_, Call(_, _)).Times(0);
+
+  const char kWebVtt[] =
+      "WEBVTT\n"
+      "\n"
+      "anyid\n"
+      "00:12.000 00:13.000\n";  // This line doesn't have -->, so error.
+  InitializeParser();
+  EXPECT_FALSE(parser_.Parse(reinterpret_cast<const uint8_t*>(kWebVtt),
+                             arraysize(kWebVtt) - 1));
+  parser_.Flush();
+}
+
+}  // namespace media
+}  // namespace edash_packager
diff --git a/packager/packager.gyp b/packager/packager.gyp
index 6c77b0ee3c..d2b116baf7 100644
--- a/packager/packager.gyp
+++ b/packager/packager.gyp
@@ -41,6 +41,7 @@
         'media/formats/mp4/mp4.gyp:mp4',
         'media/formats/mpeg/mpeg.gyp:mpeg',
         'media/formats/webm/webm.gyp:webm',
+        'media/formats/webvtt/webvtt.gyp:webvtt',
         'media/formats/wvm/wvm.gyp:wvm',
         'mpd/mpd.gyp:mpd_builder',
         'third_party/boringssl/boringssl.gyp:boringssl',
@@ -75,6 +76,7 @@
         'media/formats/mp4/mp4.gyp:mp4',
         'media/formats/mpeg/mpeg.gyp:mpeg',
         'media/formats/webm/webm.gyp:webm',
+        'media/formats/webvtt/webvtt.gyp:webvtt',
         'media/formats/wvm/wvm.gyp:wvm',
         'media/test/media_test.gyp:media_test_support',
         'testing/gtest.gyp:gtest',