DASH Media Packaging SDK
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator
webvtt_media_parser.cc
1 // Copyright 2015 Google Inc. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file or at
5 // https://developers.google.com/open-source/licenses/bsd
6 
7 #include "packager/media/formats/webvtt/webvtt_media_parser.h"
8 
9 #include <string>
10 #include <vector>
11 
12 #include "packager/base/logging.h"
13 #include "packager/base/strings/string_number_conversions.h"
14 #include "packager/base/strings/string_split.h"
15 #include "packager/base/strings/string_util.h"
16 #include "packager/media/base/macros.h"
17 #include "packager/media/base/media_sample.h"
18 #include "packager/media/base/text_stream_info.h"
19 
20 namespace shaka {
21 namespace media {
22 
23 namespace {
24 
25 // There's only one track in a WebVTT file.
26 const int kTrackId = 0;
27 
28 const char kCR = 0x0D;
29 const char kLF = 0x0A;
30 
31 // Reads the first line from |data| and removes the line. Returns false if there
32 // isn't a line break. Sets |line| with the content of the first line without
33 // the line break.
34 bool ReadLine(std::string* data, std::string* line) {
35  if (data->size() == 0) {
36  return false;
37  }
38  size_t string_position = 0;
39  // Length of the line break mark. 1 for LF and CR, 2 for CRLF.
40  int line_break_length = 1;
41  bool found_line_break = false;
42  while (string_position < data->size()) {
43  if (data->at(string_position) == kLF) {
44  found_line_break = true;
45  break;
46  }
47 
48  if (data->at(string_position) == kCR) {
49  found_line_break = true;
50  if (string_position + 1 >= data->size())
51  break;
52 
53  if (data->at(string_position + 1) == kLF)
54  line_break_length = 2;
55  break;
56  }
57 
58  ++string_position;
59  }
60 
61  if (!found_line_break)
62  return false;
63 
64  *line = data->substr(0, string_position);
65  data->erase(0, string_position + line_break_length);
66  return true;
67 }
68 
69 bool TimestampToMilliseconds(const std::string& original_str,
70  uint64_t* time_ms) {
71  const size_t kMinimalHoursLength = 2;
72  const size_t kMinutesLength = 2;
73  const size_t kSecondsLength = 2;
74  const size_t kMillisecondsLength = 3;
75 
76  // +2 for a colon and a dot for splitting minutes and seconds AND seconds and
77  // milliseconds, respectively.
78  const size_t kMinimalLength =
79  kMinutesLength + kSecondsLength + kMillisecondsLength + 2;
80 
81  base::StringPiece str(original_str);
82  if (str.size() < kMinimalLength)
83  return false;
84 
85  int hours = 0;
86  int minutes = 0;
87  int seconds = 0;
88  int milliseconds = 0;
89 
90  size_t str_index = 0;
91  if (str.size() > kMinimalLength) {
92  // Check if hours is in the right format, if so get the number.
93  // -1 for excluding colon for splitting hours and minutes.
94  const size_t hours_length = str.size() - kMinimalLength - 1;
95  if (hours_length < kMinimalHoursLength)
96  return false;
97  if (!base::StringToInt(str.substr(0, hours_length), &hours))
98  return false;
99  str_index += hours_length;
100 
101  if (str[str_index] != ':')
102  return false;
103  ++str_index;
104  }
105 
106  DCHECK_EQ(str.size() - str_index, kMinimalLength);
107 
108  if (!base::StringToInt(str.substr(str_index, kMinutesLength), &minutes))
109  return false;
110  if (minutes < 0 || minutes > 60)
111  return false;
112 
113  str_index += kMinutesLength;
114  if (str[str_index] != ':')
115  return false;
116  ++str_index;
117 
118  if (!base::StringToInt(str.substr(str_index, kSecondsLength), &seconds))
119  return false;
120  if (seconds < 0 || seconds > 60)
121  return false;
122 
123  str_index += kSecondsLength;
124  if (str[str_index] != '.')
125  return false;
126  ++str_index;
127 
128  if (!base::StringToInt(str.substr(str_index, kMillisecondsLength),
129  &milliseconds)) {
130  return false;
131  }
132  str_index += kMillisecondsLength;
133 
134  if (milliseconds < 0 || milliseconds > 999)
135  return false;
136 
137  DCHECK_EQ(str.size(), str_index);
138  *time_ms = milliseconds +
139  seconds * 1000 +
140  minutes * 60 * 1000 +
141  hours * 60 * 60 * 1000;
142  return true;
143 }
144 
145 // Clears |settings| and 0s |start_time| and |duration| regardless of the
146 // parsing result.
147 bool ParseTimingAndSettingsLine(const std::string& line,
148  uint64_t* start_time,
149  uint64_t* duration,
150  std::string* settings) {
151  *start_time = 0;
152  *duration = 0;
153  settings->clear();
154  std::vector<std::string> entries = base::SplitString(
155  line, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
156  if (entries.size() < 3) {
157  // The timing is time1 --> time3 so if there aren't 3 entries, this is parse
158  // error.
159  LOG(ERROR) << "Not enough tokens to be a timing " << line;
160  return false;
161  }
162 
163  if (entries[1] != "-->") {
164  LOG(ERROR) << "Cannot find an arrow at the right place " << line;
165  return false;
166  }
167 
168  const std::string& start_time_str = entries[0];
169  if (!TimestampToMilliseconds(start_time_str, start_time)) {
170  LOG(ERROR) << "Failed to parse " << start_time_str << " in " << line;
171  return false;
172  }
173 
174  const std::string& end_time_str = entries[2];
175  uint64_t end_time = 0;
176  if (!TimestampToMilliseconds(end_time_str, &end_time)) {
177  LOG(ERROR) << "Failed to parse " << end_time_str << " in " << line;
178  return false;
179  }
180  *duration = end_time - *start_time;
181 
182  entries.erase(entries.begin(), entries.begin() + 3);
183  *settings = base::JoinString(entries, " ");
184  return true;
185 }
186 
187 } // namespace
188 
189 Cue::Cue() : start_time(0), duration(0) {}
190 Cue::~Cue() {}
191 
192 // Mapping:
193 // comment --> side data (and side data only sample)
194 // settings --> side data
195 // start_time --> pts
196 std::shared_ptr<MediaSample> CueToMediaSample(const Cue& cue) {
197  const bool kKeyFrame = true;
198  if (!cue.comment.empty()) {
199  const std::string comment = base::JoinString(cue.comment, "\n");
201  reinterpret_cast<const uint8_t*>(comment.data()), comment.size());
202  }
203 
204  const std::string payload = base::JoinString(cue.payload, "\n");
205  std::shared_ptr<MediaSample> media_sample = MediaSample::CopyFrom(
206  reinterpret_cast<const uint8_t*>(payload.data()), payload.size(),
207  reinterpret_cast<const uint8_t*>(cue.settings.data()),
208  cue.settings.size(), !kKeyFrame);
209 
210  media_sample->set_config_id(cue.identifier);
211  media_sample->set_pts(cue.start_time);
212  media_sample->set_duration(cue.duration);
213  return media_sample;
214 }
215 
216 // TODO(rkuroiwa): Cue gets converted to MediaSample in WebVttMediaParser and
217 // then back to Cue in the muxer. Consider making MediaSample a protobuf or make
218 // Cue a protobuf and (ab)use MediaSample::data() to store serialized Cue.
219 Cue MediaSampleToCue(const MediaSample& sample) {
220  Cue cue;
221  if (sample.data_size() == 0) {
222  std::string comment(sample.side_data(),
223  sample.side_data() + sample.side_data_size());
224  cue.comment.push_back(comment);
225  return cue;
226  }
227 
228  std::string payload(sample.data(), sample.data() + sample.data_size());
229  cue.payload.push_back(payload);
230  cue.identifier.assign(sample.config_id());
231  cue.start_time = sample.pts();
232  cue.duration = sample.duration();
233  if (sample.side_data_size() != 0) {
234  cue.settings.assign(sample.side_data(),
235  sample.side_data() + sample.side_data_size());
236  }
237  return cue;
238 }
239 
240 WebVttMediaParser::WebVttMediaParser() : state_(kHeader) {}
241 WebVttMediaParser::~WebVttMediaParser() {}
242 
243 void WebVttMediaParser::Init(const InitCB& init_cb,
244  const NewSampleCB& new_sample_cb,
245  KeySource* decryption_key_source) {
246  init_cb_ = init_cb;
247  new_sample_cb_ = new_sample_cb;
248 }
249 
251  // If not in one of these states just be ready for more data.
252  if (state_ != kCuePayload && state_ != kComment)
253  return true;
254 
255  if (!data_.empty()) {
256  // If it was in the middle of the payload and the stream finished, then this
257  // is an end of the payload. The rest of the data is part of the payload.
258  if (state_ == kCuePayload) {
259  current_cue_.payload.push_back(data_);
260  } else {
261  current_cue_.comment.push_back(data_);
262  }
263  data_.clear();
264  }
265 
266  bool result = new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_));
267  current_cue_ = Cue();
268  state_ = kCueIdentifierOrTimingOrComment;
269  return result;
270 }
271 
272 bool WebVttMediaParser::Parse(const uint8_t* buf, int size) {
273  if (state_ == kParseError) {
274  LOG(WARNING) << "The parser is in an error state, ignoring input.";
275  return false;
276  }
277 
278  data_.insert(data_.end(), buf, buf + size);
279 
280  std::string line;
281  while (ReadLine(&data_, &line)) {
282  // Only kCueIdentifierOrTimingOrComment and kCueTiming states accept -->.
283  // Error otherwise.
284  const bool has_arrow = line.find("-->") != std::string::npos;
285  if (state_ == kCueTiming) {
286  if (!has_arrow) {
287  LOG(ERROR) << "Expected --> in: " << line;
288  state_ = kParseError;
289  return false;
290  }
291  } else if (state_ != kCueIdentifierOrTimingOrComment) {
292  if (has_arrow) {
293  LOG(ERROR) << "Unexpected --> in " << line;
294  state_ = kParseError;
295  return false;
296  }
297  }
298 
299  switch (state_) {
300  case kHeader:
301  // No check. This should be WEBVTT when this object was created.
302  header_.push_back(line);
303  state_ = kMetadata;
304  break;
305  case kMetadata: {
306  if (line.empty()) {
307  std::vector<std::shared_ptr<StreamInfo>> streams;
308  // The resolution of timings are in milliseconds.
309  const int kTimescale = 1000;
310 
311  // The duration passed here is not very important. Also the whole file
312  // must be read before determining the real duration which doesn't
313  // work nicely with the current demuxer.
314  const int kDuration = 0;
315 
316  // There is no one metadata to determine what the language is. Parts
317  // of the text may be annotated as some specific language.
318  const char kLanguage[] = "";
319  streams.emplace_back(
320  new TextStreamInfo(kTrackId, kTimescale, kDuration, "wvtt",
321  base::JoinString(header_, "\n"),
322  0, // Not necessary.
323  0,
324  kLanguage)); // Not necessary.
325 
326  init_cb_.Run(streams);
327  state_ = kCueIdentifierOrTimingOrComment;
328  break;
329  }
330 
331  header_.push_back(line);
332  break;
333  }
334  case kCueIdentifierOrTimingOrComment: {
335  // Note that there can be one or more line breaks before a cue starts;
336  // skip this line.
337  // Or the file could end without a new cue.
338  if (line.empty())
339  break;
340 
341  if (!has_arrow) {
342  if (base::StartsWith(line, "NOTE",
343  base::CompareCase::INSENSITIVE_ASCII)) {
344  state_ = kComment;
345  current_cue_.comment.push_back(line);
346  } else {
347  // A cue can start from a cue identifier.
348  // https://w3c.github.io/webvtt/#webvtt-cue-identifier
349  current_cue_.identifier = line;
350  // The next line must be a timing.
351  state_ = kCueTiming;
352  }
353  break;
354  }
355 
356  // No break statement if the line has an arrow; it should be a WebVTT
357  // timing, so fall thru. Setting state_ to kCueTiming so that the state
358  // always matches the case.
359  state_ = kCueTiming;
360  FALLTHROUGH_INTENDED;
361  }
362  case kCueTiming: {
363  DCHECK(has_arrow);
364  if (!ParseTimingAndSettingsLine(line, &current_cue_.start_time,
365  &current_cue_.duration,
366  &current_cue_.settings)) {
367  state_ = kParseError;
368  return false;
369  }
370  state_ = kCuePayload;
371  break;
372  }
373  case kCuePayload: {
374  if (line.empty()) {
375  state_ = kCueIdentifierOrTimingOrComment;
376  if (!new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_))) {
377  state_ = kParseError;
378  return false;
379  }
380  current_cue_ = Cue();
381  break;
382  }
383 
384  current_cue_.payload.push_back(line);
385  break;
386  }
387  case kComment: {
388  if (line.empty()) {
389  state_ = kCueIdentifierOrTimingOrComment;
390  if (!new_sample_cb_.Run(kTrackId, CueToMediaSample(current_cue_))) {
391  state_ = kParseError;
392  return false;
393  }
394  current_cue_ = Cue();
395  break;
396  }
397 
398  current_cue_.comment.push_back(line);
399  break;
400  }
401  case kParseError:
402  NOTREACHED();
403  return false;
404  }
405  }
406 
407  return true;
408 }
409 
410 } // namespace media
411 } // namespace shaka
void Init(const InitCB &init_cb, const NewSampleCB &new_sample_cb, KeySource *decryption_key_source) override
base::Callback< bool(uint32_t track_id, const std::shared_ptr< MediaSample > &media_sample)> NewSampleCB
Definition: media_parser.h:43
bool Parse(const uint8_t *buf, int size) override WARN_UNUSED_RESULT
bool Flush() override WARN_UNUSED_RESULT
static std::shared_ptr< MediaSample > FromMetadata(const uint8_t *metadata, size_t metadata_size)
Definition: media_sample.cc:67
static std::shared_ptr< MediaSample > CopyFrom(const uint8_t *data, size_t size, bool is_key_frame)
Definition: media_sample.cc:45
KeySource is responsible for encryption key acquisition.
Definition: key_source.h:30