Shaka Packager SDK
webvtt_parser.cc
1 // Copyright 2017 Google Inc. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file or at
5 // https://developers.google.com/open-source/licenses/bsd
6 
7 #include "packager/media/formats/webvtt/webvtt_parser.h"
8 
9 #include <regex>
10 
11 #include "packager/base/logging.h"
12 #include "packager/base/strings/string_number_conversions.h"
13 #include "packager/base/strings/string_split.h"
14 #include "packager/base/strings/string_util.h"
15 #include "packager/media/base/text_stream_info.h"
16 #include "packager/media/formats/webvtt/webvtt_utils.h"
17 
18 namespace shaka {
19 namespace media {
20 namespace {
21 
22 const uint64_t kStreamIndex = 0;
23 
24 std::string BlockToString(const std::string* block, size_t size) {
25  std::string out = " --- BLOCK START ---\n";
26 
27  for (size_t i = 0; i < size; i++) {
28  out.append(" ");
29  out.append(block[i]);
30  out.append("\n");
31  }
32 
33  out.append(" --- BLOCK END ---");
34 
35  return out;
36 }
37 
38 // Comments are just blocks that are preceded by a blank line, start with the
39 // word "NOTE" (followed by a space or newline), and end at the first blank
40 // line.
41 // SOURCE: https://www.w3.org/TR/webvtt1
42 bool IsLikelyNote(const std::string& line) {
43  return line == "NOTE" ||
44  base::StartsWith(line, "NOTE ", base::CompareCase::SENSITIVE) ||
45  base::StartsWith(line, "NOTE\t", base::CompareCase::SENSITIVE);
46 }
47 
48 // As cue time is the only part of a WEBVTT file that is allowed to have
49 // "-->" appear, then if the given line contains it, we can safely assume
50 // that the line is likely to be a cue time.
51 bool IsLikelyCueTiming(const std::string& line) {
52  return line.find("-->") != std::string::npos;
53 }
54 
55 // A WebVTT cue identifier is any sequence of one or more characters not
56 // containing the substring "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
57 // U+003E GREATER-THAN SIGN), nor containing any U+000A LINE FEED (LF)
58 // characters or U+000D CARRIAGE RETURN (CR) characters.
59 // SOURCE: https://www.w3.org/TR/webvtt1/#webvtt-cue-identifier
60 bool MaybeCueId(const std::string& line) {
61  return line.find("-->") == std::string::npos;
62 }
63 
64 // Check to see if the block is likely a style block. Style blocks are
65 // identified as any block that starts with a line that only contains
66 // "STYLE".
67 // SOURCE: https://w3c.github.io/webvtt/#styling
68 bool IsLikelyStyle(const std::string& line) {
69  return base::TrimWhitespaceASCII(line, base::TRIM_TRAILING) == "STYLE";
70 }
71 
72 // Check to see if the block is likely a region block. Region blocks are
73 // identified as any block that starts with a line that only contains
74 // "REGION".
75 // SOURCE: https://w3c.github.io/webvtt/#webvtt-region
76 bool IsLikelyRegion(const std::string& line) {
77  return base::TrimWhitespaceASCII(line, base::TRIM_TRAILING) == "REGION";
78 }
79 
80 bool ParsePercent(const std::string& str, float* value) {
81  // https://www.w3.org/TR/webvtt1/#webvtt-percentage
82  // E.g. "4%" or "1.5%"
83  std::regex re(R"((\d+(?:\.\d+)?)%)");
84  std::smatch match;
85  if (!std::regex_match(str, match, re)) {
86  return false;
87  }
88 
89  double temp;
90  base::StringToDouble(match[1], &temp);
91  if (temp >= 100) {
92  return false;
93  }
94  *value = temp;
95  return true;
96 }
97 
98 bool ParseDoublePercent(const std::string& str, float* a, float* b) {
99  std::regex re(R"((\d+(?:\.\d+)?)%,(\d+(?:\.\d+)?)%)");
100  std::smatch match;
101  if (!std::regex_match(str, match, re)) {
102  return false;
103  }
104 
105  double tempA, tempB;
106  base::StringToDouble(match[1], &tempA);
107  base::StringToDouble(match[2], &tempB);
108  if (tempA >= 100 || tempB >= 100) {
109  return false;
110  }
111  *a = tempA;
112  *b = tempB;
113  return true;
114 }
115 
116 void ParseSettings(const std::string& id,
117  const std::string& value,
118  TextSettings* settings) {
119  // https://www.w3.org/TR/webvtt1/#ref-for-parse-the-webvtt-cue-settings-1
120  if (id == "region") {
121  settings->region = value;
122  } else if (id == "vertical") {
123  if (value == "rl") {
124  settings->writing_direction = WritingDirection::kVerticalGrowingLeft;
125  } else if (value == "lr") {
126  settings->writing_direction = WritingDirection::kVerticalGrowingRight;
127  } else {
128  LOG(WARNING) << "Invalid WebVTT vertical setting: " << value;
129  }
130  } else if (id == "line") {
131  const auto pos = value.find(',');
132  const std::string line = value.substr(0, pos);
133  const std::string align =
134  pos != std::string::npos ? value.substr(pos + 1) : "";
135  if (pos != std::string::npos) {
136  LOG(WARNING) << "WebVTT line alignment isn't supported";
137  }
138 
139  if (!line.empty() && line[line.size() - 1] == '%') {
140  float temp;
141  if (!ParsePercent(line, &temp)) {
142  LOG(WARNING) << "Invalid WebVTT line: " << value;
143  return;
144  }
145  settings->line.emplace(temp, TextUnitType::kPercent);
146  } else {
147  double temp;
148  if (!base::StringToDouble(line, &temp)) {
149  LOG(WARNING) << "Invalid WebVTT line: " << value;
150  return;
151  }
152  settings->line.emplace(temp, TextUnitType::kLines);
153  }
154  } else if (id == "position") {
155  const auto pos = value.find(',');
156  const std::string position = value.substr(0, pos);
157  const std::string align =
158  pos != std::string::npos ? value.substr(pos + 1) : "";
159  if (pos != std::string::npos) {
160  LOG(WARNING) << "WebVTT position alignment isn't supported";
161  }
162 
163  float temp;
164  if (ParsePercent(position, &temp)) {
165  settings->position.emplace(temp, TextUnitType::kPercent);
166  } else {
167  LOG(WARNING) << "Invalid WebVTT position: " << value;
168  }
169  } else if (id == "size") {
170  float temp;
171  if (ParsePercent(value, &temp)) {
172  settings->width.emplace(temp, TextUnitType::kPercent);
173  } else {
174  LOG(WARNING) << "Invalid WebVTT size: " << value;
175  }
176  } else if (id == "align") {
177  if (value == "start") {
178  settings->text_alignment = TextAlignment::kStart;
179  } else if (value == "center" || value == "middle") {
180  settings->text_alignment = TextAlignment::kCenter;
181  } else if (value == "end") {
182  settings->text_alignment = TextAlignment::kEnd;
183  } else if (value == "left") {
184  settings->text_alignment = TextAlignment::kLeft;
185  } else if (value == "right") {
186  settings->text_alignment = TextAlignment::kRight;
187  } else {
188  LOG(WARNING) << "Invalid WebVTT align: " << value;
189  }
190  } else {
191  LOG(WARNING) << "Unknown WebVTT setting: " << id;
192  }
193 }
194 
195 } // namespace
196 
197 WebVttParser::WebVttParser() {}
198 
199 void WebVttParser::Init(const InitCB& init_cb,
200  const NewMediaSampleCB& new_media_sample_cb,
201  const NewTextSampleCB& new_text_sample_cb,
202  KeySource* decryption_key_source) {
203  DCHECK(init_cb_.is_null());
204  DCHECK(!init_cb.is_null());
205  DCHECK(!new_text_sample_cb.is_null());
206  DCHECK(!decryption_key_source) << "Encrypted WebVTT not supported";
207 
208  init_cb_ = init_cb;
209  new_text_sample_cb_ = new_text_sample_cb;
210 }
211 
213  reader_.Flush();
214  return Parse();
215 }
216 
217 bool WebVttParser::Parse(const uint8_t* buf, int size) {
218  reader_.PushData(buf, size);
219  return Parse();
220 }
221 
222 bool WebVttParser::Parse() {
223  if (!initialized_) {
224  std::vector<std::string> block;
225  if (!reader_.Next(&block)) {
226  return true;
227  }
228 
229  // Check the header. It is possible for a 0xFEFF BOM to come before the
230  // header text.
231  if (block.size() != 1) {
232  LOG(ERROR) << "Failed to read WEBVTT header - "
233  << "block size should be 1 but was " << block.size() << ".";
234  return false;
235  }
236  if (block[0] != "WEBVTT" && block[0] != "\xEF\xBB\xBFWEBVTT") {
237  LOG(ERROR) << "Failed to read WEBVTT header - should be WEBVTT but was "
238  << block[0];
239  return false;
240  }
241  initialized_ = true;
242  }
243 
244  std::vector<std::string> block;
245  while (reader_.Next(&block)) {
246  if (!ParseBlock(block))
247  return false;
248  }
249  return true;
250 }
251 
252 bool WebVttParser::ParseBlock(const std::vector<std::string>& block) {
253  // NOTE
254  if (IsLikelyNote(block[0])) {
255  // We can safely ignore the whole block.
256  return true;
257  }
258 
259  // STYLE
260  if (IsLikelyStyle(block[0])) {
261  if (saw_cue_) {
262  LOG(WARNING)
263  << "Found style block after seeing cue. Ignoring style block";
264  } else {
265  for (size_t i = 1; i < block.size(); i++) {
266  if (!css_styles_.empty())
267  css_styles_ += "\n";
268  css_styles_ += block[i];
269  }
270  }
271  return true;
272  }
273 
274  // REGION
275  if (IsLikelyRegion(block[0])) {
276  if (saw_cue_) {
277  LOG(WARNING)
278  << "Found region block after seeing cue. Ignoring region block";
279  return true;
280  } else {
281  return ParseRegion(block);
282  }
283  }
284 
285  // CUE with ID
286  if (block.size() >= 2 && MaybeCueId(block[0]) &&
287  IsLikelyCueTiming(block[1]) && ParseCueWithId(block)) {
288  saw_cue_ = true;
289  return true;
290  }
291 
292  // CUE with no ID
293  if (IsLikelyCueTiming(block[0]) && ParseCueWithNoId(block)) {
294  saw_cue_ = true;
295  return true;
296  }
297 
298  LOG(ERROR) << "Failed to determine block classification:\n"
299  << BlockToString(block.data(), block.size());
300  return false;
301 }
302 
303 bool WebVttParser::ParseRegion(const std::vector<std::string>& block) {
304  TextRegion region;
305  std::string region_id;
306  // Fill in defaults. Some may already be this, but set them anyway.
307  // See https://www.w3.org/TR/webvtt1/#regions
308  region.width.value = 100;
309  region.width.type = TextUnitType::kPercent;
310  region.height.value = 3;
311  region.height.type = TextUnitType::kLines;
312  region.window_anchor_x.value = 0;
313  region.window_anchor_x.type = TextUnitType::kPercent;
314  region.window_anchor_y.value = 100;
315  region.window_anchor_y.type = TextUnitType::kPercent;
316  region.region_anchor_x.value = 0;
317  region.region_anchor_x.type = TextUnitType::kPercent;
318  region.region_anchor_y.value = 100;
319  region.region_anchor_y.type = TextUnitType::kPercent;
320 
321  bool first = true;
322  for (const auto& line : block) {
323  // First line is "REGION", skip.
324  if (first) {
325  first = false;
326  continue;
327  }
328 
329  base::StringPairs pairs;
330  if (!base::SplitStringIntoKeyValuePairs(line, ':', ' ', &pairs)) {
331  LOG(ERROR) << "Invalid WebVTT settings: " << line;
332  return false;
333  }
334  for (const auto& pair : pairs) {
335  const std::string& value = pair.second;
336  if (pair.first == "id") {
337  if (value.find("-->") != std::string::npos) {
338  LOG(ERROR) << "Invalid WebVTT REGION ID: " << value;
339  return false;
340  }
341  if (regions_.find(value) != regions_.end()) {
342  LOG(ERROR) << "Duplicate WebVTT REGION: " << value;
343  return false;
344  }
345  region_id = value;
346  } else if (pair.first == "width") {
347  if (!ParsePercent(value, &region.width.value)) {
348  LOG(ERROR) << "Invalid WebVTT REGION width: " << value;
349  return false;
350  }
351  } else if (pair.first == "lines") {
352  unsigned int temp;
353  if (!base::StringToUint(value, &temp)) {
354  LOG(ERROR) << "Invalid WebVTT REGION lines: " << value;
355  return false;
356  }
357  region.height.value = temp;
358  } else if (pair.first == "regionanchor") {
359  if (!ParseDoublePercent(value, &region.region_anchor_x.value,
360  &region.region_anchor_y.value)) {
361  LOG(ERROR) << "Invalid WebVTT REGION regionanchor: " << value;
362  return false;
363  }
364  } else if (pair.first == "viewportanchor") {
365  if (!ParseDoublePercent(value, &region.window_anchor_x.value,
366  &region.window_anchor_y.value)) {
367  LOG(ERROR) << "Invalid WebVTT REGION windowanchor: " << value;
368  return false;
369  }
370  } else if (pair.first == "scroll") {
371  if (value != "up") {
372  LOG(ERROR) << "Invalid WebVTT REGION scroll: " << value;
373  return false;
374  }
375  region.scroll = true;
376  } else {
377  LOG(ERROR) << "Unknown WebVTT REGION setting: " << pair.first;
378  return false;
379  }
380  }
381  }
382  if (region_id.empty()) {
383  LOG(ERROR) << "WebVTT REGION id is required";
384  return false;
385  }
386  regions_.insert(std::make_pair(region_id, std::move(region)));
387  return true;
388 }
389 
390 bool WebVttParser::ParseCueWithNoId(const std::vector<std::string>& block) {
391  return ParseCue("", block.data(), block.size());
392 }
393 
394 bool WebVttParser::ParseCueWithId(const std::vector<std::string>& block) {
395  return ParseCue(block[0], block.data() + 1, block.size() - 1);
396 }
397 
398 bool WebVttParser::ParseCue(const std::string& id,
399  const std::string* block,
400  size_t block_size) {
401  const std::vector<std::string> time_and_style = base::SplitString(
402  block[0], " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
403 
404  uint64_t start_time = 0;
405  uint64_t end_time = 0;
406 
407  const bool parsed_time =
408  time_and_style.size() >= 3 && time_and_style[1] == "-->" &&
409  WebVttTimestampToMs(time_and_style[0], &start_time) &&
410  WebVttTimestampToMs(time_and_style[2], &end_time);
411 
412  if (!parsed_time) {
413  LOG(ERROR) << "Could not parse start time, -->, and end time from "
414  << block[0];
415  return false;
416  }
417 
418  if (!stream_info_dispatched_)
419  DispatchTextStreamInfo();
420 
421  // According to the WebVTT spec end time must be greater than the start time
422  // of the cue. Since we are seeing content with invalid times in the field, we
423  // are going to drop the cue instead of failing to package.
424  //
425  // For more context see:
426  // - https://www.w3.org/TR/webvtt1/#webvtt-cue-timings
427  // - https://github.com/google/shaka-packager/issues/335
428  // - https://github.com/google/shaka-packager/issues/425
429  //
430  // Print a warning so that those packaging content can know that their
431  // content is not spec compliant.
432  if (end_time <= start_time) {
433  LOG(WARNING) << "WebVTT input is not spec compliant. Start time ("
434  << start_time << ") should be less than end time (" << end_time
435  << "). Skipping webvtt cue:"
436  << BlockToString(block, block_size);
437  return true;
438  }
439 
440  TextSettings settings;
441  for (size_t i = 3; i < time_and_style.size(); i++) {
442  const auto pos = time_and_style[i].find(':');
443  if (pos == std::string::npos) {
444  continue;
445  }
446 
447  const std::string key = time_and_style[i].substr(0, pos);
448  const std::string value = time_and_style[i].substr(pos + 1);
449  ParseSettings(key, value, &settings);
450  }
451 
452  // The rest of the block is the payload.
453  // TODO: Parse tags to support <b>, <i>, etc.
454  TextFragment body;
455  TextFragmentStyle no_styles;
456  for (size_t i = 1; i < block_size; i++) {
457  if (i > 1) {
458  body.sub_fragments.emplace_back(no_styles, /* newline= */ true);
459  }
460  body.sub_fragments.emplace_back(no_styles, block[i]);
461  }
462 
463  const auto sample =
464  std::make_shared<TextSample>(id, start_time, end_time, settings, body);
465  return new_text_sample_cb_.Run(kStreamIndex, sample);
466 }
467 
468 void WebVttParser::DispatchTextStreamInfo() {
469  stream_info_dispatched_ = true;
470 
471  const int kTrackId = 0;
472  // The resolution of timings are in milliseconds.
473  const int kTimescale = 1000;
474  // The duration passed here is not very important. Also the whole file
475  // must be read before determining the real duration which doesn't
476  // work nicely with the current demuxer.
477  const int kDuration = 0;
478  const char kWebVttCodecString[] = "wvtt";
479  const int64_t kNoWidth = 0;
480  const int64_t kNoHeight = 0;
481  // The language of the stream will be overwritten by the Demuxer later.
482  const char kNoLanguage[] = "";
483 
484  const auto stream = std::make_shared<TextStreamInfo>(
485  kTrackId, kTimescale, kDuration, kCodecWebVtt, kWebVttCodecString, "",
486  kNoWidth, kNoHeight, kNoLanguage);
487  stream->set_css_styles(css_styles_);
488  for (const auto& pair : regions_)
489  stream->AddRegion(pair.first, pair.second);
490 
491  std::vector<std::shared_ptr<StreamInfo>> streams{stream};
492  init_cb_.Run(streams);
493 }
494 
495 } // namespace media
496 } // namespace shaka
void PushData(const uint8_t *data, size_t data_size)
Pushes data onto the end of the buffer.
Definition: text_readers.cc:73
bool Next(std::vector< std::string > *out)
Definition: text_readers.cc:78
KeySource is responsible for encryption key acquisition.
Definition: key_source.h:51
base::Callback< bool(uint32_t track_id, std::shared_ptr< TextSample > text_sample)> NewTextSampleCB
Definition: media_parser.h:53
base::Callback< bool(uint32_t track_id, std::shared_ptr< MediaSample > media_sample)> NewMediaSampleCB
Definition: media_parser.h:44
base::Callback< void(const std::vector< std::shared_ptr< StreamInfo > > &stream_info)> InitCB
Definition: media_parser.h:35
void Init(const InitCB &init_cb, const NewMediaSampleCB &new_media_sample_cb, const NewTextSampleCB &new_text_sample_cb, KeySource *decryption_key_source) override
bool Parse(const uint8_t *buf, int size) override
All the methods that are virtual are virtual for mocking.