Shaka Packager SDK
webvtt_parser.cc
1 // Copyright 2017 Google Inc. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file or at
5 // https://developers.google.com/open-source/licenses/bsd
6 
7 #include "packager/media/formats/webvtt/webvtt_parser.h"
8 
9 #include "packager/base/logging.h"
10 #include "packager/base/strings/string_number_conversions.h"
11 #include "packager/base/strings/string_split.h"
12 #include "packager/base/strings/string_util.h"
13 #include "packager/media/base/text_stream_info.h"
14 #include "packager/media/formats/webvtt/webvtt_utils.h"
15 
16 namespace shaka {
17 namespace media {
18 namespace {
19 
20 const uint64_t kStreamIndex = 0;
21 
22 std::string BlockToString(const std::string* block, size_t size) {
23  std::string out = " --- BLOCK START ---\n";
24 
25  for (size_t i = 0; i < size; i++) {
26  out.append(" ");
27  out.append(block[i]);
28  out.append("\n");
29  }
30 
31  out.append(" --- BLOCK END ---");
32 
33  return out;
34 }
35 
36 // Comments are just blocks that are preceded by a blank line, start with the
37 // word "NOTE" (followed by a space or newline), and end at the first blank
38 // line.
39 // SOURCE: https://www.w3.org/TR/webvtt1
40 bool IsLikelyNote(const std::string& line) {
41  return line == "NOTE" ||
42  base::StartsWith(line, "NOTE ", base::CompareCase::SENSITIVE) ||
43  base::StartsWith(line, "NOTE\t", base::CompareCase::SENSITIVE);
44 }
45 
46 // As cue time is the only part of a WEBVTT file that is allowed to have
47 // "-->" appear, then if the given line contains it, we can safely assume
48 // that the line is likely to be a cue time.
49 bool IsLikelyCueTiming(const std::string& line) {
50  return line.find("-->") != std::string::npos;
51 }
52 
53 // A WebVTT cue identifier is any sequence of one or more characters not
54 // containing the substring "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
55 // U+003E GREATER-THAN SIGN), nor containing any U+000A LINE FEED (LF)
56 // characters or U+000D CARRIAGE RETURN (CR) characters.
57 // SOURCE: https://www.w3.org/TR/webvtt1/#webvtt-cue-identifier
58 bool MaybeCueId(const std::string& line) {
59  return line.find("-->") == std::string::npos;
60 }
61 
62 // Check to see if the block is likely a style block. Style blocks are
63 // identified as any block that starts with a line that only contains
64 // "STYLE".
65 // SOURCE: https://w3c.github.io/webvtt/#styling
66 bool IsLikelyStyle(const std::string& line) {
67  return base::TrimWhitespaceASCII(line, base::TRIM_TRAILING) == "STYLE";
68 }
69 
70 // Check to see if the block is likely a region block. Region blocks are
71 // identified as any block that starts with a line that only contains
72 // "REGION".
73 // SOURCE: https://w3c.github.io/webvtt/#webvtt-region
74 bool IsLikelyRegion(const std::string& line) {
75  return base::TrimWhitespaceASCII(line, base::TRIM_TRAILING) == "REGION";
76 }
77 
78 bool ParsePercent(const std::string& str, float* value) {
79  // https://www.w3.org/TR/webvtt1/#webvtt-percentage
80  // E.g. "4%" or "1.5%"
81  if (str[str.size() - 1] != '%') {
82  return false;
83  }
84 
85  double temp;
86  if (!base::StringToDouble(str.substr(0, str.size() - 1), &temp) ||
87  temp >= 100) {
88  return false;
89  }
90  *value = temp;
91  return true;
92 }
93 
94 bool ParseDoublePercent(const std::string& str, float* a, float* b) {
95  auto percents = base::SplitString(str, ",", base::TRIM_WHITESPACE,
96  base::SPLIT_WANT_NONEMPTY);
97  if (percents.size() != 2) {
98  return false;
99  }
100  float temp_a, temp_b;
101  if (!ParsePercent(percents[0], &temp_a) ||
102  !ParsePercent(percents[1], &temp_b)) {
103  return false;
104  }
105  *a = temp_a;
106  *b = temp_b;
107  return true;
108 }
109 
110 void ParseSettings(const std::string& id,
111  const std::string& value,
112  TextSettings* settings) {
113  // https://www.w3.org/TR/webvtt1/#ref-for-parse-the-webvtt-cue-settings-1
114  if (id == "region") {
115  settings->region = value;
116  } else if (id == "vertical") {
117  if (value == "rl") {
118  settings->writing_direction = WritingDirection::kVerticalGrowingLeft;
119  } else if (value == "lr") {
120  settings->writing_direction = WritingDirection::kVerticalGrowingRight;
121  } else {
122  LOG(WARNING) << "Invalid WebVTT vertical setting: " << value;
123  }
124  } else if (id == "line") {
125  const auto pos = value.find(',');
126  const std::string line = value.substr(0, pos);
127  const std::string align =
128  pos != std::string::npos ? value.substr(pos + 1) : "";
129  if (pos != std::string::npos) {
130  LOG(WARNING) << "WebVTT line alignment isn't supported";
131  }
132 
133  if (!line.empty() && line[line.size() - 1] == '%') {
134  float temp;
135  if (!ParsePercent(line, &temp)) {
136  LOG(WARNING) << "Invalid WebVTT line: " << value;
137  return;
138  }
139  settings->line.emplace(temp, TextUnitType::kPercent);
140  } else {
141  double temp;
142  if (!base::StringToDouble(line, &temp)) {
143  LOG(WARNING) << "Invalid WebVTT line: " << value;
144  return;
145  }
146  settings->line.emplace(temp, TextUnitType::kLines);
147  }
148  } else if (id == "position") {
149  const auto pos = value.find(',');
150  const std::string position = value.substr(0, pos);
151  const std::string align =
152  pos != std::string::npos ? value.substr(pos + 1) : "";
153  if (pos != std::string::npos) {
154  LOG(WARNING) << "WebVTT position alignment isn't supported";
155  }
156 
157  float temp;
158  if (ParsePercent(position, &temp)) {
159  settings->position.emplace(temp, TextUnitType::kPercent);
160  } else {
161  LOG(WARNING) << "Invalid WebVTT position: " << value;
162  }
163  } else if (id == "size") {
164  float temp;
165  if (ParsePercent(value, &temp)) {
166  settings->width.emplace(temp, TextUnitType::kPercent);
167  } else {
168  LOG(WARNING) << "Invalid WebVTT size: " << value;
169  }
170  } else if (id == "align") {
171  if (value == "start") {
172  settings->text_alignment = TextAlignment::kStart;
173  } else if (value == "center" || value == "middle") {
174  settings->text_alignment = TextAlignment::kCenter;
175  } else if (value == "end") {
176  settings->text_alignment = TextAlignment::kEnd;
177  } else if (value == "left") {
178  settings->text_alignment = TextAlignment::kLeft;
179  } else if (value == "right") {
180  settings->text_alignment = TextAlignment::kRight;
181  } else {
182  LOG(WARNING) << "Invalid WebVTT align: " << value;
183  }
184  } else {
185  LOG(WARNING) << "Unknown WebVTT setting: " << id;
186  }
187 }
188 
189 } // namespace
190 
191 WebVttParser::WebVttParser() {}
192 
193 void WebVttParser::Init(const InitCB& init_cb,
194  const NewMediaSampleCB& new_media_sample_cb,
195  const NewTextSampleCB& new_text_sample_cb,
196  KeySource* decryption_key_source) {
197  DCHECK(init_cb_.is_null());
198  DCHECK(!init_cb.is_null());
199  DCHECK(!new_text_sample_cb.is_null());
200  DCHECK(!decryption_key_source) << "Encrypted WebVTT not supported";
201 
202  init_cb_ = init_cb;
203  new_text_sample_cb_ = new_text_sample_cb;
204 }
205 
207  reader_.Flush();
208  return Parse();
209 }
210 
211 bool WebVttParser::Parse(const uint8_t* buf, int size) {
212  reader_.PushData(buf, size);
213  return Parse();
214 }
215 
216 bool WebVttParser::Parse() {
217  if (!initialized_) {
218  std::vector<std::string> block;
219  if (!reader_.Next(&block)) {
220  return true;
221  }
222 
223  // Check the header. It is possible for a 0xFEFF BOM to come before the
224  // header text.
225  if (block.size() != 1) {
226  LOG(ERROR) << "Failed to read WEBVTT header - "
227  << "block size should be 1 but was " << block.size() << ".";
228  return false;
229  }
230  if (block[0] != "WEBVTT" && block[0] != "\xEF\xBB\xBFWEBVTT") {
231  LOG(ERROR) << "Failed to read WEBVTT header - should be WEBVTT but was "
232  << block[0];
233  return false;
234  }
235  initialized_ = true;
236  }
237 
238  std::vector<std::string> block;
239  while (reader_.Next(&block)) {
240  if (!ParseBlock(block))
241  return false;
242  }
243  return true;
244 }
245 
246 bool WebVttParser::ParseBlock(const std::vector<std::string>& block) {
247  // NOTE
248  if (IsLikelyNote(block[0])) {
249  // We can safely ignore the whole block.
250  return true;
251  }
252 
253  // STYLE
254  if (IsLikelyStyle(block[0])) {
255  if (saw_cue_) {
256  LOG(WARNING)
257  << "Found style block after seeing cue. Ignoring style block";
258  } else {
259  for (size_t i = 1; i < block.size(); i++) {
260  if (!css_styles_.empty())
261  css_styles_ += "\n";
262  css_styles_ += block[i];
263  }
264  }
265  return true;
266  }
267 
268  // REGION
269  if (IsLikelyRegion(block[0])) {
270  if (saw_cue_) {
271  LOG(WARNING)
272  << "Found region block after seeing cue. Ignoring region block";
273  return true;
274  } else {
275  return ParseRegion(block);
276  }
277  }
278 
279  // CUE with ID
280  if (block.size() >= 2 && MaybeCueId(block[0]) &&
281  IsLikelyCueTiming(block[1]) && ParseCueWithId(block)) {
282  saw_cue_ = true;
283  return true;
284  }
285 
286  // CUE with no ID
287  if (IsLikelyCueTiming(block[0]) && ParseCueWithNoId(block)) {
288  saw_cue_ = true;
289  return true;
290  }
291 
292  LOG(ERROR) << "Failed to determine block classification:\n"
293  << BlockToString(block.data(), block.size());
294  return false;
295 }
296 
297 bool WebVttParser::ParseRegion(const std::vector<std::string>& block) {
298  TextRegion region;
299  std::string region_id;
300  // Fill in defaults. Some may already be this, but set them anyway.
301  // See https://www.w3.org/TR/webvtt1/#regions
302  region.width.value = 100;
303  region.width.type = TextUnitType::kPercent;
304  region.height.value = 3;
305  region.height.type = TextUnitType::kLines;
306  region.window_anchor_x.value = 0;
307  region.window_anchor_x.type = TextUnitType::kPercent;
308  region.window_anchor_y.value = 100;
309  region.window_anchor_y.type = TextUnitType::kPercent;
310  region.region_anchor_x.value = 0;
311  region.region_anchor_x.type = TextUnitType::kPercent;
312  region.region_anchor_y.value = 100;
313  region.region_anchor_y.type = TextUnitType::kPercent;
314 
315  bool first = true;
316  for (const auto& line : block) {
317  // First line is "REGION", skip.
318  if (first) {
319  first = false;
320  continue;
321  }
322 
323  base::StringPairs pairs;
324  if (!base::SplitStringIntoKeyValuePairs(line, ':', ' ', &pairs)) {
325  LOG(ERROR) << "Invalid WebVTT settings: " << line;
326  return false;
327  }
328  for (const auto& pair : pairs) {
329  const std::string& value = pair.second;
330  if (pair.first == "id") {
331  if (value.find("-->") != std::string::npos) {
332  LOG(ERROR) << "Invalid WebVTT REGION ID: " << value;
333  return false;
334  }
335  if (regions_.find(value) != regions_.end()) {
336  LOG(ERROR) << "Duplicate WebVTT REGION: " << value;
337  return false;
338  }
339  region_id = value;
340  } else if (pair.first == "width") {
341  if (!ParsePercent(value, &region.width.value)) {
342  LOG(ERROR) << "Invalid WebVTT REGION width: " << value;
343  return false;
344  }
345  } else if (pair.first == "lines") {
346  unsigned int temp;
347  if (!base::StringToUint(value, &temp)) {
348  LOG(ERROR) << "Invalid WebVTT REGION lines: " << value;
349  return false;
350  }
351  region.height.value = temp;
352  } else if (pair.first == "regionanchor") {
353  if (!ParseDoublePercent(value, &region.region_anchor_x.value,
354  &region.region_anchor_y.value)) {
355  LOG(ERROR) << "Invalid WebVTT REGION regionanchor: " << value;
356  return false;
357  }
358  } else if (pair.first == "viewportanchor") {
359  if (!ParseDoublePercent(value, &region.window_anchor_x.value,
360  &region.window_anchor_y.value)) {
361  LOG(ERROR) << "Invalid WebVTT REGION windowanchor: " << value;
362  return false;
363  }
364  } else if (pair.first == "scroll") {
365  if (value != "up") {
366  LOG(ERROR) << "Invalid WebVTT REGION scroll: " << value;
367  return false;
368  }
369  region.scroll = true;
370  } else {
371  LOG(ERROR) << "Unknown WebVTT REGION setting: " << pair.first;
372  return false;
373  }
374  }
375  }
376  if (region_id.empty()) {
377  LOG(ERROR) << "WebVTT REGION id is required";
378  return false;
379  }
380  regions_.insert(std::make_pair(region_id, std::move(region)));
381  return true;
382 }
383 
384 bool WebVttParser::ParseCueWithNoId(const std::vector<std::string>& block) {
385  return ParseCue("", block.data(), block.size());
386 }
387 
388 bool WebVttParser::ParseCueWithId(const std::vector<std::string>& block) {
389  return ParseCue(block[0], block.data() + 1, block.size() - 1);
390 }
391 
392 bool WebVttParser::ParseCue(const std::string& id,
393  const std::string* block,
394  size_t block_size) {
395  const std::vector<std::string> time_and_style = base::SplitString(
396  block[0], " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
397 
398  uint64_t start_time = 0;
399  uint64_t end_time = 0;
400 
401  const bool parsed_time =
402  time_and_style.size() >= 3 && time_and_style[1] == "-->" &&
403  WebVttTimestampToMs(time_and_style[0], &start_time) &&
404  WebVttTimestampToMs(time_and_style[2], &end_time);
405 
406  if (!parsed_time) {
407  LOG(ERROR) << "Could not parse start time, -->, and end time from "
408  << block[0];
409  return false;
410  }
411 
412  if (!stream_info_dispatched_)
413  DispatchTextStreamInfo();
414 
415  // According to the WebVTT spec end time must be greater than the start time
416  // of the cue. Since we are seeing content with invalid times in the field, we
417  // are going to drop the cue instead of failing to package.
418  //
419  // For more context see:
420  // - https://www.w3.org/TR/webvtt1/#webvtt-cue-timings
421  // - https://github.com/google/shaka-packager/issues/335
422  // - https://github.com/google/shaka-packager/issues/425
423  //
424  // Print a warning so that those packaging content can know that their
425  // content is not spec compliant.
426  if (end_time <= start_time) {
427  LOG(WARNING) << "WebVTT input is not spec compliant. Start time ("
428  << start_time << ") should be less than end time (" << end_time
429  << "). Skipping webvtt cue:"
430  << BlockToString(block, block_size);
431  return true;
432  }
433 
434  TextSettings settings;
435  for (size_t i = 3; i < time_and_style.size(); i++) {
436  const auto pos = time_and_style[i].find(':');
437  if (pos == std::string::npos) {
438  continue;
439  }
440 
441  const std::string key = time_and_style[i].substr(0, pos);
442  const std::string value = time_and_style[i].substr(pos + 1);
443  ParseSettings(key, value, &settings);
444  }
445 
446  // The rest of the block is the payload.
447  // TODO: Parse tags to support <b>, <i>, etc.
448  TextFragment body;
449  TextFragmentStyle no_styles;
450  for (size_t i = 1; i < block_size; i++) {
451  if (i > 1) {
452  body.sub_fragments.emplace_back(no_styles, /* newline= */ true);
453  }
454  body.sub_fragments.emplace_back(no_styles, block[i]);
455  }
456 
457  const auto sample =
458  std::make_shared<TextSample>(id, start_time, end_time, settings, body);
459  return new_text_sample_cb_.Run(kStreamIndex, sample);
460 }
461 
462 void WebVttParser::DispatchTextStreamInfo() {
463  stream_info_dispatched_ = true;
464 
465  const int kTrackId = 0;
466  // The resolution of timings are in milliseconds.
467  const int kTimescale = 1000;
468  // The duration passed here is not very important. Also the whole file
469  // must be read before determining the real duration which doesn't
470  // work nicely with the current demuxer.
471  const int kDuration = 0;
472  const char kWebVttCodecString[] = "wvtt";
473  const int64_t kNoWidth = 0;
474  const int64_t kNoHeight = 0;
475  // The language of the stream will be overwritten by the Demuxer later.
476  const char kNoLanguage[] = "";
477 
478  const auto stream = std::make_shared<TextStreamInfo>(
479  kTrackId, kTimescale, kDuration, kCodecWebVtt, kWebVttCodecString, "",
480  kNoWidth, kNoHeight, kNoLanguage);
481  stream->set_css_styles(css_styles_);
482  for (const auto& pair : regions_)
483  stream->AddRegion(pair.first, pair.second);
484 
485  std::vector<std::shared_ptr<StreamInfo>> streams{stream};
486  init_cb_.Run(streams);
487 }
488 
489 } // namespace media
490 } // namespace shaka
shaka::media::BlockReader::Flush
void Flush()
Definition: text_readers.cc:103
shaka
All the methods that are virtual are virtual for mocking.
Definition: gflags_hex_bytes.cc:11
shaka::media::MediaParser::NewMediaSampleCB
base::Callback< bool(uint32_t track_id, std::shared_ptr< MediaSample > media_sample)> NewMediaSampleCB
Definition: media_parser.h:44
shaka::media::WebVttParser::Parse
bool Parse(const uint8_t *buf, int size) override
Definition: webvtt_parser.cc:211
shaka::media::KeySource
KeySource is responsible for encryption key acquisition.
Definition: key_source.h:51
shaka::media::MediaParser::NewTextSampleCB
base::Callback< bool(uint32_t track_id, std::shared_ptr< TextSample > text_sample)> NewTextSampleCB
Definition: media_parser.h:53
shaka::media::WebVttParser::Init
void Init(const InitCB &init_cb, const NewMediaSampleCB &new_media_sample_cb, const NewTextSampleCB &new_text_sample_cb, KeySource *decryption_key_source) override
Definition: webvtt_parser.cc:193
shaka::media::BlockReader::PushData
void PushData(const uint8_t *data, size_t data_size)
Pushes data onto the end of the buffer.
Definition: text_readers.cc:73
shaka::media::BlockReader::Next
bool Next(std::vector< std::string > *out)
Definition: text_readers.cc:78
shaka::media::WebVttParser::Flush
bool Flush() override
Definition: webvtt_parser.cc:206