DASH Media Packaging SDK
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator
webm_cluster_parser.cc
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "packager/media/formats/webm/webm_cluster_parser.h"
6 
7 #include <algorithm>
8 #include <vector>
9 
10 #include "packager/base/logging.h"
11 #include "packager/base/sys_byteorder.h"
12 #include "packager/media/base/decrypt_config.h"
13 #include "packager/media/base/timestamp.h"
14 #include "packager/media/codecs/vp8_parser.h"
15 #include "packager/media/codecs/vp9_parser.h"
16 #include "packager/media/codecs/webvtt_util.h"
17 #include "packager/media/formats/webm/webm_constants.h"
18 #include "packager/media/formats/webm/webm_crypto_helpers.h"
19 #include "packager/media/formats/webm/webm_webvtt_parser.h"
20 
21 namespace shaka {
22 namespace media {
23 namespace {
24 
25 const int64_t kMicrosecondsPerMillisecond = 1000;
26 
27 } // namespace
28 
30  int64_t timecode_scale,
31  scoped_refptr<AudioStreamInfo> audio_stream_info,
32  scoped_refptr<VideoStreamInfo> video_stream_info,
33  int64_t audio_default_duration,
34  int64_t video_default_duration,
35  const WebMTracksParser::TextTracks& text_tracks,
36  const std::set<int64_t>& ignored_tracks,
37  const std::string& audio_encryption_key_id,
38  const std::string& video_encryption_key_id,
39  const MediaParser::NewSampleCB& new_sample_cb,
40  const MediaParser::InitCB& init_cb,
41  KeySource* decryption_key_source)
42  : timecode_multiplier_(timecode_scale / 1000.0),
43  audio_stream_info_(audio_stream_info),
44  video_stream_info_(video_stream_info),
45  ignored_tracks_(ignored_tracks),
46  audio_encryption_key_id_(audio_encryption_key_id),
47  video_encryption_key_id_(video_encryption_key_id),
48  parser_(kWebMIdCluster, this),
49  initialized_(false),
50  init_cb_(init_cb),
51  cluster_start_time_(kNoTimestamp),
52  audio_(audio_stream_info ? audio_stream_info->track_id() : -1,
53  false,
54  audio_default_duration,
55  new_sample_cb),
56  video_(video_stream_info ? video_stream_info->track_id() : -1,
57  true,
58  video_default_duration,
59  new_sample_cb) {
60  if (decryption_key_source)
61  decryptor_source_.reset(new DecryptorSource(decryption_key_source));
62  for (WebMTracksParser::TextTracks::const_iterator it = text_tracks.begin();
63  it != text_tracks.end();
64  ++it) {
65  text_track_map_.insert(std::make_pair(
66  it->first, Track(it->first, false, kNoTimestamp, new_sample_cb)));
67  }
68 }
69 
70 WebMClusterParser::~WebMClusterParser() {}
71 
73  last_block_timecode_ = -1;
74  cluster_timecode_ = -1;
75  cluster_start_time_ = kNoTimestamp;
76  cluster_ended_ = false;
77  parser_.Reset();
78  audio_.Reset();
79  video_.Reset();
80  ResetTextTracks();
81 }
82 
84  // Estimate the duration of the last frame if necessary.
85  bool audio_result = audio_.ApplyDurationEstimateIfNeeded();
86  bool video_result = video_.ApplyDurationEstimateIfNeeded();
87  Reset();
88  return audio_result && video_result;
89 }
90 
91 int WebMClusterParser::Parse(const uint8_t* buf, int size) {
92  int result = parser_.Parse(buf, size);
93 
94  if (result < 0) {
95  cluster_ended_ = false;
96  return result;
97  }
98 
99  cluster_ended_ = parser_.IsParsingComplete();
100  if (cluster_ended_) {
101  // If there were no buffers in this cluster, set the cluster start time to
102  // be the |cluster_timecode_|.
103  if (cluster_start_time_ == kNoTimestamp) {
104  // If the cluster did not even have a |cluster_timecode_|, signal parse
105  // error.
106  if (cluster_timecode_ < 0)
107  return -1;
108 
109  cluster_start_time_ = cluster_timecode_ * timecode_multiplier_;
110  }
111 
112  // Reset the parser if we're done parsing so that
113  // it is ready to accept another cluster on the next
114  // call.
115  parser_.Reset();
116 
117  last_block_timecode_ = -1;
118  cluster_timecode_ = -1;
119  }
120 
121  return result;
122 }
123 
124 WebMParserClient* WebMClusterParser::OnListStart(int id) {
125  if (id == kWebMIdCluster) {
126  cluster_timecode_ = -1;
127  cluster_start_time_ = kNoTimestamp;
128  } else if (id == kWebMIdBlockGroup) {
129  block_data_.reset();
130  block_data_size_ = -1;
131  block_duration_ = -1;
132  discard_padding_ = -1;
133  discard_padding_set_ = false;
134  reference_block_set_ = false;
135  } else if (id == kWebMIdBlockAdditions) {
136  block_add_id_ = -1;
137  block_additional_data_.reset();
138  block_additional_data_size_ = 0;
139  }
140 
141  return this;
142 }
143 
144 bool WebMClusterParser::OnListEnd(int id) {
145  if (id != kWebMIdBlockGroup)
146  return true;
147 
148  // Make sure the BlockGroup actually had a Block.
149  if (block_data_size_ == -1) {
150  LOG(ERROR) << "Block missing from BlockGroup.";
151  return false;
152  }
153 
154  bool result = ParseBlock(
155  false, block_data_.get(), block_data_size_, block_additional_data_.get(),
156  block_additional_data_size_, block_duration_,
157  discard_padding_set_ ? discard_padding_ : 0, reference_block_set_);
158  block_data_.reset();
159  block_data_size_ = -1;
160  block_duration_ = -1;
161  block_add_id_ = -1;
162  block_additional_data_.reset();
163  block_additional_data_size_ = 0;
164  discard_padding_ = -1;
165  discard_padding_set_ = false;
166  reference_block_set_ = false;
167  return result;
168 }
169 
170 bool WebMClusterParser::OnUInt(int id, int64_t val) {
171  int64_t* dst;
172  switch (id) {
173  case kWebMIdTimecode:
174  dst = &cluster_timecode_;
175  break;
176  case kWebMIdBlockDuration:
177  dst = &block_duration_;
178  break;
179  case kWebMIdBlockAddID:
180  dst = &block_add_id_;
181  break;
182  default:
183  return true;
184  }
185  if (*dst != -1)
186  return false;
187  *dst = val;
188  return true;
189 }
190 
191 bool WebMClusterParser::ParseBlock(bool is_simple_block,
192  const uint8_t* buf,
193  int size,
194  const uint8_t* additional,
195  int additional_size,
196  int duration,
197  int64_t discard_padding,
198  bool reference_block_set) {
199  if (size < 4)
200  return false;
201 
202  // Return an error if the trackNum > 127. We just aren't
203  // going to support large track numbers right now.
204  if (!(buf[0] & 0x80)) {
205  LOG(ERROR) << "TrackNumber over 127 not supported";
206  return false;
207  }
208 
209  int track_num = buf[0] & 0x7f;
210  int timecode = buf[1] << 8 | buf[2];
211  int flags = buf[3] & 0xff;
212  int lacing = (flags >> 1) & 0x3;
213 
214  if (lacing) {
215  LOG(ERROR) << "Lacing " << lacing << " is not supported yet.";
216  return false;
217  }
218 
219  // Sign extend negative timecode offsets.
220  if (timecode & 0x8000)
221  timecode |= ~0xffff;
222 
223  // The first bit of the flags is set when a SimpleBlock contains only
224  // keyframes. If this is a Block, then keyframe is inferred by the absence of
225  // the ReferenceBlock Element.
226  // http://www.matroska.org/technical/specs/index.html
227  bool is_key_frame =
228  is_simple_block ? (flags & 0x80) != 0 : !reference_block_set;
229 
230  const uint8_t* frame_data = buf + 4;
231  int frame_size = size - (frame_data - buf);
232  return OnBlock(is_simple_block, track_num, timecode, duration, frame_data,
233  frame_size, additional, additional_size, discard_padding,
234  is_key_frame);
235 }
236 
237 bool WebMClusterParser::OnBinary(int id, const uint8_t* data, int size) {
238  switch (id) {
239  case kWebMIdSimpleBlock:
240  return ParseBlock(true, data, size, NULL, 0, -1, 0, false);
241 
242  case kWebMIdBlock:
243  if (block_data_) {
244  LOG(ERROR) << "More than 1 Block in a BlockGroup is not "
245  "supported.";
246  return false;
247  }
248  block_data_.reset(new uint8_t[size]);
249  memcpy(block_data_.get(), data, size);
250  block_data_size_ = size;
251  return true;
252 
253  case kWebMIdBlockAdditional: {
254  uint64_t block_add_id = base::HostToNet64(block_add_id_);
255  if (block_additional_data_) {
256  // TODO: Technically, more than 1 BlockAdditional is allowed as per
257  // matroska spec. But for now we don't have a use case to support
258  // parsing of such files. Take a look at this again when such a case
259  // arises.
260  LOG(ERROR) << "More than 1 BlockAdditional in a "
261  "BlockGroup is not supported.";
262  return false;
263  }
264  // First 8 bytes of side_data in DecoderBuffer is the BlockAddID
265  // element's value in Big Endian format. This is done to mimic ffmpeg
266  // demuxer's behavior.
267  block_additional_data_size_ = size + sizeof(block_add_id);
268  block_additional_data_.reset(new uint8_t[block_additional_data_size_]);
269  memcpy(block_additional_data_.get(), &block_add_id,
270  sizeof(block_add_id));
271  memcpy(block_additional_data_.get() + 8, data, size);
272  return true;
273  }
274  case kWebMIdDiscardPadding: {
275  if (discard_padding_set_ || size <= 0 || size > 8)
276  return false;
277  discard_padding_set_ = true;
278 
279  // Read in the big-endian integer.
280  discard_padding_ = static_cast<int8_t>(data[0]);
281  for (int i = 1; i < size; ++i)
282  discard_padding_ = (discard_padding_ << 8) | data[i];
283 
284  return true;
285  }
286  case kWebMIdReferenceBlock:
287  // We use ReferenceBlock to determine whether the current Block contains a
288  // keyframe or not. Other than that, we don't care about the value of the
289  // ReferenceBlock element itself.
290  reference_block_set_ = true;
291  return true;
292  default:
293  return true;
294  }
295 }
296 
297 bool WebMClusterParser::OnBlock(bool is_simple_block,
298  int track_num,
299  int timecode,
300  int block_duration,
301  const uint8_t* data,
302  int size,
303  const uint8_t* additional,
304  int additional_size,
305  int64_t discard_padding,
306  bool is_key_frame) {
307  DCHECK_GE(size, 0);
308  if (cluster_timecode_ == -1) {
309  LOG(ERROR) << "Got a block before cluster timecode.";
310  return false;
311  }
312 
313  // TODO: Should relative negative timecode offsets be rejected? Or only when
314  // the absolute timecode is negative? See http://crbug.com/271794
315  if (timecode < 0) {
316  LOG(ERROR) << "Got a block with negative timecode offset " << timecode;
317  return false;
318  }
319 
320  if (last_block_timecode_ != -1 && timecode < last_block_timecode_) {
321  LOG(ERROR) << "Got a block with a timecode before the previous block.";
322  return false;
323  }
324 
325  Track* track = NULL;
326  StreamType stream_type = kStreamUnknown;
327  std::string encryption_key_id;
328  if (track_num == audio_.track_num()) {
329  track = &audio_;
330  encryption_key_id = audio_encryption_key_id_;
331  stream_type = kStreamAudio;
332  } else if (track_num == video_.track_num()) {
333  track = &video_;
334  encryption_key_id = video_encryption_key_id_;
335  stream_type = kStreamVideo;
336  } else if (ignored_tracks_.find(track_num) != ignored_tracks_.end()) {
337  return true;
338  } else if (Track* const text_track = FindTextTrack(track_num)) {
339  if (is_simple_block) // BlockGroup is required for WebVTT cues
340  return false;
341  if (block_duration < 0) // not specified
342  return false;
343  track = text_track;
344  stream_type = kStreamText;
345  } else {
346  LOG(ERROR) << "Unexpected track number " << track_num;
347  return false;
348  }
349  DCHECK_NE(stream_type, kStreamUnknown);
350 
351  last_block_timecode_ = timecode;
352 
353  int64_t timestamp = (cluster_timecode_ + timecode) * timecode_multiplier_;
354 
355  scoped_refptr<MediaSample> buffer;
356  if (stream_type != kStreamText) {
357  // Every encrypted Block has a signal byte and IV prepended to it. Current
358  // encrypted WebM request for comments specification is here
359  // http://wiki.webmproject.org/encryption/webm-encryption-rfc
360  std::unique_ptr<DecryptConfig> decrypt_config;
361  int data_offset = 0;
362  if (!encryption_key_id.empty() &&
363  !WebMCreateDecryptConfig(
364  data, size,
365  reinterpret_cast<const uint8_t*>(encryption_key_id.data()),
366  encryption_key_id.size(),
367  &decrypt_config, &data_offset)) {
368  return false;
369  }
370 
371  buffer = MediaSample::CopyFrom(data + data_offset, size - data_offset,
372  additional, additional_size, is_key_frame);
373 
374  if (decrypt_config) {
375  if (!decryptor_source_) {
376  // If the demuxer does not have the decryptor_source_, store
377  // decrypt_config so that the demuxed sample can be decrypted later.
378  buffer->set_decrypt_config(std::move(decrypt_config));
379  buffer->set_is_encrypted(true);
380  } else if (!decryptor_source_->DecryptSampleBuffer(
381  decrypt_config.get(), buffer->writable_data(),
382  buffer->data_size())) {
383  LOG(ERROR) << "Cannot decrypt samples";
384  return false;
385  }
386  }
387  } else {
388  std::string id, settings, content;
389  WebMWebVTTParser::Parse(data, size, &id, &settings, &content);
390 
391  std::vector<uint8_t> side_data;
392  MakeSideData(id.begin(), id.end(),
393  settings.begin(), settings.end(),
394  &side_data);
395 
396  buffer = MediaSample::CopyFrom(
397  reinterpret_cast<const uint8_t*>(content.data()), content.length(),
398  &side_data[0], side_data.size(), true);
399  }
400 
401  buffer->set_dts(timestamp);
402  buffer->set_pts(timestamp);
403  if (cluster_start_time_ == kNoTimestamp)
404  cluster_start_time_ = timestamp;
405  buffer->set_duration(block_duration > 0
406  ? (block_duration * timecode_multiplier_)
407  : kNoTimestamp);
408 
409  if (!init_cb_.is_null() && !initialized_) {
410  std::vector<scoped_refptr<StreamInfo>> streams;
411  if (audio_stream_info_)
412  streams.push_back(audio_stream_info_);
413  if (video_stream_info_) {
414  if (stream_type == kStreamVideo) {
415  std::unique_ptr<VPxParser> vpx_parser;
416  switch (video_stream_info_->codec()) {
417  case kCodecVP8:
418  vpx_parser.reset(new VP8Parser);
419  break;
420  case kCodecVP9:
421  vpx_parser.reset(new VP9Parser);
422  break;
423  default:
424  NOTIMPLEMENTED() << "Unsupported codec "
425  << video_stream_info_->codec();
426  return false;
427  }
428  std::vector<VPxFrameInfo> vpx_frames;
429  if (!vpx_parser->Parse(buffer->data(), buffer->data_size(),
430  &vpx_frames)) {
431  LOG(ERROR) << "Failed to parse vpx frame.";
432  return false;
433  }
434  if (vpx_frames.size() != 1u || !vpx_frames[0].is_keyframe) {
435  LOG(ERROR) << "The first frame should be a key frame.";
436  return false;
437  }
438 
439  VPCodecConfigurationRecord codec_config;
440  if (!video_stream_info_->codec_config().empty())
441  codec_config.ParseWebM(video_stream_info_->codec_config());
442  codec_config.MergeFrom(vpx_parser->codec_config());
443 
444  video_stream_info_->set_codec_string(
445  codec_config.GetCodecString(video_stream_info_->codec()));
446  std::vector<uint8_t> config_serialized;
447  codec_config.WriteMP4(&config_serialized);
448  video_stream_info_->set_codec_config(config_serialized);
449  streams.push_back(video_stream_info_);
450  init_cb_.Run(streams);
451  initialized_ = true;
452  }
453  } else {
454  init_cb_.Run(streams);
455  initialized_ = true;
456  }
457  }
458 
459  return track->EmitBuffer(buffer);
460 }
461 
462 WebMClusterParser::Track::Track(int track_num,
463  bool is_video,
464  int64_t default_duration,
465  const MediaParser::NewSampleCB& new_sample_cb)
466  : track_num_(track_num),
467  is_video_(is_video),
468  default_duration_(default_duration),
469  estimated_next_frame_duration_(kNoTimestamp),
470  new_sample_cb_(new_sample_cb) {
471  DCHECK(default_duration_ == kNoTimestamp || default_duration_ > 0);
472 }
473 
474 WebMClusterParser::Track::~Track() {}
475 
476 bool WebMClusterParser::Track::EmitBuffer(
477  const scoped_refptr<MediaSample>& buffer) {
478  DVLOG(2) << "EmitBuffer() : " << track_num_
479  << " ts " << buffer->pts()
480  << " dur " << buffer->duration()
481  << " kf " << buffer->is_key_frame()
482  << " size " << buffer->data_size();
483 
484  if (last_added_buffer_missing_duration_.get()) {
485  int64_t derived_duration =
486  buffer->pts() - last_added_buffer_missing_duration_->pts();
487  last_added_buffer_missing_duration_->set_duration(derived_duration);
488 
489  DVLOG(2) << "EmitBuffer() : applied derived duration to held-back buffer : "
490  << " ts "
491  << last_added_buffer_missing_duration_->pts()
492  << " dur "
493  << last_added_buffer_missing_duration_->duration()
494  << " kf " << last_added_buffer_missing_duration_->is_key_frame()
495  << " size " << last_added_buffer_missing_duration_->data_size();
496  scoped_refptr<MediaSample> updated_buffer =
497  last_added_buffer_missing_duration_;
498  last_added_buffer_missing_duration_ = NULL;
499  if (!EmitBufferHelp(updated_buffer))
500  return false;
501  }
502 
503  if (buffer->duration() == kNoTimestamp) {
504  last_added_buffer_missing_duration_ = buffer;
505  DVLOG(2) << "EmitBuffer() : holding back buffer that is missing duration";
506  return true;
507  }
508 
509  return EmitBufferHelp(buffer);
510 }
511 
512 bool WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() {
513  if (!last_added_buffer_missing_duration_.get())
514  return true;
515 
516  int64_t estimated_duration = GetDurationEstimate();
517  last_added_buffer_missing_duration_->set_duration(estimated_duration);
518 
519  VLOG(1) << "Track " << track_num_ << ": Estimating WebM block duration to be "
520  << estimated_duration / 1000
521  << "ms for the last (Simple)Block in the Cluster for this Track. Use "
522  "BlockGroups with BlockDurations at the end of each Track in a "
523  "Cluster to avoid estimation.";
524 
525  DVLOG(2) << " new dur : ts " << last_added_buffer_missing_duration_->pts()
526  << " dur " << last_added_buffer_missing_duration_->duration()
527  << " kf " << last_added_buffer_missing_duration_->is_key_frame()
528  << " size " << last_added_buffer_missing_duration_->data_size();
529 
530  // Don't use the applied duration as a future estimation (don't use
531  // EmitBufferHelp() here.)
532  if (!new_sample_cb_.Run(track_num_, last_added_buffer_missing_duration_))
533  return false;
534  last_added_buffer_missing_duration_ = NULL;
535  return true;
536 }
537 
538 void WebMClusterParser::Track::Reset() {
539  last_added_buffer_missing_duration_ = NULL;
540 }
541 
542 bool WebMClusterParser::Track::EmitBufferHelp(
543  const scoped_refptr<MediaSample>& buffer) {
544  DCHECK(!last_added_buffer_missing_duration_.get());
545 
546  int64_t duration = buffer->duration();
547  if (duration < 0 || duration == kNoTimestamp) {
548  LOG(ERROR) << "Invalid buffer duration: " << duration;
549  return false;
550  }
551 
552  // The estimated frame duration is the maximum non-zero duration since the
553  // last initialization segment.
554  if (duration > 0) {
555  int64_t orig_duration_estimate = estimated_next_frame_duration_;
556  if (estimated_next_frame_duration_ == kNoTimestamp) {
557  estimated_next_frame_duration_ = duration;
558  } else {
559  estimated_next_frame_duration_ =
560  std::max(duration, estimated_next_frame_duration_);
561  }
562 
563  if (orig_duration_estimate != estimated_next_frame_duration_) {
564  DVLOG(3) << "Updated duration estimate:"
565  << orig_duration_estimate
566  << " -> "
567  << estimated_next_frame_duration_
568  << " at timestamp: "
569  << buffer->dts();
570  }
571  }
572 
573  return new_sample_cb_.Run(track_num_, buffer);
574 }
575 
576 int64_t WebMClusterParser::Track::GetDurationEstimate() {
577  int64_t duration = kNoTimestamp;
578  if (default_duration_ != kNoTimestamp) {
579  duration = default_duration_;
580  DVLOG(3) << __FUNCTION__ << " : using track default duration " << duration;
581  } else if (estimated_next_frame_duration_ != kNoTimestamp) {
582  duration = estimated_next_frame_duration_;
583  DVLOG(3) << __FUNCTION__ << " : using estimated duration " << duration;
584  } else {
585  if (is_video_) {
586  duration = kDefaultVideoBufferDurationInMs * kMicrosecondsPerMillisecond;
587  } else {
588  duration = kDefaultAudioBufferDurationInMs * kMicrosecondsPerMillisecond;
589  }
590  DVLOG(3) << __FUNCTION__ << " : using hardcoded default duration "
591  << duration;
592  }
593 
594  DCHECK_GT(duration, 0);
595  DCHECK_NE(duration, kNoTimestamp);
596  return duration;
597 }
598 
599 void WebMClusterParser::ResetTextTracks() {
600  for (TextTrackMap::iterator it = text_track_map_.begin();
601  it != text_track_map_.end();
602  ++it) {
603  it->second.Reset();
604  }
605 }
606 
607 WebMClusterParser::Track*
608 WebMClusterParser::FindTextTrack(int track_num) {
609  const TextTrackMap::iterator it = text_track_map_.find(track_num);
610 
611  if (it == text_track_map_.end())
612  return NULL;
613 
614  return &it->second;
615 }
616 
617 } // namespace media
618 } // namespace shaka
base::Callback< void(const std::vector< scoped_refptr< StreamInfo > > &stream_info)> InitCB
Definition: media_parser.h:34
int Parse(const uint8_t *buf, int size)
void Reset()
Resets the state of the parser so it can start parsing a new list.
Definition: webm_parser.cc:714
int Parse(const uint8_t *buf, int size)
Definition: webm_parser.cc:719
bool Flush() WARN_UNUSED_RESULT
static void Parse(const uint8_t *payload, int payload_size, std::string *id, std::string *settings, std::string *content)
Utility function to parse the WebVTT cue from a byte stream.
base::Callback< bool(uint32_t track_id, const scoped_refptr< MediaSample > &media_sample)> NewSampleCB
Definition: media_parser.h:43
KeySource is responsible for encryption key acquisition.
Definition: key_source.h:30
static scoped_refptr< MediaSample > CopyFrom(const uint8_t *data, size_t size, bool is_key_frame)
Definition: media_sample.cc:45
WebMClusterParser(int64_t timecode_scale, scoped_refptr< AudioStreamInfo > audio_stream_info, scoped_refptr< VideoStreamInfo > video_stream_info, int64_t audio_default_duration, int64_t video_default_duration, const WebMTracksParser::TextTracks &text_tracks, const std::set< int64_t > &ignored_tracks, const std::string &audio_encryption_key_id, const std::string &video_encryption_key_id, const MediaParser::NewSampleCB &new_sample_cb, const MediaParser::InitCB &init_cb, KeySource *decryption_key_source)
DecryptorSource wraps KeySource and is responsible for decryptor management.
void Reset()
Resets the parser state so it can accept a new cluster.