DASH Media Packaging SDK
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator
webm_cluster_parser.cc
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "packager/media/formats/webm/webm_cluster_parser.h"
6 
7 #include <algorithm>
8 #include <vector>
9 
10 #include "packager/base/logging.h"
11 #include "packager/base/sys_byteorder.h"
12 #include "packager/media/base/decrypt_config.h"
13 #include "packager/media/base/timestamp.h"
14 #include "packager/media/codecs/vp8_parser.h"
15 #include "packager/media/codecs/vp9_parser.h"
16 #include "packager/media/codecs/webvtt_util.h"
17 #include "packager/media/formats/webm/webm_constants.h"
18 #include "packager/media/formats/webm/webm_crypto_helpers.h"
19 #include "packager/media/formats/webm/webm_webvtt_parser.h"
20 
21 namespace shaka {
22 namespace media {
23 namespace {
24 
25 const int64_t kMicrosecondsPerMillisecond = 1000;
26 
27 } // namespace
28 
30  int64_t timecode_scale,
31  std::shared_ptr<AudioStreamInfo> audio_stream_info,
32  std::shared_ptr<VideoStreamInfo> video_stream_info,
33  int64_t audio_default_duration,
34  int64_t video_default_duration,
35  const WebMTracksParser::TextTracks& text_tracks,
36  const std::set<int64_t>& ignored_tracks,
37  const std::string& audio_encryption_key_id,
38  const std::string& video_encryption_key_id,
39  const MediaParser::NewSampleCB& new_sample_cb,
40  const MediaParser::InitCB& init_cb,
41  KeySource* decryption_key_source)
42  : timecode_multiplier_(timecode_scale / 1000.0),
43  audio_stream_info_(audio_stream_info),
44  video_stream_info_(video_stream_info),
45  ignored_tracks_(ignored_tracks),
46  audio_encryption_key_id_(audio_encryption_key_id),
47  video_encryption_key_id_(video_encryption_key_id),
48  parser_(kWebMIdCluster, this),
49  initialized_(false),
50  init_cb_(init_cb),
51  cluster_start_time_(kNoTimestamp),
52  audio_(audio_stream_info ? audio_stream_info->track_id() : -1,
53  false,
54  audio_default_duration,
55  new_sample_cb),
56  video_(video_stream_info ? video_stream_info->track_id() : -1,
57  true,
58  video_default_duration,
59  new_sample_cb) {
60  if (decryption_key_source) {
61  decryptor_source_.reset(new DecryptorSource(decryption_key_source));
62  if (audio_stream_info_)
63  audio_stream_info_->set_is_encrypted(false);
64  if (video_stream_info_)
65  video_stream_info_->set_is_encrypted(false);
66  }
67  for (WebMTracksParser::TextTracks::const_iterator it = text_tracks.begin();
68  it != text_tracks.end();
69  ++it) {
70  text_track_map_.insert(std::make_pair(
71  it->first, Track(it->first, false, kNoTimestamp, new_sample_cb)));
72  }
73 }
74 
75 WebMClusterParser::~WebMClusterParser() {}
76 
78  last_block_timecode_ = -1;
79  cluster_timecode_ = -1;
80  cluster_start_time_ = kNoTimestamp;
81  cluster_ended_ = false;
82  parser_.Reset();
83  audio_.Reset();
84  video_.Reset();
85  ResetTextTracks();
86 }
87 
89  // Estimate the duration of the last frame if necessary.
90  bool audio_result = audio_.ApplyDurationEstimateIfNeeded();
91  bool video_result = video_.ApplyDurationEstimateIfNeeded();
92  Reset();
93  return audio_result && video_result;
94 }
95 
96 int WebMClusterParser::Parse(const uint8_t* buf, int size) {
97  int result = parser_.Parse(buf, size);
98 
99  if (result < 0) {
100  cluster_ended_ = false;
101  return result;
102  }
103 
104  cluster_ended_ = parser_.IsParsingComplete();
105  if (cluster_ended_) {
106  // If there were no buffers in this cluster, set the cluster start time to
107  // be the |cluster_timecode_|.
108  if (cluster_start_time_ == kNoTimestamp) {
109  // If the cluster did not even have a |cluster_timecode_|, signal parse
110  // error.
111  if (cluster_timecode_ < 0)
112  return -1;
113 
114  cluster_start_time_ = cluster_timecode_ * timecode_multiplier_;
115  }
116 
117  // Reset the parser if we're done parsing so that
118  // it is ready to accept another cluster on the next
119  // call.
120  parser_.Reset();
121 
122  last_block_timecode_ = -1;
123  cluster_timecode_ = -1;
124  }
125 
126  return result;
127 }
128 
129 WebMParserClient* WebMClusterParser::OnListStart(int id) {
130  if (id == kWebMIdCluster) {
131  cluster_timecode_ = -1;
132  cluster_start_time_ = kNoTimestamp;
133  } else if (id == kWebMIdBlockGroup) {
134  block_data_.reset();
135  block_data_size_ = -1;
136  block_duration_ = -1;
137  discard_padding_ = -1;
138  discard_padding_set_ = false;
139  reference_block_set_ = false;
140  } else if (id == kWebMIdBlockAdditions) {
141  block_add_id_ = -1;
142  block_additional_data_.reset();
143  block_additional_data_size_ = 0;
144  }
145 
146  return this;
147 }
148 
149 bool WebMClusterParser::OnListEnd(int id) {
150  if (id != kWebMIdBlockGroup)
151  return true;
152 
153  // Make sure the BlockGroup actually had a Block.
154  if (block_data_size_ == -1) {
155  LOG(ERROR) << "Block missing from BlockGroup.";
156  return false;
157  }
158 
159  bool result = ParseBlock(
160  false, block_data_.get(), block_data_size_, block_additional_data_.get(),
161  block_additional_data_size_, block_duration_,
162  discard_padding_set_ ? discard_padding_ : 0, reference_block_set_);
163  block_data_.reset();
164  block_data_size_ = -1;
165  block_duration_ = -1;
166  block_add_id_ = -1;
167  block_additional_data_.reset();
168  block_additional_data_size_ = 0;
169  discard_padding_ = -1;
170  discard_padding_set_ = false;
171  reference_block_set_ = false;
172  return result;
173 }
174 
175 bool WebMClusterParser::OnUInt(int id, int64_t val) {
176  int64_t* dst;
177  switch (id) {
178  case kWebMIdTimecode:
179  dst = &cluster_timecode_;
180  break;
181  case kWebMIdBlockDuration:
182  dst = &block_duration_;
183  break;
184  case kWebMIdBlockAddID:
185  dst = &block_add_id_;
186  break;
187  default:
188  return true;
189  }
190  if (*dst != -1)
191  return false;
192  *dst = val;
193  return true;
194 }
195 
196 bool WebMClusterParser::ParseBlock(bool is_simple_block,
197  const uint8_t* buf,
198  int size,
199  const uint8_t* additional,
200  int additional_size,
201  int duration,
202  int64_t discard_padding,
203  bool reference_block_set) {
204  if (size < 4)
205  return false;
206 
207  // Return an error if the trackNum > 127. We just aren't
208  // going to support large track numbers right now.
209  if (!(buf[0] & 0x80)) {
210  LOG(ERROR) << "TrackNumber over 127 not supported";
211  return false;
212  }
213 
214  int track_num = buf[0] & 0x7f;
215  int timecode = buf[1] << 8 | buf[2];
216  int flags = buf[3] & 0xff;
217  int lacing = (flags >> 1) & 0x3;
218 
219  if (lacing) {
220  LOG(ERROR) << "Lacing " << lacing << " is not supported yet.";
221  return false;
222  }
223 
224  // Sign extend negative timecode offsets.
225  if (timecode & 0x8000)
226  timecode |= ~0xffff;
227 
228  // The first bit of the flags is set when a SimpleBlock contains only
229  // keyframes. If this is a Block, then keyframe is inferred by the absence of
230  // the ReferenceBlock Element.
231  // http://www.matroska.org/technical/specs/index.html
232  bool is_key_frame =
233  is_simple_block ? (flags & 0x80) != 0 : !reference_block_set;
234 
235  const uint8_t* frame_data = buf + 4;
236  int frame_size = size - (frame_data - buf);
237  return OnBlock(is_simple_block, track_num, timecode, duration, frame_data,
238  frame_size, additional, additional_size, discard_padding,
239  is_key_frame);
240 }
241 
242 bool WebMClusterParser::OnBinary(int id, const uint8_t* data, int size) {
243  switch (id) {
244  case kWebMIdSimpleBlock:
245  return ParseBlock(true, data, size, NULL, 0, -1, 0, false);
246 
247  case kWebMIdBlock:
248  if (block_data_) {
249  LOG(ERROR) << "More than 1 Block in a BlockGroup is not "
250  "supported.";
251  return false;
252  }
253  block_data_.reset(new uint8_t[size]);
254  memcpy(block_data_.get(), data, size);
255  block_data_size_ = size;
256  return true;
257 
258  case kWebMIdBlockAdditional: {
259  uint64_t block_add_id = base::HostToNet64(block_add_id_);
260  if (block_additional_data_) {
261  // TODO: Technically, more than 1 BlockAdditional is allowed as per
262  // matroska spec. But for now we don't have a use case to support
263  // parsing of such files. Take a look at this again when such a case
264  // arises.
265  LOG(ERROR) << "More than 1 BlockAdditional in a "
266  "BlockGroup is not supported.";
267  return false;
268  }
269  // First 8 bytes of side_data in DecoderBuffer is the BlockAddID
270  // element's value in Big Endian format. This is done to mimic ffmpeg
271  // demuxer's behavior.
272  block_additional_data_size_ = size + sizeof(block_add_id);
273  block_additional_data_.reset(new uint8_t[block_additional_data_size_]);
274  memcpy(block_additional_data_.get(), &block_add_id,
275  sizeof(block_add_id));
276  memcpy(block_additional_data_.get() + 8, data, size);
277  return true;
278  }
279  case kWebMIdDiscardPadding: {
280  if (discard_padding_set_ || size <= 0 || size > 8)
281  return false;
282  discard_padding_set_ = true;
283 
284  // Read in the big-endian integer.
285  discard_padding_ = static_cast<int8_t>(data[0]);
286  for (int i = 1; i < size; ++i)
287  discard_padding_ = (discard_padding_ << 8) | data[i];
288 
289  return true;
290  }
291  case kWebMIdReferenceBlock:
292  // We use ReferenceBlock to determine whether the current Block contains a
293  // keyframe or not. Other than that, we don't care about the value of the
294  // ReferenceBlock element itself.
295  reference_block_set_ = true;
296  return true;
297  default:
298  return true;
299  }
300 }
301 
302 bool WebMClusterParser::OnBlock(bool is_simple_block,
303  int track_num,
304  int timecode,
305  int block_duration,
306  const uint8_t* data,
307  int size,
308  const uint8_t* additional,
309  int additional_size,
310  int64_t discard_padding,
311  bool is_key_frame) {
312  DCHECK_GE(size, 0);
313  if (cluster_timecode_ == -1) {
314  LOG(ERROR) << "Got a block before cluster timecode.";
315  return false;
316  }
317 
318  // TODO: Should relative negative timecode offsets be rejected? Or only when
319  // the absolute timecode is negative? See http://crbug.com/271794
320  if (timecode < 0) {
321  LOG(ERROR) << "Got a block with negative timecode offset " << timecode;
322  return false;
323  }
324 
325  if (last_block_timecode_ != -1 && timecode < last_block_timecode_) {
326  LOG(ERROR) << "Got a block with a timecode before the previous block.";
327  return false;
328  }
329 
330  Track* track = NULL;
331  StreamType stream_type = kStreamUnknown;
332  std::string encryption_key_id;
333  if (track_num == audio_.track_num()) {
334  track = &audio_;
335  encryption_key_id = audio_encryption_key_id_;
336  stream_type = kStreamAudio;
337  } else if (track_num == video_.track_num()) {
338  track = &video_;
339  encryption_key_id = video_encryption_key_id_;
340  stream_type = kStreamVideo;
341  } else if (ignored_tracks_.find(track_num) != ignored_tracks_.end()) {
342  return true;
343  } else if (Track* const text_track = FindTextTrack(track_num)) {
344  if (is_simple_block) // BlockGroup is required for WebVTT cues
345  return false;
346  if (block_duration < 0) // not specified
347  return false;
348  track = text_track;
349  stream_type = kStreamText;
350  } else {
351  LOG(ERROR) << "Unexpected track number " << track_num;
352  return false;
353  }
354  DCHECK_NE(stream_type, kStreamUnknown);
355 
356  last_block_timecode_ = timecode;
357 
358  int64_t timestamp = (cluster_timecode_ + timecode) * timecode_multiplier_;
359 
360  std::shared_ptr<MediaSample> buffer;
361  if (stream_type != kStreamText) {
362  // Every encrypted Block has a signal byte and IV prepended to it. Current
363  // encrypted WebM request for comments specification is here
364  // http://wiki.webmproject.org/encryption/webm-encryption-rfc
365  std::unique_ptr<DecryptConfig> decrypt_config;
366  int data_offset = 0;
367  if (!encryption_key_id.empty() &&
368  !WebMCreateDecryptConfig(
369  data, size,
370  reinterpret_cast<const uint8_t*>(encryption_key_id.data()),
371  encryption_key_id.size(),
372  &decrypt_config, &data_offset)) {
373  return false;
374  }
375 
376  buffer = MediaSample::CopyFrom(data + data_offset, size - data_offset,
377  additional, additional_size, is_key_frame);
378 
379  if (decrypt_config) {
380  if (!decryptor_source_) {
381  // If the demuxer does not have the decryptor_source_, store
382  // decrypt_config so that the demuxed sample can be decrypted later.
383  buffer->set_decrypt_config(std::move(decrypt_config));
384  buffer->set_is_encrypted(true);
385  } else if (!decryptor_source_->DecryptSampleBuffer(
386  decrypt_config.get(), buffer->writable_data(),
387  buffer->data_size())) {
388  LOG(ERROR) << "Cannot decrypt samples";
389  return false;
390  }
391  }
392  } else {
393  std::string id, settings, content;
394  WebMWebVTTParser::Parse(data, size, &id, &settings, &content);
395 
396  std::vector<uint8_t> side_data;
397  MakeSideData(id.begin(), id.end(),
398  settings.begin(), settings.end(),
399  &side_data);
400 
401  buffer = MediaSample::CopyFrom(
402  reinterpret_cast<const uint8_t*>(content.data()), content.length(),
403  &side_data[0], side_data.size(), true);
404  }
405 
406  buffer->set_dts(timestamp);
407  buffer->set_pts(timestamp);
408  if (cluster_start_time_ == kNoTimestamp)
409  cluster_start_time_ = timestamp;
410  buffer->set_duration(block_duration > 0
411  ? (block_duration * timecode_multiplier_)
412  : kNoTimestamp);
413 
414  if (!init_cb_.is_null() && !initialized_) {
415  std::vector<std::shared_ptr<StreamInfo>> streams;
416  if (audio_stream_info_)
417  streams.push_back(audio_stream_info_);
418  if (video_stream_info_) {
419  if (stream_type == kStreamVideo) {
420  std::unique_ptr<VPxParser> vpx_parser;
421  switch (video_stream_info_->codec()) {
422  case kCodecVP8:
423  vpx_parser.reset(new VP8Parser);
424  break;
425  case kCodecVP9:
426  vpx_parser.reset(new VP9Parser);
427  break;
428  default:
429  NOTIMPLEMENTED() << "Unsupported codec "
430  << video_stream_info_->codec();
431  return false;
432  }
433  std::vector<VPxFrameInfo> vpx_frames;
434  if (!vpx_parser->Parse(buffer->data(), buffer->data_size(),
435  &vpx_frames)) {
436  LOG(ERROR) << "Failed to parse vpx frame.";
437  return false;
438  }
439  if (vpx_frames.size() != 1u || !vpx_frames[0].is_keyframe) {
440  LOG(ERROR) << "The first frame should be a key frame.";
441  return false;
442  }
443 
444  VPCodecConfigurationRecord codec_config;
445  if (!video_stream_info_->codec_config().empty())
446  codec_config.ParseWebM(video_stream_info_->codec_config());
447  codec_config.MergeFrom(vpx_parser->codec_config());
448 
449  video_stream_info_->set_codec_string(
450  codec_config.GetCodecString(video_stream_info_->codec()));
451  std::vector<uint8_t> config_serialized;
452  codec_config.WriteMP4(&config_serialized);
453  video_stream_info_->set_codec_config(config_serialized);
454  streams.push_back(video_stream_info_);
455  init_cb_.Run(streams);
456  initialized_ = true;
457  }
458  } else {
459  init_cb_.Run(streams);
460  initialized_ = true;
461  }
462  }
463 
464  return track->EmitBuffer(buffer);
465 }
466 
467 WebMClusterParser::Track::Track(int track_num,
468  bool is_video,
469  int64_t default_duration,
470  const MediaParser::NewSampleCB& new_sample_cb)
471  : track_num_(track_num),
472  is_video_(is_video),
473  default_duration_(default_duration),
474  estimated_next_frame_duration_(kNoTimestamp),
475  new_sample_cb_(new_sample_cb) {
476  DCHECK(default_duration_ == kNoTimestamp || default_duration_ > 0);
477 }
478 
479 WebMClusterParser::Track::~Track() {}
480 
481 bool WebMClusterParser::Track::EmitBuffer(
482  const std::shared_ptr<MediaSample>& buffer) {
483  DVLOG(2) << "EmitBuffer() : " << track_num_
484  << " ts " << buffer->pts()
485  << " dur " << buffer->duration()
486  << " kf " << buffer->is_key_frame()
487  << " size " << buffer->data_size();
488 
489  if (last_added_buffer_missing_duration_.get()) {
490  int64_t derived_duration =
491  buffer->pts() - last_added_buffer_missing_duration_->pts();
492  last_added_buffer_missing_duration_->set_duration(derived_duration);
493 
494  DVLOG(2) << "EmitBuffer() : applied derived duration to held-back buffer : "
495  << " ts "
496  << last_added_buffer_missing_duration_->pts()
497  << " dur "
498  << last_added_buffer_missing_duration_->duration()
499  << " kf " << last_added_buffer_missing_duration_->is_key_frame()
500  << " size " << last_added_buffer_missing_duration_->data_size();
501  std::shared_ptr<MediaSample> updated_buffer =
502  last_added_buffer_missing_duration_;
503  last_added_buffer_missing_duration_ = NULL;
504  if (!EmitBufferHelp(updated_buffer))
505  return false;
506  }
507 
508  if (buffer->duration() == kNoTimestamp) {
509  last_added_buffer_missing_duration_ = buffer;
510  DVLOG(2) << "EmitBuffer() : holding back buffer that is missing duration";
511  return true;
512  }
513 
514  return EmitBufferHelp(buffer);
515 }
516 
517 bool WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() {
518  if (!last_added_buffer_missing_duration_.get())
519  return true;
520 
521  int64_t estimated_duration = GetDurationEstimate();
522  last_added_buffer_missing_duration_->set_duration(estimated_duration);
523 
524  VLOG(1) << "Track " << track_num_ << ": Estimating WebM block duration to be "
525  << estimated_duration / 1000
526  << "ms for the last (Simple)Block in the Cluster for this Track. Use "
527  "BlockGroups with BlockDurations at the end of each Track in a "
528  "Cluster to avoid estimation.";
529 
530  DVLOG(2) << " new dur : ts " << last_added_buffer_missing_duration_->pts()
531  << " dur " << last_added_buffer_missing_duration_->duration()
532  << " kf " << last_added_buffer_missing_duration_->is_key_frame()
533  << " size " << last_added_buffer_missing_duration_->data_size();
534 
535  // Don't use the applied duration as a future estimation (don't use
536  // EmitBufferHelp() here.)
537  if (!new_sample_cb_.Run(track_num_, last_added_buffer_missing_duration_))
538  return false;
539  last_added_buffer_missing_duration_ = NULL;
540  return true;
541 }
542 
543 void WebMClusterParser::Track::Reset() {
544  last_added_buffer_missing_duration_ = NULL;
545 }
546 
547 bool WebMClusterParser::Track::EmitBufferHelp(
548  const std::shared_ptr<MediaSample>& buffer) {
549  DCHECK(!last_added_buffer_missing_duration_.get());
550 
551  int64_t duration = buffer->duration();
552  if (duration < 0 || duration == kNoTimestamp) {
553  LOG(ERROR) << "Invalid buffer duration: " << duration;
554  return false;
555  }
556 
557  // The estimated frame duration is the maximum non-zero duration since the
558  // last initialization segment.
559  if (duration > 0) {
560  int64_t orig_duration_estimate = estimated_next_frame_duration_;
561  if (estimated_next_frame_duration_ == kNoTimestamp) {
562  estimated_next_frame_duration_ = duration;
563  } else {
564  estimated_next_frame_duration_ =
565  std::max(duration, estimated_next_frame_duration_);
566  }
567 
568  if (orig_duration_estimate != estimated_next_frame_duration_) {
569  DVLOG(3) << "Updated duration estimate:"
570  << orig_duration_estimate
571  << " -> "
572  << estimated_next_frame_duration_
573  << " at timestamp: "
574  << buffer->dts();
575  }
576  }
577 
578  return new_sample_cb_.Run(track_num_, buffer);
579 }
580 
581 int64_t WebMClusterParser::Track::GetDurationEstimate() {
582  int64_t duration = kNoTimestamp;
583  if (default_duration_ != kNoTimestamp) {
584  duration = default_duration_;
585  DVLOG(3) << __FUNCTION__ << " : using track default duration " << duration;
586  } else if (estimated_next_frame_duration_ != kNoTimestamp) {
587  duration = estimated_next_frame_duration_;
588  DVLOG(3) << __FUNCTION__ << " : using estimated duration " << duration;
589  } else {
590  if (is_video_) {
591  duration = kDefaultVideoBufferDurationInMs * kMicrosecondsPerMillisecond;
592  } else {
593  duration = kDefaultAudioBufferDurationInMs * kMicrosecondsPerMillisecond;
594  }
595  DVLOG(3) << __FUNCTION__ << " : using hardcoded default duration "
596  << duration;
597  }
598 
599  DCHECK_GT(duration, 0);
600  DCHECK_NE(duration, kNoTimestamp);
601  return duration;
602 }
603 
604 void WebMClusterParser::ResetTextTracks() {
605  for (TextTrackMap::iterator it = text_track_map_.begin();
606  it != text_track_map_.end();
607  ++it) {
608  it->second.Reset();
609  }
610 }
611 
612 WebMClusterParser::Track*
613 WebMClusterParser::FindTextTrack(int track_num) {
614  const TextTrackMap::iterator it = text_track_map_.find(track_num);
615 
616  if (it == text_track_map_.end())
617  return NULL;
618 
619  return &it->second;
620 }
621 
622 } // namespace media
623 } // namespace shaka
int Parse(const uint8_t *buf, int size)
base::Callback< void(const std::vector< std::shared_ptr< StreamInfo > > &stream_info)> InitCB
Definition: media_parser.h:34
base::Callback< bool(uint32_t track_id, const std::shared_ptr< MediaSample > &media_sample)> NewSampleCB
Definition: media_parser.h:43
void Reset()
Resets the state of the parser so it can start parsing a new list.
Definition: webm_parser.cc:714
int Parse(const uint8_t *buf, int size)
Definition: webm_parser.cc:719
bool Flush() WARN_UNUSED_RESULT
static void Parse(const uint8_t *payload, int payload_size, std::string *id, std::string *settings, std::string *content)
Utility function to parse the WebVTT cue from a byte stream.
static std::shared_ptr< MediaSample > CopyFrom(const uint8_t *data, size_t size, bool is_key_frame)
Definition: media_sample.cc:45
KeySource is responsible for encryption key acquisition.
Definition: key_source.h:45
DecryptorSource wraps KeySource and is responsible for decryptor management.
WebMClusterParser(int64_t timecode_scale, std::shared_ptr< AudioStreamInfo > audio_stream_info, std::shared_ptr< VideoStreamInfo > video_stream_info, int64_t audio_default_duration, int64_t video_default_duration, const WebMTracksParser::TextTracks &text_tracks, const std::set< int64_t > &ignored_tracks, const std::string &audio_encryption_key_id, const std::string &video_encryption_key_id, const MediaParser::NewSampleCB &new_sample_cb, const MediaParser::InitCB &init_cb, KeySource *decryption_key_source)
void Reset()
Resets the parser state so it can accept a new cluster.