DASH Media Packaging SDK
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator
webm_cluster_parser.cc
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "packager/media/formats/webm/webm_cluster_parser.h"
6 
7 #include <vector>
8 
9 #include "packager/base/logging.h"
10 #include "packager/base/sys_byteorder.h"
11 #include "packager/media/base/decrypt_config.h"
12 #include "packager/media/base/timestamp.h"
13 #include "packager/media/codecs/vp8_parser.h"
14 #include "packager/media/codecs/vp9_parser.h"
15 #include "packager/media/codecs/webvtt_util.h"
16 #include "packager/media/formats/webm/webm_constants.h"
17 #include "packager/media/formats/webm/webm_crypto_helpers.h"
18 #include "packager/media/formats/webm/webm_webvtt_parser.h"
19 
20 namespace shaka {
21 namespace media {
22 namespace {
23 
24 const int64_t kMicrosecondsPerMillisecond = 1000;
25 
26 } // namespace
27 
29  int64_t timecode_scale,
30  scoped_refptr<AudioStreamInfo> audio_stream_info,
31  scoped_refptr<VideoStreamInfo> video_stream_info,
32  int64_t audio_default_duration,
33  int64_t video_default_duration,
34  const WebMTracksParser::TextTracks& text_tracks,
35  const std::set<int64_t>& ignored_tracks,
36  const std::string& audio_encryption_key_id,
37  const std::string& video_encryption_key_id,
38  const MediaParser::NewSampleCB& new_sample_cb,
39  const MediaParser::InitCB& init_cb,
40  KeySource* decryption_key_source)
41  : timecode_multiplier_(timecode_scale / 1000.0),
42  audio_stream_info_(audio_stream_info),
43  video_stream_info_(video_stream_info),
44  ignored_tracks_(ignored_tracks),
45  audio_encryption_key_id_(audio_encryption_key_id),
46  video_encryption_key_id_(video_encryption_key_id),
47  parser_(kWebMIdCluster, this),
48  initialized_(false),
49  init_cb_(init_cb),
50  cluster_start_time_(kNoTimestamp),
51  audio_(audio_stream_info ? audio_stream_info->track_id() : -1,
52  false,
53  audio_default_duration,
54  new_sample_cb),
55  video_(video_stream_info ? video_stream_info->track_id() : -1,
56  true,
57  video_default_duration,
58  new_sample_cb) {
59  if (decryption_key_source)
60  decryptor_source_.reset(new DecryptorSource(decryption_key_source));
61  for (WebMTracksParser::TextTracks::const_iterator it = text_tracks.begin();
62  it != text_tracks.end();
63  ++it) {
64  text_track_map_.insert(std::make_pair(
65  it->first, Track(it->first, false, kNoTimestamp, new_sample_cb)));
66  }
67 }
68 
69 WebMClusterParser::~WebMClusterParser() {}
70 
72  last_block_timecode_ = -1;
73  cluster_timecode_ = -1;
74  cluster_start_time_ = kNoTimestamp;
75  cluster_ended_ = false;
76  parser_.Reset();
77  audio_.Reset();
78  video_.Reset();
79  ResetTextTracks();
80 }
81 
83  // Estimate the duration of the last frame if necessary.
84  bool audio_result = audio_.ApplyDurationEstimateIfNeeded();
85  bool video_result = video_.ApplyDurationEstimateIfNeeded();
86  Reset();
87  return audio_result && video_result;
88 }
89 
90 int WebMClusterParser::Parse(const uint8_t* buf, int size) {
91  int result = parser_.Parse(buf, size);
92 
93  if (result < 0) {
94  cluster_ended_ = false;
95  return result;
96  }
97 
98  cluster_ended_ = parser_.IsParsingComplete();
99  if (cluster_ended_) {
100  // If there were no buffers in this cluster, set the cluster start time to
101  // be the |cluster_timecode_|.
102  if (cluster_start_time_ == kNoTimestamp) {
103  // If the cluster did not even have a |cluster_timecode_|, signal parse
104  // error.
105  if (cluster_timecode_ < 0)
106  return -1;
107 
108  cluster_start_time_ = cluster_timecode_ * timecode_multiplier_;
109  }
110 
111  // Reset the parser if we're done parsing so that
112  // it is ready to accept another cluster on the next
113  // call.
114  parser_.Reset();
115 
116  last_block_timecode_ = -1;
117  cluster_timecode_ = -1;
118  }
119 
120  return result;
121 }
122 
123 WebMParserClient* WebMClusterParser::OnListStart(int id) {
124  if (id == kWebMIdCluster) {
125  cluster_timecode_ = -1;
126  cluster_start_time_ = kNoTimestamp;
127  } else if (id == kWebMIdBlockGroup) {
128  block_data_.reset();
129  block_data_size_ = -1;
130  block_duration_ = -1;
131  discard_padding_ = -1;
132  discard_padding_set_ = false;
133  reference_block_set_ = false;
134  } else if (id == kWebMIdBlockAdditions) {
135  block_add_id_ = -1;
136  block_additional_data_.reset();
137  block_additional_data_size_ = 0;
138  }
139 
140  return this;
141 }
142 
143 bool WebMClusterParser::OnListEnd(int id) {
144  if (id != kWebMIdBlockGroup)
145  return true;
146 
147  // Make sure the BlockGroup actually had a Block.
148  if (block_data_size_ == -1) {
149  LOG(ERROR) << "Block missing from BlockGroup.";
150  return false;
151  }
152 
153  bool result = ParseBlock(
154  false, block_data_.get(), block_data_size_, block_additional_data_.get(),
155  block_additional_data_size_, block_duration_,
156  discard_padding_set_ ? discard_padding_ : 0, reference_block_set_);
157  block_data_.reset();
158  block_data_size_ = -1;
159  block_duration_ = -1;
160  block_add_id_ = -1;
161  block_additional_data_.reset();
162  block_additional_data_size_ = 0;
163  discard_padding_ = -1;
164  discard_padding_set_ = false;
165  reference_block_set_ = false;
166  return result;
167 }
168 
169 bool WebMClusterParser::OnUInt(int id, int64_t val) {
170  int64_t* dst;
171  switch (id) {
172  case kWebMIdTimecode:
173  dst = &cluster_timecode_;
174  break;
175  case kWebMIdBlockDuration:
176  dst = &block_duration_;
177  break;
178  case kWebMIdBlockAddID:
179  dst = &block_add_id_;
180  break;
181  default:
182  return true;
183  }
184  if (*dst != -1)
185  return false;
186  *dst = val;
187  return true;
188 }
189 
190 bool WebMClusterParser::ParseBlock(bool is_simple_block,
191  const uint8_t* buf,
192  int size,
193  const uint8_t* additional,
194  int additional_size,
195  int duration,
196  int64_t discard_padding,
197  bool reference_block_set) {
198  if (size < 4)
199  return false;
200 
201  // Return an error if the trackNum > 127. We just aren't
202  // going to support large track numbers right now.
203  if (!(buf[0] & 0x80)) {
204  LOG(ERROR) << "TrackNumber over 127 not supported";
205  return false;
206  }
207 
208  int track_num = buf[0] & 0x7f;
209  int timecode = buf[1] << 8 | buf[2];
210  int flags = buf[3] & 0xff;
211  int lacing = (flags >> 1) & 0x3;
212 
213  if (lacing) {
214  LOG(ERROR) << "Lacing " << lacing << " is not supported yet.";
215  return false;
216  }
217 
218  // Sign extend negative timecode offsets.
219  if (timecode & 0x8000)
220  timecode |= ~0xffff;
221 
222  // The first bit of the flags is set when a SimpleBlock contains only
223  // keyframes. If this is a Block, then keyframe is inferred by the absence of
224  // the ReferenceBlock Element.
225  // http://www.matroska.org/technical/specs/index.html
226  bool is_key_frame =
227  is_simple_block ? (flags & 0x80) != 0 : !reference_block_set;
228 
229  const uint8_t* frame_data = buf + 4;
230  int frame_size = size - (frame_data - buf);
231  return OnBlock(is_simple_block, track_num, timecode, duration, frame_data,
232  frame_size, additional, additional_size, discard_padding,
233  is_key_frame);
234 }
235 
236 bool WebMClusterParser::OnBinary(int id, const uint8_t* data, int size) {
237  switch (id) {
238  case kWebMIdSimpleBlock:
239  return ParseBlock(true, data, size, NULL, 0, -1, 0, false);
240 
241  case kWebMIdBlock:
242  if (block_data_) {
243  LOG(ERROR) << "More than 1 Block in a BlockGroup is not "
244  "supported.";
245  return false;
246  }
247  block_data_.reset(new uint8_t[size]);
248  memcpy(block_data_.get(), data, size);
249  block_data_size_ = size;
250  return true;
251 
252  case kWebMIdBlockAdditional: {
253  uint64_t block_add_id = base::HostToNet64(block_add_id_);
254  if (block_additional_data_) {
255  // TODO: Technically, more than 1 BlockAdditional is allowed as per
256  // matroska spec. But for now we don't have a use case to support
257  // parsing of such files. Take a look at this again when such a case
258  // arises.
259  LOG(ERROR) << "More than 1 BlockAdditional in a "
260  "BlockGroup is not supported.";
261  return false;
262  }
263  // First 8 bytes of side_data in DecoderBuffer is the BlockAddID
264  // element's value in Big Endian format. This is done to mimic ffmpeg
265  // demuxer's behavior.
266  block_additional_data_size_ = size + sizeof(block_add_id);
267  block_additional_data_.reset(new uint8_t[block_additional_data_size_]);
268  memcpy(block_additional_data_.get(), &block_add_id,
269  sizeof(block_add_id));
270  memcpy(block_additional_data_.get() + 8, data, size);
271  return true;
272  }
273  case kWebMIdDiscardPadding: {
274  if (discard_padding_set_ || size <= 0 || size > 8)
275  return false;
276  discard_padding_set_ = true;
277 
278  // Read in the big-endian integer.
279  discard_padding_ = static_cast<int8_t>(data[0]);
280  for (int i = 1; i < size; ++i)
281  discard_padding_ = (discard_padding_ << 8) | data[i];
282 
283  return true;
284  }
285  case kWebMIdReferenceBlock:
286  // We use ReferenceBlock to determine whether the current Block contains a
287  // keyframe or not. Other than that, we don't care about the value of the
288  // ReferenceBlock element itself.
289  reference_block_set_ = true;
290  return true;
291  default:
292  return true;
293  }
294 }
295 
296 bool WebMClusterParser::OnBlock(bool is_simple_block,
297  int track_num,
298  int timecode,
299  int block_duration,
300  const uint8_t* data,
301  int size,
302  const uint8_t* additional,
303  int additional_size,
304  int64_t discard_padding,
305  bool is_key_frame) {
306  DCHECK_GE(size, 0);
307  if (cluster_timecode_ == -1) {
308  LOG(ERROR) << "Got a block before cluster timecode.";
309  return false;
310  }
311 
312  // TODO: Should relative negative timecode offsets be rejected? Or only when
313  // the absolute timecode is negative? See http://crbug.com/271794
314  if (timecode < 0) {
315  LOG(ERROR) << "Got a block with negative timecode offset " << timecode;
316  return false;
317  }
318 
319  if (last_block_timecode_ != -1 && timecode < last_block_timecode_) {
320  LOG(ERROR) << "Got a block with a timecode before the previous block.";
321  return false;
322  }
323 
324  Track* track = NULL;
325  StreamType stream_type = kStreamUnknown;
326  std::string encryption_key_id;
327  if (track_num == audio_.track_num()) {
328  track = &audio_;
329  encryption_key_id = audio_encryption_key_id_;
330  stream_type = kStreamAudio;
331  } else if (track_num == video_.track_num()) {
332  track = &video_;
333  encryption_key_id = video_encryption_key_id_;
334  stream_type = kStreamVideo;
335  } else if (ignored_tracks_.find(track_num) != ignored_tracks_.end()) {
336  return true;
337  } else if (Track* const text_track = FindTextTrack(track_num)) {
338  if (is_simple_block) // BlockGroup is required for WebVTT cues
339  return false;
340  if (block_duration < 0) // not specified
341  return false;
342  track = text_track;
343  stream_type = kStreamText;
344  } else {
345  LOG(ERROR) << "Unexpected track number " << track_num;
346  return false;
347  }
348  DCHECK_NE(stream_type, kStreamUnknown);
349 
350  last_block_timecode_ = timecode;
351 
352  int64_t timestamp = (cluster_timecode_ + timecode) * timecode_multiplier_;
353 
354  scoped_refptr<MediaSample> buffer;
355  if (stream_type != kStreamText) {
356  // Every encrypted Block has a signal byte and IV prepended to it. Current
357  // encrypted WebM request for comments specification is here
358  // http://wiki.webmproject.org/encryption/webm-encryption-rfc
359  std::unique_ptr<DecryptConfig> decrypt_config;
360  int data_offset = 0;
361  if (!encryption_key_id.empty() &&
362  !WebMCreateDecryptConfig(
363  data, size,
364  reinterpret_cast<const uint8_t*>(encryption_key_id.data()),
365  encryption_key_id.size(),
366  &decrypt_config, &data_offset)) {
367  return false;
368  }
369 
370  buffer = MediaSample::CopyFrom(data + data_offset, size - data_offset,
371  additional, additional_size, is_key_frame);
372 
373  if (decrypt_config) {
374  if (!decryptor_source_) {
375  LOG(ERROR) << "Encrypted media sample encountered, but decryption is "
376  "not enabled";
377  return false;
378  }
379  if (!decryptor_source_->DecryptSampleBuffer(decrypt_config.get(),
380  buffer->writable_data(),
381  buffer->data_size())) {
382  LOG(ERROR) << "Cannot decrypt samples";
383  return false;
384  }
385  }
386  } else {
387  std::string id, settings, content;
388  WebMWebVTTParser::Parse(data, size, &id, &settings, &content);
389 
390  std::vector<uint8_t> side_data;
391  MakeSideData(id.begin(), id.end(),
392  settings.begin(), settings.end(),
393  &side_data);
394 
395  buffer = MediaSample::CopyFrom(
396  reinterpret_cast<const uint8_t*>(content.data()), content.length(),
397  &side_data[0], side_data.size(), true);
398  }
399 
400  buffer->set_dts(timestamp);
401  buffer->set_pts(timestamp);
402  if (cluster_start_time_ == kNoTimestamp)
403  cluster_start_time_ = timestamp;
404  buffer->set_duration(block_duration > 0
405  ? (block_duration * timecode_multiplier_)
406  : kNoTimestamp);
407 
408  if (!init_cb_.is_null() && !initialized_) {
409  std::vector<scoped_refptr<StreamInfo>> streams;
410  if (audio_stream_info_)
411  streams.push_back(audio_stream_info_);
412  if (video_stream_info_) {
413  if (stream_type == kStreamVideo) {
414  std::unique_ptr<VPxParser> vpx_parser;
415  switch (video_stream_info_->codec()) {
416  case kCodecVP8:
417  vpx_parser.reset(new VP8Parser);
418  break;
419  case kCodecVP9:
420  vpx_parser.reset(new VP9Parser);
421  break;
422  default:
423  NOTIMPLEMENTED() << "Unsupported codec "
424  << video_stream_info_->codec();
425  return false;
426  }
427  std::vector<VPxFrameInfo> vpx_frames;
428  if (!vpx_parser->Parse(buffer->data(), buffer->data_size(),
429  &vpx_frames)) {
430  LOG(ERROR) << "Failed to parse vpx frame.";
431  return false;
432  }
433  if (vpx_frames.size() != 1u || !vpx_frames[0].is_keyframe) {
434  LOG(ERROR) << "The first frame should be a key frame.";
435  return false;
436  }
437 
438  VPCodecConfigurationRecord codec_config;
439  if (!video_stream_info_->codec_config().empty())
440  codec_config.ParseWebM(video_stream_info_->codec_config());
441  codec_config.MergeFrom(vpx_parser->codec_config());
442 
443  video_stream_info_->set_codec_string(
444  codec_config.GetCodecString(video_stream_info_->codec()));
445  std::vector<uint8_t> config_serialized;
446  codec_config.WriteMP4(&config_serialized);
447  video_stream_info_->set_codec_config(config_serialized);
448  streams.push_back(video_stream_info_);
449  init_cb_.Run(streams);
450  initialized_ = true;
451  }
452  } else {
453  init_cb_.Run(streams);
454  initialized_ = true;
455  }
456  }
457 
458  return track->EmitBuffer(buffer);
459 }
460 
461 WebMClusterParser::Track::Track(int track_num,
462  bool is_video,
463  int64_t default_duration,
464  const MediaParser::NewSampleCB& new_sample_cb)
465  : track_num_(track_num),
466  is_video_(is_video),
467  default_duration_(default_duration),
468  estimated_next_frame_duration_(kNoTimestamp),
469  new_sample_cb_(new_sample_cb) {
470  DCHECK(default_duration_ == kNoTimestamp || default_duration_ > 0);
471 }
472 
473 WebMClusterParser::Track::~Track() {}
474 
475 bool WebMClusterParser::Track::EmitBuffer(
476  const scoped_refptr<MediaSample>& buffer) {
477  DVLOG(2) << "EmitBuffer() : " << track_num_
478  << " ts " << buffer->pts()
479  << " dur " << buffer->duration()
480  << " kf " << buffer->is_key_frame()
481  << " size " << buffer->data_size();
482 
483  if (last_added_buffer_missing_duration_.get()) {
484  int64_t derived_duration =
485  buffer->pts() - last_added_buffer_missing_duration_->pts();
486  last_added_buffer_missing_duration_->set_duration(derived_duration);
487 
488  DVLOG(2) << "EmitBuffer() : applied derived duration to held-back buffer : "
489  << " ts "
490  << last_added_buffer_missing_duration_->pts()
491  << " dur "
492  << last_added_buffer_missing_duration_->duration()
493  << " kf " << last_added_buffer_missing_duration_->is_key_frame()
494  << " size " << last_added_buffer_missing_duration_->data_size();
495  scoped_refptr<MediaSample> updated_buffer =
496  last_added_buffer_missing_duration_;
497  last_added_buffer_missing_duration_ = NULL;
498  if (!EmitBufferHelp(updated_buffer))
499  return false;
500  }
501 
502  if (buffer->duration() == kNoTimestamp) {
503  last_added_buffer_missing_duration_ = buffer;
504  DVLOG(2) << "EmitBuffer() : holding back buffer that is missing duration";
505  return true;
506  }
507 
508  return EmitBufferHelp(buffer);
509 }
510 
511 bool WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() {
512  if (!last_added_buffer_missing_duration_.get())
513  return true;
514 
515  int64_t estimated_duration = GetDurationEstimate();
516  last_added_buffer_missing_duration_->set_duration(estimated_duration);
517 
518  VLOG(1) << "Track " << track_num_ << ": Estimating WebM block duration to be "
519  << estimated_duration / 1000
520  << "ms for the last (Simple)Block in the Cluster for this Track. Use "
521  "BlockGroups with BlockDurations at the end of each Track in a "
522  "Cluster to avoid estimation.";
523 
524  DVLOG(2) << " new dur : ts " << last_added_buffer_missing_duration_->pts()
525  << " dur " << last_added_buffer_missing_duration_->duration()
526  << " kf " << last_added_buffer_missing_duration_->is_key_frame()
527  << " size " << last_added_buffer_missing_duration_->data_size();
528 
529  // Don't use the applied duration as a future estimation (don't use
530  // EmitBufferHelp() here.)
531  if (!new_sample_cb_.Run(track_num_, last_added_buffer_missing_duration_))
532  return false;
533  last_added_buffer_missing_duration_ = NULL;
534  return true;
535 }
536 
537 void WebMClusterParser::Track::Reset() {
538  last_added_buffer_missing_duration_ = NULL;
539 }
540 
541 bool WebMClusterParser::Track::EmitBufferHelp(
542  const scoped_refptr<MediaSample>& buffer) {
543  DCHECK(!last_added_buffer_missing_duration_.get());
544 
545  int64_t duration = buffer->duration();
546  if (duration < 0 || duration == kNoTimestamp) {
547  LOG(ERROR) << "Invalid buffer duration: " << duration;
548  return false;
549  }
550 
551  // The estimated frame duration is the maximum non-zero duration since the
552  // last initialization segment.
553  if (duration > 0) {
554  int64_t orig_duration_estimate = estimated_next_frame_duration_;
555  if (estimated_next_frame_duration_ == kNoTimestamp) {
556  estimated_next_frame_duration_ = duration;
557  } else {
558  estimated_next_frame_duration_ =
559  std::max(duration, estimated_next_frame_duration_);
560  }
561 
562  if (orig_duration_estimate != estimated_next_frame_duration_) {
563  DVLOG(3) << "Updated duration estimate:"
564  << orig_duration_estimate
565  << " -> "
566  << estimated_next_frame_duration_
567  << " at timestamp: "
568  << buffer->dts();
569  }
570  }
571 
572  return new_sample_cb_.Run(track_num_, buffer);
573 }
574 
575 int64_t WebMClusterParser::Track::GetDurationEstimate() {
576  int64_t duration = kNoTimestamp;
577  if (default_duration_ != kNoTimestamp) {
578  duration = default_duration_;
579  DVLOG(3) << __FUNCTION__ << " : using track default duration " << duration;
580  } else if (estimated_next_frame_duration_ != kNoTimestamp) {
581  duration = estimated_next_frame_duration_;
582  DVLOG(3) << __FUNCTION__ << " : using estimated duration " << duration;
583  } else {
584  if (is_video_) {
585  duration = kDefaultVideoBufferDurationInMs * kMicrosecondsPerMillisecond;
586  } else {
587  duration = kDefaultAudioBufferDurationInMs * kMicrosecondsPerMillisecond;
588  }
589  DVLOG(3) << __FUNCTION__ << " : using hardcoded default duration "
590  << duration;
591  }
592 
593  DCHECK_GT(duration, 0);
594  DCHECK_NE(duration, kNoTimestamp);
595  return duration;
596 }
597 
598 void WebMClusterParser::ResetTextTracks() {
599  for (TextTrackMap::iterator it = text_track_map_.begin();
600  it != text_track_map_.end();
601  ++it) {
602  it->second.Reset();
603  }
604 }
605 
606 WebMClusterParser::Track*
607 WebMClusterParser::FindTextTrack(int track_num) {
608  const TextTrackMap::iterator it = text_track_map_.find(track_num);
609 
610  if (it == text_track_map_.end())
611  return NULL;
612 
613  return &it->second;
614 }
615 
616 } // namespace media
617 } // namespace shaka
base::Callback< void(const std::vector< scoped_refptr< StreamInfo > > &stream_info)> InitCB
Definition: media_parser.h:34
int Parse(const uint8_t *buf, int size)
void Reset()
Resets the state of the parser so it can start parsing a new list.
Definition: webm_parser.cc:714
int Parse(const uint8_t *buf, int size)
Definition: webm_parser.cc:719
bool Flush() WARN_UNUSED_RESULT
static void Parse(const uint8_t *payload, int payload_size, std::string *id, std::string *settings, std::string *content)
Utility function to parse the WebVTT cue from a byte stream.
base::Callback< bool(uint32_t track_id, const scoped_refptr< MediaSample > &media_sample)> NewSampleCB
Definition: media_parser.h:43
KeySource is responsible for encryption key acquisition.
Definition: key_source.h:30
static scoped_refptr< MediaSample > CopyFrom(const uint8_t *data, size_t size, bool is_key_frame)
Definition: media_sample.cc:45
WebMClusterParser(int64_t timecode_scale, scoped_refptr< AudioStreamInfo > audio_stream_info, scoped_refptr< VideoStreamInfo > video_stream_info, int64_t audio_default_duration, int64_t video_default_duration, const WebMTracksParser::TextTracks &text_tracks, const std::set< int64_t > &ignored_tracks, const std::string &audio_encryption_key_id, const std::string &video_encryption_key_id, const MediaParser::NewSampleCB &new_sample_cb, const MediaParser::InitCB &init_cb, KeySource *decryption_key_source)
DecryptorSource wraps KeySource and is responsible for decryptor management.
void Reset()
Resets the parser state so it can accept a new cluster.