// Copyright 2014 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "packager/media/formats/webm/webm_cluster_parser.h" #include #include "packager/base/logging.h" #include "packager/base/sys_byteorder.h" #include "packager/media/base/decrypt_config.h" #include "packager/media/base/timestamp.h" #include "packager/media/filters/webvtt_util.h" #include "packager/media/formats/webm/webm_constants.h" #include "packager/media/formats/webm/webm_crypto_helpers.h" #include "packager/media/formats/webm/webm_webvtt_parser.h" // Logs only while |count| < |max|, increments |count| for each log, and warns // in the log if |count| has just reached |max|. #define LIMITED_LOG(level, count, max) \ LOG_IF(level, (count) < (max)) \ << (((count) + 1 == (max)) \ ? "(Log limit reached. Further similar entries " \ "may be suppressed): " \ : "") #define LIMITED_DLOG(level, count, max) \ DLOG_IF(level, (count) < (max)) \ << (((count) + 1 == (max)) \ ? "(Log limit reached. Further similar entries " \ "may be suppressed): " \ : "") namespace { const int64_t kMicrosecondsPerMillisecond = 1000; } // namespace namespace edash_packager { namespace media { const uint16_t WebMClusterParser::kOpusFrameDurationsMu[] = { 10000, 20000, 40000, 60000, 10000, 20000, 40000, 60000, 10000, 20000, 40000, 60000, 10000, 20000, 10000, 20000, 2500, 5000, 10000, 20000, 2500, 5000, 10000, 20000, 2500, 5000, 10000, 20000, 2500, 5000, 10000, 20000}; enum { // Limits the number of LOG() calls in the path of reading encoded // duration to avoid spamming for corrupted data. kMaxDurationErrorLogs = 10, // Limits the number of LOG() calls warning the user that buffer // durations have been estimated. kMaxDurationEstimateLogs = 10, }; WebMClusterParser::WebMClusterParser( int64_t timecode_scale, int audio_track_num, int64_t audio_default_duration, int video_track_num, int64_t video_default_duration, const WebMTracksParser::TextTracks& text_tracks, const std::set& ignored_tracks, const std::string& audio_encryption_key_id, const std::string& video_encryption_key_id, const AudioCodec audio_codec, const MediaParser::NewSampleCB& new_sample_cb) : timecode_multiplier_(timecode_scale / 1000.0), ignored_tracks_(ignored_tracks), audio_encryption_key_id_(audio_encryption_key_id), video_encryption_key_id_(video_encryption_key_id), audio_codec_(audio_codec), parser_(kWebMIdCluster, this), cluster_start_time_(kNoTimestamp), audio_(audio_track_num, false, audio_default_duration, new_sample_cb), video_(video_track_num, true, video_default_duration, new_sample_cb) { for (WebMTracksParser::TextTracks::const_iterator it = text_tracks.begin(); it != text_tracks.end(); ++it) { text_track_map_.insert(std::make_pair( it->first, Track(it->first, false, kNoTimestamp, new_sample_cb))); } } WebMClusterParser::~WebMClusterParser() {} void WebMClusterParser::Reset() { last_block_timecode_ = -1; cluster_timecode_ = -1; cluster_start_time_ = kNoTimestamp; cluster_ended_ = false; parser_.Reset(); audio_.Reset(); video_.Reset(); ResetTextTracks(); } int WebMClusterParser::Parse(const uint8_t* buf, int size) { int result = parser_.Parse(buf, size); if (result < 0) { cluster_ended_ = false; return result; } cluster_ended_ = parser_.IsParsingComplete(); if (cluster_ended_) { audio_.ApplyDurationEstimateIfNeeded(); video_.ApplyDurationEstimateIfNeeded(); // If there were no buffers in this cluster, set the cluster start time to // be the |cluster_timecode_|. if (cluster_start_time_ == kNoTimestamp) { // If the cluster did not even have a |cluster_timecode_|, signal parse // error. if (cluster_timecode_ < 0) return -1; cluster_start_time_ = cluster_timecode_ * timecode_multiplier_; } // Reset the parser if we're done parsing so that // it is ready to accept another cluster on the next // call. parser_.Reset(); last_block_timecode_ = -1; cluster_timecode_ = -1; } return result; } int64_t WebMClusterParser::TryGetEncodedAudioDuration( const uint8_t* data, int size) { // Duration is currently read assuming the *entire* stream is unencrypted. // The special "Signal Byte" prepended to Blocks in encrypted streams is // assumed to not be present. // TODO: Consider parsing "Signal Byte" for encrypted streams to return // duration for any unencrypted blocks. if (audio_codec_ == kCodecOpus) { return ReadOpusDuration(data, size); } // TODO: Implement duration reading for Vorbis. See motivations in // http://crbug.com/396634. return kNoTimestamp; } int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) { // Masks and constants for Opus packets. See // https://tools.ietf.org/html/rfc6716#page-14 static const uint8_t kTocConfigMask = 0xf8; static const uint8_t kTocFrameCountCodeMask = 0x03; static const uint8_t kFrameCountMask = 0x3f; static const int64_t kPacketDurationMax = 120; if (size < 1) { LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) << "Invalid zero-byte Opus packet; demuxed block duration may be " "imprecise."; return kNoTimestamp; } // Frame count type described by last 2 bits of Opus TOC byte. int frame_count_type = data[0] & kTocFrameCountCodeMask; int frame_count = 0; switch (frame_count_type) { case 0: frame_count = 1; break; case 1: case 2: frame_count = 2; break; case 3: // Type 3 indicates an arbitrary frame count described in the next byte. if (size < 2) { LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) << "Second byte missing from 'Code 3' Opus packet; demuxed block " "duration may be imprecise."; return kNoTimestamp; } frame_count = data[1] & kFrameCountMask; if (frame_count == 0) { LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) << "Illegal 'Code 3' Opus packet with frame count zero; demuxed " "block duration may be imprecise."; return kNoTimestamp; } break; default: LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) << "Unexpected Opus frame count type: " << frame_count_type << "; " << "demuxed block duration may be imprecise."; return kNoTimestamp; } int opusConfig = (data[0] & kTocConfigMask) >> 3; CHECK_GE(opusConfig, 0); CHECK_LT(opusConfig, static_cast(arraysize(kOpusFrameDurationsMu))); DCHECK_GT(frame_count, 0); int64_t duration = kOpusFrameDurationsMu[opusConfig] * frame_count; if (duration > kPacketDurationMax) { // Intentionally allowing packet to pass through for now. Decoder should // either handle or fail gracefully. LOG as breadcrumbs in case // things go sideways. LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) << "Warning, demuxed Opus packet with encoded duration: " << duration << "ms. Should be no greater than " << kPacketDurationMax << "ms."; } return duration; } WebMParserClient* WebMClusterParser::OnListStart(int id) { if (id == kWebMIdCluster) { cluster_timecode_ = -1; cluster_start_time_ = kNoTimestamp; } else if (id == kWebMIdBlockGroup) { block_data_.reset(); block_data_size_ = -1; block_duration_ = -1; discard_padding_ = -1; discard_padding_set_ = false; } else if (id == kWebMIdBlockAdditions) { block_add_id_ = -1; block_additional_data_.reset(); block_additional_data_size_ = 0; } return this; } bool WebMClusterParser::OnListEnd(int id) { if (id != kWebMIdBlockGroup) return true; // Make sure the BlockGroup actually had a Block. if (block_data_size_ == -1) { LOG(ERROR) << "Block missing from BlockGroup."; return false; } bool result = ParseBlock(false, block_data_.get(), block_data_size_, block_additional_data_.get(), block_additional_data_size_, block_duration_, discard_padding_set_ ? discard_padding_ : 0); block_data_.reset(); block_data_size_ = -1; block_duration_ = -1; block_add_id_ = -1; block_additional_data_.reset(); block_additional_data_size_ = 0; discard_padding_ = -1; discard_padding_set_ = false; return result; } bool WebMClusterParser::OnUInt(int id, int64_t val) { int64_t* dst; switch (id) { case kWebMIdTimecode: dst = &cluster_timecode_; break; case kWebMIdBlockDuration: dst = &block_duration_; break; case kWebMIdBlockAddID: dst = &block_add_id_; break; default: return true; } if (*dst != -1) return false; *dst = val; return true; } bool WebMClusterParser::ParseBlock(bool is_simple_block, const uint8_t* buf, int size, const uint8_t* additional, int additional_size, int duration, int64_t discard_padding) { if (size < 4) return false; // Return an error if the trackNum > 127. We just aren't // going to support large track numbers right now. if (!(buf[0] & 0x80)) { LOG(ERROR) << "TrackNumber over 127 not supported"; return false; } int track_num = buf[0] & 0x7f; int timecode = buf[1] << 8 | buf[2]; int flags = buf[3] & 0xff; int lacing = (flags >> 1) & 0x3; if (lacing) { LOG(ERROR) << "Lacing " << lacing << " is not supported yet."; return false; } // Sign extend negative timecode offsets. if (timecode & 0x8000) timecode |= ~0xffff; const uint8_t* frame_data = buf + 4; int frame_size = size - (frame_data - buf); return OnBlock(is_simple_block, track_num, timecode, duration, flags, frame_data, frame_size, additional, additional_size, discard_padding); } bool WebMClusterParser::OnBinary(int id, const uint8_t* data, int size) { switch (id) { case kWebMIdSimpleBlock: return ParseBlock(true, data, size, NULL, 0, -1, 0); case kWebMIdBlock: if (block_data_) { LOG(ERROR) << "More than 1 Block in a BlockGroup is not " "supported."; return false; } block_data_.reset(new uint8_t[size]); memcpy(block_data_.get(), data, size); block_data_size_ = size; return true; case kWebMIdBlockAdditional: { uint64_t block_add_id = base::HostToNet64(block_add_id_); if (block_additional_data_) { // TODO: Technically, more than 1 BlockAdditional is allowed as per // matroska spec. But for now we don't have a use case to support // parsing of such files. Take a look at this again when such a case // arises. LOG(ERROR) << "More than 1 BlockAdditional in a " "BlockGroup is not supported."; return false; } // First 8 bytes of side_data in DecoderBuffer is the BlockAddID // element's value in Big Endian format. This is done to mimic ffmpeg // demuxer's behavior. block_additional_data_size_ = size + sizeof(block_add_id); block_additional_data_.reset(new uint8_t[block_additional_data_size_]); memcpy(block_additional_data_.get(), &block_add_id, sizeof(block_add_id)); memcpy(block_additional_data_.get() + 8, data, size); return true; } case kWebMIdDiscardPadding: { if (discard_padding_set_ || size <= 0 || size > 8) return false; discard_padding_set_ = true; // Read in the big-endian integer. discard_padding_ = static_cast(data[0]); for (int i = 1; i < size; ++i) discard_padding_ = (discard_padding_ << 8) | data[i]; return true; } default: return true; } } bool WebMClusterParser::OnBlock(bool is_simple_block, int track_num, int timecode, int block_duration, int flags, const uint8_t* data, int size, const uint8_t* additional, int additional_size, int64_t discard_padding) { DCHECK_GE(size, 0); if (cluster_timecode_ == -1) { LOG(ERROR) << "Got a block before cluster timecode."; return false; } // TODO: Should relative negative timecode offsets be rejected? Or only when // the absolute timecode is negative? See http://crbug.com/271794 if (timecode < 0) { LOG(ERROR) << "Got a block with negative timecode offset " << timecode; return false; } if (last_block_timecode_ != -1 && timecode < last_block_timecode_) { LOG(ERROR) << "Got a block with a timecode before the previous block."; return false; } Track* track = NULL; StreamType stream_type = kStreamAudio; std::string encryption_key_id; int64_t encoded_duration = kNoTimestamp; if (track_num == audio_.track_num()) { track = &audio_; encryption_key_id = audio_encryption_key_id_; if (encryption_key_id.empty()) { encoded_duration = TryGetEncodedAudioDuration(data, size); } } else if (track_num == video_.track_num()) { track = &video_; encryption_key_id = video_encryption_key_id_; stream_type = kStreamVideo; } else if (ignored_tracks_.find(track_num) != ignored_tracks_.end()) { return true; } else if (Track* const text_track = FindTextTrack(track_num)) { if (is_simple_block) // BlockGroup is required for WebVTT cues return false; if (block_duration < 0) // not specified return false; track = text_track; stream_type = kStreamText; } else { LOG(ERROR) << "Unexpected track number " << track_num; return false; } last_block_timecode_ = timecode; int64_t timestamp = (cluster_timecode_ + timecode) * timecode_multiplier_; scoped_refptr buffer; if (stream_type != kStreamText) { // The first bit of the flags is set when a SimpleBlock contains only // keyframes. If this is a Block, then inspection of the payload is // necessary to determine whether it contains a keyframe or not. // http://www.matroska.org/technical/specs/index.html bool is_keyframe = is_simple_block ? (flags & 0x80) != 0 : track->IsKeyframe(data, size); // Every encrypted Block has a signal byte and IV prepended to it. Current // encrypted WebM request for comments specification is here // http://wiki.webmproject.org/encryption/webm-encryption-rfc scoped_ptr decrypt_config; int data_offset = 0; if (!encryption_key_id.empty() && !WebMCreateDecryptConfig( data, size, reinterpret_cast(encryption_key_id.data()), encryption_key_id.size(), &decrypt_config, &data_offset)) { return false; } buffer = MediaSample::CopyFrom(data + data_offset, size - data_offset, additional, additional_size, is_keyframe); if (decrypt_config) { // TODO(kqyang): Decrypt it if it is encrypted. buffer->set_is_encrypted(true); } } else { std::string id, settings, content; WebMWebVTTParser::Parse(data, size, &id, &settings, &content); std::vector side_data; MakeSideData(id.begin(), id.end(), settings.begin(), settings.end(), &side_data); buffer = MediaSample::CopyFrom( reinterpret_cast(content.data()), content.length(), &side_data[0], side_data.size(), true); } buffer->set_pts(timestamp); if (cluster_start_time_ == kNoTimestamp) cluster_start_time_ = timestamp; int64_t block_duration_time_delta = kNoTimestamp; if (block_duration >= 0) { block_duration_time_delta = block_duration * timecode_multiplier_; } // Prefer encoded duration over BlockGroup->BlockDuration or // TrackEntry->DefaultDuration when available. This layering violation is a // workaround for http://crbug.com/396634, decreasing the likelihood of // fall-back to rough estimation techniques for Blocks that lack a // BlockDuration at the end of a cluster. Cross cluster durations are not // feasible given flexibility of cluster ordering and MSE APIs. Duration // estimation may still apply in cases of encryption and codecs for which // we do not extract encoded duration. Within a cluster, estimates are applied // as Block Timecode deltas, or once the whole cluster is parsed in the case // of the last Block in the cluster. See Track::EmitBuffer and // ApplyDurationEstimateIfNeeded(). if (encoded_duration != kNoTimestamp) { DCHECK(encoded_duration != kInfiniteDuration); DCHECK(encoded_duration > 0); buffer->set_duration(encoded_duration); DVLOG(3) << __FUNCTION__ << " : " << "Using encoded duration " << encoded_duration; if (block_duration_time_delta != kNoTimestamp) { int64_t duration_difference = block_duration_time_delta - encoded_duration; const auto kWarnDurationDiff = timecode_multiplier_ * 2; if (duration_difference > kWarnDurationDiff) { LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs) << "BlockDuration (" << block_duration_time_delta << "ms) differs significantly from encoded duration (" << encoded_duration << "ms)."; } } } else if (block_duration_time_delta != kNoTimestamp) { buffer->set_duration(block_duration_time_delta); } else { buffer->set_duration(track->default_duration()); } return track->EmitBuffer(buffer); } WebMClusterParser::Track::Track(int track_num, bool is_video, int64_t default_duration, const MediaParser::NewSampleCB& new_sample_cb) : track_num_(track_num), is_video_(is_video), default_duration_(default_duration), estimated_next_frame_duration_(kNoTimestamp), new_sample_cb_(new_sample_cb) { DCHECK(default_duration_ == kNoTimestamp || default_duration_ > 0); } WebMClusterParser::Track::~Track() {} bool WebMClusterParser::Track::EmitBuffer( const scoped_refptr& buffer) { DVLOG(2) << "EmitBuffer() : " << track_num_ << " ts " << buffer->pts() << " dur " << buffer->duration() << " kf " << buffer->is_key_frame() << " size " << buffer->data_size(); if (last_added_buffer_missing_duration_.get()) { int64_t derived_duration = buffer->pts() - last_added_buffer_missing_duration_->pts(); last_added_buffer_missing_duration_->set_duration(derived_duration); DVLOG(2) << "EmitBuffer() : applied derived duration to held-back buffer : " << " ts " << last_added_buffer_missing_duration_->pts() << " dur " << last_added_buffer_missing_duration_->duration() << " kf " << last_added_buffer_missing_duration_->is_key_frame() << " size " << last_added_buffer_missing_duration_->data_size(); scoped_refptr updated_buffer = last_added_buffer_missing_duration_; last_added_buffer_missing_duration_ = NULL; if (!EmitBufferHelp(updated_buffer)) return false; } if (buffer->duration() == kNoTimestamp) { last_added_buffer_missing_duration_ = buffer; DVLOG(2) << "EmitBuffer() : holding back buffer that is missing duration"; return true; } return EmitBufferHelp(buffer); } void WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() { if (!last_added_buffer_missing_duration_.get()) return; int64_t estimated_duration = GetDurationEstimate(); last_added_buffer_missing_duration_->set_duration(estimated_duration); if (is_video_) { // Exposing estimation so splicing/overlap frame processing can make // informed decisions downstream. // TODO(kqyang): Should we wait for the next cluster to set the duration? // last_added_buffer_missing_duration_->set_is_duration_estimated(true); } LIMITED_LOG(INFO, num_duration_estimates_, kMaxDurationEstimateLogs) << "Estimating WebM block duration to be " << estimated_duration << "ms for the last (Simple)Block in the Cluster for this Track. Use " "BlockGroups with BlockDurations at the end of each Track in a " "Cluster to avoid estimation."; DVLOG(2) << __FUNCTION__ << " new dur : ts " << last_added_buffer_missing_duration_->pts() << " dur " << last_added_buffer_missing_duration_->duration() << " kf " << last_added_buffer_missing_duration_->is_key_frame() << " size " << last_added_buffer_missing_duration_->data_size(); // Don't use the applied duration as a future estimation (don't use // EmitBufferHelp() here.) new_sample_cb_.Run(track_num_, last_added_buffer_missing_duration_); last_added_buffer_missing_duration_ = NULL; } void WebMClusterParser::Track::Reset() { last_added_buffer_missing_duration_ = NULL; } bool WebMClusterParser::Track::IsKeyframe(const uint8_t* data, int size) const { // For now, assume that all blocks are keyframes for datatypes other than // video. This is a valid assumption for Vorbis, WebVTT, & Opus. if (!is_video_) return true; // Make sure the block is big enough for the minimal keyframe header size. if (size < 7) return false; // The LSb of the first byte must be a 0 for a keyframe. // http://tools.ietf.org/html/rfc6386 Section 19.1 if ((data[0] & 0x01) != 0) return false; // Verify VP8 keyframe startcode. // http://tools.ietf.org/html/rfc6386 Section 19.1 if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) return false; return true; } bool WebMClusterParser::Track::EmitBufferHelp( const scoped_refptr& buffer) { DCHECK(!last_added_buffer_missing_duration_.get()); int64_t duration = buffer->duration(); if (duration < 0 || duration == kNoTimestamp) { LOG(ERROR) << "Invalid buffer duration: " << duration; return false; } // The estimated frame duration is the minimum (for audio) or the maximum // (for video) non-zero duration since the last initialization segment. The // minimum is used for audio to ensure frame durations aren't overestimated, // triggering unnecessary frame splicing. For video, splicing does not apply, // so maximum is used and overlap is simply resolved by showing the // later of the overlapping frames at its given PTS, effectively trimming down // the over-estimated duration of the previous frame. // TODO: Use max for audio and disable splicing whenever estimated buffers are // encountered. if (duration > 0) { int64_t orig_duration_estimate = estimated_next_frame_duration_; if (estimated_next_frame_duration_ == kNoTimestamp) { estimated_next_frame_duration_ = duration; } else if (is_video_) { estimated_next_frame_duration_ = std::max(duration, estimated_next_frame_duration_); } else { estimated_next_frame_duration_ = std::min(duration, estimated_next_frame_duration_); } if (orig_duration_estimate != estimated_next_frame_duration_) { DVLOG(3) << "Updated duration estimate:" << orig_duration_estimate << " -> " << estimated_next_frame_duration_ << " at timestamp: " << buffer->dts(); } } new_sample_cb_.Run(track_num_, buffer); return true; } int64_t WebMClusterParser::Track::GetDurationEstimate() { int64_t duration = estimated_next_frame_duration_; if (duration != kNoTimestamp) { DVLOG(3) << __FUNCTION__ << " : using estimated duration"; } else { DVLOG(3) << __FUNCTION__ << " : using hardcoded default duration"; if (is_video_) { duration = kDefaultVideoBufferDurationInMs * kMicrosecondsPerMillisecond; } else { duration = kDefaultAudioBufferDurationInMs * kMicrosecondsPerMillisecond; } } DCHECK(duration > 0); DCHECK(duration != kNoTimestamp); return duration; } void WebMClusterParser::ResetTextTracks() { for (TextTrackMap::iterator it = text_track_map_.begin(); it != text_track_map_.end(); ++it) { it->second.Reset(); } } WebMClusterParser::Track* WebMClusterParser::FindTextTrack(int track_num) { const TextTrackMap::iterator it = text_track_map_.find(track_num); if (it == text_track_map_.end()) return NULL; return &it->second; } } // namespace media } // namespace edash_packager