shaka-packager/packager/media/formats/webm/webm_cluster_parser.cc

729 lines
25 KiB
C++
Raw Normal View History

// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "packager/media/formats/webm/webm_cluster_parser.h"
#include <vector>
#include "packager/base/logging.h"
#include "packager/base/sys_byteorder.h"
#include "packager/media/base/decrypt_config.h"
#include "packager/media/base/timestamp.h"
#include "packager/media/filters/webvtt_util.h"
#include "packager/media/formats/webm/webm_constants.h"
#include "packager/media/formats/webm/webm_crypto_helpers.h"
#include "packager/media/formats/webm/webm_webvtt_parser.h"
// Logs only while |count| < |max|, increments |count| for each log, and warns
// in the log if |count| has just reached |max|.
#define LIMITED_LOG(level, count, max) \
LOG_IF(level, (count) < (max)) \
<< (((count) + 1 == (max)) \
? "(Log limit reached. Further similar entries " \
"may be suppressed): " \
: "")
#define LIMITED_DLOG(level, count, max) \
DLOG_IF(level, (count) < (max)) \
<< (((count) + 1 == (max)) \
? "(Log limit reached. Further similar entries " \
"may be suppressed): " \
: "")
namespace {
const int64_t kMicrosecondsPerMillisecond = 1000;
} // namespace
namespace edash_packager {
namespace media {
const uint16_t WebMClusterParser::kOpusFrameDurationsMu[] = {
10000, 20000, 40000, 60000, 10000, 20000, 40000, 60000, 10000, 20000, 40000,
60000, 10000, 20000, 10000, 20000, 2500, 5000, 10000, 20000, 2500, 5000,
10000, 20000, 2500, 5000, 10000, 20000, 2500, 5000, 10000, 20000};
enum {
// Limits the number of LOG() calls in the path of reading encoded
// duration to avoid spamming for corrupted data.
kMaxDurationErrorLogs = 10,
// Limits the number of LOG() calls warning the user that buffer
// durations have been estimated.
kMaxDurationEstimateLogs = 10,
};
WebMClusterParser::WebMClusterParser(
int64_t timecode_scale,
int audio_track_num,
int64_t audio_default_duration,
int video_track_num,
int64_t video_default_duration,
const WebMTracksParser::TextTracks& text_tracks,
const std::set<int64_t>& ignored_tracks,
const std::string& audio_encryption_key_id,
const std::string& video_encryption_key_id,
const AudioCodec audio_codec,
const MediaParser::NewSampleCB& new_sample_cb)
: timecode_multiplier_(timecode_scale / 1000.0),
ignored_tracks_(ignored_tracks),
audio_encryption_key_id_(audio_encryption_key_id),
video_encryption_key_id_(video_encryption_key_id),
audio_codec_(audio_codec),
parser_(kWebMIdCluster, this),
cluster_start_time_(kNoTimestamp),
audio_(audio_track_num, false, audio_default_duration, new_sample_cb),
video_(video_track_num, true, video_default_duration, new_sample_cb) {
for (WebMTracksParser::TextTracks::const_iterator it = text_tracks.begin();
it != text_tracks.end();
++it) {
text_track_map_.insert(std::make_pair(
it->first, Track(it->first, false, kNoTimestamp, new_sample_cb)));
}
}
WebMClusterParser::~WebMClusterParser() {}
void WebMClusterParser::Reset() {
last_block_timecode_ = -1;
cluster_timecode_ = -1;
cluster_start_time_ = kNoTimestamp;
cluster_ended_ = false;
parser_.Reset();
audio_.Reset();
video_.Reset();
ResetTextTracks();
}
int WebMClusterParser::Parse(const uint8_t* buf, int size) {
int result = parser_.Parse(buf, size);
if (result < 0) {
cluster_ended_ = false;
return result;
}
cluster_ended_ = parser_.IsParsingComplete();
if (cluster_ended_) {
audio_.ApplyDurationEstimateIfNeeded();
video_.ApplyDurationEstimateIfNeeded();
// If there were no buffers in this cluster, set the cluster start time to
// be the |cluster_timecode_|.
if (cluster_start_time_ == kNoTimestamp) {
// If the cluster did not even have a |cluster_timecode_|, signal parse
// error.
if (cluster_timecode_ < 0)
return -1;
cluster_start_time_ = cluster_timecode_ * timecode_multiplier_;
}
// Reset the parser if we're done parsing so that
// it is ready to accept another cluster on the next
// call.
parser_.Reset();
last_block_timecode_ = -1;
cluster_timecode_ = -1;
}
return result;
}
int64_t WebMClusterParser::TryGetEncodedAudioDuration(
const uint8_t* data,
int size) {
// Duration is currently read assuming the *entire* stream is unencrypted.
// The special "Signal Byte" prepended to Blocks in encrypted streams is
// assumed to not be present.
// TODO: Consider parsing "Signal Byte" for encrypted streams to return
// duration for any unencrypted blocks.
if (audio_codec_ == kCodecOpus) {
return ReadOpusDuration(data, size);
}
// TODO: Implement duration reading for Vorbis. See motivations in
// http://crbug.com/396634.
return kNoTimestamp;
}
int64_t WebMClusterParser::ReadOpusDuration(const uint8_t* data, int size) {
// Masks and constants for Opus packets. See
// https://tools.ietf.org/html/rfc6716#page-14
static const uint8_t kTocConfigMask = 0xf8;
static const uint8_t kTocFrameCountCodeMask = 0x03;
static const uint8_t kFrameCountMask = 0x3f;
static const int64_t kPacketDurationMax = 120;
if (size < 1) {
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "Invalid zero-byte Opus packet; demuxed block duration may be "
"imprecise.";
return kNoTimestamp;
}
// Frame count type described by last 2 bits of Opus TOC byte.
int frame_count_type = data[0] & kTocFrameCountCodeMask;
int frame_count = 0;
switch (frame_count_type) {
case 0:
frame_count = 1;
break;
case 1:
case 2:
frame_count = 2;
break;
case 3:
// Type 3 indicates an arbitrary frame count described in the next byte.
if (size < 2) {
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "Second byte missing from 'Code 3' Opus packet; demuxed block "
"duration may be imprecise.";
return kNoTimestamp;
}
frame_count = data[1] & kFrameCountMask;
if (frame_count == 0) {
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "Illegal 'Code 3' Opus packet with frame count zero; demuxed "
"block duration may be imprecise.";
return kNoTimestamp;
}
break;
default:
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "Unexpected Opus frame count type: " << frame_count_type << "; "
<< "demuxed block duration may be imprecise.";
return kNoTimestamp;
}
int opusConfig = (data[0] & kTocConfigMask) >> 3;
CHECK_GE(opusConfig, 0);
CHECK_LT(opusConfig, static_cast<int>(arraysize(kOpusFrameDurationsMu)));
DCHECK_GT(frame_count, 0);
int64_t duration = kOpusFrameDurationsMu[opusConfig] * frame_count;
if (duration > kPacketDurationMax) {
// Intentionally allowing packet to pass through for now. Decoder should
// either handle or fail gracefully. LOG as breadcrumbs in case
// things go sideways.
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "Warning, demuxed Opus packet with encoded duration: "
<< duration << "ms. Should be no greater than "
<< kPacketDurationMax << "ms.";
}
return duration;
}
WebMParserClient* WebMClusterParser::OnListStart(int id) {
if (id == kWebMIdCluster) {
cluster_timecode_ = -1;
cluster_start_time_ = kNoTimestamp;
} else if (id == kWebMIdBlockGroup) {
block_data_.reset();
block_data_size_ = -1;
block_duration_ = -1;
discard_padding_ = -1;
discard_padding_set_ = false;
} else if (id == kWebMIdBlockAdditions) {
block_add_id_ = -1;
block_additional_data_.reset();
block_additional_data_size_ = 0;
}
return this;
}
bool WebMClusterParser::OnListEnd(int id) {
if (id != kWebMIdBlockGroup)
return true;
// Make sure the BlockGroup actually had a Block.
if (block_data_size_ == -1) {
LOG(ERROR) << "Block missing from BlockGroup.";
return false;
}
bool result = ParseBlock(false, block_data_.get(), block_data_size_,
block_additional_data_.get(),
block_additional_data_size_, block_duration_,
discard_padding_set_ ? discard_padding_ : 0);
block_data_.reset();
block_data_size_ = -1;
block_duration_ = -1;
block_add_id_ = -1;
block_additional_data_.reset();
block_additional_data_size_ = 0;
discard_padding_ = -1;
discard_padding_set_ = false;
return result;
}
bool WebMClusterParser::OnUInt(int id, int64_t val) {
int64_t* dst;
switch (id) {
case kWebMIdTimecode:
dst = &cluster_timecode_;
break;
case kWebMIdBlockDuration:
dst = &block_duration_;
break;
case kWebMIdBlockAddID:
dst = &block_add_id_;
break;
default:
return true;
}
if (*dst != -1)
return false;
*dst = val;
return true;
}
bool WebMClusterParser::ParseBlock(bool is_simple_block,
const uint8_t* buf,
int size,
const uint8_t* additional,
int additional_size,
int duration,
int64_t discard_padding) {
if (size < 4)
return false;
// Return an error if the trackNum > 127. We just aren't
// going to support large track numbers right now.
if (!(buf[0] & 0x80)) {
LOG(ERROR) << "TrackNumber over 127 not supported";
return false;
}
int track_num = buf[0] & 0x7f;
int timecode = buf[1] << 8 | buf[2];
int flags = buf[3] & 0xff;
int lacing = (flags >> 1) & 0x3;
if (lacing) {
LOG(ERROR) << "Lacing " << lacing << " is not supported yet.";
return false;
}
// Sign extend negative timecode offsets.
if (timecode & 0x8000)
timecode |= ~0xffff;
const uint8_t* frame_data = buf + 4;
int frame_size = size - (frame_data - buf);
return OnBlock(is_simple_block, track_num, timecode, duration, flags,
frame_data, frame_size, additional, additional_size,
discard_padding);
}
bool WebMClusterParser::OnBinary(int id, const uint8_t* data, int size) {
switch (id) {
case kWebMIdSimpleBlock:
return ParseBlock(true, data, size, NULL, 0, -1, 0);
case kWebMIdBlock:
if (block_data_) {
LOG(ERROR) << "More than 1 Block in a BlockGroup is not "
"supported.";
return false;
}
block_data_.reset(new uint8_t[size]);
memcpy(block_data_.get(), data, size);
block_data_size_ = size;
return true;
case kWebMIdBlockAdditional: {
uint64_t block_add_id = base::HostToNet64(block_add_id_);
if (block_additional_data_) {
// TODO: Technically, more than 1 BlockAdditional is allowed as per
// matroska spec. But for now we don't have a use case to support
// parsing of such files. Take a look at this again when such a case
// arises.
LOG(ERROR) << "More than 1 BlockAdditional in a "
"BlockGroup is not supported.";
return false;
}
// First 8 bytes of side_data in DecoderBuffer is the BlockAddID
// element's value in Big Endian format. This is done to mimic ffmpeg
// demuxer's behavior.
block_additional_data_size_ = size + sizeof(block_add_id);
block_additional_data_.reset(new uint8_t[block_additional_data_size_]);
memcpy(block_additional_data_.get(), &block_add_id,
sizeof(block_add_id));
memcpy(block_additional_data_.get() + 8, data, size);
return true;
}
case kWebMIdDiscardPadding: {
if (discard_padding_set_ || size <= 0 || size > 8)
return false;
discard_padding_set_ = true;
// Read in the big-endian integer.
discard_padding_ = static_cast<int8_t>(data[0]);
for (int i = 1; i < size; ++i)
discard_padding_ = (discard_padding_ << 8) | data[i];
return true;
}
default:
return true;
}
}
bool WebMClusterParser::OnBlock(bool is_simple_block,
int track_num,
int timecode,
int block_duration,
int flags,
const uint8_t* data,
int size,
const uint8_t* additional,
int additional_size,
int64_t discard_padding) {
DCHECK_GE(size, 0);
if (cluster_timecode_ == -1) {
LOG(ERROR) << "Got a block before cluster timecode.";
return false;
}
// TODO: Should relative negative timecode offsets be rejected? Or only when
// the absolute timecode is negative? See http://crbug.com/271794
if (timecode < 0) {
LOG(ERROR) << "Got a block with negative timecode offset " << timecode;
return false;
}
if (last_block_timecode_ != -1 && timecode < last_block_timecode_) {
LOG(ERROR) << "Got a block with a timecode before the previous block.";
return false;
}
Track* track = NULL;
StreamType stream_type = kStreamAudio;
std::string encryption_key_id;
int64_t encoded_duration = kNoTimestamp;
if (track_num == audio_.track_num()) {
track = &audio_;
encryption_key_id = audio_encryption_key_id_;
if (encryption_key_id.empty()) {
encoded_duration = TryGetEncodedAudioDuration(data, size);
}
} else if (track_num == video_.track_num()) {
track = &video_;
encryption_key_id = video_encryption_key_id_;
stream_type = kStreamVideo;
} else if (ignored_tracks_.find(track_num) != ignored_tracks_.end()) {
return true;
} else if (Track* const text_track = FindTextTrack(track_num)) {
if (is_simple_block) // BlockGroup is required for WebVTT cues
return false;
if (block_duration < 0) // not specified
return false;
track = text_track;
stream_type = kStreamText;
} else {
LOG(ERROR) << "Unexpected track number " << track_num;
return false;
}
last_block_timecode_ = timecode;
int64_t timestamp = (cluster_timecode_ + timecode) * timecode_multiplier_;
scoped_refptr<MediaSample> buffer;
if (stream_type != kStreamText) {
// The first bit of the flags is set when a SimpleBlock contains only
// keyframes. If this is a Block, then inspection of the payload is
// necessary to determine whether it contains a keyframe or not.
// http://www.matroska.org/technical/specs/index.html
bool is_keyframe =
is_simple_block ? (flags & 0x80) != 0 : track->IsKeyframe(data, size);
// Every encrypted Block has a signal byte and IV prepended to it. Current
// encrypted WebM request for comments specification is here
// http://wiki.webmproject.org/encryption/webm-encryption-rfc
scoped_ptr<DecryptConfig> decrypt_config;
int data_offset = 0;
if (!encryption_key_id.empty() &&
!WebMCreateDecryptConfig(
data, size,
reinterpret_cast<const uint8_t*>(encryption_key_id.data()),
encryption_key_id.size(),
&decrypt_config, &data_offset)) {
return false;
}
buffer = MediaSample::CopyFrom(data + data_offset, size - data_offset,
additional, additional_size, is_keyframe);
if (decrypt_config) {
// TODO(kqyang): Decrypt it if it is encrypted.
buffer->set_is_encrypted(true);
}
} else {
std::string id, settings, content;
WebMWebVTTParser::Parse(data, size, &id, &settings, &content);
std::vector<uint8_t> side_data;
MakeSideData(id.begin(), id.end(),
settings.begin(), settings.end(),
&side_data);
buffer = MediaSample::CopyFrom(
reinterpret_cast<const uint8_t*>(content.data()), content.length(),
&side_data[0], side_data.size(), true);
}
buffer->set_pts(timestamp);
if (cluster_start_time_ == kNoTimestamp)
cluster_start_time_ = timestamp;
int64_t block_duration_time_delta = kNoTimestamp;
if (block_duration >= 0) {
block_duration_time_delta = block_duration * timecode_multiplier_;
}
// Prefer encoded duration over BlockGroup->BlockDuration or
// TrackEntry->DefaultDuration when available. This layering violation is a
// workaround for http://crbug.com/396634, decreasing the likelihood of
// fall-back to rough estimation techniques for Blocks that lack a
// BlockDuration at the end of a cluster. Cross cluster durations are not
// feasible given flexibility of cluster ordering and MSE APIs. Duration
// estimation may still apply in cases of encryption and codecs for which
// we do not extract encoded duration. Within a cluster, estimates are applied
// as Block Timecode deltas, or once the whole cluster is parsed in the case
// of the last Block in the cluster. See Track::EmitBuffer and
// ApplyDurationEstimateIfNeeded().
if (encoded_duration != kNoTimestamp) {
DCHECK(encoded_duration != kInfiniteDuration);
DCHECK(encoded_duration > 0);
buffer->set_duration(encoded_duration);
DVLOG(3) << __FUNCTION__ << " : "
<< "Using encoded duration " << encoded_duration;
if (block_duration_time_delta != kNoTimestamp) {
int64_t duration_difference =
block_duration_time_delta - encoded_duration;
const auto kWarnDurationDiff = timecode_multiplier_ * 2;
if (duration_difference > kWarnDurationDiff) {
LIMITED_DLOG(INFO, num_duration_errors_, kMaxDurationErrorLogs)
<< "BlockDuration (" << block_duration_time_delta
<< "ms) differs significantly from encoded duration ("
<< encoded_duration << "ms).";
}
}
} else if (block_duration_time_delta != kNoTimestamp) {
buffer->set_duration(block_duration_time_delta);
} else {
buffer->set_duration(track->default_duration());
}
return track->EmitBuffer(buffer);
}
WebMClusterParser::Track::Track(int track_num,
bool is_video,
int64_t default_duration,
const MediaParser::NewSampleCB& new_sample_cb)
: track_num_(track_num),
is_video_(is_video),
default_duration_(default_duration),
estimated_next_frame_duration_(kNoTimestamp),
new_sample_cb_(new_sample_cb) {
DCHECK(default_duration_ == kNoTimestamp || default_duration_ > 0);
}
WebMClusterParser::Track::~Track() {}
bool WebMClusterParser::Track::EmitBuffer(
const scoped_refptr<MediaSample>& buffer) {
DVLOG(2) << "EmitBuffer() : " << track_num_
<< " ts " << buffer->pts()
<< " dur " << buffer->duration()
<< " kf " << buffer->is_key_frame()
<< " size " << buffer->data_size();
if (last_added_buffer_missing_duration_.get()) {
int64_t derived_duration =
buffer->pts() - last_added_buffer_missing_duration_->pts();
last_added_buffer_missing_duration_->set_duration(derived_duration);
DVLOG(2) << "EmitBuffer() : applied derived duration to held-back buffer : "
<< " ts "
<< last_added_buffer_missing_duration_->pts()
<< " dur "
<< last_added_buffer_missing_duration_->duration()
<< " kf " << last_added_buffer_missing_duration_->is_key_frame()
<< " size " << last_added_buffer_missing_duration_->data_size();
scoped_refptr<MediaSample> updated_buffer =
last_added_buffer_missing_duration_;
last_added_buffer_missing_duration_ = NULL;
if (!EmitBufferHelp(updated_buffer))
return false;
}
if (buffer->duration() == kNoTimestamp) {
last_added_buffer_missing_duration_ = buffer;
DVLOG(2) << "EmitBuffer() : holding back buffer that is missing duration";
return true;
}
return EmitBufferHelp(buffer);
}
void WebMClusterParser::Track::ApplyDurationEstimateIfNeeded() {
if (!last_added_buffer_missing_duration_.get())
return;
int64_t estimated_duration = GetDurationEstimate();
last_added_buffer_missing_duration_->set_duration(estimated_duration);
if (is_video_) {
// Exposing estimation so splicing/overlap frame processing can make
// informed decisions downstream.
// TODO(kqyang): Should we wait for the next cluster to set the duration?
// last_added_buffer_missing_duration_->set_is_duration_estimated(true);
}
LIMITED_LOG(INFO, num_duration_estimates_, kMaxDurationEstimateLogs)
<< "Estimating WebM block duration to be "
<< estimated_duration
<< "ms for the last (Simple)Block in the Cluster for this Track. Use "
"BlockGroups with BlockDurations at the end of each Track in a "
"Cluster to avoid estimation.";
DVLOG(2) << __FUNCTION__ << " new dur : ts "
<< last_added_buffer_missing_duration_->pts()
<< " dur "
<< last_added_buffer_missing_duration_->duration()
<< " kf " << last_added_buffer_missing_duration_->is_key_frame()
<< " size " << last_added_buffer_missing_duration_->data_size();
// Don't use the applied duration as a future estimation (don't use
// EmitBufferHelp() here.)
new_sample_cb_.Run(track_num_, last_added_buffer_missing_duration_);
last_added_buffer_missing_duration_ = NULL;
}
void WebMClusterParser::Track::Reset() {
last_added_buffer_missing_duration_ = NULL;
}
bool WebMClusterParser::Track::IsKeyframe(const uint8_t* data, int size) const {
// For now, assume that all blocks are keyframes for datatypes other than
// video. This is a valid assumption for Vorbis, WebVTT, & Opus.
if (!is_video_)
return true;
// Make sure the block is big enough for the minimal keyframe header size.
if (size < 7)
return false;
// The LSb of the first byte must be a 0 for a keyframe.
// http://tools.ietf.org/html/rfc6386 Section 19.1
if ((data[0] & 0x01) != 0)
return false;
// Verify VP8 keyframe startcode.
// http://tools.ietf.org/html/rfc6386 Section 19.1
if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a)
return false;
return true;
}
bool WebMClusterParser::Track::EmitBufferHelp(
const scoped_refptr<MediaSample>& buffer) {
DCHECK(!last_added_buffer_missing_duration_.get());
int64_t duration = buffer->duration();
if (duration < 0 || duration == kNoTimestamp) {
LOG(ERROR) << "Invalid buffer duration: " << duration;
return false;
}
// The estimated frame duration is the minimum (for audio) or the maximum
// (for video) non-zero duration since the last initialization segment. The
// minimum is used for audio to ensure frame durations aren't overestimated,
// triggering unnecessary frame splicing. For video, splicing does not apply,
// so maximum is used and overlap is simply resolved by showing the
// later of the overlapping frames at its given PTS, effectively trimming down
// the over-estimated duration of the previous frame.
// TODO: Use max for audio and disable splicing whenever estimated buffers are
// encountered.
if (duration > 0) {
int64_t orig_duration_estimate = estimated_next_frame_duration_;
if (estimated_next_frame_duration_ == kNoTimestamp) {
estimated_next_frame_duration_ = duration;
} else if (is_video_) {
estimated_next_frame_duration_ =
std::max(duration, estimated_next_frame_duration_);
} else {
estimated_next_frame_duration_ =
std::min(duration, estimated_next_frame_duration_);
}
if (orig_duration_estimate != estimated_next_frame_duration_) {
DVLOG(3) << "Updated duration estimate:"
<< orig_duration_estimate
<< " -> "
<< estimated_next_frame_duration_
<< " at timestamp: "
<< buffer->dts();
}
}
new_sample_cb_.Run(track_num_, buffer);
return true;
}
int64_t WebMClusterParser::Track::GetDurationEstimate() {
int64_t duration = estimated_next_frame_duration_;
if (duration != kNoTimestamp) {
DVLOG(3) << __FUNCTION__ << " : using estimated duration";
} else {
DVLOG(3) << __FUNCTION__ << " : using hardcoded default duration";
if (is_video_) {
duration = kDefaultVideoBufferDurationInMs * kMicrosecondsPerMillisecond;
} else {
duration = kDefaultAudioBufferDurationInMs * kMicrosecondsPerMillisecond;
}
}
DCHECK(duration > 0);
DCHECK(duration != kNoTimestamp);
return duration;
}
void WebMClusterParser::ResetTextTracks() {
for (TextTrackMap::iterator it = text_track_map_.begin();
it != text_track_map_.end();
++it) {
it->second.Reset();
}
}
WebMClusterParser::Track*
WebMClusterParser::FindTextTrack(int track_num) {
const TextTrackMap::iterator it = text_track_map_.find(track_num);
if (it == text_track_map_.end())
return NULL;
return &it->second;
}
} // namespace media
} // namespace edash_packager