shaka-packager/packager/media/formats/mp4/mp4_muxer.cc

529 lines
19 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2014 Google Inc. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd
#include "packager/media/formats/mp4/mp4_muxer.h"
#include "packager/base/time/clock.h"
#include "packager/base/time/time.h"
#include "packager/file/file.h"
#include "packager/media/base/aes_encryptor.h"
#include "packager/media/base/audio_stream_info.h"
#include "packager/media/base/fourccs.h"
#include "packager/media/base/key_source.h"
#include "packager/media/base/media_sample.h"
#include "packager/media/base/text_stream_info.h"
#include "packager/media/base/video_stream_info.h"
#include "packager/media/codecs/es_descriptor.h"
#include "packager/media/event/muxer_listener.h"
#include "packager/media/formats/mp4/box_definitions.h"
#include "packager/media/formats/mp4/multi_segment_segmenter.h"
#include "packager/media/formats/mp4/single_segment_segmenter.h"
namespace shaka {
namespace media {
namespace mp4 {
namespace {
// Sets the range start and end value from offset and size.
// |start| and |end| are for byte-range-spec specified in RFC2616.
void SetStartAndEndFromOffsetAndSize(size_t offset,
size_t size,
Range* range) {
DCHECK(range);
range->start = static_cast<uint32_t>(offset);
// Note that ranges are inclusive. So we need - 1.
range->end = range->start + static_cast<uint32_t>(size) - 1;
}
FourCC CodecToFourCC(Codec codec, H26xStreamFormat h26x_stream_format) {
switch (codec) {
case kCodecH264:
return h26x_stream_format ==
H26xStreamFormat::kNalUnitStreamWithParameterSetNalus
? FOURCC_avc3
: FOURCC_avc1;
case kCodecH265:
return h26x_stream_format ==
H26xStreamFormat::kNalUnitStreamWithParameterSetNalus
? FOURCC_hev1
: FOURCC_hvc1;
case kCodecVP8:
return FOURCC_vp08;
case kCodecVP9:
return FOURCC_vp09;
case kCodecVP10:
return FOURCC_vp10;
case kCodecAAC:
return FOURCC_mp4a;
case kCodecAC3:
return FOURCC_ac_3;
case kCodecDTSC:
return FOURCC_dtsc;
case kCodecDTSH:
return FOURCC_dtsh;
case kCodecDTSL:
return FOURCC_dtsl;
case kCodecDTSE:
return FOURCC_dtse;
case kCodecDTSM:
return FOURCC_dtsm;
case kCodecEAC3:
return FOURCC_ec_3;
case kCodecFlac:
return FOURCC_fLaC;
case kCodecOpus:
return FOURCC_Opus;
default:
return FOURCC_NULL;
}
}
void GenerateSinf(FourCC old_type,
const EncryptionConfig& encryption_config,
ProtectionSchemeInfo* sinf) {
sinf->format.format = old_type;
DCHECK_NE(encryption_config.protection_scheme, FOURCC_NULL);
sinf->type.type = encryption_config.protection_scheme;
// The version of cenc implemented here. CENC 4.
const int kCencSchemeVersion = 0x00010000;
sinf->type.version = kCencSchemeVersion;
auto& track_encryption = sinf->info.track_encryption;
track_encryption.default_is_protected = 1;
track_encryption.default_crypt_byte_block =
encryption_config.crypt_byte_block;
track_encryption.default_skip_byte_block = encryption_config.skip_byte_block;
switch (encryption_config.protection_scheme) {
case FOURCC_cenc:
case FOURCC_cbc1:
DCHECK_EQ(track_encryption.default_crypt_byte_block, 0u);
DCHECK_EQ(track_encryption.default_skip_byte_block, 0u);
// CENCv3 10.1 cenc AES-CTR scheme and 10.2 cbc1 AES-CBC scheme:
// The version of the Track Encryption Box (tenc) SHALL be 0.
track_encryption.version = 0;
break;
case FOURCC_cbcs:
case FOURCC_cens:
if (track_encryption.default_skip_byte_block == 0) {
// Some clients, e.g. Safari v11.0.3 does not like having
// crypt_byte_block as a non-zero value when skip_byte_block is zero.
track_encryption.default_crypt_byte_block = 0;
}
// CENCv3 10.3 cens AES-CTR subsample pattern encryption scheme and
// 10.4 cbcs AES-CBC subsample pattern encryption scheme:
// The version of the Track Encryption Box (tenc) SHALL be 1.
track_encryption.version = 1;
break;
default:
NOTREACHED() << "Unexpected protection scheme "
<< encryption_config.protection_scheme;
}
track_encryption.default_per_sample_iv_size =
encryption_config.per_sample_iv_size;
track_encryption.default_constant_iv = encryption_config.constant_iv;
track_encryption.default_kid = encryption_config.key_id;
}
} // namespace
MP4Muxer::MP4Muxer(const MuxerOptions& options) : Muxer(options) {}
MP4Muxer::~MP4Muxer() {}
Status MP4Muxer::InitializeMuxer() {
DCHECK(!streams().empty());
std::unique_ptr<FileType> ftyp(new FileType);
std::unique_ptr<Movie> moov(new Movie);
ftyp->major_brand = FOURCC_isom;
ftyp->compatible_brands.push_back(FOURCC_iso8);
ftyp->compatible_brands.push_back(FOURCC_mp41);
ftyp->compatible_brands.push_back(FOURCC_dash);
if (streams().size() == 1) {
FourCC codec_fourcc = FOURCC_NULL;
if (streams()[0]->stream_type() == kStreamVideo) {
codec_fourcc =
CodecToFourCC(streams()[0]->codec(),
static_cast<const VideoStreamInfo*>(streams()[0].get())
->h26x_stream_format());
if (codec_fourcc != FOURCC_NULL)
ftyp->compatible_brands.push_back(codec_fourcc);
}
// CMAF allows only one track/stream per file.
// CMAF requires single initialization switching for AVC3/HEV1, which is not
// supported yet.
if (codec_fourcc != FOURCC_avc3 && codec_fourcc != FOURCC_hev1)
ftyp->compatible_brands.push_back(FOURCC_cmfc);
}
moov->header.creation_time = IsoTimeNow();
moov->header.modification_time = IsoTimeNow();
moov->header.next_track_id = static_cast<uint32_t>(streams().size()) + 1;
moov->tracks.resize(streams().size());
moov->extends.tracks.resize(streams().size());
// Initialize tracks.
for (uint32_t i = 0; i < streams().size(); ++i) {
const StreamInfo* stream = streams()[i].get();
Track& trak = moov->tracks[i];
trak.header.track_id = i + 1;
TrackExtends& trex = moov->extends.tracks[i];
trex.track_id = trak.header.track_id;
trex.default_sample_description_index = 1;
bool generate_trak_result = false;
switch (stream->stream_type()) {
case kStreamVideo:
generate_trak_result = GenerateVideoTrak(
static_cast<const VideoStreamInfo*>(stream), &trak, i + 1);
break;
case kStreamAudio:
generate_trak_result = GenerateAudioTrak(
static_cast<const AudioStreamInfo*>(stream), &trak, i + 1);
break;
case kStreamText:
generate_trak_result = GenerateTextTrak(
static_cast<const TextStreamInfo*>(stream), &trak, i + 1);
break;
default:
NOTIMPLEMENTED() << "Not implemented for stream type: "
<< stream->stream_type();
}
if (!generate_trak_result)
return Status(error::MUXER_FAILURE, "Failed to generate trak.");
if (stream->is_encrypted() && options().mp4_params.include_pssh_in_stream) {
const auto& key_system_info = stream->encryption_config().key_system_info;
moov->pssh.resize(key_system_info.size());
for (size_t j = 0; j < key_system_info.size(); j++)
moov->pssh[j].raw_box = key_system_info[j].psshs;
}
}
if (options().segment_template.empty()) {
segmenter_.reset(new SingleSegmentSegmenter(options(), std::move(ftyp),
std::move(moov)));
} else {
segmenter_.reset(
new MultiSegmentSegmenter(options(), std::move(ftyp), std::move(moov)));
}
const Status segmenter_initialized =
segmenter_->Initialize(streams(), muxer_listener(), progress_listener());
if (!segmenter_initialized.ok())
return segmenter_initialized;
FireOnMediaStartEvent();
return Status::OK;
}
Status MP4Muxer::Finalize() {
DCHECK(segmenter_);
Status segmenter_finalized = segmenter_->Finalize();
if (!segmenter_finalized.ok())
return segmenter_finalized;
FireOnMediaEndEvent();
LOG(INFO) << "MP4 file '" << options().output_file_name << "' finalized.";
return Status::OK;
}
Status MP4Muxer::AddSample(size_t stream_id, const MediaSample& sample) {
DCHECK(segmenter_);
return segmenter_->AddSample(stream_id, sample);
}
Status MP4Muxer::FinalizeSegment(size_t stream_id,
const SegmentInfo& segment_info) {
DCHECK(segmenter_);
VLOG(3) << "Finalize " << (segment_info.is_subsegment ? "sub" : "")
<< "segment " << segment_info.start_timestamp << " duration "
<< segment_info.duration;
return segmenter_->FinalizeSegment(stream_id, segment_info);
}
void MP4Muxer::InitializeTrak(const StreamInfo* info, Track* trak) {
int64_t now = IsoTimeNow();
trak->header.creation_time = now;
trak->header.modification_time = now;
trak->header.duration = 0;
trak->media.header.creation_time = now;
trak->media.header.modification_time = now;
trak->media.header.timescale = info->time_scale();
trak->media.header.duration = 0;
if (!info->language().empty()) {
// Strip off the subtag, if any.
std::string main_language = info->language();
size_t dash = main_language.find('-');
if (dash != std::string::npos) {
main_language.erase(dash);
}
// ISO-639-2/T main language code should be 3 characters.
if (main_language.size() != 3) {
LOG(WARNING) << "'" << main_language << "' is not a valid ISO-639-2 "
<< "language code, ignoring.";
} else {
trak->media.header.language.code = main_language;
}
}
}
bool MP4Muxer::GenerateVideoTrak(const VideoStreamInfo* video_info,
Track* trak,
uint32_t track_id) {
InitializeTrak(video_info, trak);
// width and height specify the track's visual presentation size as
// fixed-point 16.16 values.
uint32_t pixel_width = video_info->pixel_width();
uint32_t pixel_height = video_info->pixel_height();
if (pixel_width == 0 || pixel_height == 0) {
LOG(WARNING) << "pixel width/height are not set. Assuming 1:1.";
pixel_width = 1;
pixel_height = 1;
}
const double sample_aspect_ratio =
static_cast<double>(pixel_width) / pixel_height;
trak->header.width = video_info->width() * sample_aspect_ratio * 0x10000;
trak->header.height = video_info->height() * 0x10000;
VideoSampleEntry video;
video.format =
CodecToFourCC(video_info->codec(), video_info->h26x_stream_format());
video.width = video_info->width();
video.height = video_info->height();
video.codec_configuration.data = video_info->codec_config();
if (pixel_width != 1 || pixel_height != 1) {
video.pixel_aspect.h_spacing = pixel_width;
video.pixel_aspect.v_spacing = pixel_height;
}
SampleDescription& sample_description =
trak->media.information.sample_table.description;
sample_description.type = kVideo;
sample_description.video_entries.push_back(video);
if (video_info->is_encrypted()) {
if (video_info->has_clear_lead()) {
// Add a second entry for clear content.
sample_description.video_entries.push_back(video);
}
// Convert the first entry to an encrypted entry.
VideoSampleEntry& entry = sample_description.video_entries[0];
GenerateSinf(entry.format, video_info->encryption_config(), &entry.sinf);
entry.format = FOURCC_encv;
}
return true;
}
bool MP4Muxer::GenerateAudioTrak(const AudioStreamInfo* audio_info,
Track* trak,
uint32_t track_id) {
InitializeTrak(audio_info, trak);
trak->header.volume = 0x100;
AudioSampleEntry audio;
audio.format =
CodecToFourCC(audio_info->codec(), H26xStreamFormat::kUnSpecified);
switch(audio_info->codec()){
case kCodecAAC:
audio.esds.es_descriptor.set_object_type(
ObjectType::kISO_14496_3); // MPEG4 AAC.
audio.esds.es_descriptor.set_esid(track_id);
audio.esds.es_descriptor.set_decoder_specific_info(
audio_info->codec_config());
audio.esds.es_descriptor.set_max_bitrate(audio_info->max_bitrate());
audio.esds.es_descriptor.set_avg_bitrate(audio_info->avg_bitrate());
break;
case kCodecDTSC:
case kCodecDTSH:
case kCodecDTSL:
case kCodecDTSE:
case kCodecDTSM:
audio.ddts.extra_data = audio_info->codec_config();
audio.ddts.max_bitrate = audio_info->max_bitrate();
audio.ddts.avg_bitrate = audio_info->avg_bitrate();
audio.ddts.sampling_frequency = audio_info->sampling_frequency();
audio.ddts.pcm_sample_depth = audio_info->sample_bits();
break;
case kCodecAC3:
audio.dac3.data = audio_info->codec_config();
break;
case kCodecEAC3:
audio.dec3.data = audio_info->codec_config();
break;
case kCodecFlac:
audio.dfla.data = audio_info->codec_config();
break;
case kCodecOpus:
audio.dops.opus_identification_header = audio_info->codec_config();
break;
default:
NOTIMPLEMENTED() << " Unsupported audio codec " << audio_info->codec();
return false;
}
if (audio_info->codec() == kCodecAC3 || audio_info->codec() == kCodecEAC3) {
// AC3 and EC3 does not fill in actual channel count and sample size in
// sample description entry. Instead, two constants are used.
audio.channelcount = 2;
audio.samplesize = 16;
} else {
audio.channelcount = audio_info->num_channels();
audio.samplesize = audio_info->sample_bits();
}
audio.samplerate = audio_info->sampling_frequency();
SampleTable& sample_table = trak->media.information.sample_table;
SampleDescription& sample_description = sample_table.description;
sample_description.type = kAudio;
sample_description.audio_entries.push_back(audio);
if (audio_info->is_encrypted()) {
if (audio_info->has_clear_lead()) {
// Add a second entry for clear content.
sample_description.audio_entries.push_back(audio);
}
// Convert the first entry to an encrypted entry.
AudioSampleEntry& entry = sample_description.audio_entries[0];
GenerateSinf(entry.format, audio_info->encryption_config(), &entry.sinf);
entry.format = FOURCC_enca;
}
// Opus requires at least one sample group description box and at least one
// sample to group box with grouping type 'roll' within sample table box.
if (audio_info->codec() == kCodecOpus) {
sample_table.sample_group_descriptions.resize(1);
SampleGroupDescription& sample_group_description =
sample_table.sample_group_descriptions.back();
sample_group_description.grouping_type = FOURCC_roll;
sample_group_description.audio_roll_recovery_entries.resize(1);
// The roll distance is expressed in sample units and always takes negative
// values.
const uint64_t kNanosecondsPerSecond = 1000000000ull;
sample_group_description.audio_roll_recovery_entries[0].roll_distance =
(0 - (audio_info->seek_preroll_ns() * audio.samplerate +
kNanosecondsPerSecond / 2)) /
kNanosecondsPerSecond;
sample_table.sample_to_groups.resize(1);
SampleToGroup& sample_to_group = sample_table.sample_to_groups.back();
sample_to_group.grouping_type = FOURCC_roll;
sample_to_group.entries.resize(1);
SampleToGroupEntry& sample_to_group_entry = sample_to_group.entries.back();
// All samples are in track fragments.
sample_to_group_entry.sample_count = 0;
sample_to_group_entry.group_description_index =
SampleToGroupEntry::kTrackGroupDescriptionIndexBase + 1;
} else if (audio_info->seek_preroll_ns() != 0) {
LOG(WARNING) << "Unexpected seek preroll for codec " << audio_info->codec();
}
return true;
}
bool MP4Muxer::GenerateTextTrak(const TextStreamInfo* text_info,
Track* trak,
uint32_t track_id) {
InitializeTrak(text_info, trak);
if (text_info->codec_string() == "wvtt") {
// Handle WebVTT.
TextSampleEntry webvtt;
webvtt.format = FOURCC_wvtt;
webvtt.config.config.assign(text_info->codec_config().begin(),
text_info->codec_config().end());
// TODO(rkuroiwa): This should be the source file URI(s). Putting bogus
// string for now so that the box will be there for samples with overlapping
// cues.
webvtt.label.source_label = "source_label";
SampleDescription& sample_description =
trak->media.information.sample_table.description;
sample_description.type = kText;
sample_description.text_entries.push_back(webvtt);
return true;
}
NOTIMPLEMENTED() << text_info->codec_string()
<< " handling not implemented yet.";
return false;
}
base::Optional<Range> MP4Muxer::GetInitRangeStartAndEnd() {
size_t range_offset = 0;
size_t range_size = 0;
const bool has_range = segmenter_->GetInitRange(&range_offset, &range_size);
if (!has_range)
return base::nullopt;
Range range;
SetStartAndEndFromOffsetAndSize(range_offset, range_size, &range);
return range;
}
base::Optional<Range> MP4Muxer::GetIndexRangeStartAndEnd() {
size_t range_offset = 0;
size_t range_size = 0;
const bool has_range = segmenter_->GetIndexRange(&range_offset, &range_size);
if (!has_range)
return base::nullopt;
Range range;
SetStartAndEndFromOffsetAndSize(range_offset, range_size, &range);
return range;
}
void MP4Muxer::FireOnMediaStartEvent() {
if (!muxer_listener())
return;
if (streams().size() > 1) {
LOG(ERROR) << "MuxerListener cannot take more than 1 stream.";
return;
}
DCHECK(!streams().empty()) << "Media started without a stream.";
const uint32_t timescale = segmenter_->GetReferenceTimeScale();
muxer_listener()->OnMediaStart(options(), *streams().front(), timescale,
MuxerListener::kContainerMp4);
}
void MP4Muxer::FireOnMediaEndEvent() {
if (!muxer_listener())
return;
MuxerListener::MediaRanges media_range;
media_range.init_range = GetInitRangeStartAndEnd();
media_range.index_range = GetIndexRangeStartAndEnd();
media_range.subsegment_ranges = segmenter_->GetSegmentRanges();
const float duration_seconds = static_cast<float>(segmenter_->GetDuration());
muxer_listener()->OnMediaEnd(media_range, duration_seconds);
}
uint64_t MP4Muxer::IsoTimeNow() {
// Time in seconds from Jan. 1, 1904 to epoch time, i.e. Jan. 1, 1970.
const uint64_t kIsomTimeOffset = 2082844800l;
return kIsomTimeOffset +
(clock() ? clock()->Now() : base::Time::Now()).ToDoubleT();
}
} // namespace mp4
} // namespace media
} // namespace shaka