5 #include "packager/media/formats/mp4/mp4_media_parser.h"
9 #include "packager/base/callback.h"
10 #include "packager/base/callback_helpers.h"
11 #include "packager/base/logging.h"
12 #include "packager/base/strings/string_number_conversions.h"
13 #include "packager/file/file.h"
14 #include "packager/file/file_closer.h"
15 #include "packager/media/base/audio_stream_info.h"
16 #include "packager/media/base/buffer_reader.h"
17 #include "packager/media/base/decrypt_config.h"
18 #include "packager/media/base/key_source.h"
19 #include "packager/media/base/macros.h"
20 #include "packager/media/base/media_sample.h"
21 #include "packager/media/base/rcheck.h"
22 #include "packager/media/base/video_stream_info.h"
23 #include "packager/media/base/video_util.h"
24 #include "packager/media/codecs/ac3_audio_util.h"
25 #include "packager/media/codecs/av1_codec_configuration_record.h"
26 #include "packager/media/codecs/avc_decoder_configuration_record.h"
27 #include "packager/media/codecs/dovi_decoder_configuration_record.h"
28 #include "packager/media/codecs/ec3_audio_util.h"
29 #include "packager/media/codecs/ac4_audio_util.h"
30 #include "packager/media/codecs/es_descriptor.h"
31 #include "packager/media/codecs/hevc_decoder_configuration_record.h"
32 #include "packager/media/codecs/vp_codec_configuration_record.h"
33 #include "packager/media/formats/mp4/box_definitions.h"
34 #include "packager/media/formats/mp4/box_reader.h"
35 #include "packager/media/formats/mp4/track_run_iterator.h"
42 uint64_t Rescale(uint64_t time_in_old_scale,
45 return (
static_cast<double>(time_in_old_scale) / old_scale) * new_scale;
48 H26xStreamFormat GetH26xStreamFormat(FourCC fourcc) {
53 return H26xStreamFormat::kNalUnitStreamWithoutParameterSetNalus;
57 return H26xStreamFormat::kNalUnitStreamWithParameterSetNalus;
59 return H26xStreamFormat::kUnSpecified;
63 Codec FourCCToCodec(FourCC fourcc) {
72 return kCodecH265DolbyVision;
103 return kUnknownCodec;
107 Codec ObjectTypeToCodec(ObjectType object_type) {
108 switch (object_type) {
109 case ObjectType::kISO_14496_3:
110 case ObjectType::kISO_13818_7_AAC_LC:
112 case ObjectType::kDTSC:
114 case ObjectType::kDTSE:
116 case ObjectType::kDTSH:
118 case ObjectType::kDTSL:
121 return kUnknownCodec;
125 std::vector<uint8_t> GetDOVIDecoderConfig(
126 const std::vector<CodecConfiguration>& configs) {
127 for (
const CodecConfiguration& config : configs) {
128 if (config.box_type == FOURCC_dvcC || config.box_type == FOURCC_dvvC) {
132 return std::vector<uint8_t>();
135 bool UpdateCodecStringForDolbyVision(
136 FourCC actual_format,
137 const std::vector<CodecConfiguration>& configs,
138 std::string* codec_string) {
139 DOVIDecoderConfigurationRecord dovi_config;
140 if (!dovi_config.Parse(GetDOVIDecoderConfig(configs))) {
141 LOG(ERROR) <<
"Failed to parse Dolby Vision decoder "
142 "configuration record.";
145 switch (actual_format) {
150 *codec_string = dovi_config.GetCodecString(actual_format);
155 *codec_string +=
";" + dovi_config.GetCodecString(FOURCC_dvhe);
159 *codec_string +=
";" + dovi_config.GetCodecString(FOURCC_dvh1);
162 LOG(ERROR) <<
"Unsupported format with extra codec "
163 << FourCCToString(actual_format);
169 const uint64_t kNanosecondsPerSecond = 1000000000ull;
173 MP4MediaParser::MP4MediaParser()
174 : state_(kWaitingForInit),
175 decryption_key_source_(NULL),
179 MP4MediaParser::~MP4MediaParser() {}
181 void MP4MediaParser::Init(
const InitCB& init_cb,
185 DCHECK_EQ(state_, kWaitingForInit);
186 DCHECK(init_cb_.is_null());
187 DCHECK(!init_cb.is_null());
188 DCHECK(!new_media_sample_cb.is_null());
190 ChangeState(kParsingBoxes);
192 new_sample_cb_ = new_media_sample_cb;
193 decryption_key_source_ = decryption_key_source;
194 if (decryption_key_source)
198 void MP4MediaParser::Reset() {
205 bool MP4MediaParser::Flush() {
206 DCHECK_NE(state_, kWaitingForInit);
208 ChangeState(kParsingBoxes);
212 bool MP4MediaParser::Parse(
const uint8_t* buf,
int size) {
213 DCHECK_NE(state_, kWaitingForInit);
215 if (state_ == kError)
218 queue_.Push(buf, size);
220 bool result, err =
false;
223 if (state_ == kParsingBoxes) {
224 result = ParseBox(&err);
226 DCHECK_EQ(kEmittingSamples, state_);
227 result = EnqueueSample(&err);
229 int64_t max_clear = runs_->GetMaxClearOffset() + moof_head_;
230 err = !ReadAndDiscardMDATsUntil(max_clear);
233 }
while (result && !err);
236 DLOG(ERROR) <<
"Error while parsing MP4";
246 bool MP4MediaParser::LoadMoov(
const std::string& file_path) {
247 std::unique_ptr<File, FileCloser> file(
248 File::OpenWithNoBuffering(file_path.c_str(),
"r"));
250 LOG(ERROR) <<
"Unable to open media file '" << file_path <<
"'";
253 if (!file->Seek(0)) {
254 LOG(WARNING) <<
"Filesystem does not support seeking on file '" << file_path
259 uint64_t file_position(0);
260 bool mdat_seen(
false);
262 const uint32_t kBoxHeaderReadSize(16);
263 std::vector<uint8_t> buffer(kBoxHeaderReadSize);
264 int64_t bytes_read = file->Read(&buffer[0], kBoxHeaderReadSize);
265 if (bytes_read == 0) {
266 LOG(ERROR) <<
"Could not find 'moov' box in file '" << file_path <<
"'";
269 if (bytes_read < kBoxHeaderReadSize) {
270 LOG(ERROR) <<
"Error reading media file '" << file_path <<
"'";
276 if (!BoxReader::StartBox(&buffer[0], kBoxHeaderReadSize, &box_type,
278 LOG(ERROR) <<
"Could not start box from file '" << file_path <<
"'";
281 if (box_type == FOURCC_mdat) {
283 }
else if (box_type == FOURCC_moov) {
289 if (!Parse(&buffer[0], bytes_read)) {
290 LOG(ERROR) <<
"Error parsing mp4 file '" << file_path <<
"'";
293 uint64_t bytes_to_read = box_size - bytes_read;
294 buffer.resize(bytes_to_read);
295 while (bytes_to_read > 0) {
296 bytes_read = file->Read(&buffer[0], bytes_to_read);
297 if (bytes_read <= 0) {
298 LOG(ERROR) <<
"Error reading 'moov' contents from file '" << file_path
302 if (!Parse(&buffer[0], bytes_read)) {
303 LOG(ERROR) <<
"Error parsing mp4 file '" << file_path <<
"'";
306 bytes_to_read -= bytes_read;
312 file_position += box_size;
313 if (!file->Seek(file_position)) {
314 LOG(ERROR) <<
"Error skipping box in mp4 file '" << file_path <<
"'";
321 bool MP4MediaParser::ParseBox(
bool* err) {
324 queue_.Peek(&buf, &size);
328 std::unique_ptr<BoxReader> reader(BoxReader::ReadBox(buf, size, err));
329 if (reader.get() == NULL)
332 if (reader->type() == FOURCC_mdat) {
338 NOTIMPLEMENTED() <<
" Non-seekable Files with 'mdat' box before 'moov' "
339 "box is not supported.";
346 <<
"Ignore unused 'mdat' box - this could be as a result of extra "
347 "not usable 'mdat' or 'mdat' associated with unrecognized track.";
352 mdat_tail_ = queue_.head() + reader->size();
354 if (reader->type() == FOURCC_moov) {
355 *err = !ParseMoov(reader.get());
356 }
else if (reader->type() == FOURCC_moof) {
357 moof_head_ = queue_.head();
358 *err = !ParseMoof(reader.get());
366 VLOG(2) <<
"Skipping top-level box: " << FourCCToString(reader->type());
369 queue_.Pop(
static_cast<int>(reader->size()));
373 bool MP4MediaParser::ParseMoov(BoxReader* reader) {
377 moov_.reset(
new Movie);
378 RCHECK(moov_->Parse(reader));
381 std::vector<std::shared_ptr<StreamInfo>> streams;
383 for (std::vector<Track>::const_iterator track = moov_->tracks.begin();
384 track != moov_->tracks.end(); ++track) {
385 const uint32_t timescale = track->media.header.timescale;
388 uint64_t duration = 0;
389 if (track->media.header.duration > 0) {
390 duration = track->media.header.duration;
391 }
else if (moov_->extends.header.fragment_duration > 0) {
392 DCHECK(moov_->header.timescale != 0);
393 duration = Rescale(moov_->extends.header.fragment_duration,
394 moov_->header.timescale,
396 }
else if (moov_->header.duration > 0 &&
397 moov_->header.duration != std::numeric_limits<uint64_t>::max()) {
398 DCHECK(moov_->header.timescale != 0);
400 Rescale(moov_->header.duration, moov_->header.timescale, timescale);
403 const SampleDescription& samp_descr =
404 track->media.information.sample_table.description;
410 if (moov_->extends.tracks.size() > 0) {
411 for (
size_t t = 0; t < moov_->extends.tracks.size(); t++) {
412 const TrackExtends& trex = moov_->extends.tracks[t];
413 if (trex.track_id == track->header.track_id) {
414 desc_idx = trex.default_sample_description_index;
419 const std::vector<ChunkInfo>& chunk_info =
420 track->media.information.sample_table.sample_to_chunk.chunk_info;
421 RCHECK(chunk_info.size() > 0);
422 desc_idx = chunk_info[0].sample_description_index;
424 RCHECK(desc_idx > 0);
427 if (samp_descr.type == kAudio) {
428 RCHECK(!samp_descr.audio_entries.empty());
432 if (desc_idx >= samp_descr.audio_entries.size())
435 const AudioSampleEntry& entry = samp_descr.audio_entries[desc_idx];
436 const FourCC actual_format = entry.GetActualFormat();
437 Codec codec = FourCCToCodec(actual_format);
438 uint8_t num_channels = entry.channelcount;
439 uint32_t sampling_frequency = entry.samplerate;
440 uint64_t codec_delay_ns = 0;
441 uint8_t audio_object_type = 0;
442 uint32_t max_bitrate = 0;
443 uint32_t avg_bitrate = 0;
444 std::vector<uint8_t> codec_config;
446 switch (actual_format) {
448 const DecoderConfigDescriptor& decoder_config =
449 entry.esds.es_descriptor.decoder_config_descriptor();
450 max_bitrate = decoder_config.max_bitrate();
451 avg_bitrate = decoder_config.avg_bitrate();
453 codec = ObjectTypeToCodec(decoder_config.object_type());
454 if (codec == kCodecAAC) {
455 const AACAudioSpecificConfig& aac_audio_specific_config =
456 entry.esds.aac_audio_specific_config;
457 num_channels = aac_audio_specific_config.GetNumChannels();
459 aac_audio_specific_config.GetSamplesPerSecond();
460 audio_object_type = aac_audio_specific_config.GetAudioObjectType();
462 decoder_config.decoder_specific_info_descriptor().data();
463 }
else if (codec == kUnknownCodec) {
468 LOG(WARNING) <<
"Unsupported audio object type "
469 <<
static_cast<int>(decoder_config.object_type())
470 <<
" in stsd.es_desriptor.";
475 FALLTHROUGH_INTENDED;
477 FALLTHROUGH_INTENDED;
479 FALLTHROUGH_INTENDED;
481 FALLTHROUGH_INTENDED;
483 codec_config = entry.ddts.extra_data;
484 max_bitrate = entry.ddts.max_bitrate;
485 avg_bitrate = entry.ddts.avg_bitrate;
488 codec_config = entry.dac3.data;
489 num_channels =
static_cast<uint8_t
>(GetAc3NumChannels(codec_config));
492 codec_config = entry.dec3.data;
493 num_channels =
static_cast<uint8_t
>(GetEc3NumChannels(codec_config));
496 codec_config = entry.dac4.data;
500 if (!GetAc4CodecInfo(codec_config, &audio_object_type)) {
501 LOG(ERROR) <<
"Failed to parse dac4.";
506 codec_config = entry.dfla.data;
509 codec_config = entry.dops.opus_identification_header;
511 entry.dops.preskip * kNanosecondsPerSecond / sampling_frequency;
519 LOG(WARNING) <<
"Unsupported audio format '"
520 << FourCCToString(actual_format) <<
"' in stsd box.";
525 uint64_t seek_preroll_ns = 0;
526 for (
const auto& sample_group_description :
527 track->media.information.sample_table.sample_group_descriptions) {
528 if (sample_group_description.grouping_type != FOURCC_roll)
530 const auto& audio_roll_recovery_entries =
531 sample_group_description.audio_roll_recovery_entries;
532 if (audio_roll_recovery_entries.size() != 1) {
533 LOG(WARNING) <<
"Unexpected number of entries in "
534 "SampleGroupDescription table with grouping type "
538 const int16_t roll_distance_in_samples =
539 audio_roll_recovery_entries[0].roll_distance;
540 if (roll_distance_in_samples < 0) {
541 RCHECK(sampling_frequency != 0);
542 seek_preroll_ns = kNanosecondsPerSecond *
543 (-roll_distance_in_samples) / sampling_frequency;
546 <<
"Roll distance is supposed to be negative, but seeing "
547 << roll_distance_in_samples;
553 const bool is_encrypted =
556 : entry.sinf.info.track_encryption.default_is_protected == 1;
557 DVLOG(1) <<
"is_audio_track_encrypted_: " << is_encrypted;
558 streams.emplace_back(
new AudioStreamInfo(
559 track->header.track_id, timescale, duration, codec,
560 AudioStreamInfo::GetCodecString(codec, audio_object_type),
561 codec_config.data(), codec_config.size(), entry.samplesize,
562 num_channels, sampling_frequency, seek_preroll_ns, codec_delay_ns,
563 max_bitrate, avg_bitrate, track->media.header.language.code,
567 if (samp_descr.type == kVideo) {
568 RCHECK(!samp_descr.video_entries.empty());
569 if (desc_idx >= samp_descr.video_entries.size())
571 const VideoSampleEntry& entry = samp_descr.video_entries[desc_idx];
572 std::vector<uint8_t> codec_configuration_data =
573 entry.codec_configuration.data;
575 uint32_t coded_width = entry.width;
576 uint32_t coded_height = entry.height;
577 uint32_t pixel_width = entry.pixel_aspect.h_spacing;
578 uint32_t pixel_height = entry.pixel_aspect.v_spacing;
579 if (pixel_width == 0 && pixel_height == 0) {
580 DerivePixelWidthHeight(coded_width, coded_height, track->header.width,
581 track->header.height, &pixel_width,
584 std::string codec_string;
585 uint8_t nalu_length_size = 0;
586 uint8_t transfer_characteristics = 0;
588 const FourCC actual_format = entry.GetActualFormat();
589 const Codec video_codec = FourCCToCodec(actual_format);
590 switch (actual_format) {
592 AV1CodecConfigurationRecord av1_config;
593 if (!av1_config.Parse(codec_configuration_data)) {
594 LOG(ERROR) <<
"Failed to parse av1c.";
597 codec_string = av1_config.GetCodecString();
602 AVCDecoderConfigurationRecord avc_config;
603 if (!avc_config.Parse(codec_configuration_data)) {
604 LOG(ERROR) <<
"Failed to parse avcc.";
607 codec_string = avc_config.GetCodecString(actual_format);
608 nalu_length_size = avc_config.nalu_length_size();
609 transfer_characteristics = avc_config.transfer_characteristics();
612 if (avc_config.coded_width() != 0) {
613 DCHECK_NE(avc_config.coded_height(), 0u);
614 if (coded_width != avc_config.coded_width() ||
615 coded_height != avc_config.coded_height()) {
616 LOG(WARNING) <<
"Resolution in VisualSampleEntry (" << coded_width
617 <<
"," << coded_height
618 <<
") does not match with resolution in "
619 "AVCDecoderConfigurationRecord ("
620 << avc_config.coded_width() <<
","
621 << avc_config.coded_height()
622 <<
"). Use AVCDecoderConfigurationRecord.";
623 coded_width = avc_config.coded_width();
624 coded_height = avc_config.coded_height();
627 DCHECK_NE(avc_config.pixel_width(), 0u);
628 DCHECK_NE(avc_config.pixel_height(), 0u);
629 if (pixel_width != avc_config.pixel_width() ||
630 pixel_height != avc_config.pixel_height()) {
631 LOG_IF(WARNING, pixel_width != 1 || pixel_height != 1)
632 <<
"Pixel aspect ratio in PASP box (" << pixel_width <<
","
634 <<
") does not match with SAR in "
635 "AVCDecoderConfigurationRecord "
637 << avc_config.pixel_width() <<
","
638 << avc_config.pixel_height()
639 <<
"). Use AVCDecoderConfigurationRecord.";
640 pixel_width = avc_config.pixel_width();
641 pixel_height = avc_config.pixel_height();
650 HEVCDecoderConfigurationRecord hevc_config;
651 if (!hevc_config.Parse(codec_configuration_data)) {
652 LOG(ERROR) <<
"Failed to parse hevc.";
655 codec_string = hevc_config.GetCodecString(actual_format);
656 nalu_length_size = hevc_config.nalu_length_size();
657 transfer_characteristics = hevc_config.transfer_characteristics();
659 if (!entry.extra_codec_configs.empty()) {
661 if (!UpdateCodecStringForDolbyVision(
662 actual_format, entry.extra_codec_configs, &codec_string)) {
670 VPCodecConfigurationRecord vp_config;
671 if (!vp_config.ParseMP4(codec_configuration_data)) {
672 LOG(ERROR) <<
"Failed to parse vpcc.";
675 if (actual_format == FOURCC_vp09 &&
676 (!vp_config.is_level_set() || vp_config.level() == 0)) {
677 const double kUnknownSampleDuration = 0.0;
678 vp_config.SetVP9Level(coded_width, coded_height,
679 kUnknownSampleDuration);
680 vp_config.WriteMP4(&codec_configuration_data);
682 codec_string = vp_config.GetCodecString(video_codec);
691 LOG(WARNING) <<
"Unsupported video format '"
692 << FourCCToString(actual_format) <<
"' in stsd box.";
697 const bool is_encrypted =
700 : entry.sinf.info.track_encryption.default_is_protected == 1;
701 DVLOG(1) <<
"is_video_track_encrypted_: " << is_encrypted;
702 std::shared_ptr<VideoStreamInfo> video_stream_info(
new VideoStreamInfo(
703 track->header.track_id, timescale, duration, video_codec,
704 GetH26xStreamFormat(actual_format), codec_string,
705 codec_configuration_data.data(), codec_configuration_data.size(),
706 coded_width, coded_height, pixel_width, pixel_height,
707 transfer_characteristics,
709 nalu_length_size, track->media.header.language.code, is_encrypted));
710 video_stream_info->set_extra_config(entry.ExtraCodecConfigsAsVector());
713 if (moov_->pssh.size() > 0) {
714 std::vector<uint8_t> pssh_raw_data;
715 for (
const auto& pssh : moov_->pssh) {
716 pssh_raw_data.insert(pssh_raw_data.end(), pssh.raw_box.begin(),
719 video_stream_info->set_eme_init_data(pssh_raw_data.data(),
720 pssh_raw_data.size());
723 streams.push_back(video_stream_info);
727 init_cb_.Run(streams);
728 if (!FetchKeysIfNecessary(moov_->pssh))
730 runs_.reset(
new TrackRunIterator(moov_.get()));
731 RCHECK(runs_->Init());
732 ChangeState(kEmittingSamples);
736 bool MP4MediaParser::ParseMoof(BoxReader* reader) {
740 RCHECK(moof.Parse(reader));
742 runs_.reset(
new TrackRunIterator(moov_.get()));
743 RCHECK(runs_->Init(moof));
744 if (!FetchKeysIfNecessary(moof.pssh))
746 ChangeState(kEmittingSamples);
750 bool MP4MediaParser::FetchKeysIfNecessary(
751 const std::vector<ProtectionSystemSpecificHeader>& headers) {
756 if (!decryption_key_source_)
759 std::vector<uint8_t> pssh_raw_data;
760 for (
const auto& header : headers) {
761 pssh_raw_data.insert(pssh_raw_data.end(), header.raw_box.begin(),
762 header.raw_box.end());
765 decryption_key_source_->FetchKeys(EmeInitDataType::CENC, pssh_raw_data);
767 LOG(ERROR) <<
"Error fetching decryption keys: " << status;
773 bool MP4MediaParser::EnqueueSample(
bool* err) {
774 if (!runs_->IsRunValid()) {
777 if (!queue_.Trim(mdat_tail_))
780 ChangeState(kParsingBoxes);
784 if (!runs_->IsSampleValid()) {
793 queue_.Peek(&buf, &buf_size);
798 if (!runs_->is_audio() && !runs_->is_video())
808 if (runs_->AuxInfoNeedsToBeCached()) {
809 queue_.PeekAt(runs_->aux_info_offset() + moof_head_, &buf, &buf_size);
810 if (buf_size < runs_->aux_info_size())
812 *err = !runs_->CacheAuxInfo(buf, buf_size);
816 int64_t sample_offset = runs_->sample_offset() + moof_head_;
817 queue_.PeekAt(sample_offset, &buf, &buf_size);
818 if (buf_size < runs_->sample_size()) {
819 if (sample_offset < queue_.head()) {
820 LOG(ERROR) <<
"Incorrect sample offset " << sample_offset
821 <<
" < " << queue_.head();
827 const uint8_t* media_data = buf;
828 const size_t media_data_size = runs_->sample_size();
831 const size_t kDummyDataSize = 0;
832 std::shared_ptr<MediaSample> stream_sample(
833 MediaSample::CopyFrom(media_data, kDummyDataSize, runs_->is_keyframe()));
835 if (runs_->is_encrypted()) {
836 std::shared_ptr<uint8_t> decrypted_media_data(
837 new uint8_t[media_data_size], std::default_delete<uint8_t[]>());
838 std::unique_ptr<DecryptConfig> decrypt_config = runs_->GetDecryptConfig();
839 if (!decrypt_config) {
841 LOG(ERROR) <<
"Missing decrypt config.";
845 if (!decryptor_source_) {
846 stream_sample->SetData(media_data, media_data_size);
849 stream_sample->set_decrypt_config(std::move(decrypt_config));
850 stream_sample->set_is_encrypted(
true);
852 if (!decryptor_source_->DecryptSampleBuffer(decrypt_config.get(),
853 media_data, media_data_size,
854 decrypted_media_data.get())) {
856 LOG(ERROR) <<
"Cannot decrypt samples.";
859 stream_sample->TransferData(std::move(decrypted_media_data),
863 stream_sample->SetData(media_data, media_data_size);
866 stream_sample->set_dts(runs_->dts());
867 stream_sample->set_pts(runs_->cts());
868 stream_sample->set_duration(runs_->duration());
870 DVLOG(3) <<
"Pushing frame: "
871 <<
", key=" << runs_->is_keyframe()
872 <<
", dur=" << runs_->duration()
873 <<
", dts=" << runs_->dts()
874 <<
", cts=" << runs_->cts()
875 <<
", size=" << runs_->sample_size();
877 if (!new_sample_cb_.Run(runs_->track_id(), stream_sample)) {
879 LOG(ERROR) <<
"Failed to process the sample.";
883 runs_->AdvanceSample();
887 bool MP4MediaParser::ReadAndDiscardMDATsUntil(
const int64_t offset) {
889 while (mdat_tail_ < offset) {
892 queue_.PeekAt(mdat_tail_, &buf, &size);
896 if (!BoxReader::StartBox(buf, size, &type, &box_sz, &err))
899 mdat_tail_ += box_sz;
901 queue_.Trim(std::min(mdat_tail_, offset));
905 void MP4MediaParser::ChangeState(State new_state) {
906 DVLOG(2) <<
"Changing state: " << new_state;
All the methods that are virtual are virtual for mocking.