feat: This patch adds support for DTS:X Profile 2 audio in MP4 files. (#1303)

feat: Added audio specific configuration udts box to AudioSampleEntry
for MP4 input/output. DASH tags for DTS audio as specified in ETSI TS
103 491 and ETSI TS 102 114.

Closes #1301

---------

Co-authored-by: Cosmin Stejerean <cstejerean@meta.com>
This commit is contained in:
Roy-Funderburk 2024-02-14 23:03:03 -08:00 committed by GitHub
parent f7b3986818
commit 07f780dae1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 373 additions and 3 deletions

View File

@ -927,6 +927,13 @@ class PackagerFunctionalTest(PackagerAppTest):
self._GetFlags(output_dash=True))
self._CheckTestResults('acc-he')
def testDtsx(self):
self.assertPackageSuccess(
self._GetStreams(
['audio'], test_files=['bear-dtsx.mp4']),
self._GetFlags(output_dash=True))
self._CheckTestResults('dtsx-dash')
def testVideoAudioWebVTT(self):
audio_video_streams = self._GetStreams(['audio', 'video'])
text_stream = self._GetStreams(['text'], test_files=['bear-english.vtt'])

Binary file not shown.

View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--Generated with https://github.com/shaka-project/shaka-packager version <tag>-<hash>-<test>-->
<MPD xmlns="urn:mpeg:dash:schema:mpd:2011" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 DASH-MPD.xsd" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" minBufferTime="PT2S" type="static" mediaPresentationDuration="PT3.114667S">
<Period id="0">
<AdaptationSet id="0" contentType="audio" subsegmentAlignment="true">
<Representation id="0" bandwidth="227665" codecs="dtsx" mimeType="audio/mp4" audioSamplingRate="48000">
<AudioChannelConfiguration schemeIdUri="tag:dts.com,2018:uhd:audio_channel_configuration" value="0000003F"/>
<BaseURL>bear-dtsx-audio.mp4</BaseURL>
<SegmentBase indexRange="742-821" timescale="48000">
<Initialization range="0-741"/>
</SegmentBase>
</Representation>
</AdaptationSet>
</Period>
</MPD>

View File

@ -150,6 +150,8 @@ std::string AudioStreamInfo::GetCodecString(Codec codec,
return "dts-";
case kCodecDTSP:
return "dts+";
case kCodecDTSX:
return "dtsx";
case kCodecEAC3:
return "ec-3";
case kCodecAC4:

View File

@ -58,6 +58,7 @@ enum FourCC : uint32_t {
FOURCC_dtsl = 0x6474736c,
FOURCC_dtsm = 0x6474732d, // "dts-"
FOURCC_dtsp = 0x6474732b, // "dts+"
FOURCC_dtsx = 0x64747378, // "dtsx"
FOURCC_dvcC = 0x64766343,
FOURCC_dvh1 = 0x64766831,
FOURCC_dvhe = 0x64766865,
@ -151,6 +152,7 @@ enum FourCC : uint32_t {
FOURCC_trex = 0x74726578,
FOURCC_trun = 0x7472756e,
FOURCC_udta = 0x75647461,
FOURCC_udts = 0x75647473, // "udts"
FOURCC_url = 0x75726c20, // "url "
FOURCC_urn = 0x75726e20, // "urn "
FOURCC_uuid = 0x75756964,

View File

@ -49,6 +49,7 @@ enum Codec {
kCodecDTSL,
kCodecDTSM,
kCodecDTSP,
kCodecDTSX,
kCodecEAC3,
kCodecFlac,
kCodecOpus,

View File

@ -12,6 +12,7 @@ add_library(media_codecs STATIC
avc_decoder_configuration_record.cc
decoder_configuration_record.cc
dovi_decoder_configuration_record.cc
dts_audio_specific_config.cc
ec3_audio_util.cc
ac4_audio_util.cc
es_descriptor.cc

View File

@ -0,0 +1,28 @@
// Copyright (c) 2023 Google Inc. All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <packager/media/codecs/dts_audio_specific_config.h>
#include <packager/media/base/bit_reader.h>
#include <packager/media/base/rcheck.h>
namespace shaka {
namespace media {
bool GetDTSXChannelMask(const std::vector<uint8_t>& udts, uint32_t& mask) {
// udts is the DTS-UHD Specific Box: ETSI TS 103 491 V1.2.1 Table B-2
// DecoderProfileCode(6 bits)
// FrameDurationCode(2 bits)
// MaxPayloadCode(3 bits)
// NumPresentationsCode(5 bits)
// ChannelMask (32 bits)
BitReader bit_reader(udts.data(), udts.size());
RCHECK(bit_reader.SkipBits(16));
RCHECK(bit_reader.ReadBits(32, &mask));
return true;
}
} // namespace media
} // namespace shaka

View File

@ -0,0 +1,24 @@
// Copyright (c) 2023 Google Inc. All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef PACKAGER_MEDIA_CODECS_DTS_AUDIO_SPECIFIC_CONFIG_H_
#define PACKAGER_MEDIA_CODECS_DTS_AUDIO_SPECIFIC_CONFIG_H_
#include <stddef.h>
#include <stdint.h>
#include <vector>
namespace shaka {
namespace media {
class BitReader;
bool GetDTSXChannelMask(const std::vector<uint8_t>& udts, uint32_t& mask);
} // namespace media
} // namespace shaka
#endif // PACKAGER_MEDIA_CODECS_DTS_AUDIO_SPECIFIC_CONFIG_H_

View File

@ -0,0 +1,37 @@
// Copyright 2023 Google Inc. All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <gtest/gtest.h>
#include "packager/media/codecs/dts_audio_specific_config.h"
namespace shaka {
namespace media {
TEST(DTSAudioSpecificConfigTest, BasicProfileTest) {
uint8_t buffer[] = {0x01, 0x20, 0x00, 0x00, 0x0, 0x3F, 0x80, 0x00};
std::vector<uint8_t> data(std::begin(buffer), std::end(buffer));
uint32_t mask;
EXPECT_TRUE(GetDTSXChannelMask(data, mask));
EXPECT_EQ(0x3F, mask);
}
TEST(DTSAudioSpecificConfigTest, ChannelMaskBytes) {
uint8_t buffer[] = {0x01, 0x20, 0x12, 0x34, 0x56, 0x78, 0x80, 0x00};
std::vector<uint8_t> data(std::begin(buffer), std::end(buffer));
uint32_t mask;
EXPECT_TRUE(GetDTSXChannelMask(data, mask));
EXPECT_EQ(0x12345678, mask);
}
TEST(DTSAudioSpecificConfigTest, Truncated) {
uint8_t buffer[] = {0x01, 0x20, 0x00, 0x00, 0x00};
std::vector<uint8_t> data(std::begin(buffer), std::end(buffer));
uint32_t mask;
EXPECT_FALSE(GetDTSXChannelMask(data, mask));
}
} // namespace media
} // namespace shaka

View File

@ -20,6 +20,7 @@
#include <packager/media/base/text_stream_info.h>
#include <packager/media/base/video_stream_info.h>
#include <packager/media/codecs/ac4_audio_util.h>
#include <packager/media/codecs/dts_audio_specific_config.h>
#include <packager/media/codecs/ec3_audio_util.h>
#include <packager/mpd/base/media_info.pb.h>
#include <packager/utils/bytes_to_string_view.h>
@ -165,6 +166,16 @@ void AddAudioInfo(const AudioStreamInfo* audio_stream_info,
codec_data->set_ac4_ims_flag(ac4_ims_flag);
codec_data->set_ac4_cbi_flag(ac4_cbi_flag);
}
if (audio_stream_info->codec() == kCodecDTSX) {
auto* codec_data = audio_info->mutable_codec_specific_data();
uint32_t channel_mask;
if (!GetDTSXChannelMask(codec_config, channel_mask)) {
LOG(ERROR) << "Failed to parse DTSX channel mask.";
return;
}
codec_data->set_channel_mask(channel_mask);
}
}
void AddTextInfo(const TextStreamInfo& text_stream_info,

View File

@ -72,6 +72,24 @@ TEST_F(MuxerListenerInternalVideoStreamTest, TransferCharacteristics) {
EXPECT_EQ(18u, media_info.video_info().transfer_characteristics());
}
class MuxerListenerInternalAudioStreamTest : public MuxerListenerInternalTest {
};
// AddAudioInfo function should parse the channel mask
TEST_F(MuxerListenerInternalAudioStreamTest, DTSX) {
MediaInfo media_info;
std::shared_ptr<AudioStreamInfo> audio_info = CreateAudioStreamInfo(
GetAudioStreamInfoParams(kCodecDTSX, "dtsx",
{0x01, 0x20, 0x00, 0x00, 0x0, 0x3F, 0x80,
0x00})); // Channel mask = 3F
ASSERT_TRUE(GenerateMediaInfo(MuxerOptions(), *audio_info,
kReferenceTimeScale,
MuxerListener::kContainerMp4, &media_info));
MediaInfo_AudioInfo* info = media_info.mutable_audio_info();
auto* codec_data = info->mutable_codec_specific_data();
EXPECT_EQ(0x3F, codec_data->channel_mask());
}
} // namespace internal
} // namespace media
} // namespace shaka

View File

@ -104,5 +104,53 @@ std::vector<ProtectionSystemSpecificInfo> GetDefaultKeySystemInfo() {
std::end(kExpectedDefaultPsshBox) - 1}}};
}
AudioStreamInfoParameters::AudioStreamInfoParameters() {}
AudioStreamInfoParameters::~AudioStreamInfoParameters() {}
std::shared_ptr<AudioStreamInfo> CreateAudioStreamInfo(
const AudioStreamInfoParameters& param) {
return std::make_shared<AudioStreamInfo>(
param.track_id, param.time_scale, param.duration, param.codec,
param.codec_string, param.codec_config.data(), param.codec_config.size(),
param.sample_bits, param.num_channels, param.sampling_frequency,
param.seek_preroll_ns, param.codec_delay_ns, param.max_bitrate,
param.avg_bitrate, param.language, param.is_encrypted);
}
AudioStreamInfoParameters GetAudioStreamInfoParams(
Codec codec,
const char* codec_string,
const std::vector<uint8_t>& codec_config) {
const int kTrackId = 0;
const int32_t kTimeScale = 10;
const int64_t kAudioStreamDuration = 200;
const char* kLanuageUndefined = "und";
const uint8_t kSampleBits = 16;
const uint8_t kNumChannels = 6;
const uint32_t kSamplingFrequency = 48000;
const uint64_t kSeekPrerollNs = 0;
const uint64_t kCodecDelayNs = 0;
const uint32_t kMaxBitrate = 0;
const uint32_t kAvgBitrate = 0;
const bool kEncryptedFlag = false;
AudioStreamInfoParameters params;
params.track_id = kTrackId;
params.time_scale = kTimeScale;
params.duration = kAudioStreamDuration;
params.codec = codec;
params.codec_string = codec_string;
params.language = kLanuageUndefined;
params.sample_bits = kSampleBits;
params.num_channels = kNumChannels;
params.sampling_frequency = kSamplingFrequency;
params.seek_preroll_ns = kSeekPrerollNs;
params.codec_delay_ns = kCodecDelayNs;
params.max_bitrate = kMaxBitrate;
params.avg_bitrate = kAvgBitrate;
params.codec_config = codec_config;
params.is_encrypted = kEncryptedFlag;
return params;
}
} // namespace media
} // namespace shaka

View File

@ -10,6 +10,7 @@
#include <cstdint>
#include <vector>
#include <packager/media/base/audio_stream_info.h>
#include <packager/media/base/key_source.h>
#include <packager/media/base/muxer_options.h>
#include <packager/media/base/stream_info.h>
@ -95,6 +96,29 @@ struct VideoStreamInfoParameters {
bool is_encrypted;
};
// Struct that gets passed for to CreateAudioStreamInfo() to create a
// StreamInfo instance. Useful for generating multiple AudioStreamInfo with
// slightly different parameters.
struct AudioStreamInfoParameters {
AudioStreamInfoParameters();
~AudioStreamInfoParameters();
int track_id;
int32_t time_scale;
int64_t duration;
Codec codec;
std::string codec_string;
std::vector<uint8_t> codec_config;
uint8_t sample_bits;
uint8_t num_channels;
uint32_t sampling_frequency;
uint64_t seek_preroll_ns;
uint64_t codec_delay_ns;
uint32_t max_bitrate;
uint32_t avg_bitrate;
std::string language;
bool is_encrypted;
};
struct OnNewSegmentParameters {
std::string file_name;
int64_t start_time;
@ -115,6 +139,16 @@ std::shared_ptr<VideoStreamInfo> CreateVideoStreamInfo(
// Returns the "default" VideoStreamInfoParameters for testing.
VideoStreamInfoParameters GetDefaultVideoStreamInfoParams();
// Creates StreamInfo instance from AudioStreamInfoParameters.
std::shared_ptr<AudioStreamInfo> CreateAudioStreamInfo(
const AudioStreamInfoParameters& param);
// Returns the "default" configuration for testing given codec and parameters.
AudioStreamInfoParameters GetAudioStreamInfoParams(
Codec codec,
const char* codec_string,
const std::vector<uint8_t>& codec_config);
// Returns the "default" values for OnMediaEnd().
OnMediaEndParameters GetDefaultOnMediaEndParams();

View File

@ -1811,6 +1811,27 @@ size_t DTSSpecific::ComputeSizeInternal() {
sizeof(kDdtsExtraData);
}
UDTSSpecific::UDTSSpecific() = default;
UDTSSpecific::~UDTSSpecific() = default;
FourCC UDTSSpecific::BoxType() const {
return FOURCC_udts;
}
bool UDTSSpecific::ReadWriteInternal(BoxBuffer* buffer) {
RCHECK(ReadWriteHeaderInternal(buffer) &&
buffer->ReadWriteVector(
&data, buffer->Reading() ? buffer->BytesLeft() : data.size()));
return true;
}
size_t UDTSSpecific::ComputeSizeInternal() {
// This box is optional. Skip it if not initialized.
if (data.empty())
return 0;
return HeaderSize() + data.size();
}
AC3Specific::AC3Specific() = default;
AC3Specific::~AC3Specific() = default;
@ -1983,6 +2004,7 @@ bool AudioSampleEntry::ReadWriteInternal(BoxBuffer* buffer) {
RCHECK(buffer->TryReadWriteChild(&esds));
RCHECK(buffer->TryReadWriteChild(&ddts));
RCHECK(buffer->TryReadWriteChild(&udts));
RCHECK(buffer->TryReadWriteChild(&dac3));
RCHECK(buffer->TryReadWriteChild(&dec3));
RCHECK(buffer->TryReadWriteChild(&dac4));
@ -2014,7 +2036,7 @@ size_t AudioSampleEntry::ComputeSizeInternal() {
sizeof(samplesize) + sizeof(samplerate) + sinf.ComputeSize() +
esds.ComputeSize() + ddts.ComputeSize() + dac3.ComputeSize() +
dec3.ComputeSize() + dops.ComputeSize() + dfla.ComputeSize() +
dac4.ComputeSize() + mhac.ComputeSize() +
dac4.ComputeSize() + mhac.ComputeSize() + udts.ComputeSize() +
// Reserved and predefined bytes.
6 + 8 + // 6 + 8 bytes reserved.
4; // 4 bytes predefined.

View File

@ -334,6 +334,12 @@ struct DTSSpecific : Box {
std::vector<uint8_t> extra_data;
};
struct UDTSSpecific : Box {
DECLARE_BOX_METHODS(UDTSSpecific);
std::vector<uint8_t> data;
};
struct AC3Specific : Box {
DECLARE_BOX_METHODS(AC3Specific);
@ -396,6 +402,7 @@ struct AudioSampleEntry : Box {
ElementaryStreamDescriptor esds;
DTSSpecific ddts;
UDTSSpecific udts;
AC3Specific dac3;
EC3Specific dec3;
AC4Specific dac4;

View File

@ -417,6 +417,16 @@ class BoxDefinitionsTestGeneral : public testing::Test {
ddts->pcm_sample_depth = 24;
}
void Fill(UDTSSpecific* udts) {
const uint8_t kUdtsData[] = {0x01, 0x20, 0x00, 0x00, 0x0, 0x3F, 0x80, 0x00};
udts->data.assign(kUdtsData, kUdtsData + std::size(kUdtsData));
}
void Modify(UDTSSpecific* udts) {
const uint8_t kUdtsData[] = {0x01, 0x20, 0x01, 0x80, 0xA, 0x3F, 0x80, 0x00};
udts->data.assign(kUdtsData, kUdtsData + std::size(kUdtsData));
}
void Fill(AC3Specific* dac3) {
const uint8_t kAc3Data[] = {0x50, 0x11, 0x60};
dac3->data.assign(kAc3Data, kAc3Data + std::size(kAc3Data));
@ -1217,6 +1227,20 @@ TEST_F(BoxDefinitionsTest, DTSSampleEntry) {
ASSERT_EQ(entry, entry_readback);
}
TEST_F(BoxDefinitionsTest, UDTSSampleEntry) {
AudioSampleEntry entry;
entry.format = FOURCC_dtsx;
entry.data_reference_index = 2;
entry.channelcount = 6;
entry.samplesize = 16;
entry.samplerate = 48000;
Fill(&entry.udts);
entry.Write(this->buffer_.get());
AudioSampleEntry entry_readback;
ASSERT_TRUE(ReadBack(&entry_readback));
ASSERT_EQ(entry, entry_readback);
}
TEST_F(BoxDefinitionsTest, AC3SampleEntry) {
AudioSampleEntry entry;
entry.format = FOURCC_ac_3;

View File

@ -90,6 +90,8 @@ Codec FourCCToCodec(FourCC fourcc) {
return kCodecDTSL;
case FOURCC_dtse:
return kCodecDTSE;
case FOURCC_dtsx:
return kCodecDTSX;
case FOURCC_dtsp:
return kCodecDTSP;
case FOURCC_dtsm:
@ -491,6 +493,9 @@ bool MP4MediaParser::ParseMoov(BoxReader* reader) {
max_bitrate = entry.ddts.max_bitrate;
avg_bitrate = entry.ddts.avg_bitrate;
break;
case FOURCC_dtsx:
codec_config = entry.udts.data;
break;
case FOURCC_ac_3:
codec_config = entry.dac3.data;
num_channels = static_cast<uint8_t>(GetAc3NumChannels(codec_config));

View File

@ -86,6 +86,8 @@ FourCC CodecToFourCC(Codec codec, H26xStreamFormat h26x_stream_format) {
return FOURCC_dtse;
case kCodecDTSM:
return FOURCC_dtsm;
case kCodecDTSX:
return FOURCC_dtsx;
case kCodecEAC3:
return FOURCC_ec_3;
case kCodecAC4:
@ -498,6 +500,9 @@ bool MP4Muxer::GenerateAudioTrak(const AudioStreamInfo* audio_info,
audio.ddts.sampling_frequency = audio_info->sampling_frequency();
audio.ddts.pcm_sample_depth = audio_info->sample_bits();
break;
case kCodecDTSX:
audio.udts.data = audio_info->codec_config();
break;
case kCodecAC3:
audio.dac3.data = audio_info->codec_config();
break;

View File

@ -118,3 +118,11 @@ sintel-1024x436.mp4 - First 6 seconds of Sintel stream.
// http://download.opencontent.netflix.com/?prefix=TechblogAssets/Sparks/Sparks_DolbyVision_P3D65_PQ_5994fps_4096x2160_LtRt_IMF_20170214/
sparks_dovi_5.mp4 - Dolby Vision profile 5
sparks_dovi_8.mp4 - Dolby Vision profile 8
// DTS:X Profile 2 5.1 channel audio
bear-dtsx.mp4:
Generated using ffmpeg and proprietary encoder:
ffmpeg -i bear.aiff -i bear.aiff -i bear.aiff \
-filter_complex "[0:a][1:a][2:a]concat=n=3:v=0:a=1[out]" \
-ar 48000 -map "[out]" -y bear.wav
dtsx-ott-enc -w -o bear-dtsx.mp4 -c 5.1 bear.wav bear.wav bear.wav

Binary file not shown.

View File

@ -49,6 +49,9 @@ typedef MediaInfo::VideoInfo VideoInfo;
namespace {
const char kEC3Codec[] = "ec-3";
const char kAC4Codec[] = "ac-4";
const char kDTSCCodec[] = "dtsc";
const char kDTSECodec[] = "dtse";
const char kDTSXCodec[] = "dtsx";
std::string RangeToString(const Range& range) {
return absl::StrFormat("%u-%u", range.begin(), range.end());
@ -618,6 +621,18 @@ bool RepresentationXmlNode::AddAudioChannelInfo(const AudioInfo& audio_info) {
"1");
}
return ret;
} else if (audio_info.codec() == kDTSCCodec ||
audio_info.codec() == kDTSECodec) {
audio_channel_config_value =
absl::StrFormat("%u", audio_info.num_channels());
audio_channel_config_scheme =
"tag:dts.com,2014:dash:audio_channel_configuration:2012";
} else if (audio_info.codec() == kDTSXCodec) {
const auto& codec_data = audio_info.codec_specific_data();
audio_channel_config_value =
absl::StrFormat("%08X", codec_data.channel_mask());
audio_channel_config_scheme =
"tag:dts.com,2018:uhd:audio_channel_configuration";
} else {
audio_channel_config_value =
absl::StrFormat("%u", audio_info.num_channels());

View File

@ -766,5 +766,61 @@ TEST_F(LowLatencySegmentTest, LowLatencySegmentTemplate) {
"</Representation>"));
}
TEST(XmlNodeTest, AddDTSCAudioInfo) {
MediaInfo::AudioInfo audio_info;
audio_info.set_codec("dtsc");
audio_info.set_sampling_frequency(48000);
audio_info.set_num_channels(6);
RepresentationXmlNode representation;
ASSERT_TRUE(representation.AddAudioInfo(audio_info));
EXPECT_THAT(
representation,
XmlNodeEqual(
"<Representation audioSamplingRate=\"48000\">\n"
" <AudioChannelConfiguration\n"
" schemeIdUri=\n"
" \"tag:dts.com,2014:dash:audio_channel_configuration:2012\"\n"
" value=\"6\"/>\n"
"</Representation>\n"));
}
TEST(XmlNodeTest, AddDTSEAudioInfo) {
MediaInfo::AudioInfo audio_info;
audio_info.set_codec("dtse");
audio_info.set_sampling_frequency(48000);
audio_info.set_num_channels(6);
RepresentationXmlNode representation;
ASSERT_TRUE(representation.AddAudioInfo(audio_info));
EXPECT_THAT(
representation,
XmlNodeEqual(
"<Representation audioSamplingRate=\"48000\">\n"
" <AudioChannelConfiguration\n"
" schemeIdUri=\n"
" \"tag:dts.com,2014:dash:audio_channel_configuration:2012\"\n"
" value=\"6\"/>\n"
"</Representation>\n"));
}
TEST(XmlNodeTest, AddDTSXAudioInfo) {
MediaInfo::AudioInfo audio_info;
audio_info.set_codec("dtsx");
audio_info.set_sampling_frequency(48000);
audio_info.mutable_codec_specific_data()->set_channel_mask(0x3F);
RepresentationXmlNode representation;
ASSERT_TRUE(representation.AddAudioInfo(audio_info));
EXPECT_THAT(
representation,
XmlNodeEqual("<Representation audioSamplingRate=\"48000\">\n"
" <AudioChannelConfiguration\n"
" schemeIdUri=\n"
" \"tag:dts.com,2018:uhd:audio_channel_configuration\"\n"
" value=\"0000003F\"/>\n"
"</Representation>\n"));
}
} // namespace xml
} // namespace shaka