diff --git a/packager/app/test/packager_test.py b/packager/app/test/packager_test.py index 51e86d9944..3628d290a5 100755 --- a/packager/app/test/packager_test.py +++ b/packager/app/test/packager_test.py @@ -92,6 +92,16 @@ class PackagerAppTest(unittest.TestCase): self._DiffGold(self.output[1], 'bear-640x360-v-golden.mp4') self._DiffGold(self.mpd_output, 'bear-640x360-av-golden.mpd') + def testPackageAacHe(self): + self.packager.Package( + self._GetStreams( + ['audio'], test_files=['bear-640x360-aac_he-silent_right.mp4']), + self._GetFlags()) + self._DiffGold(self.output[0], + 'bear-640x360-aac_he-silent_right-golden.mp4') + self._DiffGold(self.mpd_output, + 'bear-640x360-aac_he-silent_right-golden.mpd') + # Package all video, audio, and text. def testPackageVideoAudioText(self): audio_video_streams = self._GetStreams(['audio', 'video']) diff --git a/packager/app/test/testdata/bear-640x360-aac_he-silent_right-golden.mp4 b/packager/app/test/testdata/bear-640x360-aac_he-silent_right-golden.mp4 new file mode 100644 index 0000000000..1bc42a7c78 Binary files /dev/null and b/packager/app/test/testdata/bear-640x360-aac_he-silent_right-golden.mp4 differ diff --git a/packager/app/test/testdata/bear-640x360-aac_he-silent_right-golden.mpd b/packager/app/test/testdata/bear-640x360-aac_he-silent_right-golden.mpd new file mode 100644 index 0000000000..5994c47f3b --- /dev/null +++ b/packager/app/test/testdata/bear-640x360-aac_he-silent_right-golden.mpd @@ -0,0 +1,15 @@ + + + + + + + + output_audio.mp4 + + + + + + + diff --git a/packager/media/codecs/aac_audio_specific_config.cc b/packager/media/codecs/aac_audio_specific_config.cc index 128b1c4bbd..7783405f47 100644 --- a/packager/media/codecs/aac_audio_specific_config.cc +++ b/packager/media/codecs/aac_audio_specific_config.cc @@ -25,14 +25,7 @@ const uint8_t kChannelConfigs[] = {0, 1, 2, 3, 4, 5, 6, 8}; namespace shaka { namespace media { -AACAudioSpecificConfig::AACAudioSpecificConfig() - : audio_object_type_(0), - frequency_index_(0), - channel_config_(0), - ps_present_(false), - frequency_(0), - extension_frequency_(0), - num_channels_(0) {} +AACAudioSpecificConfig::AACAudioSpecificConfig() {} AACAudioSpecificConfig::~AACAudioSpecificConfig() {} @@ -41,9 +34,10 @@ bool AACAudioSpecificConfig::Parse(const std::vector& data) { return false; BitReader reader(&data[0], data.size()); - uint8_t extension_type = 0; + uint8_t extension_type = AOT_NULL; uint8_t extension_frequency_index = 0xff; + sbr_present_ = false; ps_present_ = false; frequency_ = 0; extension_frequency_ = 0; @@ -62,9 +56,10 @@ bool AACAudioSpecificConfig::Parse(const std::vector& data) { RCHECK(reader.ReadBits(4, &channel_config_)); // Read extension configuration. - if (audio_object_type_ == 5 || audio_object_type_ == 29) { - ps_present_ = (audio_object_type_ == 29); - extension_type = 5; + if (audio_object_type_ == AOT_SBR || audio_object_type_ == AOT_PS) { + sbr_present_ = audio_object_type_ == AOT_SBR; + ps_present_ = audio_object_type_ == AOT_PS; + extension_type = AOT_SBR; RCHECK(reader.ReadBits(4, &extension_frequency_index)); if (extension_frequency_index == 0xf) RCHECK(reader.ReadBits(24, &extension_frequency_)); @@ -78,7 +73,7 @@ bool AACAudioSpecificConfig::Parse(const std::vector& data) { // Read extension configuration again // Note: The check for 16 available bits comes from the AAC spec. - if (extension_type != 5 && reader.bits_available() >= 16) { + if (extension_type != AOT_SBR && reader.bits_available() >= 16) { uint16_t sync_extension_type; uint8_t sbr_present_flag; uint8_t ps_present_flag; @@ -87,6 +82,7 @@ bool AACAudioSpecificConfig::Parse(const std::vector& data) { sync_extension_type == 0x2b7) { if (reader.ReadBits(5, &extension_type) && extension_type == 5) { RCHECK(reader.ReadBits(1, &sbr_present_flag)); + sbr_present_ = sbr_present_flag != 0; if (sbr_present_flag) { RCHECK(reader.ReadBits(4, &extension_frequency_index)); @@ -141,7 +137,7 @@ bool AACAudioSpecificConfig::ConvertToADTS(std::vector* buffer) const { adts[0] = 0xff; adts[1] = 0xf1; adts[2] = ((audio_object_type_ - 1) << 6) + (frequency_index_ << 2) + - (channel_config_ >> 2); + (channel_config_ >> 2); adts[3] = ((channel_config_ & 0x3) << 6) + static_cast(size >> 11); adts[4] = static_cast((size & 0x7ff) >> 3); adts[5] = static_cast(((size & 7) << 5) + 0x1f); @@ -150,12 +146,20 @@ bool AACAudioSpecificConfig::ConvertToADTS(std::vector* buffer) const { return true; } -uint32_t AACAudioSpecificConfig::GetOutputSamplesPerSecond( - bool sbr_in_mimetype) const { +AACAudioSpecificConfig::AudioObjectType +AACAudioSpecificConfig::GetAudioObjectType() const { + if (ps_present_) + return AOT_PS; + if (sbr_present_) + return AOT_SBR; + return audio_object_type_; +} + +uint32_t AACAudioSpecificConfig::GetSamplesPerSecond() const { if (extension_frequency_ > 0) return extension_frequency_; - if (!sbr_in_mimetype) + if (!sbr_present_) return frequency_; // The following code is written according to ISO 14496 Part 3 Table 1.11 and @@ -165,11 +169,11 @@ uint32_t AACAudioSpecificConfig::GetOutputSamplesPerSecond( return std::min(2 * frequency_, 48000u); } -uint8_t AACAudioSpecificConfig::GetNumChannels(bool sbr_in_mimetype) const { +uint8_t AACAudioSpecificConfig::GetNumChannels() const { // Check for implicit signalling of HE-AAC and indicate stereo output // if the mono channel configuration is signalled. // See ISO-14496-3 Section 1.6.6.1.2 for details about this special casing. - if (sbr_in_mimetype && channel_config_ == 1) + if (sbr_present_ && channel_config_ == 1) return 2; // CHANNEL_LAYOUT_STEREO // When Parametric Stereo is on, mono will be played as stereo. diff --git a/packager/media/codecs/aac_audio_specific_config.h b/packager/media/codecs/aac_audio_specific_config.h index e7e61742b3..5be25b8276 100644 --- a/packager/media/codecs/aac_audio_specific_config.h +++ b/packager/media/codecs/aac_audio_specific_config.h @@ -22,6 +22,56 @@ class BitReader; /// for more details. class AACAudioSpecificConfig { public: + // Audio Object Types specified in ISO 14496-3 (2005), Table 1.3 + enum AudioObjectType { + AOT_NULL = 0, + AOT_AAC_MAIN = 1, // Main + AOT_AAC_LC = 2, // Low Complexity + AOT_AAC_SSR = 3, // Scalable Sample Rate + AOT_AAC_LTP = 4, // Long Term Prediction + AOT_SBR = 5, // Spectral Band Replication + AOT_AAC_SCALABLE = 6, // Scalable + AOT_TWINVQ = 7, // Twin Vector Quantizer + AOT_CELP = 8, // Code Excited Linear Prediction + AOT_HVXC = 9, // Harmonic Vector eXcitation Coding + AOT_TTSI = 12, // Text-To-Speech Interface + AOT_MAINSYNTH = 13, // Main Synthesis + AOT_WAVESYNTH = 14, // Wavetable Synthesis + AOT_MIDI = 15, // General MIDI + AOT_SAFX = 16, // Algorithmic Synthesis and Audio Effects + AOT_ER_AAC_LC = 17, // Error Resilient Low Complexity + AOT_ER_AAC_LTP = 19, // Error Resilient Long Term Prediction + AOT_ER_AAC_SCALABLE = 20, // Error Resilient Scalable + AOT_ER_TWINVQ = 21, // Error Resilient Twin Vector Quantizer + AOT_ER_BSAC = 22, // Error Resilient Bit-Sliced Arithmetic Coding + AOT_ER_AAC_LD = 23, // Error Resilient Low Delay + AOT_ER_CELP = 24, // Error Resilient Code Excited Linear + // Prediction + AOT_ER_HVXC = 25, // Error Resilient Harmonic Vector eXcitation + // Coding + AOT_ER_HILN = 26, // Error Resilient Harmonic and Individual Lines + // plus Noise + AOT_ER_PARAM = 27, // Error Resilient Parametric + AOT_SSC = 28, // SinuSoidal Coding + AOT_PS = 29, // Parametric Stereo + AOT_SURROUND = 30, // MPEG Surround + AOT_ESCAPE = 31, // Escape Value + AOT_L1 = 32, // Layer 1 + AOT_L2 = 33, // Layer 2 + AOT_L3 = 34, // Layer 3 + AOT_DST = 35, // Direct Stream Transfer + AOT_ALS = 36, // Audio LosslesS + AOT_SLS = 37, // Scalable LosslesS + AOT_SLS_NON_CORE = 38, // Scalable LosslesS (non core) + AOT_ER_AAC_ELD = 39, // Error Resilient Enhanced Low Delay + AOT_SMR_SIMPLE = 40, // Symbolic Music Representation Simple + AOT_SMR_MAIN = 41, // Symbolic Music Representation Main + AOT_USAC_NOSBR = 42, // Unified Speech and Audio Coding (no SBR) + AOT_SAOC = 43, // Spatial Audio Object Coding + AOT_LD_SURROUND = 44, // Low Delay MPEG Surround + AOT_USAC = 45, // Unified Speech and Audio Coding + }; + AACAudioSpecificConfig(); virtual ~AACAudioSpecificConfig(); @@ -40,28 +90,26 @@ class AACAudioSpecificConfig { /// @return true on success, false otherwise. virtual bool ConvertToADTS(std::vector* buffer) const; - /// @param sbr_in_mimetype indicates whether SBR mode is specified in the - /// mimetype, i.e. codecs parameter contains mp4a.40.5. - /// @return Output sample rate for the AAC stream. - uint32_t GetOutputSamplesPerSecond(bool sbr_in_mimetype) const; + /// @return The audio object type for this AAC config, with possible extension + /// considered. + AudioObjectType GetAudioObjectType() const; - /// @param sbr_in_mimetype indicates whether SBR mode is specified in the - /// mimetype, i.e. codecs parameter contains mp4a.40.5. - /// @return Number of channels for the AAC stream. - uint8_t GetNumChannels(bool sbr_in_mimetype) const; + /// @return Sample rate for the AAC stream, with possible extensions + /// considered. + uint32_t GetSamplesPerSecond() const; - /// @return The audio object type for this AAC config. - uint8_t audio_object_type() const { return audio_object_type_; } - - /// @return The sampling frequency for this AAC config. - uint32_t frequency() const { return frequency_; } - - /// @return Number of channels for this AAC config. - uint8_t num_channels() const { return num_channels_; } + /// @return Number of channels for the AAC stream, with possible extensions + /// considered. + uint8_t GetNumChannels() const; /// Size in bytes of the ADTS header added by ConvertEsdsToADTS(). static const size_t kADTSHeaderSize = 7; + /// @return whether Spectral Band Repliation (SBR) is present in the stream. + bool sbr_present() const { return sbr_present_; } + /// Indicate whether SBR is present in the stream. + void set_sbr_present(bool sbr_present) { sbr_present_ = sbr_present; } + private: bool SkipDecoderGASpecificConfig(BitReader* bit_reader) const; bool SkipErrorSpecificConfig() const; @@ -69,18 +117,20 @@ class AACAudioSpecificConfig { // The following variables store the AAC specific configuration information // that are used to generate the ADTS header. - uint8_t audio_object_type_; - uint8_t frequency_index_; - uint8_t channel_config_; - // Is Parametric Stereo on? - bool ps_present_; + AudioObjectType audio_object_type_ = AOT_NULL; + uint8_t frequency_index_ = 0; + uint8_t channel_config_ = 0; + // Is Spectral Band Replication (SBR) available? + bool sbr_present_ = false; + // Is Parametric Stereo available? + bool ps_present_ = false; // The following variables store audio configuration information. // They are based on the AAC specific configuration but can be overridden // by extensions in elementary stream descriptor. - uint32_t frequency_; - uint32_t extension_frequency_; - uint8_t num_channels_; + uint32_t frequency_ = 0; + uint32_t extension_frequency_ = 0; + uint8_t num_channels_ = 0; }; } // namespace media diff --git a/packager/media/codecs/aac_audio_specific_config_unittest.cc b/packager/media/codecs/aac_audio_specific_config_unittest.cc index a5432d83ea..ab46631f63 100644 --- a/packager/media/codecs/aac_audio_specific_config_unittest.cc +++ b/packager/media/codecs/aac_audio_specific_config_unittest.cc @@ -17,8 +17,10 @@ TEST(AACAudioSpecificConfigTest, BasicProfileTest) { data.assign(buffer, buffer + sizeof(buffer)); EXPECT_TRUE(aac_audio_specific_config.Parse(data)); - EXPECT_EQ(44100u, aac_audio_specific_config.GetOutputSamplesPerSecond(false)); - EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels(false)); + EXPECT_EQ(44100u, aac_audio_specific_config.GetSamplesPerSecond()); + EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels()); + EXPECT_EQ(AACAudioSpecificConfig::AOT_AAC_LC, + aac_audio_specific_config.GetAudioObjectType()); } TEST(AACAudioSpecificConfigTest, ExtensionTest) { @@ -29,9 +31,11 @@ TEST(AACAudioSpecificConfigTest, ExtensionTest) { data.assign(buffer, buffer + sizeof(buffer)); EXPECT_TRUE(aac_audio_specific_config.Parse(data)); - EXPECT_EQ(48000u, aac_audio_specific_config.GetOutputSamplesPerSecond(false)); - EXPECT_EQ(48000u, aac_audio_specific_config.GetOutputSamplesPerSecond(true)); - EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels(false)); + EXPECT_EQ(48000u, aac_audio_specific_config.GetSamplesPerSecond()); + EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels()); + EXPECT_TRUE(aac_audio_specific_config.sbr_present()); + EXPECT_EQ(AACAudioSpecificConfig::AOT_PS, + aac_audio_specific_config.GetAudioObjectType()); } // Test implicit SBR with mono channel config. @@ -47,13 +51,17 @@ TEST(AACAudioSpecificConfigTest, ImplicitSBR_ChannelConfig0) { EXPECT_TRUE(aac_audio_specific_config.Parse(data)); - // Test w/o implict SBR. - EXPECT_EQ(24000u, aac_audio_specific_config.GetOutputSamplesPerSecond(false)); - EXPECT_EQ(1u, aac_audio_specific_config.GetNumChannels(false)); + EXPECT_EQ(24000u, aac_audio_specific_config.GetSamplesPerSecond()); + EXPECT_EQ(1u, aac_audio_specific_config.GetNumChannels()); + EXPECT_FALSE(aac_audio_specific_config.sbr_present()); + EXPECT_EQ(AACAudioSpecificConfig::AOT_AAC_LC, + aac_audio_specific_config.GetAudioObjectType()); - // Test implicit SBR. - EXPECT_EQ(48000u, aac_audio_specific_config.GetOutputSamplesPerSecond(true)); - EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels(true)); + aac_audio_specific_config.set_sbr_present(true); + EXPECT_EQ(48000u, aac_audio_specific_config.GetSamplesPerSecond()); + EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels()); + EXPECT_EQ(AACAudioSpecificConfig::AOT_SBR, + aac_audio_specific_config.GetAudioObjectType()); } // Tests implicit SBR with a stereo channel config. @@ -66,13 +74,17 @@ TEST(AACAudioSpecificConfigTest, ImplicitSBR_ChannelConfig1) { EXPECT_TRUE(aac_audio_specific_config.Parse(data)); - // Test w/o implict SBR. - EXPECT_EQ(24000u, aac_audio_specific_config.GetOutputSamplesPerSecond(false)); - EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels(false)); + EXPECT_EQ(24000u, aac_audio_specific_config.GetSamplesPerSecond()); + EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels()); + EXPECT_FALSE(aac_audio_specific_config.sbr_present()); + EXPECT_EQ(AACAudioSpecificConfig::AOT_AAC_LC, + aac_audio_specific_config.GetAudioObjectType()); - // Test implicit SBR. - EXPECT_EQ(48000u, aac_audio_specific_config.GetOutputSamplesPerSecond(true)); - EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels(true)); + aac_audio_specific_config.set_sbr_present(true); + EXPECT_EQ(48000u, aac_audio_specific_config.GetSamplesPerSecond()); + EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels()); + EXPECT_EQ(AACAudioSpecificConfig::AOT_SBR, + aac_audio_specific_config.GetAudioObjectType()); } TEST(AACAudioSpecificConfigTest, SixChannelTest) { @@ -83,8 +95,10 @@ TEST(AACAudioSpecificConfigTest, SixChannelTest) { data.assign(buffer, buffer + sizeof(buffer)); EXPECT_TRUE(aac_audio_specific_config.Parse(data)); - EXPECT_EQ(48000u, aac_audio_specific_config.GetOutputSamplesPerSecond(false)); - EXPECT_EQ(6u, aac_audio_specific_config.GetNumChannels(false)); + EXPECT_EQ(48000u, aac_audio_specific_config.GetSamplesPerSecond()); + EXPECT_EQ(6u, aac_audio_specific_config.GetNumChannels()); + EXPECT_EQ(AACAudioSpecificConfig::AOT_AAC_LC, + aac_audio_specific_config.GetAudioObjectType()); } TEST(AACAudioSpecificConfigTest, DataTooShortTest) { diff --git a/packager/media/formats/mp2t/program_map_table_writer.cc b/packager/media/formats/mp2t/program_map_table_writer.cc index 491a367b90..d1b5385b24 100644 --- a/packager/media/formats/mp2t/program_map_table_writer.cc +++ b/packager/media/formats/mp2t/program_map_table_writer.cc @@ -209,20 +209,15 @@ bool WriteAacAudioSetupInformation(const uint8_t* aac_audio_specific_config, return false; } - const uint8_t kAacLc = 2; - const uint8_t kAacHeV1 = 5; - const uint8_t kAacHeV2 = 29; - uint8_t audio_object_type = 2; - audio_object_type = config.audio_object_type(); - + auto audio_object_type = config.GetAudioObjectType(); switch (audio_object_type) { - case kAacLc: + case AACAudioSpecificConfig::AOT_AAC_LC: audio_setup_information->AppendInt(FOURCC_zaac); break; - case kAacHeV1: + case AACAudioSpecificConfig::AOT_SBR: audio_setup_information->AppendInt(FOURCC_zach); break; - case kAacHeV2: + case AACAudioSpecificConfig::AOT_PS: audio_setup_information->AppendInt(FOURCC_zacp); break; default: diff --git a/packager/media/formats/mp4/mp4_media_parser.cc b/packager/media/formats/mp4/mp4_media_parser.cc index 06e1c65015..c256c4fbb1 100644 --- a/packager/media/formats/mp4/mp4_media_parser.cc +++ b/packager/media/formats/mp4/mp4_media_parser.cc @@ -369,9 +369,10 @@ bool MP4MediaParser::ParseMoov(BoxReader* reader) { codec = kCodecAAC; const AACAudioSpecificConfig& aac_audio_specific_config = entry.esds.aac_audio_specific_config; - num_channels = aac_audio_specific_config.num_channels(); - sampling_frequency = aac_audio_specific_config.frequency(); - audio_object_type = aac_audio_specific_config.audio_object_type(); + num_channels = aac_audio_specific_config.GetNumChannels(); + sampling_frequency = + aac_audio_specific_config.GetSamplesPerSecond(); + audio_object_type = aac_audio_specific_config.GetAudioObjectType(); codec_config = entry.esds.es_descriptor.decoder_specific_info(); break; } else if (entry.esds.es_descriptor.IsDTS()) { @@ -394,7 +395,7 @@ bool MP4MediaParser::ParseMoov(BoxReader* reader) { << " in stsd box."; return false; } - num_channels = entry.esds.aac_audio_specific_config.num_channels(); + num_channels = entry.channelcount; // For dts audio in esds, current supported number of channels is 6 // as the only supported channel layout is 5.1. if (num_channels != kDtsAudioNumChannels) { diff --git a/packager/media/formats/wvm/wvm_media_parser.cc b/packager/media/formats/wvm/wvm_media_parser.cc index aa4ca37693..1931253a85 100644 --- a/packager/media/formats/wvm/wvm_media_parser.cc +++ b/packager/media/formats/wvm/wvm_media_parser.cc @@ -920,7 +920,7 @@ bool WvmMediaParser::Output(bool output_encrypted_sample) { adts_header.GetSamplingFrequency()); std::vector audio_specific_config; if (!adts_header.GetAudioSpecificConfig(&audio_specific_config)) { - LOG(ERROR) << "Could not compute AACaudiospecificconfig"; + LOG(ERROR) << "Could not compute AACAudiospecificconfig"; return false; } audio_stream_info->set_codec_config(audio_specific_config); @@ -935,10 +935,11 @@ bool WvmMediaParser::Output(bool output_encrypted_sample) { LOG(ERROR) << "Could not parse AACAudioSpecificconfig"; return false; } - audio_stream_info->set_sampling_frequency(aac_config.frequency()); + audio_stream_info->set_sampling_frequency( + aac_config.GetSamplesPerSecond()); audio_stream_info->set_codec_string( AudioStreamInfo::GetCodecString( - kCodecAAC, aac_config.audio_object_type())); + kCodecAAC, aac_config.GetAudioObjectType())); } } } diff --git a/packager/media/test/data/README b/packager/media/test/data/README index 78d7b3e899..4bb7dcf419 100644 --- a/packager/media/test/data/README +++ b/packager/media/test/data/README @@ -31,6 +31,8 @@ bear-640x360-hevc.mp4 - Same content, but encoded with HEVC. bear-320x180.mp4 - Same as above, but in a different resolution. bear-640x360-trailing-moov.mp4 - Same content, but with moov box in the end. bear-640x360-av_frag.mp4 - Same content, but in fragmented mp4. +bear-640x360-aac_lc-silent_right.mp4 - Audio only, stereo, but right channel is silent, with AAC-LC profile. +bear-640x360-aac_he-silent_right.mp4 - Same as above, but with AAC-HE profile. // Non square pixels. bear-640x360-non_square_pixel-with_pasp.mp4 - A non-square pixel version of the video track of bear-640x360.mp4 with PixelAspectRatio box. diff --git a/packager/media/test/data/bear-640x360-aac_he-silent_right.mp4 b/packager/media/test/data/bear-640x360-aac_he-silent_right.mp4 new file mode 100644 index 0000000000..d0638e1eeb Binary files /dev/null and b/packager/media/test/data/bear-640x360-aac_he-silent_right.mp4 differ diff --git a/packager/media/test/data/bear-640x360-aac_lc-silent_right.mp4 b/packager/media/test/data/bear-640x360-aac_lc-silent_right.mp4 new file mode 100644 index 0000000000..f547844625 Binary files /dev/null and b/packager/media/test/data/bear-640x360-aac_lc-silent_right.mp4 differ