shaka-packager/packager/media/codecs/aac_audio_specific_config.cc

398 lines
13 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <packager/media/codecs/aac_audio_specific_config.h>
#include <algorithm>
#include <glog/logging.h>
#include <packager/media/base/bit_reader.h>
#include <packager/media/base/rcheck.h>
namespace shaka {
namespace media {
namespace {
// Sampling Frequency Index table, from ISO 14496-3 Table 1.16
static const uint32_t kSampleRates[] = {96000, 88200, 64000, 48000, 44100,
32000, 24000, 22050, 16000, 12000,
11025, 8000, 7350};
// Channel Configuration table, from ISO 14496-3 Table 1.17
const uint8_t kChannelConfigs[] = {0, 1, 2, 3, 4, 5, 6, 8};
// ISO 14496-3 Table 4.2 Syntax of program_config_element()
// program_config_element()
// ...
// element_is_cpe[i]; 1 bslbf
// element_tag_select[i]; 4 uimsbf
bool CountChannels(uint8_t num_elements,
uint8_t* num_channels,
BitReader* bit_reader) {
for (uint8_t i = 0; i < num_elements; ++i) {
bool is_pair = false;
RCHECK(bit_reader->ReadBits(1, &is_pair));
*num_channels += is_pair ? 2 : 1;
RCHECK(bit_reader->SkipBits(4));
}
return true;
}
} // namespace
AACAudioSpecificConfig::AACAudioSpecificConfig() {}
AACAudioSpecificConfig::~AACAudioSpecificConfig() {}
bool AACAudioSpecificConfig::Parse(const std::vector<uint8_t>& data) {
if (data.empty())
return false;
BitReader reader(&data[0], data.size());
uint8_t extension_type = AOT_NULL;
uint8_t extension_frequency_index = 0xff;
sbr_present_ = false;
ps_present_ = false;
frequency_ = 0;
extension_frequency_ = 0;
// The following code is written according to ISO 14496 Part 3 Table 1.13 -
// Syntax of AudioSpecificConfig.
// Read base configuration.
// Audio Object Types specified in "ISO/IEC 14496-3:2019, Table 1.19"
RCHECK(ParseAudioObjectType(&reader));
RCHECK(reader.ReadBits(4, &frequency_index_));
if (frequency_index_ == 0xf)
RCHECK(reader.ReadBits(24, &frequency_));
RCHECK(reader.ReadBits(4, &channel_config_));
RCHECK(channel_config_ < std::size(kChannelConfigs));
num_channels_ = kChannelConfigs[channel_config_];
// Read extension configuration.
if (audio_object_type_ == AOT_SBR || audio_object_type_ == AOT_PS) {
sbr_present_ = audio_object_type_ == AOT_SBR;
ps_present_ = audio_object_type_ == AOT_PS;
extension_type = AOT_SBR;
RCHECK(reader.ReadBits(4, &extension_frequency_index));
if (extension_frequency_index == 0xf)
RCHECK(reader.ReadBits(24, &extension_frequency_));
RCHECK(ParseAudioObjectType(&reader));
}
RCHECK(ParseDecoderGASpecificConfig(&reader));
RCHECK(SkipErrorSpecificConfig());
// Read extension configuration again
// Note: The check for 16 available bits comes from the AAC spec.
if (extension_type != AOT_SBR && reader.bits_available() >= 16) {
uint16_t sync_extension_type;
uint8_t sbr_present_flag;
uint8_t ps_present_flag;
if (reader.ReadBits(11, &sync_extension_type) &&
sync_extension_type == 0x2b7) {
if (reader.ReadBits(5, &extension_type) && extension_type == 5) {
RCHECK(reader.ReadBits(1, &sbr_present_flag));
sbr_present_ = sbr_present_flag != 0;
if (sbr_present_flag) {
RCHECK(reader.ReadBits(4, &extension_frequency_index));
if (extension_frequency_index == 0xf)
RCHECK(reader.ReadBits(24, &extension_frequency_));
// Note: The check for 12 available bits comes from the AAC spec.
if (reader.bits_available() >= 12) {
RCHECK(reader.ReadBits(11, &sync_extension_type));
if (sync_extension_type == 0x548) {
RCHECK(reader.ReadBits(1, &ps_present_flag));
ps_present_ = ps_present_flag != 0;
}
}
}
}
}
}
if (frequency_ == 0) {
RCHECK(frequency_index_ < std::size(kSampleRates));
frequency_ = kSampleRates[frequency_index_];
}
if (extension_frequency_ == 0 && extension_frequency_index != 0xff) {
RCHECK(extension_frequency_index < std::size(kSampleRates));
extension_frequency_ = kSampleRates[extension_frequency_index];
}
if (audio_object_type_ == AOT_USAC) {
return frequency_ != 0 && num_channels_ != 0 && channel_config_ <= 7;
} else {
return frequency_ != 0 && num_channels_ != 0 && audio_object_type_ >= 1 &&
audio_object_type_ <= 4 && frequency_index_ != 0xf &&
channel_config_ <= 7;
}
}
bool AACAudioSpecificConfig::ConvertToADTS(
const uint8_t* data,
size_t data_size,
std::vector<uint8_t>* audio_frame) const {
DCHECK(audio_object_type_ >= 1 && audio_object_type_ <= 4 &&
frequency_index_ != 0xf && channel_config_ <= 7);
size_t size = kADTSHeaderSize + data_size;
// ADTS header uses 13 bits for packet size.
if (size >= (1 << 13))
return false;
audio_frame->reserve(size);
audio_frame->resize(kADTSHeaderSize);
audio_frame->at(0) = 0xff;
audio_frame->at(1) = 0xf1;
audio_frame->at(2) = ((audio_object_type_ - 1) << 6) +
(frequency_index_ << 2) + (channel_config_ >> 2);
audio_frame->at(3) =
((channel_config_ & 0x3) << 6) + static_cast<uint8_t>(size >> 11);
audio_frame->at(4) = static_cast<uint8_t>((size & 0x7ff) >> 3);
audio_frame->at(5) = static_cast<uint8_t>(((size & 7) << 5) + 0x1f);
audio_frame->at(6) = 0xfc;
audio_frame->insert(audio_frame->end(), data, data + data_size);
return true;
}
AACAudioSpecificConfig::AudioObjectType
AACAudioSpecificConfig::GetAudioObjectType() const {
if (ps_present_)
return AOT_PS;
if (sbr_present_)
return AOT_SBR;
return audio_object_type_;
}
uint32_t AACAudioSpecificConfig::GetSamplesPerSecond() const {
if (extension_frequency_ > 0)
return extension_frequency_;
if (!sbr_present_)
return frequency_;
// The following code is written according to ISO 14496 Part 3 Table 1.11 and
// Table 1.22. (Table 1.11 refers to the capping to 48000, Table 1.22 refers
// to SBR doubling the AAC sample rate.)
DCHECK_GT(frequency_, 0u);
return std::min(2 * frequency_, 48000u);
}
uint8_t AACAudioSpecificConfig::GetNumChannels() const {
// Check for implicit signalling of HE-AAC and indicate stereo output
// if the mono channel configuration is signalled.
// See ISO-14496-3 Section 1.6.6.1.2 for details about this special casing.
if (sbr_present_ && channel_config_ == 1)
return 2; // CHANNEL_LAYOUT_STEREO
// When Parametric Stereo is on, mono will be played as stereo.
if (ps_present_ && channel_config_ == 1)
return 2; // CHANNEL_LAYOUT_STEREO
return num_channels_;
}
bool AACAudioSpecificConfig::ParseAudioObjectType(BitReader* bit_reader) {
RCHECK(bit_reader->ReadBits(5, &audio_object_type_));
if (audio_object_type_ == AOT_ESCAPE) {
uint8_t audioObjectTypeExt;
RCHECK(bit_reader->ReadBits(6, &audioObjectTypeExt));
audio_object_type_ = static_cast<AudioObjectType>(32 + audioObjectTypeExt);
}
return true;
}
// Currently this function only support GASpecificConfig defined in
// ISO 14496 Part 3 Table 4.1 - Syntax of GASpecificConfig()
bool AACAudioSpecificConfig::ParseDecoderGASpecificConfig(
BitReader* bit_reader) {
switch (audio_object_type_) {
case 1:
case 2:
case 3:
case 4:
case 6:
case 7:
case 17:
case 19:
case 20:
case 21:
case 22:
case 23:
return ParseGASpecificConfig(bit_reader);
case 42:
// Skip UsacConfig() parsing until required
RCHECK(bit_reader->SkipBits(bit_reader->bits_available()));
return true;
default:
break;
}
return false;
}
bool AACAudioSpecificConfig::SkipErrorSpecificConfig() const {
switch (audio_object_type_) {
case 17:
case 19:
case 20:
case 21:
case 22:
case 23:
case 24:
case 25:
case 26:
case 27:
return false;
default:
break;
}
return true;
}
// The following code is written according to ISO 14496 part 3 Table 4.1 -
// GASpecificConfig.
bool AACAudioSpecificConfig::ParseGASpecificConfig(BitReader* bit_reader) {
uint8_t extension_flag = 0;
uint8_t depends_on_core_coder;
uint16_t dummy;
RCHECK(bit_reader->ReadBits(1, &dummy)); // frameLengthFlag
RCHECK(bit_reader->ReadBits(1, &depends_on_core_coder));
if (depends_on_core_coder == 1)
RCHECK(bit_reader->ReadBits(14, &dummy)); // coreCoderDelay
RCHECK(bit_reader->ReadBits(1, &extension_flag));
if (channel_config_ == 0)
RCHECK(ParseProgramConfigElement(bit_reader));
if (audio_object_type_ == 6 || audio_object_type_ == 20)
RCHECK(bit_reader->ReadBits(3, &dummy)); // layerNr
if (extension_flag) {
if (audio_object_type_ == 22) {
RCHECK(bit_reader->ReadBits(5, &dummy)); // numOfSubFrame
RCHECK(bit_reader->ReadBits(11, &dummy)); // layer_length
}
if (audio_object_type_ == 17 || audio_object_type_ == 19 ||
audio_object_type_ == 20 || audio_object_type_ == 23) {
RCHECK(bit_reader->ReadBits(3, &dummy)); // resilience flags
}
RCHECK(bit_reader->ReadBits(1, &dummy)); // extensionFlag3
}
return true;
}
// ISO 14496-3 Table 4.2 Syntax of program_config_element()
// program_config_element()
// {
// element_instance_tag; 4 uimsbf
// object_type; 2 uimsbf
// sampling_frequency_index; 4 uimsbf
// num_front_channel_elements; 4 uimsbf
// num_side_channel_elements; 4 uimsbf
// num_back_channel_elements; 4 uimsbf
// num_lfe_channel_elements; 2 uimsbf
// num_assoc_data_elements; 3 uimsbf
// num_valid_cc_elements; 4 uimsbf
// mono_mixdown_present; 1 uimsbf
// if (mono_mixdown_present == 1)
// mono_mixdown_element_number; 4 uimsbf
// stereo_mixdown_present; 1 uimsbf
// if (stereo_mixdown_present == 1)
// stereo_mixdown_element_number; 4 uimsbf
// matrix_mixdown_idx_present; 1 uimsbf
// if (matrix_mixdown_idx_present == 1) {
// matrix_mixdown_idx ; 2 uimsbf
// pseudo_surround_enable; 1 uimsbf
// }
// for (i = 0; i < num_front_channel_elements; i++) {
// front_element_is_cpe[i]; 1 bslbf
// front_element_tag_select[i]; 4 uimsbf
// }
// for (i = 0; i < num_side_channel_elements; i++) {
// side_element_is_cpe[i]; 1 bslbf
// side_element_tag_select[i]; 4 uimsbf
// }
// for (i = 0; i < num_back_channel_elements; i++) {
// back_element_is_cpe[i]; 1 bslbf
// back_element_tag_select[i]; 4 uimsbf
// }
// for (i = 0; i < num_lfe_channel_elements; i++)
// lfe_element_tag_select[i]; 4 uimsbf
// for ( i = 0; i < num_assoc_data_elements; i++)
// assoc_data_element_tag_select[i]; 4 uimsbf
// for (i = 0; i < num_valid_cc_elements; i++) {
// cc_element_is_ind_sw[i]; 1 uimsbf
// valid_cc_element_tag_select[i]; 4 uimsbf
// }
// byte_alignment(); Note 1
// comment_field_bytes; 8 uimsbf
// for (i = 0; i < comment_field_bytes; i++)
// comment_field_data[i]; 8 uimsbf
// }
// Note 1: If called from within an AudioSpecificConfig(), this
// byte_alignment shall be relative to the start of the AudioSpecificConfig().
bool AACAudioSpecificConfig::ParseProgramConfigElement(BitReader* bit_reader) {
// element_instance_tag (4), object_type (2), sampling_frequency_index (4).
RCHECK(bit_reader->SkipBits(4 + 2 + 4));
uint8_t num_front_channel_elements = 0;
uint8_t num_side_channel_elements = 0;
uint8_t num_back_channel_elements = 0;
uint8_t num_lfe_channel_elements = 0;
RCHECK(bit_reader->ReadBits(4, &num_front_channel_elements));
RCHECK(bit_reader->ReadBits(4, &num_side_channel_elements));
RCHECK(bit_reader->ReadBits(4, &num_back_channel_elements));
RCHECK(bit_reader->ReadBits(2, &num_lfe_channel_elements));
uint8_t num_assoc_data_elements = 0;
RCHECK(bit_reader->ReadBits(3, &num_assoc_data_elements));
uint8_t num_valid_cc_elements = 0;
RCHECK(bit_reader->ReadBits(4, &num_valid_cc_elements));
RCHECK(bit_reader->SkipBitsConditional(true, 4)); // mono_mixdown
RCHECK(bit_reader->SkipBitsConditional(true, 4)); // stereo_mixdown
RCHECK(bit_reader->SkipBitsConditional(true, 3)); // matrix_mixdown_idx
num_channels_ = 0;
RCHECK(CountChannels(num_front_channel_elements, &num_channels_, bit_reader));
RCHECK(CountChannels(num_side_channel_elements, &num_channels_, bit_reader));
RCHECK(CountChannels(num_back_channel_elements, &num_channels_, bit_reader));
num_channels_ += num_lfe_channel_elements;
RCHECK(bit_reader->SkipBits(4 * num_lfe_channel_elements));
RCHECK(bit_reader->SkipBits(4 * num_assoc_data_elements));
RCHECK(bit_reader->SkipBits(5 * num_valid_cc_elements));
bit_reader->SkipToNextByte();
uint8_t comment_field_bytes = 0;
RCHECK(bit_reader->ReadBits(8, &comment_field_bytes));
RCHECK(bit_reader->SkipBytes(comment_field_bytes));
return true;
}
} // namespace media
} // namespace shaka