feat: Add xHE-AAC support (#1092)

Note: * An xHE-AAC capable encoder will auto adjust the user-specified SAP/RAP value to the allowed grid where SAP/RAPs can occur. e.g.: `-rapInterval 5000` (5s) may result in actual SAPs/RAPs every 4.984s. * To ensure SAP/RAP starts a new segment, Shaka needs to executed with a "--segment_duration" is less than or equal to that adjusted value. * If every SAP/RAP should trigger a new segment, just set the segment length to a very low value e.g.: `--segment_duration 0.1`
2022-10-18 10:14:31 -07:00 · 2022-10-18 10:14:31 -07:00 · 5d998fca7f
parent 31129eed64
commit 5d998fca7f
4 changed files with 49 additions and 13 deletions
--- a/1
+++ b/1
@ -31,6 +31,7 @@ Daniel Cantarín <canta@canta.com.ar>
 David Cavar <pal3thorn@gmail.com>
 Evgeny Zajcev <zevlg@yandex.ru>
 Gabe Kopley <gabe@philo.com>
+Geoff Jukes <geoff@jukes.org>
 Haoming Chen <hmchen@google.com>
 Jacob Trimble <modmaker@google.com>
 Joe Foraci <jforaci@gmail.com>
--- a/packager/media/codecs/aac_audio_specific_config.cc
+++ b/packager/media/codecs/aac_audio_specific_config.cc
@ -62,10 +62,9 @@ bool AACAudioSpecificConfig::Parse(const std::vector<uint8_t>& data) {
  // Syntax of AudioSpecificConfig.

  // Read base configuration.
-  // Audio Object Types specified in ISO 14496-3, Table 1.15.
-  RCHECK(reader.ReadBits(5, &audio_object_type_));
-  // Audio objects type >=31 is not supported yet.
-  RCHECK(audio_object_type_ < 31);
+  // Audio Object Types specified in "ISO/IEC 14496-3:2019, Table 1.19"
+  RCHECK(ParseAudioObjectType(&reader));
+
  RCHECK(reader.ReadBits(4, &frequency_index_));
  if (frequency_index_ == 0xf)
    RCHECK(reader.ReadBits(24, &frequency_));
@ -82,9 +81,7 @@ bool AACAudioSpecificConfig::Parse(const std::vector<uint8_t>& data) {
    RCHECK(reader.ReadBits(4, &extension_frequency_index));
    if (extension_frequency_index == 0xf)
      RCHECK(reader.ReadBits(24, &extension_frequency_));
-    RCHECK(reader.ReadBits(5, &audio_object_type_));
-    // Audio objects type >=31 is not supported yet.
-    RCHECK(audio_object_type_ < 31);
+    RCHECK(ParseAudioObjectType(&reader));
  }

  RCHECK(ParseDecoderGASpecificConfig(&reader));
@ -131,10 +128,14 @@ bool AACAudioSpecificConfig::Parse(const std::vector<uint8_t>& data) {
    RCHECK(extension_frequency_index < arraysize(kSampleRates));
    extension_frequency_ = kSampleRates[extension_frequency_index];
  }
-
-  return frequency_ != 0 && num_channels_ != 0 && audio_object_type_ >= 1 &&
-         audio_object_type_ <= 4 && frequency_index_ != 0xf &&
-         channel_config_ <= 7;
+  
+  if (audio_object_type_ == AOT_USAC) {
+    return frequency_ != 0 && num_channels_ != 0 && channel_config_ <= 7;
+  } else {
+    return frequency_ != 0 && num_channels_ != 0 && audio_object_type_ >= 1 &&
+           audio_object_type_ <= 4 && frequency_index_ != 0xf &&
+           channel_config_ <= 7;
+  }
 }

 bool AACAudioSpecificConfig::ConvertToADTS(
@ -205,6 +206,18 @@ uint8_t AACAudioSpecificConfig::GetNumChannels() const {
  return num_channels_;
 }

+bool AACAudioSpecificConfig::ParseAudioObjectType(BitReader* bit_reader) {
+  RCHECK(bit_reader->ReadBits(5, &audio_object_type_));
+  
+  if (audio_object_type_ == AOT_ESCAPE) {
+    uint8_t audioObjectTypeExt;
+    RCHECK(bit_reader->ReadBits(6, &audioObjectTypeExt));
+    audio_object_type_ = static_cast<AudioObjectType>(32 + audioObjectTypeExt);
+  }
+  
+  return true;
+}
+
 // Currently this function only support GASpecificConfig defined in
 // ISO 14496 Part 3 Table 4.1 - Syntax of GASpecificConfig()
 bool AACAudioSpecificConfig::ParseDecoderGASpecificConfig(
@ -223,6 +236,10 @@ bool AACAudioSpecificConfig::ParseDecoderGASpecificConfig(
    case 22:
    case 23:
      return ParseGASpecificConfig(bit_reader);
+    case 42:
+      // Skip UsacConfig() parsing until required
+      RCHECK(bit_reader->SkipBits(bit_reader->bits_available()));
+      return true;
    default:
      break;
  }
--- a/packager/media/codecs/aac_audio_specific_config.h
+++ b/packager/media/codecs/aac_audio_specific_config.h
@ -66,10 +66,10 @@ class AACAudioSpecificConfig {
    AOT_ER_AAC_ELD       = 39,  // Error Resilient Enhanced Low Delay
    AOT_SMR_SIMPLE       = 40,  // Symbolic Music Representation Simple
    AOT_SMR_MAIN         = 41,  // Symbolic Music Representation Main
-    AOT_USAC_NOSBR       = 42,  // Unified Speech and Audio Coding (no SBR)
+    AOT_USAC             = 42,  // Unified Speech and Audio Coding
    AOT_SAOC             = 43,  // Spatial Audio Object Coding
    AOT_LD_SURROUND      = 44,  // Low Delay MPEG Surround
-    AOT_USAC             = 45,  // Unified Speech and Audio Coding
+    SAOC_DE              = 45,  // Spatial Audio Object Coding Dialogue Enhancement
  };

  AACAudioSpecificConfig();
@ -114,6 +114,7 @@ class AACAudioSpecificConfig {
  void set_sbr_present(bool sbr_present) { sbr_present_ = sbr_present; }

 private:
+  bool ParseAudioObjectType(BitReader* bit_reader);
  bool ParseDecoderGASpecificConfig(BitReader* bit_reader);
  bool SkipErrorSpecificConfig() const;
  // Parse GASpecificConfig. Calls |ParseProgramConfigElement| if
--- a/packager/media/codecs/aac_audio_specific_config_unittest.cc
+++ b/packager/media/codecs/aac_audio_specific_config_unittest.cc
@ -101,6 +101,23 @@ TEST(AACAudioSpecificConfigTest, SixChannelTest) {
            aac_audio_specific_config.GetAudioObjectType());
 }

+TEST(AACAudioSpecificConfigTest, UsacTest) {
+  AACAudioSpecificConfig aac_audio_specific_config;
+  uint8_t buffer[] = { 
+    0xF9, 0x5E, 0x01, 0x2C, 0x00, 0x52, 0x42, 0x2C, 0xC0, 0x51,
+    0x17, 0x55, 0x4F, 0x36, 0x00, 0x42, 0x80, 0x01, 0x00, 0x04,
+    0xA8, 0x82, 0x34, 0xE5, 0x80
+  };
+
+  std::vector<uint8_t> data(std::begin(buffer), std::end(buffer));
+
+  EXPECT_TRUE(aac_audio_specific_config.Parse(data));
+  EXPECT_EQ(38400u, aac_audio_specific_config.GetSamplesPerSecond());
+  EXPECT_EQ(2u, aac_audio_specific_config.GetNumChannels());
+  EXPECT_EQ(AACAudioSpecificConfig::AOT_USAC,
+            aac_audio_specific_config.GetAudioObjectType());
+}
+
 TEST(AACAudioSpecificConfigTest, ProgramConfigElementTest) {
  uint8_t buffer[] = {
      0x11, 0x80, 0x04, 0xC8, 0x44, 0x00, 0x20, 0x00, 0xC4,