From 94401d750a72752c68f2494cb832f8370a8f12be Mon Sep 17 00:00:00 2001
From: KongQun Yang <kqyang@google.com>
Date: Wed, 18 Nov 2015 11:51:15 -0800
Subject: [PATCH] VP9 codec string from bitstream and subsample encryption
 support

- Parse vp9 bitstream to get vpx codec configuration
- Add subsample encryption for vp9
- Also fixed a bug in VP9 parser if segmentation update_map is enabled

Change-Id: I69dc97088aa38c94c6d37fdbcf3d9cfc942a3df6
---
 packager/media/base/audio_stream_info.h       |   2 +-
 packager/media/base/bit_reader.cc             |   9 +-
 packager/media/base/bit_reader.h              |  11 +-
 packager/media/base/video_stream_info.h       |   1 +
 packager/media/filters/vp9_parser.cc          |  58 +++++--
 packager/media/filters/vp9_parser.h           |  12 +-
 packager/media/filters/vp9_parser_unittest.cc |  31 +++-
 .../media/filters/vp_codec_configuration.h    |  12 +-
 .../formats/mp4/encrypting_fragmenter.cc      |  61 +++++---
 .../media/formats/mp4/encrypting_fragmenter.h |  15 +-
 .../formats/mp4/key_rotation_fragmenter.cc    |   2 +
 .../formats/mp4/key_rotation_fragmenter.h     |   4 +
 packager/media/formats/mp4/segmenter.cc       |  24 +--
 .../media/formats/webm/webm_cluster_parser.cc | 146 +++++++++++++-----
 .../media/formats/webm/webm_cluster_parser.h  |  23 +--
 .../webm/webm_cluster_parser_unittest.cc      | 125 +++++++++++++--
 .../media/formats/webm/webm_media_parser.cc   |  28 ++--
 .../media/formats/webm/webm_video_client.cc   |  22 +--
 18 files changed, 423 insertions(+), 163 deletions(-)

diff --git a/packager/media/base/audio_stream_info.h b/packager/media/base/audio_stream_info.h
index 71e3d79995..0f5ff088e9 100644
--- a/packager/media/base/audio_stream_info.h
+++ b/packager/media/base/audio_stream_info.h
@@ -71,11 +71,11 @@ class AudioStreamInfo : public StreamInfo {
     return static_cast<uint32_t>(num_channels_) * sample_bits_ / 8;
   }
 
+  void set_codec(AudioCodec codec) { codec_ = codec; }
   void set_sampling_frequency(const uint32_t sampling_frequency) {
     sampling_frequency_ = sampling_frequency;
   }
 
-
   /// @param audio_object_type is only used by AAC Codec, ignored otherwise.
   /// @return The codec string.
   static std::string GetCodecString(AudioCodec codec,
diff --git a/packager/media/base/bit_reader.cc b/packager/media/base/bit_reader.cc
index 5b1824e519..50fc6c3c70 100644
--- a/packager/media/base/bit_reader.cc
+++ b/packager/media/base/bit_reader.cc
@@ -10,7 +10,10 @@ namespace edash_packager {
 namespace media {
 
 BitReader::BitReader(const uint8_t* data, off_t size)
-    : data_(data), bytes_left_(size), num_remaining_bits_in_curr_byte_(0) {
+    : data_(data),
+      initial_size_(size),
+      bytes_left_(size),
+      num_remaining_bits_in_curr_byte_(0) {
   DCHECK(data_ != NULL && bytes_left_ > 0);
 
   UpdateCurrByte();
@@ -50,10 +53,6 @@ bool BitReader::SkipBits(int num_bits) {
   return ReadBitsInternal(num_bits, &not_needed);
 }
 
-int BitReader::bits_available() const {
-  return 8 * bytes_left_ + num_remaining_bits_in_curr_byte_;
-}
-
 bool BitReader::ReadBitsInternal(int num_bits, uint64_t* out) {
   DCHECK_LE(num_bits, 64);
 
diff --git a/packager/media/base/bit_reader.h b/packager/media/base/bit_reader.h
index 65a494056b..bf00f9b560 100644
--- a/packager/media/base/bit_reader.h
+++ b/packager/media/base/bit_reader.h
@@ -49,7 +49,12 @@ class BitReader {
   bool SkipBits(int num_bits);
 
   /// @return The number of bits available for reading.
-  int bits_available() const;
+  int bits_available() const {
+    return 8 * bytes_left_ + num_remaining_bits_in_curr_byte_;
+  }
+
+  /// @return The current bit position.
+  int bit_position() const { return 8 * initial_size_ - bits_available(); }
 
  private:
   // Help function used by ReadBits to avoid inlining the bit reading logic.
@@ -63,6 +68,10 @@ class BitReader {
   // Pointer to the next unread (not in curr_byte_) byte in the stream.
   const uint8_t* data_;
 
+  // Initial size of the input data.
+  // TODO(kqyang): Use size_t instead of off_t instead.
+  off_t initial_size_;
+
   // Bytes left in the stream (without the curr_byte_).
   off_t bytes_left_;
 
diff --git a/packager/media/base/video_stream_info.h b/packager/media/base/video_stream_info.h
index 6786d5457f..91292e696d 100644
--- a/packager/media/base/video_stream_info.h
+++ b/packager/media/base/video_stream_info.h
@@ -67,6 +67,7 @@ class VideoStreamInfo : public StreamInfo {
   uint8_t nalu_length_size() const { return nalu_length_size_; }
   int16_t trick_play_rate() const { return trick_play_rate_; }
 
+  void set_codec(VideoCodec codec) { codec_ = codec; }
   void set_width(uint32_t width) { width_ = width; }
   void set_height(uint32_t height) { height_ = height; }
   void set_pixel_width(uint32_t pixel_width) { pixel_width_ = pixel_width; }
diff --git a/packager/media/filters/vp9_parser.cc b/packager/media/filters/vp9_parser.cc
index 5db7079bf7..f04d70618a 100644
--- a/packager/media/filters/vp9_parser.cc
+++ b/packager/media/filters/vp9_parser.cc
@@ -184,6 +184,10 @@ VPCodecConfiguration::ColorSpace GetColorSpace(uint8_t color_space) {
       return VPCodecConfiguration::COLOR_SPACE_BT_601;
     case VPX_COLOR_SPACE_BT_709:
       return VPCodecConfiguration::COLOR_SPACE_BT_709;
+    case VPX_COLOR_SPACE_SMPTE_170:
+      return VPCodecConfiguration::COLOR_SPACE_SMPTE_170;
+    case VPX_COLOR_SPACE_SMPTE_240:
+      return VPCodecConfiguration::COLOR_SPACE_SMPTE_240;
     case VPX_COLOR_SPACE_BT_2020:
       // VP9 does not specify if it is in the form of “constant luminance” or
       // “non-constant luminance”. As such, application should rely on the
@@ -368,15 +372,14 @@ bool ReadSegmentation(VP9BitReader* reader) {
   bool update_map;
   RCHECK(reader->ReadBits(1, &update_map));
   if (update_map) {
-    for (uint32_t i = 0; i < SEG_TREE_PROBS; ++i) {
+    for (uint32_t i = 0; i < SEG_TREE_PROBS; ++i)
       RCHECK(reader->SkipBitsConditional(8));
 
-      bool temporal_update;
-      RCHECK(reader->ReadBits(1, &temporal_update));
-      if (temporal_update) {
-        for (uint32_t j = 0; j < PREDICTION_PROBS; ++j)
-          RCHECK(reader->SkipBitsConditional(8));
-      }
+    bool temporal_update;
+    RCHECK(reader->ReadBits(1, &temporal_update));
+    if (temporal_update) {
+      for (uint32_t j = 0; j < PREDICTION_PROBS; ++j)
+        RCHECK(reader->SkipBitsConditional(8));
     }
   }
 
@@ -449,23 +452,23 @@ bool VP9Parser::Parse(const uint8_t* data,
       // End of current frame data. There should be no more bytes available.
       RCHECK(reader.bits_available() < 8);
 
-      vpx_frame.is_key_frame = false;
+      vpx_frame.is_keyframe = false;
       vpx_frame.uncompressed_header_size = vpx_frame.frame_size;
       vpx_frame.width = width_;
       vpx_frame.height = height_;
       continue;
     }
 
-    bool is_inter_frame;
-    RCHECK(reader.ReadBits(1, &is_inter_frame));
-    vpx_frame.is_key_frame = !is_inter_frame;
+    bool is_interframe;
+    RCHECK(reader.ReadBits(1, &is_interframe));
+    vpx_frame.is_keyframe = !is_interframe;
 
     bool show_frame;
     RCHECK(reader.ReadBits(1, &show_frame));
     bool error_resilient_mode;
     RCHECK(reader.ReadBits(1, &error_resilient_mode));
 
-    if (vpx_frame.is_key_frame) {
+    if (vpx_frame.is_keyframe) {
       RCHECK(ReadSyncCode(&reader));
       RCHECK(ReadBitDepthAndColorSpace(&reader, &codec_config_));
       RCHECK(ReadFrameSizes(&reader, &width_, &height_));
@@ -516,8 +519,7 @@ bool VP9Parser::Parse(const uint8_t* data,
     }
     RCHECK(reader.SkipBits(FRAME_CONTEXTS_LOG2));  // frame_context_idx
 
-    VLOG(4) << "bit offset: "
-            << vpx_frame.frame_size * 8 - reader.bits_available();
+    VLOG(4) << "Bits read before ReadLoopFilter: " << reader.bit_position();
     RCHECK(ReadLoopFilter(&reader));
     RCHECK(ReadQuantization(&reader));
     RCHECK(ReadSegmentation(&reader));
@@ -532,8 +534,7 @@ bool VP9Parser::Parse(const uint8_t* data,
 
     VLOG(3) << "\n frame_size: " << vpx_frame.frame_size
             << "\n header_size: " << vpx_frame.uncompressed_header_size
-            << "\n bits_read: "
-            << vpx_frame.frame_size * 8 - reader.bits_available()
+            << "\n Bits read: " << reader.bit_position()
             << "\n first_partition_size: " << first_partition_size;
 
     RCHECK(first_partition_size > 0);
@@ -544,5 +545,30 @@ bool VP9Parser::Parse(const uint8_t* data,
   return true;
 }
 
+bool VP9Parser::IsKeyframe(const uint8_t* data, size_t data_size) {
+  VP9BitReader reader(data, data_size);
+  uint8_t frame_marker;
+  RCHECK(reader.ReadBits(2, &frame_marker));
+  RCHECK(frame_marker == VP9_FRAME_MARKER);
+
+  VPCodecConfiguration codec_config;
+  RCHECK(ReadProfile(&reader, &codec_config));
+
+  bool show_existing_frame;
+  RCHECK(reader.ReadBits(1, &show_existing_frame));
+  if (show_existing_frame)
+    return false;
+
+  bool is_interframe;
+  RCHECK(reader.ReadBits(1, &is_interframe));
+  if (is_interframe)
+    return false;
+
+  RCHECK(reader.SkipBits(2));  // show_frame, error_resilient_mode.
+
+  RCHECK(ReadSyncCode(&reader));
+  return true;
+}
+
 }  // namespace media
 }  // namespace edash_packager
diff --git a/packager/media/filters/vp9_parser.h b/packager/media/filters/vp9_parser.h
index cd9e532d2b..effd7769b2 100644
--- a/packager/media/filters/vp9_parser.h
+++ b/packager/media/filters/vp9_parser.h
@@ -11,7 +11,6 @@
 #include <stdlib.h>
 
 #include "packager/base/macros.h"
-#include "packager/base/memory/scoped_ptr.h"
 #include "packager/media/filters/vp_codec_configuration.h"
 
 namespace edash_packager {
@@ -20,7 +19,7 @@ namespace media {
 struct VPxFrameInfo {
   size_t frame_size;
   size_t uncompressed_header_size;
-  bool is_key_frame;
+  bool is_keyframe;
   uint32_t width;
   uint32_t height;
 };
@@ -42,9 +41,16 @@ class VP9Parser {
              std::vector<VPxFrameInfo>* vpx_frames);
 
   /// @return VPx codec configuration extracted. Note that it is only valid
-  ///         after parsing a key frame or intra frame successfully.
+  ///         after parsing a keyframe or intra frame successfully.
   const VPCodecConfiguration& codec_config() { return codec_config_; }
 
+  /// A convenient utility function to check whether the frame is a keyframe.
+  /// Note that this function does not do a full parse of the frame header, so
+  /// should be more efficient than Parse().
+  /// @param data_size Size of the sample in bytes.
+  /// @return true if it is, false if it is not or if there is parsing error.
+  static bool IsKeyframe(const uint8_t* data, size_t data_size);
+
  private:
   // Keep track of the current width and height. Note that they may change from
   // frame to frame.
diff --git a/packager/media/filters/vp9_parser_unittest.cc b/packager/media/filters/vp9_parser_unittest.cc
index 4c5eef64c5..ae1607391e 100644
--- a/packager/media/filters/vp9_parser_unittest.cc
+++ b/packager/media/filters/vp9_parser_unittest.cc
@@ -17,16 +17,16 @@ namespace {
 MATCHER_P5(EqualVPxFrame,
            frame_size,
            uncompressed_header_size,
-           is_key_frame,
+           is_keyframe,
            width,
            height,
            "") {
   *result_listener << "which is (" << arg.frame_size << ", "
-                   << arg.uncompressed_header_size << ", " << arg.is_key_frame
+                   << arg.uncompressed_header_size << ", " << arg.is_keyframe
                    << ", " << arg.width << ", " << arg.height << ").";
   return arg.frame_size == frame_size &&
          arg.uncompressed_header_size == uncompressed_header_size &&
-         arg.is_key_frame == is_key_frame && arg.width == width &&
+         arg.is_keyframe == is_keyframe && arg.width == width &&
          arg.height == height;
 }
 }  // namespace
@@ -47,6 +47,8 @@ TEST(VP9ParserTest, Superframe) {
       0xc9, 0x3c, 0x00, 0x48, 0x00, 0xc9,
   };
 
+  EXPECT_FALSE(VP9Parser::IsKeyframe(data, arraysize(data)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(data, arraysize(data), &frames));
@@ -69,6 +71,8 @@ TEST(VP9ParserTest, KeyframeChroma420) {
       0x35, 0x7a, 0x88, 0x69, 0xf7, 0x1f, 0x26, 0x8b,
   };
 
+  EXPECT_TRUE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(kData, arraysize(kData), &frames));
@@ -89,6 +93,8 @@ TEST(VP9ParserTest, KeyframeProfile1Chroma422) {
       0xa0, 0x96, 0xa7, 0xb8, 0xf4, 0xb4, 0x65, 0xff,
   };
 
+  EXPECT_TRUE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(kData, arraysize(kData), &frames));
@@ -109,6 +115,8 @@ TEST(VP9ParserTest, KeyframeProfile2Chroma420) {
       0xa4, 0xdf, 0x05, 0xaf, 0x6f, 0xff, 0xd1, 0x74,
   };
 
+  EXPECT_TRUE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(kData, arraysize(kData), &frames));
@@ -119,7 +127,7 @@ TEST(VP9ParserTest, KeyframeProfile2Chroma420) {
 }
 
 TEST(VP9ParserTest, KeyframeProfile3Chroma444) {
-  uint8_t kData[] = {
+  const uint8_t kData[] = {
       0xb1, 0x24, 0xc1, 0xa1, 0x40, 0x00, 0x4f, 0x80, 0x2c, 0xa0, 0x41, 0xc1,
       0x20, 0xe0, 0xc3, 0xf0, 0x00, 0x09, 0x00, 0x7c, 0x57, 0x77, 0x3f, 0x67,
       0x99, 0x3e, 0x1f, 0xfb, 0xdf, 0x0f, 0x02, 0x0a, 0x37, 0x81, 0x53, 0x80,
@@ -129,6 +137,8 @@ TEST(VP9ParserTest, KeyframeProfile3Chroma444) {
       0xe1, 0xe6, 0xef, 0xff, 0xfd, 0xf7, 0x4f, 0x0f,
   };
 
+  EXPECT_TRUE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(kData, arraysize(kData), &frames));
@@ -146,8 +156,11 @@ TEST(VP9ParserTest, Intra) {
       0xe2, 0xbd, 0x53, 0xd9, 0x00, 0x3a, 0x70, 0xe0, 0x00, 0x78, 0xea, 0xa5,
       0x61, 0x08, 0xb7, 0x9f, 0x33, 0xe5, 0xf8, 0xa5, 0x82, 0x32, 0xbb, 0xa3,
       0x75, 0xb4, 0x60, 0xf3, 0x39, 0x75, 0x1f, 0x2b,
+
   };
 
+  EXPECT_FALSE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(kData, arraysize(kData), &frames));
@@ -159,6 +172,7 @@ TEST(VP9ParserTest, Intra) {
 
 TEST(VP9ParserTest, ShowExisting) {
   const uint8_t kData[] = {0x88};
+  EXPECT_FALSE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(kData, arraysize(kData), &frames));
@@ -177,6 +191,8 @@ TEST(VP9ParserTest, Interframe) {
       0x90, 0xeb, 0x8c, 0xad, 0x5f, 0x69, 0xb7, 0x9b,
   };
 
+  EXPECT_FALSE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_TRUE(parser.Parse(kData, arraysize(kData), &frames));
@@ -186,6 +202,7 @@ TEST(VP9ParserTest, Interframe) {
 
 TEST(VP9ParserTest, CorruptedFrameMarker) {
   const uint8_t kData[] = {0xc8};
+  EXPECT_FALSE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_FALSE(parser.Parse(kData, arraysize(kData), &frames));
@@ -202,6 +219,8 @@ TEST(VP9ParserTest, CorruptedSynccode) {
       0x35, 0x7a, 0x88, 0x69, 0xf7, 0x1f, 0x26, 0x8b,
   };
 
+  EXPECT_FALSE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   ASSERT_FALSE(parser.Parse(kData, arraysize(kData), &frames));
@@ -218,6 +237,10 @@ TEST(VP9ParserTest, NotEnoughBytesForFirstPartitionSize) {
       0x07, 0xf4, 0x7f, 0xc7, 0xff, 0x6d, 0xff, 0xeb,
   };
 
+  // IsKeyframe only parses the bytes that is necessary to determine whether it
+  // is a keyframe.
+  EXPECT_TRUE(VP9Parser::IsKeyframe(kData, arraysize(kData)));
+
   VP9Parser parser;
   std::vector<VPxFrameInfo> frames;
   EXPECT_FALSE(parser.Parse(kData, arraysize(kData), &frames));
diff --git a/packager/media/filters/vp_codec_configuration.h b/packager/media/filters/vp_codec_configuration.h
index 398c1bdd84..4fe04fd03e 100644
--- a/packager/media/filters/vp_codec_configuration.h
+++ b/packager/media/filters/vp_codec_configuration.h
@@ -24,9 +24,11 @@ class VPCodecConfiguration {
     COLOR_SPACE_UNSPECIFIED = 0,
     COLOR_SPACE_BT_601 = 1,
     COLOR_SPACE_BT_709 = 2,
-    COLOR_SPACE_BT_2020_NON_CONSTANT_LUMINANCE = 3,
-    COLOR_SPACE_BT_2020_CONSTANT_LUMINANCE = 4,
-    COLOR_SPACE_SRGB = 5,
+    COLOR_SPACE_SMPTE_170 = 3,
+    COLOR_SPACE_SMPTE_240 = 4,
+    COLOR_SPACE_BT_2020_NON_CONSTANT_LUMINANCE = 5,
+    COLOR_SPACE_BT_2020_CONSTANT_LUMINANCE = 6,
+    COLOR_SPACE_SRGB = 7,
   };
 
   enum ChromaSubsampling {
@@ -91,7 +93,9 @@ class VPCodecConfiguration {
   bool video_full_range_flag_;
   std::vector<uint8_t> codec_initialization_data_;
 
-  DISALLOW_COPY_AND_ASSIGN(VPCodecConfiguration);
+  // Not using DISALLOW_COPY_AND_ASSIGN here intentionally to allow the compiler
+  // generated copy constructor and assignment operator. Since the internal data
+  // is small, the performance impact is minimal.
 };
 
 }  // namespace media
diff --git a/packager/media/formats/mp4/encrypting_fragmenter.cc b/packager/media/formats/mp4/encrypting_fragmenter.cc
index 5ee7484fbd..8113c89a7f 100644
--- a/packager/media/formats/mp4/encrypting_fragmenter.cc
+++ b/packager/media/formats/mp4/encrypting_fragmenter.cc
@@ -10,6 +10,7 @@
 #include "packager/media/base/buffer_reader.h"
 #include "packager/media/base/key_source.h"
 #include "packager/media/base/media_sample.h"
+#include "packager/media/filters/vp9_parser.h"
 #include "packager/media/formats/mp4/box_definitions.h"
 #include "packager/media/formats/mp4/cenc.h"
 
@@ -26,15 +27,19 @@ EncryptingFragmenter::EncryptingFragmenter(
     TrackFragment* traf,
     scoped_ptr<EncryptionKey> encryption_key,
     int64_t clear_time,
+    VideoCodec video_codec,
     uint8_t nalu_length_size)
     : Fragmenter(traf),
       encryption_key_(encryption_key.Pass()),
+      video_codec_(video_codec),
       nalu_length_size_(nalu_length_size),
       clear_time_(clear_time) {
   DCHECK(encryption_key_);
+  if (video_codec == kCodecVP9)
+    vp9_parser_.reset(new VP9Parser);
 }
-EncryptingFragmenter::~EncryptingFragmenter() {}
 
+EncryptingFragmenter::~EncryptingFragmenter() {}
 
 Status EncryptingFragmenter::AddSample(scoped_refptr<MediaSample> sample) {
   DCHECK(sample);
@@ -134,30 +139,48 @@ Status EncryptingFragmenter::EncryptSample(scoped_refptr<MediaSample> sample) {
 
   FrameCENCInfo cenc_info(encryptor_->iv());
   uint8_t* data = sample->writable_data();
-  if (!IsSubsampleEncryptionRequired()) {
-    EncryptBytes(data, sample->data_size());
-  } else {
-    BufferReader reader(data, sample->data_size());
-    while (reader.HasBytes(1)) {
-      uint64_t nalu_length;
-      if (!reader.ReadNBytesInto8(&nalu_length, nalu_length_size_))
-        return Status(error::MUXER_FAILURE, "Fail to read nalu_length.");
-
-      SubsampleEntry subsample;
-      subsample.clear_bytes = nalu_length_size_ + 1;
-      subsample.cipher_bytes = nalu_length - 1;
-      if (!reader.SkipBytes(nalu_length)) {
-        return Status(error::MUXER_FAILURE,
-                      "Sample size does not match nalu_length.");
+  if (IsSubsampleEncryptionRequired()) {
+    if (video_codec_ == kCodecVP9) {
+      std::vector<VPxFrameInfo> vpx_frames;
+      if (!vp9_parser_->Parse(sample->data(), sample->data_size(),
+                              &vpx_frames)) {
+        return Status(error::MUXER_FAILURE, "Failed to parse vp9 frame.");
       }
+      for (const VPxFrameInfo& frame : vpx_frames) {
+        SubsampleEntry subsample;
+        subsample.clear_bytes = frame.uncompressed_header_size;
+        subsample.cipher_bytes =
+            frame.frame_size - frame.uncompressed_header_size;
+        cenc_info.AddSubsample(subsample);
+        if (subsample.cipher_bytes > 0)
+          EncryptBytes(data + subsample.clear_bytes, subsample.cipher_bytes);
+        data += frame.frame_size;
+      }
+    } else {
+      BufferReader reader(data, sample->data_size());
+      while (reader.HasBytes(1)) {
+        uint64_t nalu_length;
+        if (!reader.ReadNBytesInto8(&nalu_length, nalu_length_size_))
+          return Status(error::MUXER_FAILURE, "Fail to read nalu_length.");
 
-      EncryptBytes(data + subsample.clear_bytes, subsample.cipher_bytes);
-      cenc_info.AddSubsample(subsample);
-      data += nalu_length_size_ + nalu_length;
+        SubsampleEntry subsample;
+        subsample.clear_bytes = nalu_length_size_ + 1;
+        subsample.cipher_bytes = nalu_length - 1;
+        if (!reader.SkipBytes(nalu_length)) {
+          return Status(error::MUXER_FAILURE,
+                        "Sample size does not match nalu_length.");
+        }
+
+        EncryptBytes(data + subsample.clear_bytes, subsample.cipher_bytes);
+        cenc_info.AddSubsample(subsample);
+        data += nalu_length_size_ + nalu_length;
+      }
     }
 
     // The length of per-sample auxiliary datum, defined in CENC ch. 7.
     traf()->auxiliary_size.sample_info_sizes.push_back(cenc_info.ComputeSize());
+  } else {
+    EncryptBytes(data, sample->data_size());
   }
 
   cenc_info.Write(aux_data());
diff --git a/packager/media/formats/mp4/encrypting_fragmenter.h b/packager/media/formats/mp4/encrypting_fragmenter.h
index f0c772adff..75b4531e81 100644
--- a/packager/media/formats/mp4/encrypting_fragmenter.h
+++ b/packager/media/formats/mp4/encrypting_fragmenter.h
@@ -7,6 +7,8 @@
 #ifndef MEDIA_FORMATS_MP4_ENCRYPTING_FRAGMENTER_H_
 #define MEDIA_FORMATS_MP4_ENCRYPTING_FRAGMENTER_H_
 
+#include "packager/base/memory/scoped_ptr.h"
+#include "packager/media/filters/vp9_parser.h"
 #include "packager/media/formats/mp4/fragmenter.h"
 
 namespace edash_packager {
@@ -24,11 +26,15 @@ class EncryptingFragmenter : public Fragmenter {
   /// @param encryption_key contains the encryption parameters.
   /// @param clear_time specifies clear lead duration in units of the current
   ///        track's timescale.
+  /// @param video_codec specifies the codec if input is a video stream; it
+  ///        should be set to kUnknownVideoCodec for audio stream. This
+  ///        parameter is used for proper subsample encryption.
   /// @param nalu_length_size specifies the size of NAL unit length, in bytes,
   ///        for subsample encryption.
   EncryptingFragmenter(TrackFragment* traf,
                        scoped_ptr<EncryptionKey> encryption_key,
                        int64_t clear_time,
+                       VideoCodec video_codec,
                        uint8_t nalu_length_size);
 
   ~EncryptingFragmenter() override;
@@ -64,16 +70,23 @@ class EncryptingFragmenter : public Fragmenter {
   Status EncryptSample(scoped_refptr<MediaSample> sample);
 
   // Should we enable subsample encryption?
-  bool IsSubsampleEncryptionRequired() { return nalu_length_size_ != 0; }
+  bool IsSubsampleEncryptionRequired() {
+    return video_codec_ == kCodecVP9 || nalu_length_size_ != 0;
+  }
 
   scoped_ptr<EncryptionKey> encryption_key_;
   scoped_ptr<AesCtrEncryptor> encryptor_;
+  // For VP8/VP9, uncompressed_header should not be encrypted; for AVC/HEVC,
+  // the size and type NAL units should not be encrypted.
+  VideoCodec video_codec_;
   // If this stream contains AVC, subsample encryption specifies that the size
   // and type of NAL units remain unencrypted. This field specifies the size of
   // the size field. Can be 1, 2 or 4 bytes.
   const uint8_t nalu_length_size_;
   int64_t clear_time_;
 
+  scoped_ptr<VP9Parser> vp9_parser_;
+
   DISALLOW_COPY_AND_ASSIGN(EncryptingFragmenter);
 };
 
diff --git a/packager/media/formats/mp4/key_rotation_fragmenter.cc b/packager/media/formats/mp4/key_rotation_fragmenter.cc
index 3e4a389bcb..d8cbe135db 100644
--- a/packager/media/formats/mp4/key_rotation_fragmenter.cc
+++ b/packager/media/formats/mp4/key_rotation_fragmenter.cc
@@ -23,11 +23,13 @@ KeyRotationFragmenter::KeyRotationFragmenter(MovieFragment* moof,
                                              KeySource::TrackType track_type,
                                              int64_t crypto_period_duration,
                                              int64_t clear_time,
+                                             VideoCodec video_codec,
                                              uint8_t nalu_length_size,
                                              MuxerListener* muxer_listener)
     : EncryptingFragmenter(traf,
                            scoped_ptr<EncryptionKey>(new EncryptionKey()),
                            clear_time,
+                           video_codec,
                            nalu_length_size),
       moof_(moof),
       encryption_key_source_(encryption_key_source),
diff --git a/packager/media/formats/mp4/key_rotation_fragmenter.h b/packager/media/formats/mp4/key_rotation_fragmenter.h
index 06c68146f5..2ec49f7228 100644
--- a/packager/media/formats/mp4/key_rotation_fragmenter.h
+++ b/packager/media/formats/mp4/key_rotation_fragmenter.h
@@ -31,6 +31,9 @@ class KeyRotationFragmenter : public EncryptingFragmenter {
   ///        of the current track's timescale.
   /// @param clear_time specifies clear lead duration in units of the current
   ///        track's timescale.
+  /// @param video_codec specifies the codec if input is a video stream; it
+  ///        should be set to kUnknownVideoCodec for audio stream. This
+  ///        parameter is used for proper subsample encryption.
   /// @param nalu_length_size NAL unit length size, in bytes, for subsample
   ///        encryption.
   /// @param muxer_listener is a pointer to MuxerListener for notifying
@@ -41,6 +44,7 @@ class KeyRotationFragmenter : public EncryptingFragmenter {
                         KeySource::TrackType track_type,
                         int64_t crypto_period_duration,
                         int64_t clear_time,
+                        VideoCodec video_codec,
                         uint8_t nalu_length_size,
                         MuxerListener* muxer_listener);
   ~KeyRotationFragmenter() override;
diff --git a/packager/media/formats/mp4/segmenter.cc b/packager/media/formats/mp4/segmenter.cc
index e4cc16b76b..4be256b454 100644
--- a/packager/media/formats/mp4/segmenter.cc
+++ b/packager/media/formats/mp4/segmenter.cc
@@ -89,6 +89,14 @@ void GenerateEncryptedSampleEntry(const EncryptionKey& encryption_key,
   }
 }
 
+VideoCodec GetVideoCodec(const StreamInfo& stream_info) {
+  if (stream_info.stream_type() != kStreamVideo)
+    return kUnknownVideoCodec;
+  const VideoStreamInfo& video_stream_info =
+      static_cast<const VideoStreamInfo&>(stream_info);
+  return video_stream_info.codec();
+}
+
 uint8_t GetNaluLengthSize(const StreamInfo& stream_info) {
   if (stream_info.stream_type() != kStreamVideo)
     return 0;
@@ -160,6 +168,7 @@ Status Segmenter::Initialize(const std::vector<MediaStream*>& streams,
       continue;
     }
 
+    VideoCodec video_codec = GetVideoCodec(*streams[i]->info());
     uint8_t nalu_length_size = GetNaluLengthSize(*streams[i]->info());
     KeySource::TrackType track_type =
         GetTrackTypeForEncryption(*streams[i]->info(), max_sd_pixels);
@@ -182,14 +191,10 @@ Status Segmenter::Initialize(const std::vector<MediaStream*>& streams,
       }
 
       fragmenters_[i] = new KeyRotationFragmenter(
-          moof_.get(),
-          &moof_->tracks[i],
-          encryption_key_source,
-          track_type,
+          moof_.get(), &moof_->tracks[i], encryption_key_source, track_type,
           crypto_period_duration_in_seconds * streams[i]->info()->time_scale(),
-          clear_lead_in_seconds * streams[i]->info()->time_scale(),
-          nalu_length_size,
-          muxer_listener_);
+          clear_lead_in_seconds * streams[i]->info()->time_scale(), video_codec,
+          nalu_length_size, muxer_listener_);
       continue;
     }
 
@@ -217,9 +222,8 @@ Status Segmenter::Initialize(const std::vector<MediaStream*>& streams,
     }
 
     fragmenters_[i] = new EncryptingFragmenter(
-        &moof_->tracks[i],
-        encryption_key.Pass(),
-        clear_lead_in_seconds * streams[i]->info()->time_scale(),
+        &moof_->tracks[i], encryption_key.Pass(),
+        clear_lead_in_seconds * streams[i]->info()->time_scale(), video_codec,
         nalu_length_size);
   }
 
diff --git a/packager/media/formats/webm/webm_cluster_parser.cc b/packager/media/formats/webm/webm_cluster_parser.cc
index fdf8a93238..ba074432f9 100644
--- a/packager/media/formats/webm/webm_cluster_parser.cc
+++ b/packager/media/formats/webm/webm_cluster_parser.cc
@@ -10,6 +10,7 @@
 #include "packager/base/sys_byteorder.h"
 #include "packager/media/base/decrypt_config.h"
 #include "packager/media/base/timestamp.h"
+#include "packager/media/filters/vp9_parser.h"
 #include "packager/media/filters/webvtt_util.h"
 #include "packager/media/formats/webm/webm_constants.h"
 #include "packager/media/formats/webm/webm_crypto_helpers.h"
@@ -30,17 +31,11 @@
                 "may be suppressed): "                         \
               : "")
 
-namespace {
-const int64_t kMicrosecondsPerMillisecond = 1000;
-}  // namespace
-
 namespace edash_packager {
 namespace media {
+namespace {
 
-const uint16_t WebMClusterParser::kOpusFrameDurationsMu[] = {
-    10000, 20000, 40000, 60000, 10000, 20000, 40000, 60000, 10000, 20000, 40000,
-    60000, 10000, 20000, 10000, 20000, 2500,  5000,  10000, 20000, 2500,  5000,
-    10000, 20000, 2500,  5000,  10000, 20000, 2500,  5000,  10000, 20000};
+const int64_t kMicrosecondsPerMillisecond = 1000;
 
 enum {
   // Limits the number of LOG() calls in the path of reading encoded
@@ -51,27 +46,78 @@ enum {
   kMaxDurationEstimateLogs = 10,
 };
 
+// Helper function used to inspect block data to determine if the
+// block is a keyframe.
+// |data| contains the bytes in the block.
+// |size| indicates the number of bytes in |data|.
+bool IsKeyframe(bool is_video,
+                VideoCodec codec,
+                const uint8_t* data,
+                int size) {
+  // For now, assume that all blocks are keyframes for datatypes other than
+  // video. This is a valid assumption for Vorbis, WebVTT, & Opus.
+  if (!is_video)
+    return true;
+
+  if (codec == kCodecVP9)
+    return VP9Parser::IsKeyframe(data, size);
+
+  CHECK_EQ(kCodecVP8, codec);
+
+  // Make sure the block is big enough for the minimal keyframe header size.
+  if (size < 7)
+    return false;
+
+  // The LSb of the first byte must be a 0 for a keyframe.
+  // http://tools.ietf.org/html/rfc6386 Section 19.1
+  if ((data[0] & 0x01) != 0)
+    return false;
+
+  // Verify VP8 keyframe startcode.
+  // http://tools.ietf.org/html/rfc6386 Section 19.1
+  if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a)
+    return false;
+
+  return true;
+}
+
+}  // namespace
+
+const uint16_t WebMClusterParser::kOpusFrameDurationsMu[] = {
+    10000, 20000, 40000, 60000, 10000, 20000, 40000, 60000, 10000, 20000, 40000,
+    60000, 10000, 20000, 10000, 20000, 2500,  5000,  10000, 20000, 2500,  5000,
+    10000, 20000, 2500,  5000,  10000, 20000, 2500,  5000,  10000, 20000};
+
 WebMClusterParser::WebMClusterParser(
     int64_t timecode_scale,
-    int audio_track_num,
+    scoped_refptr<AudioStreamInfo> audio_stream_info,
+    scoped_refptr<VideoStreamInfo> video_stream_info,
     int64_t audio_default_duration,
-    int video_track_num,
     int64_t video_default_duration,
     const WebMTracksParser::TextTracks& text_tracks,
     const std::set<int64_t>& ignored_tracks,
     const std::string& audio_encryption_key_id,
     const std::string& video_encryption_key_id,
-    const AudioCodec audio_codec,
-    const MediaParser::NewSampleCB& new_sample_cb)
+    const MediaParser::NewSampleCB& new_sample_cb,
+    const MediaParser::InitCB& init_cb)
     : timecode_multiplier_(timecode_scale / 1000.0),
+      audio_stream_info_(audio_stream_info),
+      video_stream_info_(video_stream_info),
       ignored_tracks_(ignored_tracks),
       audio_encryption_key_id_(audio_encryption_key_id),
       video_encryption_key_id_(video_encryption_key_id),
-      audio_codec_(audio_codec),
       parser_(kWebMIdCluster, this),
+      initialized_(false),
+      init_cb_(init_cb),
       cluster_start_time_(kNoTimestamp),
-      audio_(audio_track_num, false, audio_default_duration, new_sample_cb),
-      video_(video_track_num, true, video_default_duration, new_sample_cb) {
+      audio_(audio_stream_info ? audio_stream_info->track_id() : -1,
+             false,
+             audio_default_duration,
+             new_sample_cb),
+      video_(video_stream_info ? video_stream_info->track_id() : -1,
+             true,
+             video_default_duration,
+             new_sample_cb) {
   for (WebMTracksParser::TextTracks::const_iterator it = text_tracks.begin();
        it != text_tracks.end();
        ++it) {
@@ -143,7 +189,8 @@ int64_t WebMClusterParser::TryGetEncodedAudioDuration(
   // TODO: Consider parsing "Signal Byte" for encrypted streams to return
   // duration for any unencrypted blocks.
 
-  if (audio_codec_ == kCodecOpus) {
+  DCHECK(audio_stream_info_);
+  if (audio_stream_info_->codec() == kCodecOpus) {
     return ReadOpusDuration(data, size);
   }
 
@@ -450,7 +497,12 @@ bool WebMClusterParser::OnBlock(bool is_simple_block,
     // necessary to determine whether it contains a keyframe or not.
     // http://www.matroska.org/technical/specs/index.html
     bool is_keyframe =
-        is_simple_block ? (flags & 0x80) != 0 : track->IsKeyframe(data, size);
+        is_simple_block
+            ? (flags & 0x80) != 0
+            : IsKeyframe(stream_type == kStreamVideo,
+                         video_stream_info_ ? video_stream_info_->codec()
+                                            : kUnknownVideoCodec,
+                         data, size);
 
     // Every encrypted Block has a signal byte and IV prepended to it. Current
     // encrypted WebM request for comments specification is here
@@ -531,6 +583,44 @@ bool WebMClusterParser::OnBlock(bool is_simple_block,
     buffer->set_duration(track->default_duration());
   }
 
+  if (!initialized_) {
+    std::vector<scoped_refptr<StreamInfo>> streams;
+    if (audio_stream_info_)
+      streams.push_back(audio_stream_info_);
+    if (video_stream_info_) {
+      if (stream_type == kStreamVideo) {
+        VPCodecConfiguration codec_config;
+        if (video_stream_info_->codec() == kCodecVP9) {
+          VP9Parser vp9_parser;
+          std::vector<VPxFrameInfo> vpx_frames;
+          if (!vp9_parser.Parse(buffer->data(), buffer->data_size(),
+                                &vpx_frames)) {
+            LOG(ERROR) << "Failed to parse vp9 frame.";
+            return false;
+          }
+          if (vpx_frames.size() != 1u || !vpx_frames[0].is_keyframe) {
+            LOG(ERROR) << "The first frame should be a key frame.";
+            return false;
+          }
+          codec_config = vp9_parser.codec_config();
+        }
+        // TODO(kqyang): Support VP8.
+
+        video_stream_info_->set_codec_string(
+            codec_config.GetCodecString(video_stream_info_->codec()));
+        std::vector<uint8_t> extra_data;
+        codec_config.Write(&extra_data);
+        video_stream_info_->set_extra_data(extra_data);
+        streams.push_back(video_stream_info_);
+        init_cb_.Run(streams);
+        initialized_ = true;
+      }
+    } else {
+      init_cb_.Run(streams);
+      initialized_ = true;
+    }
+  }
+
   return track->EmitBuffer(buffer);
 }
 
@@ -614,28 +704,6 @@ void WebMClusterParser::Track::Reset() {
   last_added_buffer_missing_duration_ = NULL;
 }
 
-bool WebMClusterParser::Track::IsKeyframe(const uint8_t* data, int size) const {
-  // For now, assume that all blocks are keyframes for datatypes other than
-  // video. This is a valid assumption for Vorbis, WebVTT, & Opus.
-  if (!is_video_)
-    return true;
-
-  // Make sure the block is big enough for the minimal keyframe header size.
-  if (size < 7)
-    return false;
-
-  // The LSb of the first byte must be a 0 for a keyframe.
-  // http://tools.ietf.org/html/rfc6386 Section 19.1
-  if ((data[0] & 0x01) != 0)
-    return false;
-
-  // Verify VP8 keyframe startcode.
-  // http://tools.ietf.org/html/rfc6386 Section 19.1
-  if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a)
-    return false;
-
-  return true;
-}
 
 bool WebMClusterParser::Track::EmitBufferHelp(
     const scoped_refptr<MediaSample>& buffer) {
diff --git a/packager/media/formats/webm/webm_cluster_parser.h b/packager/media/formats/webm/webm_cluster_parser.h
index 2f73b524f9..3bcb0aa63c 100644
--- a/packager/media/formats/webm/webm_cluster_parser.h
+++ b/packager/media/formats/webm/webm_cluster_parser.h
@@ -67,12 +67,6 @@ class WebMClusterParser : public WebMParserClient {
     // was missing duration.
     void Reset();
 
-    // Helper function used to inspect block data to determine if the
-    // block is a keyframe.
-    // |data| contains the bytes in the block.
-    // |size| indicates the number of bytes in |data|.
-    bool IsKeyframe(const uint8_t* data, int size) const;
-
     int64_t default_duration() const { return default_duration_; }
 
    private:
@@ -113,16 +107,16 @@ class WebMClusterParser : public WebMParserClient {
 
  public:
   WebMClusterParser(int64_t timecode_scale,
-                    int audio_track_num,
+                    scoped_refptr<AudioStreamInfo> audio_stream_info,
+                    scoped_refptr<VideoStreamInfo> video_stream_info,
                     int64_t audio_default_duration,
-                    int video_track_num,
                     int64_t video_default_duration,
                     const WebMTracksParser::TextTracks& text_tracks,
                     const std::set<int64_t>& ignored_tracks,
                     const std::string& audio_encryption_key_id,
                     const std::string& video_encryption_key_id,
-                    const AudioCodec audio_codec,
-                    const MediaParser::NewSampleCB& new_sample_cb);
+                    const MediaParser::NewSampleCB& new_sample_cb,
+                    const MediaParser::InitCB& init_cb);
   ~WebMClusterParser() override;
 
   /// Resets the parser state so it can accept a new cluster.
@@ -191,13 +185,20 @@ class WebMClusterParser : public WebMParserClient {
 
   double timecode_multiplier_;  // Multiplier used to convert timecodes into
                                 // microseconds.
+  scoped_refptr<AudioStreamInfo> audio_stream_info_;
+  scoped_refptr<VideoStreamInfo> video_stream_info_;
   std::set<int64_t> ignored_tracks_;
   std::string audio_encryption_key_id_;
   std::string video_encryption_key_id_;
-  const AudioCodec audio_codec_;
 
   WebMListParser parser_;
 
+  // Indicates whether init_cb has been executed. |init_cb| is executed when we
+  // have codec configuration of video stream, which is extracted from the first
+  // video sample.
+  bool initialized_;
+  MediaParser::InitCB init_cb_;
+
   int64_t last_block_timecode_ = -1;
   scoped_ptr<uint8_t[]> block_data_;
   int block_data_size_ = -1;
diff --git a/packager/media/formats/webm/webm_cluster_parser_unittest.cc b/packager/media/formats/webm/webm_cluster_parser_unittest.cc
index 83f91a186f..6f33b4716f 100644
--- a/packager/media/formats/webm/webm_cluster_parser_unittest.cc
+++ b/packager/media/formats/webm/webm_cluster_parser_unittest.cc
@@ -28,10 +28,6 @@ using ::testing::StrictMock;
 using ::testing::Mock;
 using ::testing::_;
 
-namespace {
-const int64_t kMicrosecondsPerMillisecond = 1000;
-}  // namespace
-
 namespace edash_packager {
 namespace media {
 
@@ -67,6 +63,7 @@ MATCHER_P2(WebMBlockDurationMismatchesOpusDuration,
 
 namespace {
 
+const int64_t kMicrosecondsPerMillisecond = 1000;
 // Timecode scale for millisecond timestamps.
 const int kTimecodeScale = 1000000;
 
@@ -76,6 +73,23 @@ const int kTextTrackNum = 3;
 const int kTestAudioFrameDefaultDurationInMs = 13;
 const int kTestVideoFrameDefaultDurationInMs = 17;
 
+// Constants for AudioStreamInfo and VideoStreamInfo. Most are not used.
+const uint32_t kTimeScale = 1000000u;
+const uint64_t kDuration = 10000000u;
+const char kCodecString[] = "codec_string";
+const char kLanguage[] = "eng";
+const uint8_t kBitsPerSample = 8u;
+const uint8_t kNumChannels = 2u;
+const uint32_t kSamplingFrequency = 48000u;
+const size_t kExtraDataSize = 0u;
+const bool kEncrypted = true;
+const uint16_t kWidth = 320u;
+const uint16_t kHeight = 180u;
+const uint32_t kPixelWidth = 1u;
+const uint32_t kPixelHeight = 1u;
+const int16_t kTrickPlayRate = 0u;
+const uint8_t kNaluLengthSize = 0u;
+
 // Test duration defaults must differ from parser estimation defaults to know
 // which durations parser used when emitting buffers.
 static_assert(
@@ -125,6 +139,16 @@ const uint8_t kEncryptedFrame[] = {
     0x01,
 };
 
+const uint8_t kVP9Frame[] = {
+    0xb1, 0x24, 0xc1, 0xa1, 0x40, 0x00, 0x4f, 0x80, 0x2c, 0xa0, 0x41, 0xc1,
+    0x20, 0xe0, 0xc3, 0xf0, 0x00, 0x09, 0x00, 0x7c, 0x57, 0x77, 0x3f, 0x67,
+    0x99, 0x3e, 0x1f, 0xfb, 0xdf, 0x0f, 0x02, 0x0a, 0x37, 0x81, 0x53, 0x80,
+    0x00, 0x7e, 0x6f, 0xfe, 0x74, 0x31, 0xc6, 0x4f, 0x23, 0x9d, 0x6e, 0x5f,
+    0xfc, 0xa8, 0xef, 0x67, 0xdc, 0xac, 0xf7, 0x3e, 0x31, 0x07, 0xab, 0xc7,
+    0x0c, 0x74, 0x48, 0x8b, 0x95, 0x30, 0xc9, 0xf0, 0x37, 0x3b, 0xe6, 0x11,
+    0xe1, 0xe6, 0xef, 0xff, 0xfd, 0xf7, 0x4f, 0x0f,
+};
+
 scoped_ptr<Cluster> CreateCluster(int timecode,
                                   const BlockInfo* block_info,
                                   int block_count) {
@@ -178,6 +202,14 @@ scoped_ptr<Cluster> CreateEncryptedCluster(int bytes_to_write) {
   return cb.Finish();
 }
 
+// Creates a Cluster with one vp9 frame (keyframe).
+scoped_ptr<Cluster> CreateVP9Cluster() {
+  ClusterBuilder cb;
+  cb.SetClusterTimecode(0);
+  cb.AddSimpleBlock(kVideoTrackNum, 0, 0, kVP9Frame, arraysize(kVP9Frame));
+  return cb.Finish();
+}
+
 bool VerifyBuffersHelper(const BufferQueue& audio_buffers,
                          const BufferQueue& video_buffers,
                          const BufferQueue& text_buffers,
@@ -268,7 +300,35 @@ void VerifyEncryptedBuffer(scoped_refptr<MediaSample> buffer) {
 
 class WebMClusterParserTest : public testing::Test {
  public:
-  WebMClusterParserTest() : parser_(CreateDefaultParser()) {}
+  WebMClusterParserTest()
+      : audio_stream_info_(new AudioStreamInfo(kAudioTrackNum,
+                                               kTimeScale,
+                                               kDuration,
+                                               kUnknownAudioCodec,
+                                               kCodecString,
+                                               kLanguage,
+                                               kBitsPerSample,
+                                               kNumChannels,
+                                               kSamplingFrequency,
+                                               NULL,
+                                               kExtraDataSize,
+                                               !kEncrypted)),
+        video_stream_info_(new VideoStreamInfo(kVideoTrackNum,
+                                               kTimeScale,
+                                               kDuration,
+                                               kCodecVP8,
+                                               kCodecString,
+                                               kLanguage,
+                                               kWidth,
+                                               kHeight,
+                                               kPixelWidth,
+                                               kPixelHeight,
+                                               kTrickPlayRate,
+                                               kNaluLengthSize,
+                                               NULL,
+                                               kExtraDataSize,
+                                               !kEncrypted)),
+        parser_(CreateDefaultParser()) {}
 
  protected:
   void ResetParserToHaveDefaultDurations() {
@@ -285,6 +345,10 @@ class WebMClusterParserTest : public testing::Test {
         default_audio_duration, default_video_duration));
   }
 
+  void InitEvent(const std::vector<scoped_refptr<StreamInfo>>& stream_info) {
+    streams_from_init_event_ = stream_info;
+  }
+
   bool NewSampleEvent(uint32_t track_id,
                       const scoped_refptr<MediaSample>& sample) {
     switch (track_id) {
@@ -313,20 +377,24 @@ class WebMClusterParserTest : public testing::Test {
       const std::set<int64_t>& ignored_tracks,
       const std::string& audio_encryption_key_id,
       const std::string& video_encryption_key_id,
-      const AudioCodec audio_codec) {
+      const AudioCodec audio_codec,
+      const VideoCodec video_codec) {
+    audio_stream_info_->set_codec(audio_codec);
+    video_stream_info_->set_codec(video_codec);
     return new WebMClusterParser(
-        kTimecodeScale, kAudioTrackNum, audio_default_duration, kVideoTrackNum,
-        video_default_duration, text_tracks, ignored_tracks,
-        audio_encryption_key_id, video_encryption_key_id, audio_codec,
+        kTimecodeScale, audio_stream_info_, video_stream_info_,
+        audio_default_duration, video_default_duration, text_tracks,
+        ignored_tracks, audio_encryption_key_id, video_encryption_key_id,
         base::Bind(&WebMClusterParserTest::NewSampleEvent,
-                   base::Unretained(this)));
+                   base::Unretained(this)),
+        base::Bind(&WebMClusterParserTest::InitEvent, base::Unretained(this)));
   }
 
   // Create a default version of the parser for test.
   WebMClusterParser* CreateDefaultParser() {
     return CreateParserHelper(kNoTimestamp, kNoTimestamp, TextTracks(),
                               std::set<int64_t>(), std::string(), std::string(),
-                              kUnknownAudioCodec);
+                              kUnknownAudioCodec, kCodecVP8);
   }
 
   // Create a parser for test with custom audio and video default durations, and
@@ -337,7 +405,7 @@ class WebMClusterParserTest : public testing::Test {
       const WebMTracksParser::TextTracks& text_tracks = TextTracks()) {
     return CreateParserHelper(audio_default_duration, video_default_duration,
                               text_tracks, std::set<int64_t>(), std::string(),
-                              std::string(), kUnknownAudioCodec);
+                              std::string(), kUnknownAudioCodec, kCodecVP8);
   }
 
   // Create a parser for test with custom ignored tracks.
@@ -345,7 +413,7 @@ class WebMClusterParserTest : public testing::Test {
       std::set<int64_t>& ignored_tracks) {
     return CreateParserHelper(kNoTimestamp, kNoTimestamp, TextTracks(),
                               ignored_tracks, std::string(), std::string(),
-                              kUnknownAudioCodec);
+                              kUnknownAudioCodec, kCodecVP8);
   }
 
   // Create a parser for test with custom encryption key ids and audio codec.
@@ -355,7 +423,14 @@ class WebMClusterParserTest : public testing::Test {
       const AudioCodec audio_codec) {
     return CreateParserHelper(kNoTimestamp, kNoTimestamp, TextTracks(),
                               std::set<int64_t>(), audio_encryption_key_id,
-                              video_encryption_key_id, audio_codec);
+                              video_encryption_key_id, audio_codec, kCodecVP8);
+  }
+
+  // Create a parser for test with custom video codec.
+  WebMClusterParser* CreateParserWithVideoCodec(const VideoCodec video_codec) {
+    return CreateParserHelper(kNoTimestamp, kNoTimestamp, TextTracks(),
+                              std::set<int64_t>(), std::string(), std::string(),
+                              kUnknownAudioCodec, video_codec);
   }
 
   bool VerifyBuffers(const BlockInfo* block_info, int block_count) {
@@ -368,7 +443,10 @@ class WebMClusterParserTest : public testing::Test {
     return result;
   }
 
+  scoped_refptr<AudioStreamInfo> audio_stream_info_;
+  scoped_refptr<VideoStreamInfo> video_stream_info_;
   scoped_ptr<WebMClusterParser> parser_;
+  std::vector<scoped_refptr<StreamInfo>> streams_from_init_event_;
   BufferQueue audio_buffers_;
   BufferQueue video_buffers_;
   TextBufferQueueMap text_buffers_map_;
@@ -485,6 +563,10 @@ TEST_F(WebMClusterParserTest, ParseClusterWithSingleCall) {
   int result = parser_->Parse(cluster->data(), cluster->size());
   EXPECT_EQ(cluster->size(), result);
   ASSERT_TRUE(VerifyBuffers(kDefaultBlockInfo, block_count));
+  // Verify init event called.
+  ASSERT_EQ(2u, streams_from_init_event_.size());
+  EXPECT_EQ(kStreamAudio, streams_from_init_event_[0]->stream_type());
+  EXPECT_EQ(kStreamVideo, streams_from_init_event_[1]->stream_type());
 }
 
 TEST_F(WebMClusterParserTest, ParseClusterWithMultipleCalls) {
@@ -698,6 +780,19 @@ TEST_F(WebMClusterParserTest, ParseMultipleTextTracks) {
   }
 }
 
+TEST_F(WebMClusterParserTest, ParseVP9) {
+  scoped_ptr<Cluster> cluster(CreateVP9Cluster());
+  parser_.reset(CreateParserWithVideoCodec(kCodecVP9));
+
+  EXPECT_EQ(cluster->size(), parser_->Parse(cluster->data(), cluster->size()));
+
+  ASSERT_EQ(2u, streams_from_init_event_.size());
+  EXPECT_EQ(kStreamAudio, streams_from_init_event_[0]->stream_type());
+  EXPECT_EQ(kStreamVideo, streams_from_init_event_[1]->stream_type());
+  EXPECT_EQ("vp09.03.00.12.00.03.00.00",
+            streams_from_init_event_[1]->codec_string());
+}
+
 TEST_F(WebMClusterParserTest, ParseEncryptedBlock) {
   scoped_ptr<Cluster> cluster(CreateEncryptedCluster(sizeof(kEncryptedFrame)));
 
@@ -728,6 +823,8 @@ TEST_F(WebMClusterParserTest, ParseInvalidZeroSizedCluster) {
   };
 
   EXPECT_EQ(-1, parser_->Parse(kBuffer, sizeof(kBuffer)));
+  // Verify init event not called.
+  ASSERT_EQ(0u, streams_from_init_event_.size());
 }
 
 TEST_F(WebMClusterParserTest, ParseInvalidUnknownButActuallyZeroSizedCluster) {
diff --git a/packager/media/formats/webm/webm_media_parser.cc b/packager/media/formats/webm/webm_media_parser.cc
index 63c2d6e7bd..14b2823cd8 100644
--- a/packager/media/formats/webm/webm_media_parser.cc
+++ b/packager/media/formats/webm/webm_media_parser.cc
@@ -181,37 +181,33 @@ int WebMMediaParser::ParseInfoAndTracks(const uint8_t* data, int size) {
   double timecode_scale_in_us = info_parser.timecode_scale() / 1000.0;
   int64_t duration_in_us = info_parser.duration() * timecode_scale_in_us;
 
-  std::vector<scoped_refptr<StreamInfo>> streams;
-  AudioCodec audio_codec = kCodecOpus;
-  if (tracks_parser.audio_stream_info()) {
-    streams.push_back(tracks_parser.audio_stream_info());
-    streams.back()->set_duration(duration_in_us);
-    if (streams.back()->is_encrypted())
+  scoped_refptr<AudioStreamInfo> audio_stream_info =
+      tracks_parser.audio_stream_info();
+  if (audio_stream_info) {
+    audio_stream_info->set_duration(duration_in_us);
+    if (audio_stream_info->is_encrypted())
       OnEncryptedMediaInitData(tracks_parser.audio_encryption_key_id());
-    audio_codec = tracks_parser.audio_stream_info()->codec();
   } else {
     VLOG(1) << "No audio track info found.";
   }
 
-  if (tracks_parser.video_stream_info()) {
-    streams.push_back(tracks_parser.video_stream_info());
-    streams.back()->set_duration(duration_in_us);
-    if (streams.back()->is_encrypted())
+  scoped_refptr<VideoStreamInfo> video_stream_info =
+      tracks_parser.video_stream_info();
+  if (video_stream_info) {
+    video_stream_info->set_duration(duration_in_us);
+    if (video_stream_info->is_encrypted())
       OnEncryptedMediaInitData(tracks_parser.video_encryption_key_id());
   } else {
     VLOG(1) << "No video track info found.";
   }
 
-  init_cb_.Run(streams);
-
   cluster_parser_.reset(new WebMClusterParser(
-      info_parser.timecode_scale(), tracks_parser.audio_track_num(),
+      info_parser.timecode_scale(), audio_stream_info, video_stream_info,
       tracks_parser.GetAudioDefaultDuration(timecode_scale_in_us),
-      tracks_parser.video_track_num(),
       tracks_parser.GetVideoDefaultDuration(timecode_scale_in_us),
       tracks_parser.text_tracks(), tracks_parser.ignored_tracks(),
       tracks_parser.audio_encryption_key_id(),
-      tracks_parser.video_encryption_key_id(), audio_codec, new_sample_cb_));
+      tracks_parser.video_encryption_key_id(), new_sample_cb_, init_cb_));
 
   return bytes_parsed;
 }
diff --git a/packager/media/formats/webm/webm_video_client.cc b/packager/media/formats/webm/webm_video_client.cc
index 373f8ae2f1..b99888cd6f 100644
--- a/packager/media/formats/webm/webm_video_client.cc
+++ b/packager/media/formats/webm/webm_video_client.cc
@@ -6,7 +6,6 @@
 
 #include "packager/base/logging.h"
 #include "packager/base/stl_util.h"
-#include "packager/media/filters/vp_codec_configuration.h"
 #include "packager/media/formats/webm/webm_constants.h"
 
 namespace {
@@ -106,25 +105,10 @@ scoped_refptr<VideoStreamInfo> WebMVideoClient::GetVideoStreamInfo(
   sar_x /= gcd;
   sar_y /= gcd;
 
-  // TODO(kqyang): Fill in the values for vp codec configuration.
-  const uint8_t profile = 0;
-  const uint8_t level = 0;
-  const uint8_t bit_depth = 8;
-  const uint8_t color_space = 0;
-  const uint8_t chroma_subsampling = 0;
-  const uint8_t transfer_function = 0;
-  const bool video_full_range_flag = false;
-  VPCodecConfiguration vp_config(profile, level, bit_depth, color_space,
-                                 chroma_subsampling, transfer_function,
-                                 video_full_range_flag, codec_private);
-  std::vector<uint8_t> extra_data;
-  vp_config.Write(&extra_data);
-
   return scoped_refptr<VideoStreamInfo>(new VideoStreamInfo(
-      track_num, kWebMTimeScale, 0, video_codec,
-      vp_config.GetCodecString(video_codec), std::string(), width_after_crop,
-      height_after_crop, sar_x, sar_y, 0, 0, vector_as_array(&extra_data),
-      extra_data.size(), is_encrypted));
+      track_num, kWebMTimeScale, 0, video_codec, std::string(), std::string(),
+      width_after_crop, height_after_crop, sar_x, sar_y, 0, 0, NULL, 0,
+      is_encrypted));
 }
 
 bool WebMVideoClient::OnUInt(int id, int64_t val) {