Add style support for cue fragments.

Now text cues are composed of nested fragments that can be individually styled. This allows portions of the cue to be bold, etc. The WebVTT parser doesn't parse the inputs, but the original tags are preserved in WebVTT output. The WebVTT output will add tags if the style elements are present in the cue object. Change-Id: I6abba4175e376e4f753193f7d8cac63e958d3c89
2020-08-26 13:47:14 -07:00 · 2020-08-26 13:47:14 -07:00 · 1f21cc78cd
parent f4c07b9ce0
commit 1f21cc78cd
7 changed files with 240 additions and 17 deletions
--- a/packager/media/base/media_handler_test_base.cc
+++ b/packager/media/base/media_handler_test_base.cc
@ -267,7 +267,7 @@ std::unique_ptr<TextSample> MediaHandlerTestBase::GetTextSample(
    int64_t end,
    const std::string& payload) const {
  return std::unique_ptr<TextSample>{
-      new TextSample(id, start, end, {}, TextFragment{payload})};
+      new TextSample(id, start, end, {}, TextFragment{{}, payload})};
 }

 std::unique_ptr<CueEvent> MediaHandlerTestBase::GetCueEvent(
--- a/packager/media/base/text_sample.cc
+++ b/packager/media/base/text_sample.cc
@ -6,13 +6,18 @@

 #include "packager/media/base/text_sample.h"

+#include <algorithm>
+#include <functional>
+
 #include "packager/base/logging.h"

 namespace shaka {
 namespace media {

 bool TextFragment::is_empty() const {
-  return body.empty();
+  return std::all_of(sub_fragments.begin(), sub_fragments.end(),
+                     std::mem_fn(&TextFragment::is_empty)) &&
+         body.empty();
 }

 TextSample::TextSample(const std::string& id,
--- a/packager/media/base/text_sample.h
+++ b/packager/media/base/text_sample.h
@ -73,9 +73,31 @@ struct TextSettings {
  TextAlignment text_alignment = TextAlignment::kCenter;
 };

+struct TextFragmentStyle {
+  base::Optional<bool> underline;
+  base::Optional<bool> bold;
+  base::Optional<bool> italic;
+};
+
+/// Represents a recursive structure of styled blocks of text.  Only one of
+/// sub_fragments, body, or newline will be set.
 struct TextFragment {
-  // TODO(modmaker): Fill with settings and sub-fragments.
+  TextFragment() {}
+  TextFragment(const TextFragmentStyle& style,
+               const std::vector<TextFragment>& sub_fragments)
+      : style(style), sub_fragments(sub_fragments) {}
+  TextFragment(const TextFragmentStyle& style, const char* body)
+      : style(style), body(body) {}
+  TextFragment(const TextFragmentStyle& style, const std::string& body)
+      : style(style), body(body) {}
+  TextFragment(const TextFragmentStyle& style, bool newline)
+      : style(style), newline(newline) {}
+
+  TextFragmentStyle style;
+
+  std::vector<TextFragment> sub_fragments;
  std::string body;
+  bool newline = false;

  bool is_empty() const;
 };
--- a/packager/media/formats/webvtt/webvtt_parser.cc
+++ b/packager/media/formats/webvtt/webvtt_parser.cc
@ -349,11 +349,12 @@ bool WebVttParser::ParseCue(const std::string& id,
  // The rest of the block is the payload.
  // TODO: Parse tags to support <b>, <i>, etc.
  TextFragment body;
+  TextFragmentStyle no_styles;
  for (size_t i = 1; i < block_size; i++) {
    if (i > 1) {
-      body.body += "\n";
+      body.sub_fragments.emplace_back(no_styles, /* newline= */ true);
    }
-    body.body += block[i];
+    body.sub_fragments.emplace_back(no_styles, block[i]);
  }

  const auto sample =
--- a/packager/media/formats/webvtt/webvtt_parser_unittest.cc
+++ b/packager/media/formats/webvtt/webvtt_parser_unittest.cc
@ -24,6 +24,27 @@ std::string ToString(const std::vector<uint8_t>& v) {
  return std::string(v.begin(), v.end());
 }

+void ExpectNoStyle(const TextFragmentStyle& style) {
+  EXPECT_FALSE(style.underline);
+  EXPECT_FALSE(style.bold);
+  EXPECT_FALSE(style.italic);
+}
+
+void ExpectPlainCueWithBody(const TextFragment& fragment,
+                            const std::string& expected) {
+  ExpectNoStyle(fragment.style);
+  ASSERT_TRUE(fragment.body.empty());
+  ASSERT_FALSE(fragment.newline);
+
+  if (expected.empty()) {
+    EXPECT_TRUE(fragment.sub_fragments.empty());
+  } else {
+    ASSERT_EQ(fragment.sub_fragments.size(), 1u);
+    ExpectNoStyle(fragment.sub_fragments[0].style);
+    EXPECT_EQ(fragment.sub_fragments[0].body, expected);
+  }
+}
+
 }  // namespace

 class WebVttParserTest : public testing::Test {
@ -177,7 +198,7 @@ TEST_F(WebVttParserTest, ParseOneCue) {
  EXPECT_EQ(samples_[0]->id(), kNoId);
  EXPECT_EQ(samples_[0]->start_time(), 60000u);
  EXPECT_EQ(samples_[0]->duration(), 3540000u);
-  EXPECT_EQ(samples_[0]->body().body, "subtitle");
+  ExpectPlainCueWithBody(samples_[0]->body(), "subtitle");

  // No settings
  const auto& settings = samples_[0]->settings();
@ -221,7 +242,7 @@ TEST_F(WebVttParserTest, ParseOneCueWithStyleAndRegion) {
  EXPECT_EQ(samples_[0]->id(), kNoId);
  EXPECT_EQ(samples_[0]->start_time(), 60000u);
  EXPECT_EQ(samples_[0]->duration(), 3540000u);
-  EXPECT_EQ(samples_[0]->body().body, "subtitle");
+  ExpectPlainCueWithBody(samples_[0]->body(), "subtitle");
 }

 TEST_F(WebVttParserTest, ParseOneEmptyCue) {
@ -238,7 +259,7 @@ TEST_F(WebVttParserTest, ParseOneEmptyCue) {

  ASSERT_EQ(streams_.size(), 1u);
  ASSERT_EQ(samples_.size(), 1u);
-  EXPECT_EQ(samples_[0]->body().body, "");
+  ExpectPlainCueWithBody(samples_[0]->body(), "");
 }

 TEST_F(WebVttParserTest, FailToParseCueWithArrowInId) {
@ -271,7 +292,7 @@ TEST_F(WebVttParserTest, ParseOneCueWithId) {
  ASSERT_EQ(streams_.size(), 1u);
  ASSERT_EQ(samples_.size(), 1u);
  EXPECT_EQ(samples_[0]->id(), "id");
-  EXPECT_EQ(samples_[0]->body().body, "subtitle");
+  ExpectPlainCueWithBody(samples_[0]->body(), "subtitle");
 }

 TEST_F(WebVttParserTest, ParseOneEmptyCueWithId) {
@ -290,7 +311,7 @@ TEST_F(WebVttParserTest, ParseOneEmptyCueWithId) {
  ASSERT_EQ(streams_.size(), 1u);
  ASSERT_EQ(samples_.size(), 1u);
  EXPECT_EQ(samples_[0]->id(), "id");
-  EXPECT_EQ(samples_[0]->body().body, "");
+  ExpectPlainCueWithBody(samples_[0]->body(), "");
 }

 TEST_F(WebVttParserTest, ParseOneCueWithSettings) {
@ -363,13 +384,13 @@ TEST_F(WebVttParserTest, ParseMultipleCues) {

  EXPECT_EQ(samples_[0]->start_time(), 1000u);
  EXPECT_EQ(samples_[0]->duration(), 4200u);
-  EXPECT_EQ(samples_[0]->body().body, "subtitle A");
+  ExpectPlainCueWithBody(samples_[0]->body(), "subtitle A");
  EXPECT_EQ(samples_[1]->start_time(), 2321u);
  EXPECT_EQ(samples_[1]->duration(), 4679u);
-  EXPECT_EQ(samples_[1]->body().body, "subtitle B");
+  ExpectPlainCueWithBody(samples_[1]->body(), "subtitle B");
  EXPECT_EQ(samples_[2]->start_time(), 5800u);
  EXPECT_EQ(samples_[2]->duration(), 2200u);
-  EXPECT_EQ(samples_[2]->body().body, "subtitle C");
+  ExpectPlainCueWithBody(samples_[2]->body(), "subtitle C");
 }

 // Verify that a typical case with mulitple cues work even when comments are
@ -405,9 +426,9 @@ TEST_F(WebVttParserTest, ParseWithComments) {
  ASSERT_EQ(streams_.size(), 1u);
  ASSERT_EQ(samples_.size(), 3u);

-  EXPECT_EQ(samples_[0]->body().body, "subtitle A");
-  EXPECT_EQ(samples_[1]->body().body, "subtitle B");
-  EXPECT_EQ(samples_[2]->body().body, "subtitle C");
+  ExpectPlainCueWithBody(samples_[0]->body(), "subtitle A");
+  ExpectPlainCueWithBody(samples_[1]->body(), "subtitle B");
+  ExpectPlainCueWithBody(samples_[2]->body(), "subtitle C");
 }

 }  // namespace media
--- a/packager/media/formats/webvtt/webvtt_utils.cc
+++ b/packager/media/formats/webvtt/webvtt_utils.cc
@ -9,6 +9,8 @@
 #include <ctype.h>
 #include <inttypes.h>

+#include <regex>
+
 #include "packager/base/logging.h"
 #include "packager/base/strings/string_number_conversions.h"
 #include "packager/base/strings/stringprintf.h"
@ -33,6 +35,103 @@ bool GetTotalMilliseconds(uint64_t hours,
  *out = 60 * 60 * 1000 * hours + 60 * 1000 * minutes + 1000 * seconds + ms;
  return true;
 }
+
+enum class StyleTagKind {
+  kUnderline,
+  kBold,
+  kItalic,
+};
+
+std::string GetOpenTag(StyleTagKind tag) {
+  switch (tag) {
+    case StyleTagKind::kUnderline:
+      return "<u>";
+    case StyleTagKind::kBold:
+      return "<b>";
+    case StyleTagKind::kItalic:
+      return "<i>";
+  }
+  return "";  // Not reached, but Windows doesn't like NOTREACHED.
+}
+
+std::string GetCloseTag(StyleTagKind tag) {
+  switch (tag) {
+    case StyleTagKind::kUnderline:
+      return "</u>";
+    case StyleTagKind::kBold:
+      return "</b>";
+    case StyleTagKind::kItalic:
+      return "</i>";
+  }
+  return "";  // Not reached, but Windows doesn't like NOTREACHED.
+}
+
+std::string WriteFragment(const TextFragment& fragment,
+                          std::list<StyleTagKind>* tags) {
+  std::string ret;
+  size_t local_tag_count = 0;
+  auto has = [tags](StyleTagKind tag) {
+    return std::find(tags->begin(), tags->end(), tag) != tags->end();
+  };
+  auto push_tag = [tags, &local_tag_count, &has](StyleTagKind tag) {
+    if (has(tag)) {
+      return std::string();
+    }
+    tags->push_back(tag);
+    local_tag_count++;
+    return GetOpenTag(tag);
+  };
+
+  if ((fragment.style.underline == false && has(StyleTagKind::kUnderline)) ||
+      (fragment.style.bold == false && has(StyleTagKind::kBold)) ||
+      (fragment.style.italic == false && has(StyleTagKind::kItalic))) {
+    LOG(WARNING) << "WebVTT output doesn't support disabling "
+                    "underline/bold/italic within a cue";
+  }
+
+  if (fragment.newline) {
+    // Newlines represent separate WebVTT cues. So close the existing tags to
+    // be nice and re-open them on the new line.
+    for (auto it = tags->rbegin(); it != tags->rend(); it++) {
+      ret += GetCloseTag(*it);
+    }
+    ret += "\n";
+    for (const auto tag : *tags) {
+      ret += GetOpenTag(tag);
+    }
+  } else {
+    if (fragment.style.underline == true) {
+      ret += push_tag(StyleTagKind::kUnderline);
+    }
+    if (fragment.style.bold == true) {
+      ret += push_tag(StyleTagKind::kBold);
+    }
+    if (fragment.style.italic == true) {
+      ret += push_tag(StyleTagKind::kItalic);
+    }
+
+    if (!fragment.body.empty()) {
+      // Replace newlines and consecutive whitespace with a single space.  If
+      // the user wanted an explicit newline, they should use the "newline"
+      // field.
+      std::regex whitespace("\\s+", std::regex_constants::ECMAScript);
+      ret += std::regex_replace(fragment.body, whitespace, std::string(" "));
+    } else {
+      for (const auto& frag : fragment.sub_fragments) {
+        ret += WriteFragment(frag, tags);
+      }
+    }
+
+    // Pop all the local tags we pushed.
+    while (local_tag_count > 0) {
+      ret += GetCloseTag(tags->back());
+      tags->pop_back();
+      local_tag_count--;
+    }
+  }
+  return ret;
+}
+
 }  // namespace

 bool WebVttTimestampToMs(const base::StringPiece& source, uint64_t* out) {
@ -157,7 +256,8 @@ std::string WebVttSettingsToString(const TextSettings& settings) {
 }

 std::string WebVttFragmentToString(const TextFragment& fragment) {
-  return fragment.body;
+  std::list<StyleTagKind> tags;
+  return WriteFragment(fragment, &tags);
 }

 }  // namespace media
--- a/packager/media/formats/webvtt/webvtt_utils_unittest.cc
+++ b/packager/media/formats/webvtt/webvtt_utils_unittest.cc
@ -11,6 +11,24 @@
 namespace shaka {
 namespace media {

+namespace {
+
+const TextFragmentStyle kNoStyle{};
+
+TextFragmentStyle GetItalicStyle() {
+  TextFragmentStyle style;
+  style.italic = true;
+  return style;
+}
+
+TextFragmentStyle GetBoldStyle() {
+  TextFragmentStyle style;
+  style.bold = true;
+  return style;
+}
+
+}  // namespace
+
 TEST(WebVttTimestampTest, TooShort) {
  uint64_t ms;
  EXPECT_FALSE(WebVttTimestampToMs("00.000", &ms));
@ -150,5 +168,61 @@ TEST(WebVttUtilsTest, SettingsToString_IgnoresDefaults) {
  EXPECT_EQ(actual, "region:foo");
 }

+TEST(WebVttUtilsTest, FragmentToString) {
+  TextFragment frag(GetBoldStyle(), "Foobar");
+  EXPECT_EQ(WebVttFragmentToString(frag), "<b>Foobar</b>");
+}
+
+TEST(WebVttUtilsTest, FragmentToString_PreservesTags) {
+  TextFragment frag(kNoStyle, "<i>Foobar</i>");
+  EXPECT_EQ(WebVttFragmentToString(frag), "<i>Foobar</i>");
+}
+
+TEST(WebVttUtilsTest, FragmentToString_HandlesNestedFragments) {
+  TextFragment frag;
+  frag.sub_fragments.emplace_back(kNoStyle, "Hello ");
+  frag.sub_fragments.emplace_back(kNoStyle, "World");
+  EXPECT_EQ(WebVttFragmentToString(frag), "Hello World");
+}
+
+TEST(WebVttUtilsTest, FragmentToString_HandlesNestedFragmentsWithStyle) {
+  TextFragment frag;
+  frag.style.bold = true;
+  frag.sub_fragments.emplace_back(GetItalicStyle(), "Hello");
+  frag.sub_fragments.emplace_back(kNoStyle, " World");
+  EXPECT_EQ(WebVttFragmentToString(frag), "<b><i>Hello</i> World</b>");
+}
+
+TEST(WebVttUtilsTest, FragmentToString_HandlesNewlines) {
+  TextFragment frag;
+  frag.sub_fragments.emplace_back(kNoStyle, "Hello");
+  frag.sub_fragments.emplace_back(kNoStyle, true);
+  frag.sub_fragments.emplace_back(kNoStyle, "World");
+  EXPECT_EQ(WebVttFragmentToString(frag), "Hello\nWorld");
+}
+
+TEST(WebVttUtilsTest, FragmentToString_HandlesNewlinesWithStyle) {
+  TextFragment frag;
+  frag.style.bold = true;
+  frag.sub_fragments.emplace_back(kNoStyle, "Hello");
+  frag.sub_fragments.emplace_back(kNoStyle, true);
+  frag.sub_fragments.emplace_back(kNoStyle, "World");
+  EXPECT_EQ(WebVttFragmentToString(frag), "<b>Hello</b>\n<b>World</b>");
+}
+
+TEST(WebVttUtilsTest, FragmentToString_HandlesNestedNewlinesWithStyle) {
+  TextFragment nested;
+  nested.sub_fragments.emplace_back(kNoStyle, "Hello");
+  nested.sub_fragments.emplace_back(kNoStyle, true);
+  nested.sub_fragments.emplace_back(kNoStyle, "World");
+
+  TextFragment frag;
+  frag.style.bold = true;
+  frag.sub_fragments.emplace_back(nested);
+  frag.sub_fragments.emplace_back(kNoStyle, " Now");
+
+  EXPECT_EQ(WebVttFragmentToString(frag), "<b>Hello</b>\n<b>World Now</b>");
+}
+
 }  // namespace media
 }  // namespace shaka