From 167b45475e830e507d8f87fb193bb76d721b671c Mon Sep 17 00:00:00 2001 From: rlaphoenix Date: Mon, 5 Feb 2024 12:37:21 +0000 Subject: [PATCH] Only decode text direction entities in Sub files Previously, all entities were decoded in Subtitle files because of a problem with SubtitleEdit and it's /ReverseRtlStartEnd option not being entity-aware. It actually ends up reversing the `;` of `‏`, instead of the actual value of `‏`. Therefore, I decoded all entities before SubtitleEdit could have processed the Subtitle, but this has caused problems with more advanced formats like TTML and WebVTT as `<` would decode to `<` causing syntax errors, among other problematic characters. According to the TTML and WebVTT spec, html entity encoding is allowed, and that makes sense or you wouldn't be able to use `<` etc. Any failure for players to show the decoded character would be a player problem and be out of scope with Devine. --- devine/core/manifests/hls.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/devine/core/manifests/hls.py b/devine/core/manifests/hls.py index bf23ca1..803a8ed 100644 --- a/devine/core/manifests/hls.py +++ b/devine/core/manifests/hls.py @@ -316,7 +316,11 @@ class HLS: if isinstance(track, Subtitle): segment_data = try_ensure_utf8(segment_data) if track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML): - segment_data = html.unescape(segment_data.decode("utf8")).encode("utf8") + # decode text direction entities or SubtitleEdit's /ReverseRtlStartEnd won't work + segment_data = segment_data.decode("utf8"). \ + replace("‎", html.unescape("‎")). \ + replace("‏", html.unescape("‏")). \ + encode("utf8") f.write(segment_data) segment_file.unlink()