From 2056e056a4bd2850d5b1b90ea3b8c0e5a9cfacb7 Mon Sep 17 00:00:00 2001 From: rlaphoenix Date: Thu, 18 Jan 2024 16:23:31 +0000 Subject: [PATCH] Unescape HTML Entities in Subtitles after Downloading This fixes some Subtitles having e.g., `&` instead of just `&`, but especially for special entities like `‏` which enables Right-to-Left mode on Hebrew and Arabic Subtitles. --- devine/commands/dl.py | 1 + devine/core/manifests/dash.py | 2 ++ devine/core/manifests/hls.py | 3 +++ 3 files changed, 6 insertions(+) diff --git a/devine/commands/dl.py b/devine/commands/dl.py index 216bfd5..6654297 100644 --- a/devine/commands/dl.py +++ b/devine/commands/dl.py @@ -913,6 +913,7 @@ class dl: if isinstance(track, Subtitle): track_data = track.path.read_bytes() track_data = try_ensure_utf8(track_data) + track_data = html.unescape(track_data.decode("utf8")).encode("utf8") track.path.write_bytes(track_data) progress(downloaded="Downloaded") diff --git a/devine/core/manifests/dash.py b/devine/core/manifests/dash.py index adc2f01..718f93e 100644 --- a/devine/core/manifests/dash.py +++ b/devine/core/manifests/dash.py @@ -1,6 +1,7 @@ from __future__ import annotations import base64 +import html import logging import math import re @@ -473,6 +474,7 @@ class DASH: track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML) ): segment_data = try_ensure_utf8(segment_data) + segment_data = html.unescape(segment_data.decode("utf8")).encode("utf8") f.write(segment_data) segment_file.unlink() diff --git a/devine/core/manifests/hls.py b/devine/core/manifests/hls.py index 6ba9b2c..bf23ca1 100644 --- a/devine/core/manifests/hls.py +++ b/devine/core/manifests/hls.py @@ -1,5 +1,6 @@ from __future__ import annotations +import html import logging import re import sys @@ -314,6 +315,8 @@ class HLS: segment_data = segment_file.read_bytes() if isinstance(track, Subtitle): segment_data = try_ensure_utf8(segment_data) + if track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML): + segment_data = html.unescape(segment_data.decode("utf8")).encode("utf8") f.write(segment_data) segment_file.unlink()