Unescape HTML Entities in Subtitles after Downloading

This fixes some Subtitles having e.g., `&` instead of just `&`, but especially for special entities like `‏` which enables Right-to-Left mode on Hebrew and Arabic Subtitles.
This commit is contained in:
rlaphoenix 2024-01-18 16:23:31 +00:00
parent 26d067915f
commit 2056e056a4
3 changed files with 6 additions and 0 deletions

View File

@ -913,6 +913,7 @@ class dl:
if isinstance(track, Subtitle): if isinstance(track, Subtitle):
track_data = track.path.read_bytes() track_data = track.path.read_bytes()
track_data = try_ensure_utf8(track_data) track_data = try_ensure_utf8(track_data)
track_data = html.unescape(track_data.decode("utf8")).encode("utf8")
track.path.write_bytes(track_data) track.path.write_bytes(track_data)
progress(downloaded="Downloaded") progress(downloaded="Downloaded")

View File

@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import base64 import base64
import html
import logging import logging
import math import math
import re import re
@ -473,6 +474,7 @@ class DASH:
track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML) track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML)
): ):
segment_data = try_ensure_utf8(segment_data) segment_data = try_ensure_utf8(segment_data)
segment_data = html.unescape(segment_data.decode("utf8")).encode("utf8")
f.write(segment_data) f.write(segment_data)
segment_file.unlink() segment_file.unlink()

View File

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import html
import logging import logging
import re import re
import sys import sys
@ -314,6 +315,8 @@ class HLS:
segment_data = segment_file.read_bytes() segment_data = segment_file.read_bytes()
if isinstance(track, Subtitle): if isinstance(track, Subtitle):
segment_data = try_ensure_utf8(segment_data) segment_data = try_ensure_utf8(segment_data)
if track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML):
segment_data = html.unescape(segment_data.decode("utf8")).encode("utf8")
f.write(segment_data) f.write(segment_data)
segment_file.unlink() segment_file.unlink()