From c31ee338dca9068d45b3f368d9a54a6e8f59fcf6 Mon Sep 17 00:00:00 2001 From: Shivelight Date: Sat, 2 Dec 2023 19:00:55 +0800 Subject: [PATCH] Add option for automatic subtitle character encoding normalization (#68) * Add option for automatic subtitle character encoding normalization The rationale behind this function is that some services use ISO-8859-1 (latin1) or Windows-1252 (CP-1252) instead of UTF-8 encoding, whether intentionally or accidentally. Some services even stream subtitles with malformed/mixed encoding (each segment has a different encoding). * Remove Subtitle parameter `auto_fix_encoding` Just always attempt to fix encoding. If the subtitle is neither UTF-8 nor CP-1252, then it should realistically error out instead of producing garbage Subtitle data anyway. * Move Subtitle encoding fixing code out of if drm tree * Use chardet as a last ditch effort fixing Subs, or return original data * Move Subtitle.fix_encoding method to utilities as try_ensure_utf8 * Add Shivelight as a contributor --------- Co-authored-by: rlaphoenix --- README.md | 1 + devine/commands/dl.py | 7 ++++++- devine/core/manifests/dash.py | 8 ++++++-- devine/core/manifests/hls.py | 7 +++++-- devine/core/utilities.py | 27 +++++++++++++++++++++++++++ poetry.lock | 13 ++++++++++++- pyproject.toml | 1 + 7 files changed, 58 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index db4950b..ab1acb2 100644 --- a/README.md +++ b/README.md @@ -316,6 +316,7 @@ Please refrain from spam or asking for questions that infringe upon a Service's Arias800 varyg1001 Hollander-1908 +Shivelight ## Licensing diff --git a/devine/commands/dl.py b/devine/commands/dl.py index 80f4e3c..1224f50 100644 --- a/devine/commands/dl.py +++ b/devine/commands/dl.py @@ -52,7 +52,7 @@ from devine.core.services import Services from devine.core.titles import Movie, Song, Title_T from devine.core.titles.episode import Episode from devine.core.tracks import Audio, Subtitle, Tracks, Video -from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since +from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since, try_ensure_utf8 from devine.core.utils.click_types import LANGUAGE_RANGE, QUALITY_LIST, SEASON_RANGE, ContextData from devine.core.utils.collections import merge_dict from devine.core.utils.subprocess import ffprobe @@ -922,6 +922,11 @@ class dl: track.OnDecrypted(track) progress(downloaded="Decrypted", completed=100) + if isinstance(track, Subtitle): + track_data = track.path.read_bytes() + track_data = try_ensure_utf8(track_data) + track.path.write_bytes(track_data) + progress(downloaded="Downloaded") except KeyboardInterrupt: self.DL_POOL_STOP.set() diff --git a/devine/core/manifests/dash.py b/devine/core/manifests/dash.py index 19abf38..3dffc8b 100644 --- a/devine/core/manifests/dash.py +++ b/devine/core/manifests/dash.py @@ -31,7 +31,7 @@ from devine.core.downloaders import downloader from devine.core.downloaders import requests as requests_downloader from devine.core.drm import Widevine from devine.core.tracks import Audio, Subtitle, Tracks, Video -from devine.core.utilities import is_close_match +from devine.core.utilities import is_close_match, try_ensure_utf8 from devine.core.utils.xml import load_xml @@ -471,7 +471,11 @@ class DASH: if init_data: f.write(init_data) for segment_file in sorted(save_dir.iterdir()): - f.write(segment_file.read_bytes()) + segment_data = segment_file.read_bytes() + # TODO: fix encoding after decryption? + if not drm and isinstance(track, Subtitle): + segment_data = try_ensure_utf8(segment_data) + f.write(segment_data) segment_file.unlink() if drm: diff --git a/devine/core/manifests/hls.py b/devine/core/manifests/hls.py index 60094b2..bb2d00b 100644 --- a/devine/core/manifests/hls.py +++ b/devine/core/manifests/hls.py @@ -28,7 +28,7 @@ from devine.core.downloaders import downloader from devine.core.downloaders import requests as requests_downloader from devine.core.drm import DRM_T, ClearKey, Widevine from devine.core.tracks import Audio, Subtitle, Tracks, Video -from devine.core.utilities import is_close_match +from devine.core.utilities import is_close_match, try_ensure_utf8 class HLS: @@ -301,7 +301,10 @@ class HLS: with open(save_path, "wb") as f: for segment_file in sorted(save_dir.iterdir()): - f.write(segment_file.read_bytes()) + segment_data = segment_file.read_bytes() + if isinstance(track, Subtitle): + segment_data = try_ensure_utf8(segment_data) + f.write(segment_data) segment_file.unlink() progress(downloaded="Downloaded") diff --git a/devine/core/utilities.py b/devine/core/utilities.py index b810dd4..146a841 100644 --- a/devine/core/utilities.py +++ b/devine/core/utilities.py @@ -13,6 +13,7 @@ from types import ModuleType from typing import AsyncIterator, Optional, Sequence, Union from urllib.parse import urlparse +import chardet import pproxy import requests from construct import ValidationError @@ -215,6 +216,32 @@ def time_elapsed_since(start: float) -> str: return time_string +def try_ensure_utf8(data: bytes) -> bytes: + """ + Try to ensure that the given data is encoded in UTF-8. + + Parameters: + data: Input data that may or may not yet be UTF-8 or another encoding. + + Returns the input data encoded in UTF-8 if successful. If unable to detect the + encoding of the input data, then the original data is returned as-received. + """ + try: + data.decode("utf8") + return data + except UnicodeDecodeError: + try: + # CP-1252 is a superset of latin1 + return data.decode("cp1252").encode("utf8") + except UnicodeDecodeError: + try: + # last ditch effort to detect encoding + detection_result = chardet.detect(data) + return data.decode(detection_result["encoding"]).encode("utf8") + except UnicodeDecodeError: + return data + + @contextlib.asynccontextmanager async def start_pproxy(proxy: str) -> AsyncIterator[str]: proxy = urlparse(proxy) diff --git a/poetry.lock b/poetry.lock index da42a10..bf47c21 100644 --- a/poetry.lock +++ b/poetry.lock @@ -294,6 +294,17 @@ files = [ {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, ] +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.2.0" @@ -1769,4 +1780,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.9.0,<3.12" -content-hash = "d19aedf3a21dff6327497d42b56dbff63cef4d2899fa886c52c058e5439fee87" +content-hash = "725552b13f9ba04c99b77a5cc96eef121abdd80eb5e67188981f6940fdc88015" diff --git a/pyproject.toml b/pyproject.toml index 574c072..44e35f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ sortedcontainers = "^2.4.0" subtitle-filter = "^1.4.6" Unidecode = "^1.3.6" urllib3 = "^2.0.4" +chardet = "^5.2.0" [tool.poetry.dev-dependencies] pre-commit = "^3.4.0"