Add option for automatic subtitle character encoding normalization (#68)

* Add option for automatic subtitle character encoding normalization

The rationale behind this function is that some services use ISO-8859-1
(latin1) or Windows-1252 (CP-1252) instead of UTF-8 encoding, whether
intentionally or accidentally. Some services even stream subtitles with
malformed/mixed encoding (each segment has a different encoding).

* Remove Subtitle parameter `auto_fix_encoding`

Just always attempt to fix encoding. If the subtitle is neither UTF-8 nor CP-1252, then it should realistically error out instead of producing garbage Subtitle data anyway.

* Move Subtitle encoding fixing code out of if drm tree

* Use chardet as a last ditch effort fixing Subs, or return original data

* Move Subtitle.fix_encoding method to utilities as try_ensure_utf8

* Add Shivelight as a contributor

---------

Co-authored-by: rlaphoenix <rlaphoenix@pm.me>
This commit is contained in:
Shivelight 2023-12-02 19:00:55 +08:00 committed by GitHub
parent 4b8cfabaac
commit c31ee338dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 58 additions and 6 deletions

View File

@ -316,6 +316,7 @@ Please refrain from spam or asking for questions that infringe upon a Service's
<a href="https://github.com/Arias800"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/24809312?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Arias800"/></a> <a href="https://github.com/Arias800"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/24809312?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Arias800"/></a>
<a href="https://github.com/varyg1001"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/88599103?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="varyg1001"/></a> <a href="https://github.com/varyg1001"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/88599103?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="varyg1001"/></a>
<a href="https://github.com/Hollander-1908"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/93162595?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Hollander-1908"/></a> <a href="https://github.com/Hollander-1908"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/93162595?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Hollander-1908"/></a>
<a href="https://github.com/Shivelight"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/20620780?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Shivelight"/></a>
## Licensing ## Licensing

View File

@ -52,7 +52,7 @@ from devine.core.services import Services
from devine.core.titles import Movie, Song, Title_T from devine.core.titles import Movie, Song, Title_T
from devine.core.titles.episode import Episode from devine.core.titles.episode import Episode
from devine.core.tracks import Audio, Subtitle, Tracks, Video from devine.core.tracks import Audio, Subtitle, Tracks, Video
from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since, try_ensure_utf8
from devine.core.utils.click_types import LANGUAGE_RANGE, QUALITY_LIST, SEASON_RANGE, ContextData from devine.core.utils.click_types import LANGUAGE_RANGE, QUALITY_LIST, SEASON_RANGE, ContextData
from devine.core.utils.collections import merge_dict from devine.core.utils.collections import merge_dict
from devine.core.utils.subprocess import ffprobe from devine.core.utils.subprocess import ffprobe
@ -922,6 +922,11 @@ class dl:
track.OnDecrypted(track) track.OnDecrypted(track)
progress(downloaded="Decrypted", completed=100) progress(downloaded="Decrypted", completed=100)
if isinstance(track, Subtitle):
track_data = track.path.read_bytes()
track_data = try_ensure_utf8(track_data)
track.path.write_bytes(track_data)
progress(downloaded="Downloaded") progress(downloaded="Downloaded")
except KeyboardInterrupt: except KeyboardInterrupt:
self.DL_POOL_STOP.set() self.DL_POOL_STOP.set()

View File

@ -31,7 +31,7 @@ from devine.core.downloaders import downloader
from devine.core.downloaders import requests as requests_downloader from devine.core.downloaders import requests as requests_downloader
from devine.core.drm import Widevine from devine.core.drm import Widevine
from devine.core.tracks import Audio, Subtitle, Tracks, Video from devine.core.tracks import Audio, Subtitle, Tracks, Video
from devine.core.utilities import is_close_match from devine.core.utilities import is_close_match, try_ensure_utf8
from devine.core.utils.xml import load_xml from devine.core.utils.xml import load_xml
@ -471,7 +471,11 @@ class DASH:
if init_data: if init_data:
f.write(init_data) f.write(init_data)
for segment_file in sorted(save_dir.iterdir()): for segment_file in sorted(save_dir.iterdir()):
f.write(segment_file.read_bytes()) segment_data = segment_file.read_bytes()
# TODO: fix encoding after decryption?
if not drm and isinstance(track, Subtitle):
segment_data = try_ensure_utf8(segment_data)
f.write(segment_data)
segment_file.unlink() segment_file.unlink()
if drm: if drm:

View File

@ -28,7 +28,7 @@ from devine.core.downloaders import downloader
from devine.core.downloaders import requests as requests_downloader from devine.core.downloaders import requests as requests_downloader
from devine.core.drm import DRM_T, ClearKey, Widevine from devine.core.drm import DRM_T, ClearKey, Widevine
from devine.core.tracks import Audio, Subtitle, Tracks, Video from devine.core.tracks import Audio, Subtitle, Tracks, Video
from devine.core.utilities import is_close_match from devine.core.utilities import is_close_match, try_ensure_utf8
class HLS: class HLS:
@ -301,7 +301,10 @@ class HLS:
with open(save_path, "wb") as f: with open(save_path, "wb") as f:
for segment_file in sorted(save_dir.iterdir()): for segment_file in sorted(save_dir.iterdir()):
f.write(segment_file.read_bytes()) segment_data = segment_file.read_bytes()
if isinstance(track, Subtitle):
segment_data = try_ensure_utf8(segment_data)
f.write(segment_data)
segment_file.unlink() segment_file.unlink()
progress(downloaded="Downloaded") progress(downloaded="Downloaded")

View File

@ -13,6 +13,7 @@ from types import ModuleType
from typing import AsyncIterator, Optional, Sequence, Union from typing import AsyncIterator, Optional, Sequence, Union
from urllib.parse import urlparse from urllib.parse import urlparse
import chardet
import pproxy import pproxy
import requests import requests
from construct import ValidationError from construct import ValidationError
@ -215,6 +216,32 @@ def time_elapsed_since(start: float) -> str:
return time_string return time_string
def try_ensure_utf8(data: bytes) -> bytes:
"""
Try to ensure that the given data is encoded in UTF-8.
Parameters:
data: Input data that may or may not yet be UTF-8 or another encoding.
Returns the input data encoded in UTF-8 if successful. If unable to detect the
encoding of the input data, then the original data is returned as-received.
"""
try:
data.decode("utf8")
return data
except UnicodeDecodeError:
try:
# CP-1252 is a superset of latin1
return data.decode("cp1252").encode("utf8")
except UnicodeDecodeError:
try:
# last ditch effort to detect encoding
detection_result = chardet.detect(data)
return data.decode(detection_result["encoding"]).encode("utf8")
except UnicodeDecodeError:
return data
@contextlib.asynccontextmanager @contextlib.asynccontextmanager
async def start_pproxy(proxy: str) -> AsyncIterator[str]: async def start_pproxy(proxy: str) -> AsyncIterator[str]:
proxy = urlparse(proxy) proxy = urlparse(proxy)

13
poetry.lock generated
View File

@ -294,6 +294,17 @@ files = [
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
] ]
[[package]]
name = "chardet"
version = "5.2.0"
description = "Universal encoding detector for Python 3"
optional = false
python-versions = ">=3.7"
files = [
{file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
{file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
]
[[package]] [[package]]
name = "charset-normalizer" name = "charset-normalizer"
version = "3.2.0" version = "3.2.0"
@ -1769,4 +1780,4 @@ multidict = ">=4.0"
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.9.0,<3.12" python-versions = ">=3.9.0,<3.12"
content-hash = "d19aedf3a21dff6327497d42b56dbff63cef4d2899fa886c52c058e5439fee87" content-hash = "725552b13f9ba04c99b77a5cc96eef121abdd80eb5e67188981f6940fdc88015"

View File

@ -60,6 +60,7 @@ sortedcontainers = "^2.4.0"
subtitle-filter = "^1.4.6" subtitle-filter = "^1.4.6"
Unidecode = "^1.3.6" Unidecode = "^1.3.6"
urllib3 = "^2.0.4" urllib3 = "^2.0.4"
chardet = "^5.2.0"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]
pre-commit = "^3.4.0" pre-commit = "^3.4.0"