mirror of https://github.com/devine-dl/devine.git
Add option for automatic subtitle character encoding normalization (#68)
* Add option for automatic subtitle character encoding normalization The rationale behind this function is that some services use ISO-8859-1 (latin1) or Windows-1252 (CP-1252) instead of UTF-8 encoding, whether intentionally or accidentally. Some services even stream subtitles with malformed/mixed encoding (each segment has a different encoding). * Remove Subtitle parameter `auto_fix_encoding` Just always attempt to fix encoding. If the subtitle is neither UTF-8 nor CP-1252, then it should realistically error out instead of producing garbage Subtitle data anyway. * Move Subtitle encoding fixing code out of if drm tree * Use chardet as a last ditch effort fixing Subs, or return original data * Move Subtitle.fix_encoding method to utilities as try_ensure_utf8 * Add Shivelight as a contributor --------- Co-authored-by: rlaphoenix <rlaphoenix@pm.me>
This commit is contained in:
parent
4b8cfabaac
commit
c31ee338dc
|
@ -316,6 +316,7 @@ Please refrain from spam or asking for questions that infringe upon a Service's
|
||||||
<a href="https://github.com/Arias800"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/24809312?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Arias800"/></a>
|
<a href="https://github.com/Arias800"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/24809312?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Arias800"/></a>
|
||||||
<a href="https://github.com/varyg1001"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/88599103?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="varyg1001"/></a>
|
<a href="https://github.com/varyg1001"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/88599103?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="varyg1001"/></a>
|
||||||
<a href="https://github.com/Hollander-1908"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/93162595?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Hollander-1908"/></a>
|
<a href="https://github.com/Hollander-1908"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/93162595?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Hollander-1908"/></a>
|
||||||
|
<a href="https://github.com/Shivelight"><img src="https://images.weserv.nl/?url=avatars.githubusercontent.com/u/20620780?v=4&h=25&w=25&fit=cover&mask=circle&maxage=7d" alt="Shivelight"/></a>
|
||||||
|
|
||||||
## Licensing
|
## Licensing
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,7 @@ from devine.core.services import Services
|
||||||
from devine.core.titles import Movie, Song, Title_T
|
from devine.core.titles import Movie, Song, Title_T
|
||||||
from devine.core.titles.episode import Episode
|
from devine.core.titles.episode import Episode
|
||||||
from devine.core.tracks import Audio, Subtitle, Tracks, Video
|
from devine.core.tracks import Audio, Subtitle, Tracks, Video
|
||||||
from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since
|
from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since, try_ensure_utf8
|
||||||
from devine.core.utils.click_types import LANGUAGE_RANGE, QUALITY_LIST, SEASON_RANGE, ContextData
|
from devine.core.utils.click_types import LANGUAGE_RANGE, QUALITY_LIST, SEASON_RANGE, ContextData
|
||||||
from devine.core.utils.collections import merge_dict
|
from devine.core.utils.collections import merge_dict
|
||||||
from devine.core.utils.subprocess import ffprobe
|
from devine.core.utils.subprocess import ffprobe
|
||||||
|
@ -922,6 +922,11 @@ class dl:
|
||||||
track.OnDecrypted(track)
|
track.OnDecrypted(track)
|
||||||
progress(downloaded="Decrypted", completed=100)
|
progress(downloaded="Decrypted", completed=100)
|
||||||
|
|
||||||
|
if isinstance(track, Subtitle):
|
||||||
|
track_data = track.path.read_bytes()
|
||||||
|
track_data = try_ensure_utf8(track_data)
|
||||||
|
track.path.write_bytes(track_data)
|
||||||
|
|
||||||
progress(downloaded="Downloaded")
|
progress(downloaded="Downloaded")
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
self.DL_POOL_STOP.set()
|
self.DL_POOL_STOP.set()
|
||||||
|
|
|
@ -31,7 +31,7 @@ from devine.core.downloaders import downloader
|
||||||
from devine.core.downloaders import requests as requests_downloader
|
from devine.core.downloaders import requests as requests_downloader
|
||||||
from devine.core.drm import Widevine
|
from devine.core.drm import Widevine
|
||||||
from devine.core.tracks import Audio, Subtitle, Tracks, Video
|
from devine.core.tracks import Audio, Subtitle, Tracks, Video
|
||||||
from devine.core.utilities import is_close_match
|
from devine.core.utilities import is_close_match, try_ensure_utf8
|
||||||
from devine.core.utils.xml import load_xml
|
from devine.core.utils.xml import load_xml
|
||||||
|
|
||||||
|
|
||||||
|
@ -471,7 +471,11 @@ class DASH:
|
||||||
if init_data:
|
if init_data:
|
||||||
f.write(init_data)
|
f.write(init_data)
|
||||||
for segment_file in sorted(save_dir.iterdir()):
|
for segment_file in sorted(save_dir.iterdir()):
|
||||||
f.write(segment_file.read_bytes())
|
segment_data = segment_file.read_bytes()
|
||||||
|
# TODO: fix encoding after decryption?
|
||||||
|
if not drm and isinstance(track, Subtitle):
|
||||||
|
segment_data = try_ensure_utf8(segment_data)
|
||||||
|
f.write(segment_data)
|
||||||
segment_file.unlink()
|
segment_file.unlink()
|
||||||
|
|
||||||
if drm:
|
if drm:
|
||||||
|
|
|
@ -28,7 +28,7 @@ from devine.core.downloaders import downloader
|
||||||
from devine.core.downloaders import requests as requests_downloader
|
from devine.core.downloaders import requests as requests_downloader
|
||||||
from devine.core.drm import DRM_T, ClearKey, Widevine
|
from devine.core.drm import DRM_T, ClearKey, Widevine
|
||||||
from devine.core.tracks import Audio, Subtitle, Tracks, Video
|
from devine.core.tracks import Audio, Subtitle, Tracks, Video
|
||||||
from devine.core.utilities import is_close_match
|
from devine.core.utilities import is_close_match, try_ensure_utf8
|
||||||
|
|
||||||
|
|
||||||
class HLS:
|
class HLS:
|
||||||
|
@ -301,7 +301,10 @@ class HLS:
|
||||||
|
|
||||||
with open(save_path, "wb") as f:
|
with open(save_path, "wb") as f:
|
||||||
for segment_file in sorted(save_dir.iterdir()):
|
for segment_file in sorted(save_dir.iterdir()):
|
||||||
f.write(segment_file.read_bytes())
|
segment_data = segment_file.read_bytes()
|
||||||
|
if isinstance(track, Subtitle):
|
||||||
|
segment_data = try_ensure_utf8(segment_data)
|
||||||
|
f.write(segment_data)
|
||||||
segment_file.unlink()
|
segment_file.unlink()
|
||||||
|
|
||||||
progress(downloaded="Downloaded")
|
progress(downloaded="Downloaded")
|
||||||
|
|
|
@ -13,6 +13,7 @@ from types import ModuleType
|
||||||
from typing import AsyncIterator, Optional, Sequence, Union
|
from typing import AsyncIterator, Optional, Sequence, Union
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import chardet
|
||||||
import pproxy
|
import pproxy
|
||||||
import requests
|
import requests
|
||||||
from construct import ValidationError
|
from construct import ValidationError
|
||||||
|
@ -215,6 +216,32 @@ def time_elapsed_since(start: float) -> str:
|
||||||
return time_string
|
return time_string
|
||||||
|
|
||||||
|
|
||||||
|
def try_ensure_utf8(data: bytes) -> bytes:
|
||||||
|
"""
|
||||||
|
Try to ensure that the given data is encoded in UTF-8.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
data: Input data that may or may not yet be UTF-8 or another encoding.
|
||||||
|
|
||||||
|
Returns the input data encoded in UTF-8 if successful. If unable to detect the
|
||||||
|
encoding of the input data, then the original data is returned as-received.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data.decode("utf8")
|
||||||
|
return data
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
# CP-1252 is a superset of latin1
|
||||||
|
return data.decode("cp1252").encode("utf8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
# last ditch effort to detect encoding
|
||||||
|
detection_result = chardet.detect(data)
|
||||||
|
return data.decode(detection_result["encoding"]).encode("utf8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
@contextlib.asynccontextmanager
|
@contextlib.asynccontextmanager
|
||||||
async def start_pproxy(proxy: str) -> AsyncIterator[str]:
|
async def start_pproxy(proxy: str) -> AsyncIterator[str]:
|
||||||
proxy = urlparse(proxy)
|
proxy = urlparse(proxy)
|
||||||
|
|
|
@ -294,6 +294,17 @@ files = [
|
||||||
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
|
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "chardet"
|
||||||
|
version = "5.2.0"
|
||||||
|
description = "Universal encoding detector for Python 3"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
|
||||||
|
{file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.2.0"
|
version = "3.2.0"
|
||||||
|
@ -1769,4 +1780,4 @@ multidict = ">=4.0"
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.9.0,<3.12"
|
python-versions = ">=3.9.0,<3.12"
|
||||||
content-hash = "d19aedf3a21dff6327497d42b56dbff63cef4d2899fa886c52c058e5439fee87"
|
content-hash = "725552b13f9ba04c99b77a5cc96eef121abdd80eb5e67188981f6940fdc88015"
|
||||||
|
|
|
@ -60,6 +60,7 @@ sortedcontainers = "^2.4.0"
|
||||||
subtitle-filter = "^1.4.6"
|
subtitle-filter = "^1.4.6"
|
||||||
Unidecode = "^1.3.6"
|
Unidecode = "^1.3.6"
|
||||||
urllib3 = "^2.0.4"
|
urllib3 = "^2.0.4"
|
||||||
|
chardet = "^5.2.0"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
pre-commit = "^3.4.0"
|
pre-commit = "^3.4.0"
|
||||||
|
|
Loading…
Reference in New Issue