diff --git a/README.md b/README.md
index db4950b..ab1acb2 100644
--- a/README.md
+++ b/README.md
@@ -316,6 +316,7 @@ Please refrain from spam or asking for questions that infringe upon a Service's
+
## Licensing
diff --git a/devine/commands/dl.py b/devine/commands/dl.py
index 80f4e3c..1224f50 100644
--- a/devine/commands/dl.py
+++ b/devine/commands/dl.py
@@ -52,7 +52,7 @@ from devine.core.services import Services
from devine.core.titles import Movie, Song, Title_T
from devine.core.titles.episode import Episode
from devine.core.tracks import Audio, Subtitle, Tracks, Video
-from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since
+from devine.core.utilities import get_binary_path, is_close_match, time_elapsed_since, try_ensure_utf8
from devine.core.utils.click_types import LANGUAGE_RANGE, QUALITY_LIST, SEASON_RANGE, ContextData
from devine.core.utils.collections import merge_dict
from devine.core.utils.subprocess import ffprobe
@@ -922,6 +922,11 @@ class dl:
track.OnDecrypted(track)
progress(downloaded="Decrypted", completed=100)
+ if isinstance(track, Subtitle):
+ track_data = track.path.read_bytes()
+ track_data = try_ensure_utf8(track_data)
+ track.path.write_bytes(track_data)
+
progress(downloaded="Downloaded")
except KeyboardInterrupt:
self.DL_POOL_STOP.set()
diff --git a/devine/core/manifests/dash.py b/devine/core/manifests/dash.py
index 19abf38..3dffc8b 100644
--- a/devine/core/manifests/dash.py
+++ b/devine/core/manifests/dash.py
@@ -31,7 +31,7 @@ from devine.core.downloaders import downloader
from devine.core.downloaders import requests as requests_downloader
from devine.core.drm import Widevine
from devine.core.tracks import Audio, Subtitle, Tracks, Video
-from devine.core.utilities import is_close_match
+from devine.core.utilities import is_close_match, try_ensure_utf8
from devine.core.utils.xml import load_xml
@@ -471,7 +471,11 @@ class DASH:
if init_data:
f.write(init_data)
for segment_file in sorted(save_dir.iterdir()):
- f.write(segment_file.read_bytes())
+ segment_data = segment_file.read_bytes()
+ # TODO: fix encoding after decryption?
+ if not drm and isinstance(track, Subtitle):
+ segment_data = try_ensure_utf8(segment_data)
+ f.write(segment_data)
segment_file.unlink()
if drm:
diff --git a/devine/core/manifests/hls.py b/devine/core/manifests/hls.py
index 60094b2..bb2d00b 100644
--- a/devine/core/manifests/hls.py
+++ b/devine/core/manifests/hls.py
@@ -28,7 +28,7 @@ from devine.core.downloaders import downloader
from devine.core.downloaders import requests as requests_downloader
from devine.core.drm import DRM_T, ClearKey, Widevine
from devine.core.tracks import Audio, Subtitle, Tracks, Video
-from devine.core.utilities import is_close_match
+from devine.core.utilities import is_close_match, try_ensure_utf8
class HLS:
@@ -301,7 +301,10 @@ class HLS:
with open(save_path, "wb") as f:
for segment_file in sorted(save_dir.iterdir()):
- f.write(segment_file.read_bytes())
+ segment_data = segment_file.read_bytes()
+ if isinstance(track, Subtitle):
+ segment_data = try_ensure_utf8(segment_data)
+ f.write(segment_data)
segment_file.unlink()
progress(downloaded="Downloaded")
diff --git a/devine/core/utilities.py b/devine/core/utilities.py
index b810dd4..146a841 100644
--- a/devine/core/utilities.py
+++ b/devine/core/utilities.py
@@ -13,6 +13,7 @@ from types import ModuleType
from typing import AsyncIterator, Optional, Sequence, Union
from urllib.parse import urlparse
+import chardet
import pproxy
import requests
from construct import ValidationError
@@ -215,6 +216,32 @@ def time_elapsed_since(start: float) -> str:
return time_string
+def try_ensure_utf8(data: bytes) -> bytes:
+ """
+ Try to ensure that the given data is encoded in UTF-8.
+
+ Parameters:
+ data: Input data that may or may not yet be UTF-8 or another encoding.
+
+ Returns the input data encoded in UTF-8 if successful. If unable to detect the
+ encoding of the input data, then the original data is returned as-received.
+ """
+ try:
+ data.decode("utf8")
+ return data
+ except UnicodeDecodeError:
+ try:
+ # CP-1252 is a superset of latin1
+ return data.decode("cp1252").encode("utf8")
+ except UnicodeDecodeError:
+ try:
+ # last ditch effort to detect encoding
+ detection_result = chardet.detect(data)
+ return data.decode(detection_result["encoding"]).encode("utf8")
+ except UnicodeDecodeError:
+ return data
+
+
@contextlib.asynccontextmanager
async def start_pproxy(proxy: str) -> AsyncIterator[str]:
proxy = urlparse(proxy)
diff --git a/poetry.lock b/poetry.lock
index da42a10..bf47c21 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -294,6 +294,17 @@ files = [
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
]
+[[package]]
+name = "chardet"
+version = "5.2.0"
+description = "Universal encoding detector for Python 3"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"},
+ {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"},
+]
+
[[package]]
name = "charset-normalizer"
version = "3.2.0"
@@ -1769,4 +1780,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = ">=3.9.0,<3.12"
-content-hash = "d19aedf3a21dff6327497d42b56dbff63cef4d2899fa886c52c058e5439fee87"
+content-hash = "725552b13f9ba04c99b77a5cc96eef121abdd80eb5e67188981f6940fdc88015"
diff --git a/pyproject.toml b/pyproject.toml
index 574c072..44e35f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,7 @@ sortedcontainers = "^2.4.0"
subtitle-filter = "^1.4.6"
Unidecode = "^1.3.6"
urllib3 = "^2.0.4"
+chardet = "^5.2.0"
[tool.poetry.dev-dependencies]
pre-commit = "^3.4.0"