Add convert() method to Subtitle class

This commit is contained in:
rlaphoenix 2024-01-12 00:50:27 +00:00
parent f4d8bc8dd0
commit e76bc7201d
1 changed files with 71 additions and 0 deletions

View File

@ -5,6 +5,7 @@ import subprocess
from collections import defaultdict from collections import defaultdict
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path
from typing import Any, Iterable, Optional from typing import Any, Iterable, Optional
import pycaption import pycaption
@ -143,6 +144,76 @@ class Subtitle(Track):
track_name += flag track_name += flag
return track_name or None return track_name or None
def convert(self, codec: Subtitle.Codec) -> Path:
"""
Convert this Subtitle to another Format.
The file path location of the Subtitle data will be kept at the same
location but the file extension will be changed appropriately.
Supported formats:
- SubRip - SubtitleEdit or pycaption.SRTWriter
- TimedTextMarkupLang - SubtitleEdit or pycaption.DFXPWriter
- WebVTT - SubtitleEdit or pycaption.WebVTTWriter
- SubStationAlphav4 - SubtitleEdit
- fTTML* - custom code using some pycaption functions
- fVTT* - custom code using some pycaption functions
*: Can read from format, but cannot convert to format
Note: It currently prioritizes using SubtitleEdit over PyCaption as
I have personally noticed more oddities with PyCaption parsing over
SubtitleEdit. Especially when working with TTML/DFXP where it would
often have timecodes and stuff mixed in/duplicated.
Returns the new file path of the Subtitle.
"""
if not self.path or not self.path.exists():
raise ValueError("You must download the subtitle track first.")
if self.codec == codec:
return self.path
output_path = self.path.with_suffix(f".{codec.value.lower()}")
sub_edit_executable = get_binary_path("SubtitleEdit")
if sub_edit_executable and self.codec not in (Subtitle.Codec.fTTML, Subtitle.Codec.fVTT):
sub_edit_format = {
Subtitle.Codec.SubStationAlphav4: "AdvancedSubStationAlpha",
Subtitle.Codec.TimedTextMarkupLang: "TimedText1.0"
}.get(codec, codec.name)
subprocess.run(
[
sub_edit_executable,
"/Convert", self.path, sub_edit_format,
f"/outputfilename:{output_path.name}",
f"/outputfolder:{output_path.parent}",
"/encoding:utf8"
],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
else:
writer = {
# pycaption generally only supports these subtitle formats
Subtitle.Codec.SubRip: pycaption.SRTWriter,
Subtitle.Codec.TimedTextMarkupLang: pycaption.DFXPWriter,
Subtitle.Codec.WebVTT: pycaption.WebVTTWriter,
}.get(codec)
if writer is None:
raise NotImplementedError(f"Cannot yet convert {self.codec.name} to {codec.name}.")
caption_set = self.parse(self.path.read_bytes(), self.codec)
Subtitle.merge_same_cues(caption_set)
subtitle_text = writer().write(caption_set)
output_path.write_text(subtitle_text, encoding="utf8")
self.swap(output_path)
self.codec = codec
return output_path
@staticmethod @staticmethod
def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet: def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
if not isinstance(data, bytes): if not isinstance(data, bytes):