Improve readability of Subtitle.parse() method

This commit is contained in:
rlaphoenix 2024-01-12 00:27:19 +00:00
parent c6c2e9ca51
commit 9683c34337
1 changed files with 16 additions and 15 deletions

View File

@ -145,48 +145,49 @@ class Subtitle(Track):
@staticmethod @staticmethod
def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet: def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
# TODO: Use an "enum" for subtitle codecs
if not isinstance(data, bytes): if not isinstance(data, bytes):
raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}") raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}")
try: try:
if codec == Subtitle.Codec.fTTML: if codec == Subtitle.Codec.fTTML:
captions: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList) caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
for segment in ( for segment in (
Subtitle.parse(box.data, Subtitle.Codec.TimedTextMarkupLang) Subtitle.parse(box.data, Subtitle.Codec.TimedTextMarkupLang)
for box in MP4.parse_stream(BytesIO(data)) for box in MP4.parse_stream(BytesIO(data))
if box.type == b"mdat" if box.type == b"mdat"
): ):
for lang in segment.get_languages(): for lang in segment.get_languages():
captions[lang].extend(segment.get_captions(lang)) caption_lists[lang].extend(segment.get_captions(lang))
captions: pycaption.CaptionSet = pycaption.CaptionSet(captions) caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
return captions elif codec == Subtitle.Codec.TimedTextMarkupLang:
if codec == Subtitle.Codec.TimedTextMarkupLang: text = data.decode("utf8")
text = data.decode("utf8").replace("tt:", "") text = text.replace("tt:", "")
# negative size values aren't allowed in TTML/DFXP spec, replace with 0 # negative size values aren't allowed in TTML/DFXP spec, replace with 0
text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text) text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text)
return pycaption.DFXPReader().read(text) caption_set = pycaption.DFXPReader().read(text)
if codec == Subtitle.Codec.fVTT: elif codec == Subtitle.Codec.fVTT:
caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList) caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
caption_list, language = Subtitle.merge_segmented_wvtt(data) caption_list, language = Subtitle.merge_segmented_wvtt(data)
caption_lists[language] = caption_list caption_lists[language] = caption_list
caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists) caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
return caption_set elif codec == Subtitle.Codec.WebVTT:
if codec == Subtitle.Codec.WebVTT: text = data.decode("utf8")
# Segmented VTT when merged may have the WEBVTT headers part of the next caption # Segmented VTT when merged may have the WEBVTT headers part of the next caption
# if they are not separated far enough from the previous caption, hence the \n\n # if they are not separated far enough from the previous caption, hence the \n\n
text = data.decode("utf8"). \ text = text. \
replace("WEBVTT", "\n\nWEBVTT"). \ replace("WEBVTT", "\n\nWEBVTT"). \
replace("\r", ""). \ replace("\r", ""). \
replace("\n\n\n", "\n \n\n"). \ replace("\n\n\n", "\n \n\n"). \
replace("\n\n<", "\n<") replace("\n\n<", "\n<")
captions: pycaption.CaptionSet = pycaption.WebVTTReader().read(text) caption_set = pycaption.WebVTTReader().read(text)
return captions else:
raise ValueError(f"Unknown Subtitle format \"{codec}\"...")
except pycaption.exceptions.CaptionReadSyntaxError as e: except pycaption.exceptions.CaptionReadSyntaxError as e:
raise SyntaxError(f"A syntax error has occurred when reading the \"{codec}\" subtitle: {e}") raise SyntaxError(f"A syntax error has occurred when reading the \"{codec}\" subtitle: {e}")
except pycaption.exceptions.CaptionReadNoCaptions: except pycaption.exceptions.CaptionReadNoCaptions:
return pycaption.CaptionSet({"en": []}) return pycaption.CaptionSet({"en": []})
raise ValueError(f"Unknown Subtitle Format \"{codec}\"...") return caption_set
@staticmethod @staticmethod
def merge_same_cues(caption_set: pycaption.CaptionSet): def merge_same_cues(caption_set: pycaption.CaptionSet):