forked from DRMTalks/devine
Improve readability of Subtitle.parse() method
This commit is contained in:
parent
c6c2e9ca51
commit
9683c34337
|
@ -145,48 +145,49 @@ class Subtitle(Track):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
|
def parse(data: bytes, codec: Subtitle.Codec) -> pycaption.CaptionSet:
|
||||||
# TODO: Use an "enum" for subtitle codecs
|
|
||||||
if not isinstance(data, bytes):
|
if not isinstance(data, bytes):
|
||||||
raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}")
|
raise ValueError(f"Subtitle data must be parsed as bytes data, not {type(data).__name__}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if codec == Subtitle.Codec.fTTML:
|
if codec == Subtitle.Codec.fTTML:
|
||||||
captions: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
|
caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
|
||||||
for segment in (
|
for segment in (
|
||||||
Subtitle.parse(box.data, Subtitle.Codec.TimedTextMarkupLang)
|
Subtitle.parse(box.data, Subtitle.Codec.TimedTextMarkupLang)
|
||||||
for box in MP4.parse_stream(BytesIO(data))
|
for box in MP4.parse_stream(BytesIO(data))
|
||||||
if box.type == b"mdat"
|
if box.type == b"mdat"
|
||||||
):
|
):
|
||||||
for lang in segment.get_languages():
|
for lang in segment.get_languages():
|
||||||
captions[lang].extend(segment.get_captions(lang))
|
caption_lists[lang].extend(segment.get_captions(lang))
|
||||||
captions: pycaption.CaptionSet = pycaption.CaptionSet(captions)
|
caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
|
||||||
return captions
|
elif codec == Subtitle.Codec.TimedTextMarkupLang:
|
||||||
if codec == Subtitle.Codec.TimedTextMarkupLang:
|
text = data.decode("utf8")
|
||||||
text = data.decode("utf8").replace("tt:", "")
|
text = text.replace("tt:", "")
|
||||||
# negative size values aren't allowed in TTML/DFXP spec, replace with 0
|
# negative size values aren't allowed in TTML/DFXP spec, replace with 0
|
||||||
text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text)
|
text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text)
|
||||||
return pycaption.DFXPReader().read(text)
|
caption_set = pycaption.DFXPReader().read(text)
|
||||||
if codec == Subtitle.Codec.fVTT:
|
elif codec == Subtitle.Codec.fVTT:
|
||||||
caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
|
caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
|
||||||
caption_list, language = Subtitle.merge_segmented_wvtt(data)
|
caption_list, language = Subtitle.merge_segmented_wvtt(data)
|
||||||
caption_lists[language] = caption_list
|
caption_lists[language] = caption_list
|
||||||
caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
|
caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
|
||||||
return caption_set
|
elif codec == Subtitle.Codec.WebVTT:
|
||||||
if codec == Subtitle.Codec.WebVTT:
|
text = data.decode("utf8")
|
||||||
# Segmented VTT when merged may have the WEBVTT headers part of the next caption
|
# Segmented VTT when merged may have the WEBVTT headers part of the next caption
|
||||||
# if they are not separated far enough from the previous caption, hence the \n\n
|
# if they are not separated far enough from the previous caption, hence the \n\n
|
||||||
text = data.decode("utf8"). \
|
text = text. \
|
||||||
replace("WEBVTT", "\n\nWEBVTT"). \
|
replace("WEBVTT", "\n\nWEBVTT"). \
|
||||||
replace("\r", ""). \
|
replace("\r", ""). \
|
||||||
replace("\n\n\n", "\n \n\n"). \
|
replace("\n\n\n", "\n \n\n"). \
|
||||||
replace("\n\n<", "\n<")
|
replace("\n\n<", "\n<")
|
||||||
captions: pycaption.CaptionSet = pycaption.WebVTTReader().read(text)
|
caption_set = pycaption.WebVTTReader().read(text)
|
||||||
return captions
|
else:
|
||||||
|
raise ValueError(f"Unknown Subtitle format \"{codec}\"...")
|
||||||
except pycaption.exceptions.CaptionReadSyntaxError as e:
|
except pycaption.exceptions.CaptionReadSyntaxError as e:
|
||||||
raise SyntaxError(f"A syntax error has occurred when reading the \"{codec}\" subtitle: {e}")
|
raise SyntaxError(f"A syntax error has occurred when reading the \"{codec}\" subtitle: {e}")
|
||||||
except pycaption.exceptions.CaptionReadNoCaptions:
|
except pycaption.exceptions.CaptionReadNoCaptions:
|
||||||
return pycaption.CaptionSet({"en": []})
|
return pycaption.CaptionSet({"en": []})
|
||||||
|
|
||||||
raise ValueError(f"Unknown Subtitle Format \"{codec}\"...")
|
return caption_set
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def merge_same_cues(caption_set: pycaption.CaptionSet):
|
def merge_same_cues(caption_set: pycaption.CaptionSet):
|
||||||
|
|
Loading…
Reference in New Issue