From 4b5a2c703b85bf03bdbc909b9d92103c5aa0923d Mon Sep 17 00:00:00 2001 From: rlaphoenix Date: Sat, 11 Feb 2023 22:17:43 +0000 Subject: [PATCH] Fix subtitle conversion error where WEBVTT header is kept This happened because the WEBVTT header was segmented and appended to each other without enough newline separation so pycaption thought it was an actual caption and to be kept. --- devine/core/tracks/subtitle.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/devine/core/tracks/subtitle.py b/devine/core/tracks/subtitle.py index 3b6e520..c49c1b1 100644 --- a/devine/core/tracks/subtitle.py +++ b/devine/core/tracks/subtitle.py @@ -170,7 +170,13 @@ class Subtitle(Track): caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists) return caption_set if codec == Subtitle.Codec.WebVTT: - text = data.decode("utf8").replace("\r", "").replace("\n\n\n", "\n \n\n").replace("\n\n<", "\n<") + # Segmented VTT when merged may have the WEBVTT headers part of the next caption + # if they are not separated far enough from the previous caption, hence the \n\n + text = data.decode("utf8"). \ + replace("WEBVTT", "\n\nWEBVTT"). \ + replace("\r", ""). \ + replace("\n\n\n", "\n \n\n"). \ + replace("\n\n<", "\n<") captions: pycaption.CaptionSet = pycaption.WebVTTReader().read(text) return captions except pycaption.exceptions.CaptionReadSyntaxError: