From 14ebe4ee1b8c36cc13d7eb3b4bf7dc10e5afe697 Mon Sep 17 00:00:00 2001 From: rlaphoenix Date: Fri, 12 Jan 2024 00:36:43 +0000 Subject: [PATCH] Ensure input is UTF-8 when parsing TTML and WebVTT Subtitles This fixes some conversion errors when working with non-latin languages like Russian (crylic) and Arabic. --- devine/core/tracks/subtitle.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/devine/core/tracks/subtitle.py b/devine/core/tracks/subtitle.py index 0e3fb0a..eac1dca 100644 --- a/devine/core/tracks/subtitle.py +++ b/devine/core/tracks/subtitle.py @@ -15,7 +15,7 @@ from pymp4.parser import MP4 from subtitle_filter import Subtitles from devine.core.tracks.track import Track -from devine.core.utilities import get_binary_path +from devine.core.utilities import get_binary_path, try_ensure_utf8 class Subtitle(Track): @@ -160,7 +160,7 @@ class Subtitle(Track): caption_lists[lang].extend(segment.get_captions(lang)) caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists) elif codec == Subtitle.Codec.TimedTextMarkupLang: - text = data.decode("utf8") + text = try_ensure_utf8(data).decode("utf8") text = text.replace("tt:", "") # negative size values aren't allowed in TTML/DFXP spec, replace with 0 text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text) @@ -171,7 +171,7 @@ class Subtitle(Track): caption_lists[language] = caption_list caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists) elif codec == Subtitle.Codec.WebVTT: - text = data.decode("utf8") + text = try_ensure_utf8(data).decode("utf8") # Segmented VTT when merged may have the WEBVTT headers part of the next caption # if they are not separated far enough from the previous caption, hence the \n\n text = text. \