From 14ebe4ee1b8c36cc13d7eb3b4bf7dc10e5afe697 Mon Sep 17 00:00:00 2001
From: rlaphoenix <rlaphoenix@pm.me>
Date: Fri, 12 Jan 2024 00:36:43 +0000
Subject: [PATCH] Ensure input is UTF-8 when parsing TTML and WebVTT Subtitles

This fixes some conversion errors when working with non-latin languages like Russian (crylic) and Arabic.
---
 devine/core/tracks/subtitle.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/devine/core/tracks/subtitle.py b/devine/core/tracks/subtitle.py
index 0e3fb0a..eac1dca 100644
--- a/devine/core/tracks/subtitle.py
+++ b/devine/core/tracks/subtitle.py
@@ -15,7 +15,7 @@ from pymp4.parser import MP4
 from subtitle_filter import Subtitles
 
 from devine.core.tracks.track import Track
-from devine.core.utilities import get_binary_path
+from devine.core.utilities import get_binary_path, try_ensure_utf8
 
 
 class Subtitle(Track):
@@ -160,7 +160,7 @@ class Subtitle(Track):
                         caption_lists[lang].extend(segment.get_captions(lang))
                 caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
             elif codec == Subtitle.Codec.TimedTextMarkupLang:
-                text = data.decode("utf8")
+                text = try_ensure_utf8(data).decode("utf8")
                 text = text.replace("tt:", "")
                 # negative size values aren't allowed in TTML/DFXP spec, replace with 0
                 text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text)
@@ -171,7 +171,7 @@ class Subtitle(Track):
                 caption_lists[language] = caption_list
                 caption_set: pycaption.CaptionSet = pycaption.CaptionSet(caption_lists)
             elif codec == Subtitle.Codec.WebVTT:
-                text = data.decode("utf8")
+                text = try_ensure_utf8(data).decode("utf8")
                 # Segmented VTT when merged may have the WEBVTT headers part of the next caption
                 # if they are not separated far enough from the previous caption, hence the \n\n
                 text = text. \