# encoding: utf-8 # # -------------------------------------------- # based on https://github.com/yuppity/ttml2srt # -------------------------------------------- # SPDX-License-Identifier: LGPL-2.1-or-later from __future__ import unicode_literals, absolute_import, division import re import io import os.path import json from collections import OrderedDict from copy import deepcopy import tempfile try: from defusedxml import minidom # type: ignore except: from xml.dom import minidom class TimestampConverter(object): def __init__(self, frame_rate=23.976, tick_rate=1): self.tick_rate = tick_rate self.frame_rate = frame_rate def timeexpr_to_ms(self, *args): return self._timeexpr_to_ms(*args) def _timeexpr_to_ms(self, time_expr): """Use the given time expression to get a matching conversion method to overwrite self.timeexpr_to_ms() with. """ self.timeexpr_to_ms = self.determine_ms_convfn(time_expr) return self.timeexpr_to_ms(time_expr) def _hhmmss_to_ms(self, hh, mm, ss): return hh * 3600 * 1000 + mm * 60 * 1000 + ss * 1000 def subrip_to_ms(self, timestamp): """Desconstruct SubRip timecode down to milliseconds """ hh, mm, ss, ms = re.split(r'[:,]', timestamp) return int(int(hh) * 3.6e6 + int(mm) * 60000 + int(ss) * 1000 + int(ms)) def _metric_to_ms(self, metric_multiplier, metric_value): return int(metric_multiplier * metric_value) def _ms_to_hhmmssms(self, ms): hh = int(ms / 3.6e6) mm = int((ms % 3.6e6) / 60000) ss = int((ms % 60000) / 1000) ms = int(ms % 1000) return hh, mm, ss, ms def ms_to_subrip(self, ms): """Build SubRip timecode from milliseconds """ hh, mm, ss, ms = self._ms_to_hhmmssms(ms) return '{:02d}:{:02d}:{:02d},{:03d}'.format(hh, mm, ss, ms) def ms_to_ssa(self, ms): """Build SSA/ASS timecode from milliseconds """ hh, mm, ss, ms = self._ms_to_hhmmssms(ms) return '{:01d}:{:02d}:{:02d}.{:02d}'.format(hh, mm, ss, int(ms / 10)) def frames_to_ms(self, frames): """Convert frame count to ms """ return int(int(frames) * (1000 / self.frame_rate)) def offset_frames_to_ms(self, time): """Convert offset-time expression with f metric to milliseconds. """ frames = float(time[:-1]) return int(int(frames) * (1000 / self.frame_rate)) def offset_ticks_to_ms(self, time): """Convert offset-time expression with t metric to milliseconds. """ ticks = int(time[:-1]) seconds = 1.0 / self.tick_rate return (seconds * ticks) * 1000 def offset_hours_to_ms(self, time): """Convert offset-time expression with h metric to milliseconds. """ hours = float(time[:-1]) return self._metric_to_ms(3.6e6, hours) def offset_minutes_to_ms(self, time): """Convert offset-time expression with m metric to milliseconds. """ return self._metric_to_ms(60 * 1000, float(time[:-1])) def offset_seconds_to_ms(self, time): """Convert offset-time expression with s metric to milliseconds. """ seconds = float(time[:-1]) return self._metric_to_ms(1000, seconds) def offset_ms_to_ms(self, time): """Convert offset-time expression with ms metric to milliseconds. """ ms = int(time[:-2]) return ms def fraction_timestamp_to_ms(self, timestamp): """Convert hh:mm:ss.fraction to milliseconds """ hh, mm, ss, fraction = re.split(r'[:.]', timestamp) hh, mm, ss = [int(i) for i in (hh, mm, ss)] # Resolution beyond ms is useless for our purposes ms = int(fraction[:3]) return self._hhmmss_to_ms(hh, mm, ss) + ms def frame_timestamp_to_ms(self, timestamp): """Convert hh:mm:ss:frames to milliseconds Will handle hh:mm:ss:frames.sub-frames by discarding the sub-frame part """ hh, mm, ss, frames = [int(i) for i in timestamp.split('.')[0].split(':')] hhmmss_ms = self._hhmmss_to_ms(hh, mm, ss) ms = self.frames_to_ms(frames) return hhmmss_ms + ms def determine_ms_convfn(self, time_expr): """Determine approriate ms conversion fn to pass the time expression to. Args: time_exrp (str): TTML time expression Return: Conversion method (callable) Strips the time expression of digits and uses the resulting string as a key to a dict of conversion methods. """ # Map time expression delimiters to conversion methods. Saves # us from having to exec multibranch code on each line but assumes all # time expressions to be of the same form. time_expr_fns = { # clock-time, no frames or fraction # Example(s): "00:02:23" '::': self.frame_timestamp_to_ms, # clock-time, frames # Example(s): "00:02:23:12", "00:02:23:12.222" ':::': self.frame_timestamp_to_ms, ':::.': self.frame_timestamp_to_ms, # clock-time, fraction # Example(s): "00:02:23.283" '::.': self.fraction_timestamp_to_ms, # offset-time, hour metric # Example(s): "1h", "1.232837372637h" 'h': self.offset_hours_to_ms, '.h': self.offset_hours_to_ms, # offset-time, minute metric # Example(s): "1m", "13.72986323m" 'm': self.offset_minutes_to_ms, '.m': self.offset_minutes_to_ms, # offset-time, second metric # Example(s): "1s", "113.2312312s" 's': self.offset_seconds_to_ms, '.s': self.offset_seconds_to_ms, # offset-time, millisecond metric # Example(s): "1ms", "1000.1231231231223ms" 'ms': self.offset_ms_to_ms, '.ms': self.offset_ms_to_ms, # offset-time, frame metric # Example(s): "100f" 'f': self.offset_frames_to_ms, '.f': self.offset_frames_to_ms, # offset-time, tick metric # Example(s): "19298323t" 't': self.offset_ticks_to_ms, '.t': self.offset_ticks_to_ms, } try: delims = ''.join([i for i in time_expr if not i.isdigit()]) return time_expr_fns[delims] except KeyError: raise NotImplementedError( 'Unknown timestamp format ("{}")'.format(time_expr)) class Ttml2Ssa(object): VERSION = '0.3.8' TIME_BASES = [ 'media', 'smpte', ] SCALE = { 'NTSC2PAL' : 23.976/25, 'PAL2NTSC' : 25/23.976, 'NTSC2FILM' : 23.976/24, 'PAL2FILM' : 25/24, 'FILM2NTSC' : 24/23.976, 'FILM2PAL' : 24/25 } TOP_MARKER = '{\\an8}' def __init__(self, shift=0, source_fps=23.976, scale_factor=1, subtitle_language=None): self.shift = shift self.source_fps = source_fps self.subtitle_language = subtitle_language self.scale_factor = scale_factor self.ssa_timestamp_min_sep = 200 self.use_cosmetic_filter = True self.use_language_filter = True self.fix_amazon_errors = True self.allow_italics = True self.allow_top_pos = True self.allow_timestamp_manipulation = True self.fix_timestamp_collisions = True self.fix_duplicated_entries = True try: self.cache_directory = tempfile.gettempdir() # Fails on Android self.cache_downloaded_subtitles = True except: self.cache_directory = None self.cache_downloaded_subtitles = False self._styles = {} self._italic_style_ids = [] self._top_regions_ids = [] self._allowed_style_attrs = ( 'color', 'fontStyle', 'fontWeight', ) ## This variable stores the language ID from the xml file. # But it may not exist or it may be wrong. self.lang = None self.ssa_style = OrderedDict([ ('Fontname', 'Arial'), ('Fontsize', 50), ('PrimaryColour', '&H00EEEEEE'), ('SecondaryColour', '&H000000FF'), ('BackColour', '&H40000000'), ('OutlineColour', '&H00000000'), ('Bold', 0), ('Italic', 0), ('Underline', 0), ('Alignment', 2), ('BorderStyle', 1), ('Outline', 2), ('Shadow', 3), ('MarginL', 0), ('MarginR', 0), ('MarginV', 40), ('StrikeOut', 0), ('ScaleX', 100), ('ScaleY', 100), ('Spacing', 0), ('Angle', 0), ('Encoding', 1) ]) self.ssa_playresx = 1280 self.ssa_playresy = 720 self.entries = [] def set_video_aspect_ratio(self, ratio): """ Adjust the SSA options PlaResX and PlayRexY according to the aspect ratio of the video """ self.ssa_playresy = int(self.ssa_playresx / ratio) def parse_subtitle_file(self, filename, file_encoding=None): """Read and parse a subtitle file. If the file has the vtt or srt extension it will be parsed as a vtt. Otherwise it will be parsed as ttml. The result is stored in the `entries` list, as begin (ms), end (ms), text, position. """ extension = os.path.splitext(filename)[1].lower() if extension == ".srt" or extension == ".vtt": self.parse_vtt_file(filename, file_encoding) else: self.parse_ttml_file(filename, file_encoding) def parse_ttml_file(self, filename, file_encoding=None): """Read and parse a ttml/xml/dfxp file. The result is stored in the `entries` list, as begin (ms), end (ms), text, position. """ doc = self._read_file(filename, file_encoding) self.parse_ttml_from_string(doc.encode('utf-8')) def parse_ttml_from_string(self, doc): """Read and parse a ttml/xml/dfxp subtitle from a string. The result is stored in the `entries` list, as begin (ms), end (ms), text, position. """ def extract_rate(s): try: m = s.split(' ') return int(m[0]) / int(m[1]) except: return 1 del self.entries [:] self._tc = TimestampConverter() ttml_dom = minidom.parseString(doc) self._encoding = ttml_dom.encoding if self._encoding and self._encoding.lower() not in ['utf8', 'utf-8']: # Don't bother with subtitles that aren't utf-8 encoded # but assume utf-8 when the encoding attr is missing raise NotImplementedError('Source is not utf-8 encoded') # Get the root tt element (assume the file contains # a single subtitle document) tt_element = ttml_dom.getElementsByTagNameNS('*', 'tt')[0] # Extract doc language # https://tools.ietf.org/html/rfc4646#section-2.1 language_tag = tt_element.getAttribute('xml:lang') or '' self.lang = re.split(r'\s+', language_tag.strip())[0].split('-')[0] # Store TT parameters as instance vars (in camel case) opttime = {} for ttp_name, defval, convfn in ( # (tt param, default val, fn to process the str) ('frameRate', 0, lambda x: float(x)), ('tickRate', 0, lambda x: int(x)), ('timeBase', 'media', lambda x: x), ('clockMode', '', lambda x: x), #('frameRateMultiplier', 1, lambda x: int(x)), ('frameRateMultiplier', 1, lambda x: extract_rate(x)), ('subFrameRate', 1, lambda x: int(x)), ('markerMode', '', lambda x: x), ('dropMode', '', lambda x: x), ): ttp_val = getattr( tt_element.attributes.get('ttp:' + ttp_name), 'value', defval) opttime[Ttml2Ssa._snake_to_camel(ttp_name)] = convfn(ttp_val) if opttime['time_base'] not in Ttml2Ssa.TIME_BASES: raise NotImplementedError('No support for "{}" time base'.format( opttime['time_base'])) # Set effective tick rate as per # https://www.w3.org/TR/ttml1/#parameter-attribute-tickRate # This will obviously only be made use of if we encounter offset-time # expressions that have the tick metric. self._tc.tick_rate = opttime['tick_rate'] if not opttime['tick_rate'] and opttime['frame_rate']: self._tc.tick_rate = int(opttime['frame_rate'] * opttime['sub_frame_rate']) elif not opttime['tick_rate']: self._tc.tick_rate = 1 # Set FPS to source_fps if no TT param self._tc.frame_rate = opttime['frame_rate'] or self.source_fps # Grab