mirror of
https://github.com/CCExtractor/ccextractor.git
synced 2025-01-18 16:04:21 +00:00
47c5a6e73b
* Removed all extractors except the grid extractor. Removed the call to transcript extractor in ccx_encoders_transcript.c * Removed unnecessary array appening statements in python_grid_extractor. WIP: switch in extractor. * Added switch in g608 grid extractor. * Deleted comments from wrappers. * Refactored code in ccextractor.c and .h files. Removed all the commented part. Made proper changes according to the coding conventions. * Removed calls to extractor from all the encoders. The only call made to extractor is from ccx_encoders_python.c. * Removed a comment from wrapper.c. In init_write function of output.c added a call to free the output string returned by asprintf in case of sending filename to callback function. * Added calls to free the char* which is malloced by asprintf in extractor.c WIP: Free the global variable elements. * Sample testing correctly for italics tag. Also added a hack to print only 32 characters when unicode fails. WIP: Font tag. * Added support for handling font and italics in Python SRT generator. * modified the font generator. Also, added count method for checking blank strings in python_srt_generator. * Added free statements for avoiding memory leaks. * added return code for failure of asprintf calls. * Removing unnecessary code from api_testing.py * Made modifications to Makefile and build script. * Added recursive_tester.py Autoconf builds successfully. * BUG: Made change to get_line_encoded to encode the last \0 character in a line. Otherwise the EOL characted is absent causing garbage value to be present in SRT. * Exporting the encoding of the captions from CCExtractor to Python so that the python SRT generator can generate proper SRT files. * Modified the include statement in extractor.h
153 lines
4.8 KiB
Python
153 lines
4.8 KiB
Python
import ccextractor as cc
|
|
import re
|
|
"""
|
|
#Handling underline
|
|
buff = ""
|
|
underline_flag = 0
|
|
for i,font_type in enumerate(font_line):
|
|
if font_type == 'U' and not underline_flag:
|
|
buff = buff + '<u> '
|
|
underline_flag = 1
|
|
underline=1
|
|
elif font_type =="R" and underline_flag:
|
|
buff = buff + '</u>'
|
|
underline_flag = 0
|
|
continue;
|
|
buff += letter[i]
|
|
#adding a new line after buff has seen underline
|
|
#need to cross check with CCExtractor output as to how they are doing
|
|
if underline:
|
|
buff+= "\n"
|
|
else:
|
|
buff=""
|
|
"""
|
|
encodings_map = {
|
|
'0':'unicode',
|
|
'1':'latin1',
|
|
'2':'utf-8',
|
|
'3':'ascii',
|
|
}
|
|
|
|
color_text_start={
|
|
"0":"",
|
|
"1":"<font color=\"#00ff00\">",
|
|
"2":"<font color=\"#0000ff\">",
|
|
"3":"<font color=\"#00ffff\">",
|
|
"4":"<font color=\"#ff0000\">",
|
|
"5":"<font color=\"#ffff00\">",
|
|
"6":"<font color=\"#ff00ff\">",
|
|
"7":"<font color=\"",
|
|
"8":"",
|
|
"9":""
|
|
};
|
|
color_text_end={
|
|
"0":"",
|
|
"1":"</font",
|
|
"2":"</font>",
|
|
"3":"</font>",
|
|
"4":"</font>",
|
|
"5":"</font>",
|
|
"6":"</font>",
|
|
"7":"</font>",
|
|
"8":"",
|
|
"9":""
|
|
};
|
|
no_color_tag = ['0','8','9']
|
|
def comparing_text_font_grids(text, font, color):
|
|
original_text = text
|
|
original_color = color
|
|
temp_color = []
|
|
for letter,color_line in zip(original_text,color):
|
|
color = 0
|
|
prev = color_line[0]
|
|
buff = color_text_start[str(prev)]
|
|
if prev not in no_color_tag:
|
|
color_flag = 1
|
|
else:
|
|
color_flag = 0
|
|
if letter.count(" ")<32:
|
|
for i,color_type in enumerate(color_line):
|
|
if color_type not in no_color_tag and prev!=color_type and not color_flag:
|
|
color = 1
|
|
buff = buff + color_text_start[str(color_type)]
|
|
color_flag = 1
|
|
elif prev!=color_type and color_flag:
|
|
color = 1
|
|
buff = buff + color_text_end[str(prev)]
|
|
color_flag = 0
|
|
buff += letter[i]
|
|
prev=color_type
|
|
if color_flag:
|
|
color_flag=0
|
|
buff+=color_text_end[str(prev)]
|
|
if color:
|
|
temp_color.append((buff,1))
|
|
else:
|
|
temp_color.append((letter,0))
|
|
temp_font_italics=[]
|
|
for letter,font_line in zip(original_text,font):
|
|
if letter.count(" ")<32:
|
|
buff=""
|
|
underline,italics = 0,0
|
|
#Handling italics
|
|
italics_flag = 0
|
|
for i,font_type in enumerate(font_line):
|
|
if font_type == 'I' and not italics_flag:
|
|
italics=1
|
|
buff = buff + '<i>'
|
|
italics_flag = 1
|
|
elif font_type =="R" and italics_flag:
|
|
italics=1
|
|
buff = buff + '</i>'
|
|
italics_flag = 0
|
|
buff += letter[i]
|
|
if italics_flag:
|
|
buff+='</i>'
|
|
if italics:
|
|
temp_font_italics.append((buff,1))
|
|
else:
|
|
temp_font_italics.append((letter,0))
|
|
else:
|
|
temp_font_italics.append((letter,0))
|
|
final = []
|
|
for i,j in zip(temp_color,temp_font_italics):
|
|
if i[1] and not j[1]:
|
|
final.append(i[0])
|
|
elif j[1] and not i[1]:
|
|
final.append(j[0])
|
|
else:
|
|
if not i[1]:
|
|
final.append(i[0])
|
|
else:
|
|
print "error"
|
|
return (final,font,color)
|
|
|
|
|
|
def generate_output_srt(filename,d, encoding):
|
|
if encoding in encodings_map.keys():
|
|
if encoding!='0':
|
|
encoding_format = encodings_map[encoding]
|
|
else:
|
|
encoding_format = ""
|
|
else:
|
|
print "encoding error in python"
|
|
return
|
|
if encoding_format:
|
|
d['text'] = [unicode(item,encoding_format) for item in d['text']]
|
|
else:
|
|
d['text'] = [unicode(item) for item in d['text']]
|
|
d['text'],d['font'],d['color']= comparing_text_font_grids(d['text'],d['font'],d['color'])
|
|
for item in d['text']:
|
|
if item.count(" ")<32:
|
|
o=item
|
|
with open(filename,'ab+') as fh:
|
|
if encoding_format:
|
|
fh.write(o.encode(encoding_format))
|
|
else:
|
|
fh.write(str(o))
|
|
fh.write("\n")
|
|
fh.flush()
|
|
with open(filename,'ab+') as fh:
|
|
fh.write("\n")
|
|
fh.flush()
|