Merge pull request #721 from Diptanshu8/matroska

**[FIX]** Fixes issue #705
This commit is contained in:
Carlos Fernandez Sanz 2017-03-16 11:40:27 -07:00 committed by GitHub
commit ce9416e943
5 changed files with 115 additions and 29 deletions

View File

@ -66,6 +66,7 @@ void init_options (struct ccx_s_options *options)
options->dvblang = NULL; // By default, autodetect DVB language
options->ocrlang = NULL; // By default, autodetect .traineddata file
options->ocr_oem = 0; // By default, set Tesseract OEM mode OEM_TESSERACT_ONLY (0)
options->mkvlang = NULL; // By default, all the languages are extracted
options->ignore_pts_jumps = 1;
/*HardsubX related stuff*/

View File

@ -38,7 +38,7 @@ struct encoder_cfg
int force_flush; // Force flush on content write
int append_mode; // Append mode for output files
int ucla; // 1 if -UCLA used, 0 if not
enum ccx_encoding_type encoding;
enum ccx_output_date_format date_format;
char millis_separator;
@ -132,6 +132,7 @@ struct ccx_s_options // Options from user parameters
char *dvblang; // The name of the language stream for DVB
char *ocrlang; // The name of the .traineddata file to be loaded with tesseract
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
char *mkvlang; // The name of the language stream for MKV
/*HardsubX related stuff*/
int hardsubx_ocr_mode;

View File

@ -227,7 +227,7 @@ char* generate_timestamp_ass_ssa(ULLONG milliseconds) {
}
int find_sub_track_index(struct matroska_ctx* mkv_ctx, ULLONG track_number) {
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
for (int i = mkv_ctx->sub_tracks_count-1; i >=0 ; i--)
if (mkv_ctx->sub_tracks[i]->track_number == track_number)
return i;
return -1;
@ -259,6 +259,11 @@ struct matroska_sub_sentence* parse_segment_cluster_block_group_block(struct mat
sentence->time_start = timecode + cluster_timecode;
struct matroska_sub_track* track = mkv_ctx->sub_tracks[sub_track_index];
if (track->sentence_count==0){
track->sentences = malloc(sizeof(struct matroska_sub_sentence*));
}
else
track->sentences = realloc(track->sentences,(track->sentence_count+1)*sizeof(struct matroska_sub_sentence*));
track->sentences[track->sentence_count] = sentence;
track->sentence_count++;
@ -617,12 +622,11 @@ void parse_segment_track_entry(struct matroska_ctx* mkv_ctx) {
sub_track->lang_index = 0;
sub_track->codec_id = codec_id;
sub_track->sentence_count = 0;
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
if (strcmp((const char *)mkv_ctx->sub_tracks[i]->lang, (const char *)lang) == 0)
sub_track->lang_index++;
mkv_ctx->sub_tracks[mkv_ctx->sub_tracks_count] = sub_track;
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
if (strcmp((const char *)mkv_ctx->sub_tracks[i]->lang, (const char *)lang) == 0)
sub_track->lang_index++;
mkv_ctx->sub_tracks = realloc(mkv_ctx->sub_tracks, sizeof(struct matroska_sub_track*) * (mkv_ctx->sub_tracks_count + 1));
mkv_ctx->sub_tracks[mkv_ctx->sub_tracks_count] = sub_track;
mkv_ctx->sub_tracks_count++;
}
else
@ -644,8 +648,10 @@ void parse_segment_tracks(struct matroska_ctx* mkv_ctx)
switch (code) {
/* Tracks ids*/
case MATROSKA_SEGMENT_TRACK_ENTRY:
parse_segment_track_entry(mkv_ctx);
MATROSKA_SWITCH_BREAK(code, code_len);
parse_segment_track_entry(mkv_ctx);
MATROSKA_SWITCH_BREAK(code, code_len);
/* Misc ids */
case MATROSKA_VOID:
@ -677,7 +683,6 @@ void parse_segment(struct matroska_ctx* mkv_ctx)
code <<= 8;
code += mkv_read_byte(file);
code_len++;
switch (code) {
/* Segment ids */
case MATROSKA_SEGMENT_SEEK_HEAD:
@ -758,7 +763,6 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
{
char* filename = generate_filename_from_track(mkv_ctx, track);
mprint("\nOutput file: %s", filename);
int desc;
#ifdef WIN32
desc = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_APPEND, S_IREAD | S_IWRITE);
@ -779,8 +783,7 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
{
char number[9];
sprintf(number, "%d", i + 1);
char *timestamp_start = malloc(sizeof(char) * 80); //being generous
char *timestamp_start = malloc(sizeof(char) * 80); //being generous
timestamp_to_srttime(sentence->time_start, timestamp_start);
ULLONG time_end = sentence->time_end;
if (i + 1 < track->sentence_count)
@ -794,7 +797,10 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
write(desc, " --> ", 5);
write(desc, timestamp_end, strlen(timestamp_start));
write(desc, "\n", 1);
write(desc, sentence->text, sentence->text_size);
int size=0;
while (*(sentence->text+size)=='\n' || *(sentence->text+size)=='\r' )
size++;
write(desc, sentence->text+size, sentence->text_size-size);
write(desc, "\n\n", 2);
free(timestamp_start);
@ -814,6 +820,8 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
write(desc, timestamp_end, strlen(timestamp_start));
write(desc, ",", 1);
char* text = ass_ssa_sentence_erase_read_order(sentence->text);
while((text[0]=='\\') && (text[1]=='n' || text[1]=='N'))
text+=2;
write(desc, text, strlen(text));
write(desc, "\n", 1);
@ -838,10 +846,17 @@ void free_sub_track(struct matroska_sub_track* track)
free(track);
}
void matroska_save_all(struct matroska_ctx* mkv_ctx)
void matroska_save_all(struct matroska_ctx* mkv_ctx,char* lang)
{
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
char* match;
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++){
if (lang){
if (match = strstr(lang,mkv_ctx->sub_tracks[i]->lang) != NULL)
save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]);
}
else
save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]);
}
}
void matroska_free_all(struct matroska_ctx* mkv_ctx)
@ -864,7 +879,7 @@ void matroska_parse(struct matroska_ctx* mkv_ctx)
code_len++;
switch (code) {
/* Header ids*/
/* Header ids*/
case MATROSKA_EBML_HEADER:
parse_ebml(file);
MATROSKA_SWITCH_BREAK(code, code_len);
@ -889,6 +904,7 @@ void matroska_parse(struct matroska_ctx* mkv_ctx)
}
}
// Close file stream
fclose(file);
@ -921,6 +937,7 @@ int matroska_loop(struct lib_ccx_ctx *ctx)
mkv_ctx->current_second = 0;
mkv_ctx->filename = ctx->inputfile[ctx->current_file];
mkv_ctx->file = create_file(ctx);
mkv_ctx->sub_tracks = malloc(sizeof(struct matroska_sub_track**));
matroska_parse(mkv_ctx);
@ -928,7 +945,7 @@ int matroska_loop(struct lib_ccx_ctx *ctx)
activity_progress(100, (int) (mkv_ctx->current_second / 60),
(int) (mkv_ctx->current_second % 60));
matroska_save_all(mkv_ctx);
matroska_save_all(mkv_ctx,ccx_options.mkvlang);
int sentence_count = mkv_ctx->sentence_count;
matroska_free_all(mkv_ctx);

View File

@ -136,9 +136,7 @@
/* Other defines */
#define MATROSKA_MAX_ID_LENGTH 4
#define MATROSKA_MAX_TRACKS 128
#define MATROSKA_MAX_SENTENCES 8192
#define MAX_FILE_NAME_SIZE 200
#define MAX_FILE_NAME_SIZE 260
/* Enums */
enum matroska_track_entry_type {
@ -203,11 +201,11 @@ struct matroska_sub_track {
enum matroska_track_subtitle_codec_id codec_id;
int sentence_count;
struct matroska_sub_sentence* sentences[MATROSKA_MAX_SENTENCES];
struct matroska_sub_sentence** sentences;
};
struct matroska_ctx {
struct matroska_sub_track* sub_tracks[MATROSKA_MAX_TRACKS];
struct matroska_sub_track** sub_tracks;
struct lib_ccx_ctx* ctx;
int sub_tracks_count;
int sentence_count;
@ -244,13 +242,13 @@ void parse_segment(struct matroska_ctx* mkv_ctx);
char* generate_timestamp_utf8(ULLONG milliseconds);
char* generate_timestamp_ass_ssa(ULLONG milliseconds);
int find_sub_track_index(struct matroska_ctx* mkv_ctx, ULLONG track_number);
char* get_track_entry_type_description(enum matroska_track_entry_type type);
char* get_track_entry_type_description(enum matroska_track_entry_type type);
enum matroska_track_subtitle_codec_id get_track_subtitle_codec_id(char* codec_id);
char* generate_filename_from_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* track);
char* ass_ssa_sentence_erase_read_order(char* text);
void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* track);
void free_sub_track(struct matroska_sub_track* track);
void matroska_save_all(struct matroska_ctx* mkv_ctx);
void matroska_save_all(struct matroska_ctx* mkv_ctx,char* lang);
void matroska_free_all(struct matroska_ctx* mkv_ctx);
void matroska_parse(struct matroska_ctx* mkv_ctx);
FILE* create_file(struct lib_ccx_ctx *ctx);

View File

@ -7,6 +7,8 @@
#include "ccx_decoders_708.h"
#include "compile_info.h"
#include "../lib_hash/sha2.h"
#include <string.h>
#include <stdio.h>
#ifdef ENABLE_HARDSUBX
#include "hardsubx.h"
#endif
@ -497,7 +499,7 @@ void print_usage (void)
mprint (" less or equal than the max allowed..\n");
mprint (" -levdistmincnt value: Minimum distance we always allow regardless\n");
mprint (" of the length of the strings.Default 2. \n");
mprint (" This means that if the calculated distance \n");
mprint (" This means that if the calculated distance \n");
mprint (" is 0,1 or 2, we consider the strings to be equivalent.\n");
mprint (" -levdistmaxpct value: Maximum distance we allow, as a percentage of\n");
mprint (" the shortest string length. Default 10%.\n");
@ -591,6 +593,12 @@ void print_usage (void)
mprint (" 0: OEM_TESSERACT_ONLY - default value, the fastest mode.\n");
mprint (" 1: OEM_LSTM_ONLY - use LSTM algorithm for recognition.\n");
mprint (" 2: OEM_TESSERACT_LSTM_COMBINED - both algorithms.\n");
mprint (" -mkvlang: For MKV subtitles, select which language's caption\n");
mprint (" stream will be processed. e.g. 'eng' for English.\n");
mprint (" Language codes can be either the 3 letters bibliographic\n");
mprint (" ISO-639-2 form (like \"fre\" for french) or a language\n");
mprint (" code followed by a dash and a country code for specialities\n");
mprint (" in languages (like \"fre-ca\" for Canadian French).\n");
mprint ("\n");
mprint ("Options that affect how ccextractor reads and writes (buffering):\n");
@ -1021,6 +1029,58 @@ int atoi_hex (char *s)
}
}
void mkvlang_params_check(char* lang){
int initial=0, present=0;
for(int char_index=0; char_index < strlen(lang);char_index++){
lang[char_index] = cctolower(lang[char_index]);
if (lang[char_index]==','){
present=char_index;
if ((present-initial<6)&&(present-initial!=3))
fatal(EXIT_MALFORMED_PARAMETER, "language codes should be xxx,xxx,xxx,....\n");
else if ((present-initial>3)&&(present-initial!=6))
fatal(EXIT_MALFORMED_PARAMETER, "language codes should be xxx-xx,xxx-xx,xxx-xx,....\n");
if ((present-initial>3)&&(present-initial==6)){
size_t length = present-initial;
char* block=calloc(length+1,sizeof(char));
strncpy(block,lang+initial,length);
char* hiphen_pointer = strstr(block,"-");
if (!hiphen_pointer)
fatal(EXIT_MALFORMED_PARAMETER, "language code is not of the form xxx-xx\n");
free(block);
}
initial=present+1;
}
}
//Steps to check for the last lang of multiple mkvlangs provided by the user.
present = strlen(lang)-1;
for(int char_index=strlen(lang)-1; char_index >=0 ;char_index--)
if (lang[char_index]==','){
initial=char_index+1;
break;
}
if ((present-initial<5)&&(present-initial!=2))
fatal(EXIT_MALFORMED_PARAMETER, "last language code should be xxx.\n");
else if ((present-initial>2)&&(present-initial!=5))
fatal(EXIT_MALFORMED_PARAMETER, "last language code should be xxx-xx.\n");
if ((present-initial>2)&&(present-initial==5)){
size_t length = present-initial;
char* block=calloc(length+1,sizeof(char));
strncpy(block,lang+initial,length);
char* hiphen_pointer = strstr(block,"-");
if (!hiphen_pointer)
fatal(EXIT_MALFORMED_PARAMETER, "last language code is not of the form xxx-xx\n");
free(block);
}
}
int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
{
@ -1225,7 +1285,7 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
}
}
#endif
if (strcmp(argv[i], "-chapters") == 0){
opt->extract_chapters= 1;
continue;
@ -1412,6 +1472,15 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
continue;
}
if(strcmp(argv[i],"-mkvlang")==0 && i < argc-1)
{
i++;
opt->mkvlang = (char *)malloc(sizeof(argv[i]));
sprintf(opt->mkvlang,"%s",argv[i]);
mkvlang_params_check(opt->mkvlang);
continue;
}
/* Output file formats */
if (strcmp (argv[i],"-srt")==0 ||
strcmp (argv[i],"-dvdraw")==0 ||
@ -2233,7 +2302,7 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
{
fatal (EXIT_INCOMPATIBLE_PARAMETERS, "MP4 requires an actual file, it's not possible to read from a stream, including stdin.\n");
}
if(opt->extract_chapters)
{
mprint("Request to extract chapters recieved.\n");