diff --git a/src/lib_ccx/ccx_common_option.c b/src/lib_ccx/ccx_common_option.c index cb5baf41..819a82d3 100644 --- a/src/lib_ccx/ccx_common_option.c +++ b/src/lib_ccx/ccx_common_option.c @@ -66,6 +66,7 @@ void init_options (struct ccx_s_options *options) options->dvblang = NULL; // By default, autodetect DVB language options->ocrlang = NULL; // By default, autodetect .traineddata file options->ocr_oem = 0; // By default, set Tesseract OEM mode OEM_TESSERACT_ONLY (0) + options->mkvlang = NULL; // By default, all the languages are extracted options->ignore_pts_jumps = 1; /*HardsubX related stuff*/ diff --git a/src/lib_ccx/ccx_common_option.h b/src/lib_ccx/ccx_common_option.h index 34d688bc..6ee94668 100644 --- a/src/lib_ccx/ccx_common_option.h +++ b/src/lib_ccx/ccx_common_option.h @@ -38,7 +38,7 @@ struct encoder_cfg int force_flush; // Force flush on content write int append_mode; // Append mode for output files int ucla; // 1 if -UCLA used, 0 if not - + enum ccx_encoding_type encoding; enum ccx_output_date_format date_format; char millis_separator; @@ -132,6 +132,7 @@ struct ccx_s_options // Options from user parameters char *dvblang; // The name of the language stream for DVB char *ocrlang; // The name of the .traineddata file to be loaded with tesseract int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2 + char *mkvlang; // The name of the language stream for MKV /*HardsubX related stuff*/ int hardsubx_ocr_mode; diff --git a/src/lib_ccx/matroska.c b/src/lib_ccx/matroska.c index a452dc9d..fbbfd469 100644 --- a/src/lib_ccx/matroska.c +++ b/src/lib_ccx/matroska.c @@ -227,7 +227,7 @@ char* generate_timestamp_ass_ssa(ULLONG milliseconds) { } int find_sub_track_index(struct matroska_ctx* mkv_ctx, ULLONG track_number) { - for (int i = 0; i < mkv_ctx->sub_tracks_count; i++) + for (int i = mkv_ctx->sub_tracks_count-1; i >=0 ; i--) if (mkv_ctx->sub_tracks[i]->track_number == track_number) return i; return -1; @@ -259,6 +259,11 @@ struct matroska_sub_sentence* parse_segment_cluster_block_group_block(struct mat sentence->time_start = timecode + cluster_timecode; struct matroska_sub_track* track = mkv_ctx->sub_tracks[sub_track_index]; + if (track->sentence_count==0){ + track->sentences = malloc(sizeof(struct matroska_sub_sentence*)); + } + else + track->sentences = realloc(track->sentences,(track->sentence_count+1)*sizeof(struct matroska_sub_sentence*)); track->sentences[track->sentence_count] = sentence; track->sentence_count++; @@ -617,12 +622,11 @@ void parse_segment_track_entry(struct matroska_ctx* mkv_ctx) { sub_track->lang_index = 0; sub_track->codec_id = codec_id; sub_track->sentence_count = 0; - - for (int i = 0; i < mkv_ctx->sub_tracks_count; i++) - if (strcmp((const char *)mkv_ctx->sub_tracks[i]->lang, (const char *)lang) == 0) - sub_track->lang_index++; - - mkv_ctx->sub_tracks[mkv_ctx->sub_tracks_count] = sub_track; + for (int i = 0; i < mkv_ctx->sub_tracks_count; i++) + if (strcmp((const char *)mkv_ctx->sub_tracks[i]->lang, (const char *)lang) == 0) + sub_track->lang_index++; + mkv_ctx->sub_tracks = realloc(mkv_ctx->sub_tracks, sizeof(struct matroska_sub_track*) * (mkv_ctx->sub_tracks_count + 1)); + mkv_ctx->sub_tracks[mkv_ctx->sub_tracks_count] = sub_track; mkv_ctx->sub_tracks_count++; } else @@ -644,8 +648,10 @@ void parse_segment_tracks(struct matroska_ctx* mkv_ctx) switch (code) { /* Tracks ids*/ case MATROSKA_SEGMENT_TRACK_ENTRY: - parse_segment_track_entry(mkv_ctx); - MATROSKA_SWITCH_BREAK(code, code_len); + + + parse_segment_track_entry(mkv_ctx); + MATROSKA_SWITCH_BREAK(code, code_len); /* Misc ids */ case MATROSKA_VOID: @@ -677,7 +683,6 @@ void parse_segment(struct matroska_ctx* mkv_ctx) code <<= 8; code += mkv_read_byte(file); code_len++; - switch (code) { /* Segment ids */ case MATROSKA_SEGMENT_SEEK_HEAD: @@ -758,7 +763,6 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra { char* filename = generate_filename_from_track(mkv_ctx, track); mprint("\nOutput file: %s", filename); - int desc; #ifdef WIN32 desc = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_APPEND, S_IREAD | S_IWRITE); @@ -779,8 +783,7 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra { char number[9]; sprintf(number, "%d", i + 1); - - char *timestamp_start = malloc(sizeof(char) * 80); //being generous + char *timestamp_start = malloc(sizeof(char) * 80); //being generous timestamp_to_srttime(sentence->time_start, timestamp_start); ULLONG time_end = sentence->time_end; if (i + 1 < track->sentence_count) @@ -794,7 +797,10 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra write(desc, " --> ", 5); write(desc, timestamp_end, strlen(timestamp_start)); write(desc, "\n", 1); - write(desc, sentence->text, sentence->text_size); + int size=0; + while (*(sentence->text+size)=='\n' || *(sentence->text+size)=='\r' ) + size++; + write(desc, sentence->text+size, sentence->text_size-size); write(desc, "\n\n", 2); free(timestamp_start); @@ -814,6 +820,8 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra write(desc, timestamp_end, strlen(timestamp_start)); write(desc, ",", 1); char* text = ass_ssa_sentence_erase_read_order(sentence->text); + while((text[0]=='\\') && (text[1]=='n' || text[1]=='N')) + text+=2; write(desc, text, strlen(text)); write(desc, "\n", 1); @@ -838,10 +846,17 @@ void free_sub_track(struct matroska_sub_track* track) free(track); } -void matroska_save_all(struct matroska_ctx* mkv_ctx) +void matroska_save_all(struct matroska_ctx* mkv_ctx,char* lang) { - for (int i = 0; i < mkv_ctx->sub_tracks_count; i++) + char* match; + for (int i = 0; i < mkv_ctx->sub_tracks_count; i++){ + if (lang){ + if (match = strstr(lang,mkv_ctx->sub_tracks[i]->lang) != NULL) + save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]); + } + else save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]); + } } void matroska_free_all(struct matroska_ctx* mkv_ctx) @@ -864,7 +879,7 @@ void matroska_parse(struct matroska_ctx* mkv_ctx) code_len++; switch (code) { - /* Header ids*/ + /* Header ids*/ case MATROSKA_EBML_HEADER: parse_ebml(file); MATROSKA_SWITCH_BREAK(code, code_len); @@ -889,6 +904,7 @@ void matroska_parse(struct matroska_ctx* mkv_ctx) } } + // Close file stream fclose(file); @@ -921,6 +937,7 @@ int matroska_loop(struct lib_ccx_ctx *ctx) mkv_ctx->current_second = 0; mkv_ctx->filename = ctx->inputfile[ctx->current_file]; mkv_ctx->file = create_file(ctx); + mkv_ctx->sub_tracks = malloc(sizeof(struct matroska_sub_track**)); matroska_parse(mkv_ctx); @@ -928,7 +945,7 @@ int matroska_loop(struct lib_ccx_ctx *ctx) activity_progress(100, (int) (mkv_ctx->current_second / 60), (int) (mkv_ctx->current_second % 60)); - matroska_save_all(mkv_ctx); + matroska_save_all(mkv_ctx,ccx_options.mkvlang); int sentence_count = mkv_ctx->sentence_count; matroska_free_all(mkv_ctx); diff --git a/src/lib_ccx/matroska.h b/src/lib_ccx/matroska.h index d233e094..82375912 100644 --- a/src/lib_ccx/matroska.h +++ b/src/lib_ccx/matroska.h @@ -136,9 +136,7 @@ /* Other defines */ #define MATROSKA_MAX_ID_LENGTH 4 -#define MATROSKA_MAX_TRACKS 128 -#define MATROSKA_MAX_SENTENCES 8192 -#define MAX_FILE_NAME_SIZE 200 +#define MAX_FILE_NAME_SIZE 260 /* Enums */ enum matroska_track_entry_type { @@ -203,11 +201,11 @@ struct matroska_sub_track { enum matroska_track_subtitle_codec_id codec_id; int sentence_count; - struct matroska_sub_sentence* sentences[MATROSKA_MAX_SENTENCES]; + struct matroska_sub_sentence** sentences; }; struct matroska_ctx { - struct matroska_sub_track* sub_tracks[MATROSKA_MAX_TRACKS]; + struct matroska_sub_track** sub_tracks; struct lib_ccx_ctx* ctx; int sub_tracks_count; int sentence_count; @@ -244,13 +242,13 @@ void parse_segment(struct matroska_ctx* mkv_ctx); char* generate_timestamp_utf8(ULLONG milliseconds); char* generate_timestamp_ass_ssa(ULLONG milliseconds); int find_sub_track_index(struct matroska_ctx* mkv_ctx, ULLONG track_number); -char* get_track_entry_type_description(enum matroska_track_entry_type type); +char* get_track_entry_type_description(enum matroska_track_entry_type type); enum matroska_track_subtitle_codec_id get_track_subtitle_codec_id(char* codec_id); char* generate_filename_from_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* track); char* ass_ssa_sentence_erase_read_order(char* text); void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* track); void free_sub_track(struct matroska_sub_track* track); -void matroska_save_all(struct matroska_ctx* mkv_ctx); +void matroska_save_all(struct matroska_ctx* mkv_ctx,char* lang); void matroska_free_all(struct matroska_ctx* mkv_ctx); void matroska_parse(struct matroska_ctx* mkv_ctx); FILE* create_file(struct lib_ccx_ctx *ctx); diff --git a/src/lib_ccx/params.c b/src/lib_ccx/params.c index e52303fc..4b911baf 100644 --- a/src/lib_ccx/params.c +++ b/src/lib_ccx/params.c @@ -7,6 +7,8 @@ #include "ccx_decoders_708.h" #include "compile_info.h" #include "../lib_hash/sha2.h" +#include +#include #ifdef ENABLE_HARDSUBX #include "hardsubx.h" #endif @@ -497,7 +499,7 @@ void print_usage (void) mprint (" less or equal than the max allowed..\n"); mprint (" -levdistmincnt value: Minimum distance we always allow regardless\n"); mprint (" of the length of the strings.Default 2. \n"); - mprint (" This means that if the calculated distance \n"); + mprint (" This means that if the calculated distance \n"); mprint (" is 0,1 or 2, we consider the strings to be equivalent.\n"); mprint (" -levdistmaxpct value: Maximum distance we allow, as a percentage of\n"); mprint (" the shortest string length. Default 10%.\n"); @@ -591,6 +593,12 @@ void print_usage (void) mprint (" 0: OEM_TESSERACT_ONLY - default value, the fastest mode.\n"); mprint (" 1: OEM_LSTM_ONLY - use LSTM algorithm for recognition.\n"); mprint (" 2: OEM_TESSERACT_LSTM_COMBINED - both algorithms.\n"); + mprint (" -mkvlang: For MKV subtitles, select which language's caption\n"); + mprint (" stream will be processed. e.g. 'eng' for English.\n"); + mprint (" Language codes can be either the 3 letters bibliographic\n"); + mprint (" ISO-639-2 form (like \"fre\" for french) or a language\n"); + mprint (" code followed by a dash and a country code for specialities\n"); + mprint (" in languages (like \"fre-ca\" for Canadian French).\n"); mprint ("\n"); mprint ("Options that affect how ccextractor reads and writes (buffering):\n"); @@ -1021,6 +1029,58 @@ int atoi_hex (char *s) } } +void mkvlang_params_check(char* lang){ + int initial=0, present=0; + for(int char_index=0; char_index < strlen(lang);char_index++){ + lang[char_index] = cctolower(lang[char_index]); + if (lang[char_index]==','){ + present=char_index; + if ((present-initial<6)&&(present-initial!=3)) + fatal(EXIT_MALFORMED_PARAMETER, "language codes should be xxx,xxx,xxx,....\n"); + + else if ((present-initial>3)&&(present-initial!=6)) + fatal(EXIT_MALFORMED_PARAMETER, "language codes should be xxx-xx,xxx-xx,xxx-xx,....\n"); + + if ((present-initial>3)&&(present-initial==6)){ + size_t length = present-initial; + char* block=calloc(length+1,sizeof(char)); + strncpy(block,lang+initial,length); + char* hiphen_pointer = strstr(block,"-"); + if (!hiphen_pointer) + fatal(EXIT_MALFORMED_PARAMETER, "language code is not of the form xxx-xx\n"); + free(block); + } + initial=present+1; + } + } + + //Steps to check for the last lang of multiple mkvlangs provided by the user. + present = strlen(lang)-1; + + for(int char_index=strlen(lang)-1; char_index >=0 ;char_index--) + if (lang[char_index]==','){ + initial=char_index+1; + break; + } + + if ((present-initial<5)&&(present-initial!=2)) + fatal(EXIT_MALFORMED_PARAMETER, "last language code should be xxx.\n"); + + else if ((present-initial>2)&&(present-initial!=5)) + fatal(EXIT_MALFORMED_PARAMETER, "last language code should be xxx-xx.\n"); + + if ((present-initial>2)&&(present-initial==5)){ + size_t length = present-initial; + char* block=calloc(length+1,sizeof(char)); + strncpy(block,lang+initial,length); + char* hiphen_pointer = strstr(block,"-"); + if (!hiphen_pointer) + fatal(EXIT_MALFORMED_PARAMETER, "last language code is not of the form xxx-xx\n"); + free(block); + } +} + + int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[]) { @@ -1225,7 +1285,7 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[]) } } #endif - + if (strcmp(argv[i], "-chapters") == 0){ opt->extract_chapters= 1; continue; @@ -1412,6 +1472,15 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[]) continue; } + if(strcmp(argv[i],"-mkvlang")==0 && i < argc-1) + { + i++; + opt->mkvlang = (char *)malloc(sizeof(argv[i])); + sprintf(opt->mkvlang,"%s",argv[i]); + mkvlang_params_check(opt->mkvlang); + continue; + } + /* Output file formats */ if (strcmp (argv[i],"-srt")==0 || strcmp (argv[i],"-dvdraw")==0 || @@ -2233,7 +2302,7 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[]) { fatal (EXIT_INCOMPATIBLE_PARAMETERS, "MP4 requires an actual file, it's not possible to read from a stream, including stdin.\n"); } - + if(opt->extract_chapters) { mprint("Request to extract chapters recieved.\n");