mirror of
https://github.com/CCExtractor/ccextractor.git
synced 2024-12-24 20:01:42 +00:00
Merge pull request #721 from Diptanshu8/matroska
**[FIX]** Fixes issue #705
This commit is contained in:
commit
ce9416e943
@ -66,6 +66,7 @@ void init_options (struct ccx_s_options *options)
|
||||
options->dvblang = NULL; // By default, autodetect DVB language
|
||||
options->ocrlang = NULL; // By default, autodetect .traineddata file
|
||||
options->ocr_oem = 0; // By default, set Tesseract OEM mode OEM_TESSERACT_ONLY (0)
|
||||
options->mkvlang = NULL; // By default, all the languages are extracted
|
||||
options->ignore_pts_jumps = 1;
|
||||
|
||||
/*HardsubX related stuff*/
|
||||
|
@ -38,7 +38,7 @@ struct encoder_cfg
|
||||
int force_flush; // Force flush on content write
|
||||
int append_mode; // Append mode for output files
|
||||
int ucla; // 1 if -UCLA used, 0 if not
|
||||
|
||||
|
||||
enum ccx_encoding_type encoding;
|
||||
enum ccx_output_date_format date_format;
|
||||
char millis_separator;
|
||||
@ -132,6 +132,7 @@ struct ccx_s_options // Options from user parameters
|
||||
char *dvblang; // The name of the language stream for DVB
|
||||
char *ocrlang; // The name of the .traineddata file to be loaded with tesseract
|
||||
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
|
||||
char *mkvlang; // The name of the language stream for MKV
|
||||
|
||||
/*HardsubX related stuff*/
|
||||
int hardsubx_ocr_mode;
|
||||
|
@ -227,7 +227,7 @@ char* generate_timestamp_ass_ssa(ULLONG milliseconds) {
|
||||
}
|
||||
|
||||
int find_sub_track_index(struct matroska_ctx* mkv_ctx, ULLONG track_number) {
|
||||
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
|
||||
for (int i = mkv_ctx->sub_tracks_count-1; i >=0 ; i--)
|
||||
if (mkv_ctx->sub_tracks[i]->track_number == track_number)
|
||||
return i;
|
||||
return -1;
|
||||
@ -259,6 +259,11 @@ struct matroska_sub_sentence* parse_segment_cluster_block_group_block(struct mat
|
||||
sentence->time_start = timecode + cluster_timecode;
|
||||
|
||||
struct matroska_sub_track* track = mkv_ctx->sub_tracks[sub_track_index];
|
||||
if (track->sentence_count==0){
|
||||
track->sentences = malloc(sizeof(struct matroska_sub_sentence*));
|
||||
}
|
||||
else
|
||||
track->sentences = realloc(track->sentences,(track->sentence_count+1)*sizeof(struct matroska_sub_sentence*));
|
||||
track->sentences[track->sentence_count] = sentence;
|
||||
track->sentence_count++;
|
||||
|
||||
@ -617,12 +622,11 @@ void parse_segment_track_entry(struct matroska_ctx* mkv_ctx) {
|
||||
sub_track->lang_index = 0;
|
||||
sub_track->codec_id = codec_id;
|
||||
sub_track->sentence_count = 0;
|
||||
|
||||
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
|
||||
if (strcmp((const char *)mkv_ctx->sub_tracks[i]->lang, (const char *)lang) == 0)
|
||||
sub_track->lang_index++;
|
||||
|
||||
mkv_ctx->sub_tracks[mkv_ctx->sub_tracks_count] = sub_track;
|
||||
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
|
||||
if (strcmp((const char *)mkv_ctx->sub_tracks[i]->lang, (const char *)lang) == 0)
|
||||
sub_track->lang_index++;
|
||||
mkv_ctx->sub_tracks = realloc(mkv_ctx->sub_tracks, sizeof(struct matroska_sub_track*) * (mkv_ctx->sub_tracks_count + 1));
|
||||
mkv_ctx->sub_tracks[mkv_ctx->sub_tracks_count] = sub_track;
|
||||
mkv_ctx->sub_tracks_count++;
|
||||
}
|
||||
else
|
||||
@ -644,8 +648,10 @@ void parse_segment_tracks(struct matroska_ctx* mkv_ctx)
|
||||
switch (code) {
|
||||
/* Tracks ids*/
|
||||
case MATROSKA_SEGMENT_TRACK_ENTRY:
|
||||
parse_segment_track_entry(mkv_ctx);
|
||||
MATROSKA_SWITCH_BREAK(code, code_len);
|
||||
|
||||
|
||||
parse_segment_track_entry(mkv_ctx);
|
||||
MATROSKA_SWITCH_BREAK(code, code_len);
|
||||
|
||||
/* Misc ids */
|
||||
case MATROSKA_VOID:
|
||||
@ -677,7 +683,6 @@ void parse_segment(struct matroska_ctx* mkv_ctx)
|
||||
code <<= 8;
|
||||
code += mkv_read_byte(file);
|
||||
code_len++;
|
||||
|
||||
switch (code) {
|
||||
/* Segment ids */
|
||||
case MATROSKA_SEGMENT_SEEK_HEAD:
|
||||
@ -758,7 +763,6 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
|
||||
{
|
||||
char* filename = generate_filename_from_track(mkv_ctx, track);
|
||||
mprint("\nOutput file: %s", filename);
|
||||
|
||||
int desc;
|
||||
#ifdef WIN32
|
||||
desc = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_APPEND, S_IREAD | S_IWRITE);
|
||||
@ -779,8 +783,7 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
|
||||
{
|
||||
char number[9];
|
||||
sprintf(number, "%d", i + 1);
|
||||
|
||||
char *timestamp_start = malloc(sizeof(char) * 80); //being generous
|
||||
char *timestamp_start = malloc(sizeof(char) * 80); //being generous
|
||||
timestamp_to_srttime(sentence->time_start, timestamp_start);
|
||||
ULLONG time_end = sentence->time_end;
|
||||
if (i + 1 < track->sentence_count)
|
||||
@ -794,7 +797,10 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
|
||||
write(desc, " --> ", 5);
|
||||
write(desc, timestamp_end, strlen(timestamp_start));
|
||||
write(desc, "\n", 1);
|
||||
write(desc, sentence->text, sentence->text_size);
|
||||
int size=0;
|
||||
while (*(sentence->text+size)=='\n' || *(sentence->text+size)=='\r' )
|
||||
size++;
|
||||
write(desc, sentence->text+size, sentence->text_size-size);
|
||||
write(desc, "\n\n", 2);
|
||||
|
||||
free(timestamp_start);
|
||||
@ -814,6 +820,8 @@ void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* tra
|
||||
write(desc, timestamp_end, strlen(timestamp_start));
|
||||
write(desc, ",", 1);
|
||||
char* text = ass_ssa_sentence_erase_read_order(sentence->text);
|
||||
while((text[0]=='\\') && (text[1]=='n' || text[1]=='N'))
|
||||
text+=2;
|
||||
write(desc, text, strlen(text));
|
||||
write(desc, "\n", 1);
|
||||
|
||||
@ -838,10 +846,17 @@ void free_sub_track(struct matroska_sub_track* track)
|
||||
free(track);
|
||||
}
|
||||
|
||||
void matroska_save_all(struct matroska_ctx* mkv_ctx)
|
||||
void matroska_save_all(struct matroska_ctx* mkv_ctx,char* lang)
|
||||
{
|
||||
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++)
|
||||
char* match;
|
||||
for (int i = 0; i < mkv_ctx->sub_tracks_count; i++){
|
||||
if (lang){
|
||||
if (match = strstr(lang,mkv_ctx->sub_tracks[i]->lang) != NULL)
|
||||
save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]);
|
||||
}
|
||||
else
|
||||
save_sub_track(mkv_ctx, mkv_ctx->sub_tracks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void matroska_free_all(struct matroska_ctx* mkv_ctx)
|
||||
@ -864,7 +879,7 @@ void matroska_parse(struct matroska_ctx* mkv_ctx)
|
||||
code_len++;
|
||||
|
||||
switch (code) {
|
||||
/* Header ids*/
|
||||
/* Header ids*/
|
||||
case MATROSKA_EBML_HEADER:
|
||||
parse_ebml(file);
|
||||
MATROSKA_SWITCH_BREAK(code, code_len);
|
||||
@ -889,6 +904,7 @@ void matroska_parse(struct matroska_ctx* mkv_ctx)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Close file stream
|
||||
fclose(file);
|
||||
|
||||
@ -921,6 +937,7 @@ int matroska_loop(struct lib_ccx_ctx *ctx)
|
||||
mkv_ctx->current_second = 0;
|
||||
mkv_ctx->filename = ctx->inputfile[ctx->current_file];
|
||||
mkv_ctx->file = create_file(ctx);
|
||||
mkv_ctx->sub_tracks = malloc(sizeof(struct matroska_sub_track**));
|
||||
|
||||
matroska_parse(mkv_ctx);
|
||||
|
||||
@ -928,7 +945,7 @@ int matroska_loop(struct lib_ccx_ctx *ctx)
|
||||
activity_progress(100, (int) (mkv_ctx->current_second / 60),
|
||||
(int) (mkv_ctx->current_second % 60));
|
||||
|
||||
matroska_save_all(mkv_ctx);
|
||||
matroska_save_all(mkv_ctx,ccx_options.mkvlang);
|
||||
int sentence_count = mkv_ctx->sentence_count;
|
||||
matroska_free_all(mkv_ctx);
|
||||
|
||||
|
@ -136,9 +136,7 @@
|
||||
|
||||
/* Other defines */
|
||||
#define MATROSKA_MAX_ID_LENGTH 4
|
||||
#define MATROSKA_MAX_TRACKS 128
|
||||
#define MATROSKA_MAX_SENTENCES 8192
|
||||
#define MAX_FILE_NAME_SIZE 200
|
||||
#define MAX_FILE_NAME_SIZE 260
|
||||
|
||||
/* Enums */
|
||||
enum matroska_track_entry_type {
|
||||
@ -203,11 +201,11 @@ struct matroska_sub_track {
|
||||
enum matroska_track_subtitle_codec_id codec_id;
|
||||
|
||||
int sentence_count;
|
||||
struct matroska_sub_sentence* sentences[MATROSKA_MAX_SENTENCES];
|
||||
struct matroska_sub_sentence** sentences;
|
||||
};
|
||||
|
||||
struct matroska_ctx {
|
||||
struct matroska_sub_track* sub_tracks[MATROSKA_MAX_TRACKS];
|
||||
struct matroska_sub_track** sub_tracks;
|
||||
struct lib_ccx_ctx* ctx;
|
||||
int sub_tracks_count;
|
||||
int sentence_count;
|
||||
@ -244,13 +242,13 @@ void parse_segment(struct matroska_ctx* mkv_ctx);
|
||||
char* generate_timestamp_utf8(ULLONG milliseconds);
|
||||
char* generate_timestamp_ass_ssa(ULLONG milliseconds);
|
||||
int find_sub_track_index(struct matroska_ctx* mkv_ctx, ULLONG track_number);
|
||||
char* get_track_entry_type_description(enum matroska_track_entry_type type);
|
||||
char* get_track_entry_type_description(enum matroska_track_entry_type type);
|
||||
enum matroska_track_subtitle_codec_id get_track_subtitle_codec_id(char* codec_id);
|
||||
char* generate_filename_from_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* track);
|
||||
char* ass_ssa_sentence_erase_read_order(char* text);
|
||||
void save_sub_track(struct matroska_ctx* mkv_ctx, struct matroska_sub_track* track);
|
||||
void free_sub_track(struct matroska_sub_track* track);
|
||||
void matroska_save_all(struct matroska_ctx* mkv_ctx);
|
||||
void matroska_save_all(struct matroska_ctx* mkv_ctx,char* lang);
|
||||
void matroska_free_all(struct matroska_ctx* mkv_ctx);
|
||||
void matroska_parse(struct matroska_ctx* mkv_ctx);
|
||||
FILE* create_file(struct lib_ccx_ctx *ctx);
|
||||
|
@ -7,6 +7,8 @@
|
||||
#include "ccx_decoders_708.h"
|
||||
#include "compile_info.h"
|
||||
#include "../lib_hash/sha2.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#ifdef ENABLE_HARDSUBX
|
||||
#include "hardsubx.h"
|
||||
#endif
|
||||
@ -497,7 +499,7 @@ void print_usage (void)
|
||||
mprint (" less or equal than the max allowed..\n");
|
||||
mprint (" -levdistmincnt value: Minimum distance we always allow regardless\n");
|
||||
mprint (" of the length of the strings.Default 2. \n");
|
||||
mprint (" This means that if the calculated distance \n");
|
||||
mprint (" This means that if the calculated distance \n");
|
||||
mprint (" is 0,1 or 2, we consider the strings to be equivalent.\n");
|
||||
mprint (" -levdistmaxpct value: Maximum distance we allow, as a percentage of\n");
|
||||
mprint (" the shortest string length. Default 10%.\n");
|
||||
@ -591,6 +593,12 @@ void print_usage (void)
|
||||
mprint (" 0: OEM_TESSERACT_ONLY - default value, the fastest mode.\n");
|
||||
mprint (" 1: OEM_LSTM_ONLY - use LSTM algorithm for recognition.\n");
|
||||
mprint (" 2: OEM_TESSERACT_LSTM_COMBINED - both algorithms.\n");
|
||||
mprint (" -mkvlang: For MKV subtitles, select which language's caption\n");
|
||||
mprint (" stream will be processed. e.g. 'eng' for English.\n");
|
||||
mprint (" Language codes can be either the 3 letters bibliographic\n");
|
||||
mprint (" ISO-639-2 form (like \"fre\" for french) or a language\n");
|
||||
mprint (" code followed by a dash and a country code for specialities\n");
|
||||
mprint (" in languages (like \"fre-ca\" for Canadian French).\n");
|
||||
|
||||
mprint ("\n");
|
||||
mprint ("Options that affect how ccextractor reads and writes (buffering):\n");
|
||||
@ -1021,6 +1029,58 @@ int atoi_hex (char *s)
|
||||
}
|
||||
}
|
||||
|
||||
void mkvlang_params_check(char* lang){
|
||||
int initial=0, present=0;
|
||||
for(int char_index=0; char_index < strlen(lang);char_index++){
|
||||
lang[char_index] = cctolower(lang[char_index]);
|
||||
if (lang[char_index]==','){
|
||||
present=char_index;
|
||||
if ((present-initial<6)&&(present-initial!=3))
|
||||
fatal(EXIT_MALFORMED_PARAMETER, "language codes should be xxx,xxx,xxx,....\n");
|
||||
|
||||
else if ((present-initial>3)&&(present-initial!=6))
|
||||
fatal(EXIT_MALFORMED_PARAMETER, "language codes should be xxx-xx,xxx-xx,xxx-xx,....\n");
|
||||
|
||||
if ((present-initial>3)&&(present-initial==6)){
|
||||
size_t length = present-initial;
|
||||
char* block=calloc(length+1,sizeof(char));
|
||||
strncpy(block,lang+initial,length);
|
||||
char* hiphen_pointer = strstr(block,"-");
|
||||
if (!hiphen_pointer)
|
||||
fatal(EXIT_MALFORMED_PARAMETER, "language code is not of the form xxx-xx\n");
|
||||
free(block);
|
||||
}
|
||||
initial=present+1;
|
||||
}
|
||||
}
|
||||
|
||||
//Steps to check for the last lang of multiple mkvlangs provided by the user.
|
||||
present = strlen(lang)-1;
|
||||
|
||||
for(int char_index=strlen(lang)-1; char_index >=0 ;char_index--)
|
||||
if (lang[char_index]==','){
|
||||
initial=char_index+1;
|
||||
break;
|
||||
}
|
||||
|
||||
if ((present-initial<5)&&(present-initial!=2))
|
||||
fatal(EXIT_MALFORMED_PARAMETER, "last language code should be xxx.\n");
|
||||
|
||||
else if ((present-initial>2)&&(present-initial!=5))
|
||||
fatal(EXIT_MALFORMED_PARAMETER, "last language code should be xxx-xx.\n");
|
||||
|
||||
if ((present-initial>2)&&(present-initial==5)){
|
||||
size_t length = present-initial;
|
||||
char* block=calloc(length+1,sizeof(char));
|
||||
strncpy(block,lang+initial,length);
|
||||
char* hiphen_pointer = strstr(block,"-");
|
||||
if (!hiphen_pointer)
|
||||
fatal(EXIT_MALFORMED_PARAMETER, "last language code is not of the form xxx-xx\n");
|
||||
free(block);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
|
||||
{
|
||||
|
||||
@ -1225,7 +1285,7 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
if (strcmp(argv[i], "-chapters") == 0){
|
||||
opt->extract_chapters= 1;
|
||||
continue;
|
||||
@ -1412,6 +1472,15 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
|
||||
continue;
|
||||
}
|
||||
|
||||
if(strcmp(argv[i],"-mkvlang")==0 && i < argc-1)
|
||||
{
|
||||
i++;
|
||||
opt->mkvlang = (char *)malloc(sizeof(argv[i]));
|
||||
sprintf(opt->mkvlang,"%s",argv[i]);
|
||||
mkvlang_params_check(opt->mkvlang);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Output file formats */
|
||||
if (strcmp (argv[i],"-srt")==0 ||
|
||||
strcmp (argv[i],"-dvdraw")==0 ||
|
||||
@ -2233,7 +2302,7 @@ int parse_parameters (struct ccx_s_options *opt, int argc, char *argv[])
|
||||
{
|
||||
fatal (EXIT_INCOMPATIBLE_PARAMETERS, "MP4 requires an actual file, it's not possible to read from a stream, including stdin.\n");
|
||||
}
|
||||
|
||||
|
||||
if(opt->extract_chapters)
|
||||
{
|
||||
mprint("Request to extract chapters recieved.\n");
|
||||
|
Loading…
Reference in New Issue
Block a user