From 66393a80f2b06565080319231df381e033c93f7f Mon Sep 17 00:00:00 2001 From: maxkoryukov Date: Fri, 2 Dec 2016 13:36:33 +0500 Subject: [PATCH] Break incoming subs into sentences (through a buffer), and remove duplicates --- .gitignore | 6 + src/lib_ccx/ccx_encoders_common.c | 377 ++++++++------- src/lib_ccx/ccx_encoders_common.h | 20 +- src/lib_ccx/ccx_encoders_splitbysentence.c | 508 +++++++++++++++++---- src/lib_ccx/debug_def.h | 11 + tests/Makefile | 59 +++ tests/README.md | 43 ++ tests/ccx_encoders_splitbysentence_suite.c | 305 +++++++++++++ tests/ccx_encoders_splitbysentence_suite.h | 4 + tests/runtest.c | 21 + 10 files changed, 1060 insertions(+), 294 deletions(-) create mode 100644 src/lib_ccx/debug_def.h create mode 100644 tests/Makefile create mode 100644 tests/README.md create mode 100644 tests/ccx_encoders_splitbysentence_suite.c create mode 100644 tests/ccx_encoders_splitbysentence_suite.h create mode 100644 tests/runtest.c diff --git a/.gitignore b/.gitignore index 9feb3b13..ae4e5092 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +#### +# Ignore tests tmp files and results +tests/runtest +tests/**/*.gcda +tests/**/*.gcno + #### # Ignore CVS related files diff --git a/src/lib_ccx/ccx_encoders_common.c b/src/lib_ccx/ccx_encoders_common.c index 4c32e210..140a3612 100644 --- a/src/lib_ccx/ccx_encoders_common.c +++ b/src/lib_ccx/ccx_encoders_common.c @@ -957,14 +957,10 @@ struct encoder_ctx *init_encoder(struct encoder_cfg *opt) ctx->force_flush = opt->force_flush; ctx->ucla = opt->ucla; ctx->splitbysentence = opt->splitbysentence; - ctx->sbs_newblock_start_time = -1; - ctx->sbs_newblock_end_time = -1; - ctx->sbs_newblock = NULL; - ctx->sbs_newblock_capacity = 0; - ctx->sbs_newblock_size = 0; + ctx->sbs_time_from = -1; + ctx->sbs_time_trim = -1; + ctx->sbs_capacity = 0; ctx->sbs_buffer = NULL; - ctx->sbs_buffer_capacity = 0; - ctx->sbs_buffer_size = 0; ctx->subline = (unsigned char *) malloc (SUBLINESIZE); if(!ctx->subline) @@ -1045,203 +1041,204 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub) // Write to a buffer that is later s+plit to generate split // in sentences if (sub->type == CC_BITMAP) - wrote_something = write_cc_bitmap_to_sentence_buffer(sub, context); + sub = reformat_cc_bitmap_through_sentence_buffer(sub, context); + + if (NULL==sub) + return wrote_something; } - else + // Write subtitles as they come + if (sub->type == CC_608) { - // Write subtitles as they come - if (sub->type == CC_608) + struct eia608_screen *data = NULL; + struct ccx_s_write *out; + for (data = sub->data; sub->nb_data; sub->nb_data--, data++) { - struct eia608_screen *data = NULL; - struct ccx_s_write *out; - for (data = sub->data; sub->nb_data; sub->nb_data--, data++) + // Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example) + out = get_output_ctx(context, data->my_field); + + if (data->format == SFORMAT_XDS) { - // Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example) - out = get_output_ctx(context, data->my_field); - - if (data->format == SFORMAT_XDS) - { - data->end_time = data->end_time + context->subs_delay; - xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class); - if (data->xds_len > 0) - { - ret = write(out->fh, data->xds_str, data->xds_len); - if (ret < data->xds_len) - { - mprint("WARNING:Loss of data\n"); - } - } - freep(&data->xds_str); - write_newline(context, 0); - continue; - } - data->end_time = data->end_time + context->subs_delay; - switch (context->write_format) + xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class); + if (data->xds_len > 0) { - case CCX_OF_SRT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, data->start_time); - wrote_something = write_cc_buffer_as_srt(data, context); - break; - case CCX_OF_SSA: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, data->start_time); - wrote_something = write_cc_buffer_as_ssa(data, context); - break; - case CCX_OF_G608: - wrote_something = write_cc_buffer_as_g608(data, context); - break; - case CCX_OF_WEBVTT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, data->start_time); - wrote_something = write_cc_buffer_as_webvtt(data, context); - break; - case CCX_OF_SAMI: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, data->start_time); - wrote_something = write_cc_buffer_as_sami(data, context); - break; - case CCX_OF_SMPTETT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, data->start_time); - wrote_something = write_cc_buffer_as_smptett(data, context); - break; - case CCX_OF_TRANSCRIPT: - wrote_something = write_cc_buffer_as_transcript2(data, context); - break; - case CCX_OF_SPUPNG: - wrote_something = write_cc_buffer_as_spupng(data, context); - break; - case CCX_OF_SIMPLE_XML: - if (ccx_options.keep_output_closed && context->out->temporarily_closed) - { - temporarily_open_output(context->out); - write_subtitle_file_header(context, context->out); - } - wrote_something = write_cc_buffer_as_simplexml(data, context); - if (ccx_options.keep_output_closed) - { - write_subtitle_file_footer(context, context->out); - temporarily_close_output(context->out); - } - break; - default: - break; + ret = write(out->fh, data->xds_str, data->xds_len); + if (ret < data->xds_len) + { + mprint("WARNING:Loss of data\n"); + } } - if (wrote_something) - context->last_displayed_subs_ms = data->end_time; - - if (context->gui_mode_reports) - write_cc_buffer_to_gui(sub->data, context); - } - freep(&sub->data); - } - if (sub->type == CC_BITMAP) - { - switch (context->write_format) - { - case CCX_OF_SRT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_bitmap_as_srt(sub, context); - break; - case CCX_OF_SSA: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_bitmap_as_ssa(sub, context); - break; - case CCX_OF_WEBVTT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_bitmap_as_webvtt(sub, context); - break; - case CCX_OF_SAMI: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_bitmap_as_sami(sub, context); - break; - case CCX_OF_SMPTETT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_bitmap_as_smptett(sub, context); - break; - case CCX_OF_TRANSCRIPT: - wrote_something = write_cc_bitmap_as_transcript(sub, context); - break; - case CCX_OF_SPUPNG: - wrote_something = write_cc_bitmap_as_spupng(sub, context); - break; - case CCX_OF_SIMPLE_XML: - wrote_something = write_cc_bitmap_as_simplexml(sub, context); - break; -#ifdef WITH_LIBCURL - case CCX_OF_CURL: - wrote_something = write_cc_bitmap_as_libcurl(sub, context); - break; -#endif - default: - break; + freep(&data->xds_str); + write_newline(context, 0); + continue; } - } - if (sub->type == CC_RAW) - { - if (context->send_to_srv) - net_send_header(sub->data, sub->nb_data); - else - { - ret = write(context->out->fh, sub->data, sub->nb_data); - if (ret < sub->nb_data) { - mprint("WARNING: Loss of data\n"); - } - } - sub->nb_data = 0; - } - if (sub->type == CC_TEXT) - { + data->end_time = data->end_time + context->subs_delay; switch (context->write_format) { - case CCX_OF_SRT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_subtitle_as_srt(sub, context); - break; - case CCX_OF_SSA: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_subtitle_as_ssa(sub, context); - break; - case CCX_OF_WEBVTT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_subtitle_as_webvtt(sub, context); - break; - case CCX_OF_SAMI: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_subtitle_as_sami(sub, context); - break; - case CCX_OF_SMPTETT: - if (!context->startcredits_displayed && context->start_credits_text != NULL) - try_to_add_start_credits(context, sub->start_time); - wrote_something = write_cc_subtitle_as_smptett(sub, context); - break; - case CCX_OF_TRANSCRIPT: - wrote_something = write_cc_subtitle_as_transcript(sub, context); - break; - case CCX_OF_SPUPNG: - wrote_something = write_cc_subtitle_as_spupng(sub, context); - break; - case CCX_OF_SIMPLE_XML: - wrote_something = write_cc_subtitle_as_simplexml(sub, context); - break; - default: - break; + case CCX_OF_SRT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, data->start_time); + wrote_something = write_cc_buffer_as_srt(data, context); + break; + case CCX_OF_SSA: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, data->start_time); + wrote_something = write_cc_buffer_as_ssa(data, context); + break; + case CCX_OF_G608: + wrote_something = write_cc_buffer_as_g608(data, context); + break; + case CCX_OF_WEBVTT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, data->start_time); + wrote_something = write_cc_buffer_as_webvtt(data, context); + break; + case CCX_OF_SAMI: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, data->start_time); + wrote_something = write_cc_buffer_as_sami(data, context); + break; + case CCX_OF_SMPTETT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, data->start_time); + wrote_something = write_cc_buffer_as_smptett(data, context); + break; + case CCX_OF_TRANSCRIPT: + wrote_something = write_cc_buffer_as_transcript2(data, context); + break; + case CCX_OF_SPUPNG: + wrote_something = write_cc_buffer_as_spupng(data, context); + break; + case CCX_OF_SIMPLE_XML: + if (ccx_options.keep_output_closed && context->out->temporarily_closed) + { + temporarily_open_output(context->out); + write_subtitle_file_header(context, context->out); + } + wrote_something = write_cc_buffer_as_simplexml(data, context); + if (ccx_options.keep_output_closed) + { + write_subtitle_file_footer(context, context->out); + temporarily_close_output(context->out); + } + break; + default: + break; } - sub->nb_data = 0; + if (wrote_something) + context->last_displayed_subs_ms = data->end_time; + + if (context->gui_mode_reports) + write_cc_buffer_to_gui(sub->data, context); } + freep(&sub->data); } + if (sub->type == CC_BITMAP) + { + switch (context->write_format) + { + case CCX_OF_SRT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_bitmap_as_srt(sub, context); + break; + case CCX_OF_SSA: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_bitmap_as_ssa(sub, context); + break; + case CCX_OF_WEBVTT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_bitmap_as_webvtt(sub, context); + break; + case CCX_OF_SAMI: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_bitmap_as_sami(sub, context); + break; + case CCX_OF_SMPTETT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_bitmap_as_smptett(sub, context); + break; + case CCX_OF_TRANSCRIPT: + wrote_something = write_cc_bitmap_as_transcript(sub, context); + break; + case CCX_OF_SPUPNG: + wrote_something = write_cc_bitmap_as_spupng(sub, context); + break; + case CCX_OF_SIMPLE_XML: + wrote_something = write_cc_bitmap_as_simplexml(sub, context); + break; +#ifdef WITH_LIBCURL + case CCX_OF_CURL: + wrote_something = write_cc_bitmap_as_libcurl(sub, context); + break; +#endif + default: + break; + } + + } + if (sub->type == CC_RAW) + { + if (context->send_to_srv) + net_send_header(sub->data, sub->nb_data); + else + { + ret = write(context->out->fh, sub->data, sub->nb_data); + if (ret < sub->nb_data) { + mprint("WARNING: Loss of data\n"); + } + } + sub->nb_data = 0; + } + if (sub->type == CC_TEXT) + { + switch (context->write_format) + { + case CCX_OF_SRT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_subtitle_as_srt(sub, context); + break; + case CCX_OF_SSA: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_subtitle_as_ssa(sub, context); + break; + case CCX_OF_WEBVTT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_subtitle_as_webvtt(sub, context); + break; + case CCX_OF_SAMI: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_subtitle_as_sami(sub, context); + break; + case CCX_OF_SMPTETT: + if (!context->startcredits_displayed && context->start_credits_text != NULL) + try_to_add_start_credits(context, sub->start_time); + wrote_something = write_cc_subtitle_as_smptett(sub, context); + break; + case CCX_OF_TRANSCRIPT: + wrote_something = write_cc_subtitle_as_transcript(sub, context); + break; + case CCX_OF_SPUPNG: + wrote_something = write_cc_subtitle_as_spupng(sub, context); + break; + case CCX_OF_SIMPLE_XML: + wrote_something = write_cc_subtitle_as_simplexml(sub, context); + break; + default: + break; + } + sub->nb_data = 0; + } + if (!sub->nb_data) freep(&sub->data); if (wrote_something && context->force_flush) diff --git a/src/lib_ccx/ccx_encoders_common.h b/src/lib_ccx/ccx_encoders_common.h index 13cba38b..2af17d87 100644 --- a/src/lib_ccx/ccx_encoders_common.h +++ b/src/lib_ccx/ccx_encoders_common.h @@ -62,7 +62,7 @@ struct encoder_ctx /* Input file format used in Teletext for exceptional output */ unsigned int in_fileformat; //1 =Normal, 2=Teletext /* Keep output file closed when not actually writing to it and start over each time (add headers, etc) */ - unsigned int keep_output_closed; + unsigned int keep_output_closed; /* Force a flush on the file buffer whenever content is written */ int force_flush; /* Keep track of whether -UCLA used */ @@ -118,15 +118,14 @@ struct encoder_ctx /* split-by-sentence stuff */ int splitbysentence; - LLONG sbs_newblock_start_time; // Used by the split-by-sentence code to know when the current block starts... - LLONG sbs_newblock_end_time; // ... and ends - ccx_sbs_utf8_character *sbs_newblock; - int sbs_newblock_capacity; - int sbs_newblock_size; - ccx_sbs_utf8_character *sbs_buffer; - int sbs_buffer_capacity; - int sbs_buffer_size; + unsigned char * sbs_buffer; /// Storage for sentence-split buffer + size_t sbs_handled_len; /// The length of the string in the SBS-buffer, already handled, but preserved for DUP-detection. + + //ccx_sbs_utf8_character *sbs_newblock; + LLONG sbs_time_from; // Used by the split-by-sentence code to know when the current block starts... + LLONG sbs_time_trim; // ... and ends + size_t sbs_capacity; }; #define INITIAL_ENC_BUFFER_CAPACITY 2048 @@ -196,10 +195,9 @@ int write_cc_bitmap_as_sami (struct cc_subtitle *sub, struct encoder_ int write_cc_bitmap_as_smptett (struct cc_subtitle *sub, struct encoder_ctx *context); int write_cc_bitmap_as_spupng (struct cc_subtitle *sub, struct encoder_ctx *context); int write_cc_bitmap_as_transcript (struct cc_subtitle *sub, struct encoder_ctx *context); -int write_cc_bitmap_to_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context); int write_cc_bitmap_as_libcurl (struct cc_subtitle *sub, struct encoder_ctx *context); - +struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context); void set_encoder_last_displayed_subs_ms(struct encoder_ctx *ctx, LLONG last_displayed_subs_ms); void set_encoder_subs_delay(struct encoder_ctx *ctx, LLONG subs_delay); diff --git a/src/lib_ccx/ccx_encoders_splitbysentence.c b/src/lib_ccx/ccx_encoders_splitbysentence.c index 3e299fc0..63d7e377 100644 --- a/src/lib_ccx/ccx_encoders_splitbysentence.c +++ b/src/lib_ccx/ccx_encoders_splitbysentence.c @@ -1,135 +1,457 @@ -#include "ccx_decoders_common.h" +#include "ccx_common_platform.h" #include "ccx_encoders_common.h" -#include "spupng_encoder.h" -#include "ccx_encoders_spupng.h" -#include "utility.h" +#include "lib_ccx.h" #include "ocr.h" -#include "ccx_decoders_608.h" -#include "ccx_decoders_708.h" -#include "ccx_decoders_708_output.h" -#include "ccx_encoders_xds.h" -#include "ccx_encoders_helpers.h" -#include "utf8proc.h" +#include "debug_def.h" #ifdef ENABLE_SHARING #include "ccx_share.h" #endif //ENABLE_SHARING -void lbl_start_block(LLONG start_time, struct encoder_ctx *context) +int sbs_is_pointer_on_sentence_breaker(char * start, char * current) { - context->sbs_newblock_start_time = start_time; -} + char c = *current; + char n = *(current + 1); + char p = *(current - 1); -void lbl_add_character(struct encoder_ctx *context, ccx_sbs_utf8_character ch) -{ - if (context->sbs_newblock_capacity == context->sbs_newblock_size) + if (0 == c) n = 0; + if (current == start) p = 0; + + if (0 == c) return 1; + + if ('.' == c + || '!' == c + || '?' == c + ) { - int newcapacity = (context->sbs_newblock_capacity < 512) ? 1024 : context->sbs_newblock_capacity * 2; - context->sbs_newblock = (ccx_sbs_utf8_character *)realloc(context->sbs_newblock, newcapacity*sizeof(ccx_sbs_utf8_character)); - if (!context->sbs_newblock) - fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in lbl_add_character"); - context->sbs_newblock_capacity = newcapacity; + if ('.' == n + || '!' == n + || '?' == n + ) + { + return 0; + } + + return 1; } - memcpy(&context->sbs_newblock[context->sbs_newblock_size++], &ch, sizeof ch); + + return 0; } -void lbl_end_block(LLONG end_time, struct encoder_ctx *context) +int sbs_fuzzy_strncmp(const char * a, const char * b, size_t n, const size_t maxerr) { - context->sbs_newblock_end_time = end_time; + // TODO: implement fuzzy comparing + // Error counter DOES NOT WORK!!! + + int i; + //int err; + char A, B; + + i = -1; + do + { + i++; + + // Bound check (compare to N) + if (i == n) return 0; + + A = a[i]; + B = b[i]; + + // bound check (line endings) + if (A == 0) + { + if (B == 0) return 0; + return 1; + } + else + { + if (B == 0) return -1; + } + + if (A == B) continue; + if (isspace(A) && isspace(B)) continue; + + if (A > B) return 1; + return -1; + + } while(1); } -int write_cc_bitmap_to_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context) +void sbs_strcpy_without_dup(const unsigned char * str, struct encoder_ctx * context) +{ + int intersect_len; + unsigned char * suffix; + const unsigned char * prefix = str; + + unsigned long sbs_len; + unsigned long str_len; + + str_len = strlen(str); + sbs_len = strlen(context->sbs_buffer); + + intersect_len = str_len; + if (sbs_len < intersect_len) + intersect_len = sbs_len; + + while (intersect_len>0) + { + suffix = context->sbs_buffer + sbs_len - intersect_len; + if (0 == sbs_fuzzy_strncmp(prefix, suffix, intersect_len, 1)) + { + break; + } + intersect_len--; + } + + LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, intersection len [%4d]\n", intersect_len); + + // check, that new string does not contain data, from + // already handled sentence: + LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, sbslen [%4d] handled len [%4d]\n", sbs_len, context->sbs_handled_len); + if ( (sbs_len - intersect_len) >= context->sbs_handled_len) + { + // there is no intersection. + // It is time to clean the buffer. Excepting the last uncomplete sentence + strcpy(context->sbs_buffer, context->sbs_buffer + context->sbs_handled_len); + context->sbs_handled_len = 0; + sbs_len = strlen(context->sbs_buffer); + + LOG_DEBUG("Sentence Buffer: Clean buffer, after BUF [%s]\n\n\n", context->sbs_buffer); + } + + if (intersect_len > 0) + { + // there is a common part (suffix of old sentence equals to prefix of new str) + // + // remove dup from buffer + // we will use an appropriate part from the new string + context->sbs_buffer[sbs_len-intersect_len] = 0; + } + + sbs_len = strlen(context->sbs_buffer); + + // whitespace control. Add space between subs + if ( + !isspace(str[0]) // not a space char in the beginning of new str + && context->sbs_handled_len >0 // buffer is not empty (there is uncomplete sentence) + && !isspace(context->sbs_buffer[sbs_len-1]) // not a space char at the end of existing buf + ) + { + //strcat(context->sbs_buffer, " "); + } + + strcat(context->sbs_buffer, str); +} + +void sbs_str_autofix(unsigned char * str) +{ + int i; + + // replace all whitespaces with spaces: + for (i = 0; str[i] != 0; i++) + { + if (isspace(str[i])) + { + str[i] = ' '; + } + + if ( + str[i] == '|' + && (i==0 || isspace(str[i-1])) + && (str[i+1] == 0 || isspace(str[i+1]) || str[i+1]=='\'') + ) + { + // try to convert to "I" + str[i] = 'I'; + } + } + +} + +/** + * Appends the function to the sentence buffer, and returns a list of full sentences (if there are any), or NULL + * + * @param str Partial (or full) sub to append. + * @param time_from Starting timestamp + * @param time_trim Ending timestamp + * @param context Encoder context + * @return New subtitle, or NULL, if doesn't contain the ending part of the sentence. If there are more than one sentence, the remaining sentences will be chained using next> reference. + */ +struct cc_subtitle * sbs_append_string(unsigned char * str, const LLONG time_from, const LLONG time_trim, struct encoder_ctx * context) +{ + struct cc_subtitle * resub; + struct cc_subtitle * tmpsub; + + unsigned char * bp_current; + unsigned char * bp_last_break; + unsigned char * sbs_undone_start; + + int is_buf_initialized; + int required_capacity; + int new_capacity; + + LLONG alphanum_total; + LLONG alphanum_cur; + + LLONG anychar_total; + LLONG anychar_cur; + + LLONG duration; + LLONG available_time; + int use_alphanum_counters; + + if (! str) + return NULL; + + sbs_str_autofix(str); + + is_buf_initialized = (NULL == context->sbs_buffer || context->sbs_capacity == 0) + ? 0 + : 1; + + // =============================== + // grow sentence buffer + // =============================== + required_capacity = + (is_buf_initialized ? strlen(context->sbs_buffer) : 0) // existing data in buf + + strlen(str) // length of new string + + 1 // trailing \0 + + 1 // space control (will add one space , if required) + ; + + if (required_capacity >= context->sbs_capacity) + { + new_capacity = context->sbs_capacity; + if (! is_buf_initialized) new_capacity = 16; + + while (new_capacity < required_capacity) + { + // increase NEW_capacity, and check, that increment + // is less than 8 Mb. Because 8Mb - it is a lot + // for a TEXT buffer. It is weird... + new_capacity += (new_capacity > 1048576 * 8) + ? 1048576 * 8 + : new_capacity; + } + + context->sbs_buffer = (unsigned char *)realloc( + context->sbs_buffer, + new_capacity * sizeof(/*unsigned char*/ context->sbs_buffer[0] ) + ); + + if (!context->sbs_buffer) + fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in sbs_append_string"); + + context->sbs_capacity = new_capacity; + + // if buffer wasn't initialized, we will se trash in buffer. + // but we need just empty string, so here we will get it: + if (! is_buf_initialized) + { + // INIT SBS + context->sbs_buffer[0] = 0; + context->sbs_handled_len = 0; + } + + } + + // =============================== + // append to buffer + // + // will update sbs_buffer, sbs_handled_len + // =============================== + sbs_strcpy_without_dup(str, context); + + // =============================== + // break to sentences + // =============================== + resub = NULL; + tmpsub = NULL; + + alphanum_total = 0; + alphanum_cur = 0; + + anychar_total = 0; + anychar_cur = 0; + + sbs_undone_start = context->sbs_buffer + context->sbs_handled_len; + bp_last_break = sbs_undone_start; + + LOG_DEBUG("Sentence Buffer: BEFORE sentence break. Last break: [%s] sbs_undone_start: [%d], sbs_undone: [%s]\n", + bp_last_break, context->sbs_handled_len, sbs_undone_start + ); + + for (bp_current = sbs_undone_start; bp_current && *bp_current; bp_current++) + { + if ( + 0 < anychar_cur // skip empty! + && sbs_is_pointer_on_sentence_breaker(bp_last_break, bp_current) ) + { + // it is new sentence! + tmpsub = malloc(sizeof(struct cc_subtitle)); + + tmpsub->type = CC_TEXT; + // length of new string: + tmpsub->nb_data = + bp_current - bp_last_break + + 1 // terminating '\0' + + 1 // skip '.' + ; + tmpsub->data = strndup(bp_last_break, tmpsub->nb_data - 1); + tmpsub->got_output = 1; + + tmpsub->start_time = alphanum_cur; + alphanum_cur = 0; + tmpsub->end_time = anychar_cur; + anychar_cur = 0; + + bp_last_break = bp_current + 1; + + // tune last break: + while ( + *bp_last_break + && isspace(*bp_last_break) + ) + { + bp_last_break++; + } + + // ??? + // tmpsub->info = NULL; + // tmpsub->mode = NULL; + + // link with prev sub: + tmpsub->next = NULL; + tmpsub->prev = resub; + if (NULL != resub) + { + resub->next = tmpsub; + } + + resub = tmpsub; + } + + if (*bp_current && isalnum(*bp_current)) + { + alphanum_total++; + alphanum_cur++; + } + anychar_total++; + anychar_cur++; + } + + // =============================== + // okay, we have extracted several sentences, now we should + // save the position of the "remainder" - start of the last + // incomplete sentece + // =============================== + if (bp_last_break != sbs_undone_start) + { + context->sbs_handled_len = bp_last_break - sbs_undone_start; + } + + LOG_DEBUG("Sentence Buffer: AFTER sentence break: Handled Len [%4d]\n", context->sbs_handled_len); + + LOG_DEBUG("Sentence Buffer: Alphanum Total: [%4d] Overall chars: [%4d] STRING:[%20s] BUFFER:[%20s]\n", alphanum_total, anychar_total, str, context->sbs_buffer); + + // =============================== + // Calculate time spans + // =============================== + if (!is_buf_initialized) + { + context->sbs_time_from = time_from; + context->sbs_time_trim = time_trim; + } + + available_time = time_trim - context->sbs_time_from; + use_alphanum_counters = alphanum_total > 0 ? 1 : 0; + + tmpsub = resub; + while (tmpsub) + { + alphanum_cur = tmpsub->start_time; + anychar_cur = tmpsub->end_time; + + if (use_alphanum_counters) + { + duration = available_time * alphanum_cur / alphanum_total; + } + else + { + duration = available_time * anychar_cur / anychar_total; + } + + tmpsub->start_time = context->sbs_time_from; + tmpsub->end_time = tmpsub->start_time + duration; + + context->sbs_time_from = tmpsub->end_time + 1; + + tmpsub = tmpsub->next; + } + + return resub; +} + +struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context) { - int ret = 0; -#ifdef ENABLE_OCR struct cc_bitmap* rect; - LLONG ms_start, ms_end; + int used; + int i = 0; + char *str; - if (context->prev_start != -1 && (sub->flags & SUB_EOD_MARKER)) + // this is a sub with a full sentence (or chain of such subs) + struct cc_subtitle * resub = NULL; + +#ifdef ENABLE_OCR + + if (sub->flags & SUB_EOD_MARKER) { - ms_start = context->prev_start; - ms_end = sub->start_time; + // the last sub from input + + if (context->prev_start == -1) + { + ms_start = 1; + ms_end = sub->start_time; + } + else + { + ms_start = context->prev_start; + ms_end = sub->start_time; + } } - else if (!(sub->flags & SUB_EOD_MARKER)) + else { + // not the last sub from input ms_start = sub->start_time; ms_end = sub->end_time; } - else if (context->prev_start == -1 && (sub->flags & SUB_EOD_MARKER)) - { - ms_start = 1; - ms_end = sub->start_time; - } if (sub->nb_data == 0) - return ret; - rect = sub->data; + return 0; if (sub->flags & SUB_EOD_MARKER) context->prev_start = sub->start_time; - - if (rect[0].ocr_text && *(rect[0].ocr_text)) + str = paraof_ocrtext(sub, " ", 1); + if (str) { - lbl_start_block(ms_start, context); if (context->prev_start != -1 || !(sub->flags & SUB_EOD_MARKER)) { - char *token = NULL; - token = paraof_ocrtext(sub, " ", 1); // Get text with spaces instead of newlines - uint32_t offset=0; - utf8proc_ssize_t ls; // Last size - char *s = token; - int32_t uc; - while ((ls=utf8proc_iterate(s, -1, &uc))) - { - ccx_sbs_utf8_character sbsc; - // Note: We don't care about uc here, since we will be writing the encoded bytes, not the code points in binary. - //TODO: Deal with ls < 0 - if (!uc) // End of string - break; - printf("%3ld | %08X | %c %c %c %c\n", ls, uc, ((uc & 0xFF000000) >> 24), ((uc & 0xFF0000) >> 16), - ((uc & 0xFF00) >> 8), ( uc & 0xFF)); - sbsc.ch = uc; - sbsc.encoded[0] = 0; sbsc.encoded[1] = 0; sbsc.encoded[2] = 0; sbsc.encoded[3] = 0; - memcpy(sbsc.encoded, s, ls); - sbsc.enc_len = ls; - sbsc.ts = 0; // We don't know yet - lbl_add_character(context, sbsc); - s += ls; - - // TO-DO: Add each of these characters to the buffer, splitting the timestamps. Remember to add character length to the array - } - printf("-------\n"); - - /* - while (token) - { - char *newline_pos = strstr(token, context->encoded_crlf); - if (!newline_pos) - { - fdprintf(context->out->fh, "%s", token); - break; - } - else - { - while (token != newline_pos) - { - fdprintf(context->out->fh, "%c", *token); - token++; - } - token += context->encoded_crlf_length; - fdprintf(context->out->fh, "%c", ' '); - } - }*/ - + resub = sbs_append_string(str, ms_start, ms_end, context); } - lbl_end_block(ms_end, context); + freep(&str); + } + + for(i = 0, rect = sub->data; i < sub->nb_data; i++, rect++) + { + freep(rect->data); + freep(rect->data+1); } #endif - sub->nb_data = 0; freep(&sub->data); - return ret; + return resub; } diff --git a/src/lib_ccx/debug_def.h b/src/lib_ccx/debug_def.h new file mode 100644 index 00000000..c8ea402b --- /dev/null +++ b/src/lib_ccx/debug_def.h @@ -0,0 +1,11 @@ +#ifndef _DEBUG_DEF_H_ +#define _DEBUG_DEF_H_ + +#ifdef DEBUG +#define LOG_DEBUG(...) printf(__VA_ARGS__) +#else +#define LOG_DEBUG ; +#endif + + +#endif \ No newline at end of file diff --git a/tests/Makefile b/tests/Makefile new file mode 100644 index 00000000..d3f66557 --- /dev/null +++ b/tests/Makefile @@ -0,0 +1,59 @@ +SHELL = /bin/sh + +CC=gcc +# SYS := $(shell gcc -dumpmachine) +CFLAGS=-O0 -std=gnu99 -D ENABLE_OCR -g -ggdb -rdynamic +#-Q -da -v + +# enable COVERAGE +# CFLAGS+=-fprofile-arcs -ftest-coverage + +# add debug flag +ifdef DEBUG +CFLAGS+=-DDEBUG +endif + +#ALL_FLAGS = -Wno-write-strings -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT +LDFLAGS=-lm -g + +CFLAGS+=$(shell pkg-config --cflags check) +LDFLAGS+=$(shell pkg-config --libs check) + +# TODO: need to rewrite this. Need new way to load sources for testing +SRC=$(wildcard ../src/lib_ccx/ccx_encoders_splitbysentence.c) +OBJS= + +SRC_SUITE=$(wildcard *_suite.c) +OBJ_SUITE=$(patsubst %_suite.c, %_suite.o, $(SRC_SUITE)) + +OBJS+=$(OBJ_SUITE) + +all: clean test + +%.o: %.c + # explicit output name : -o $@ + $(CC) -c $(ALL_FLAGS) $(CFLAGS) $< + +runtest: $(OBJS) + @echo "+----------------------------------------------+" + @echo "| BUILD TESTS |" + @echo "+----------------------------------------------+" + $(CC) -c $(ALL_FLAGS) $(CFLAGS) $@.c + $(CC) $(SRC) $@.o $^ $(ALL_FLAGS) $(CFLAGS) $(LDFLAGS) -o $@ + +.PHONY: test +test: runtest + @echo "+----------------------------------------------+" + @echo "| START TESTS |" + @echo "+----------------------------------------------+" + ./runtest + +.PHONY: clean +clean: + rm runtest || true + rm *.o || true + # coverage info + rm *.gcda || true + rm *.gcno || true + # debug info + rm *.c.* || true \ No newline at end of file diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..455b716f --- /dev/null +++ b/tests/README.md @@ -0,0 +1,43 @@ +# UNIT TESTING + +This folder contains a archetype and several unit-tests for CCExtractor + +## RUN TESTS + +```shell +cd tests +make +``` + +This will build and run all test-suite. + +If you want MORE output: + +```shell +DEBUG=1 make +``` + +Where `DEBUG` is just an environment variable. + +## DEBUGGING + +If tests failed after your changes, you could debug them (almost all flags for this are set in the `tests/Makefile`. + +Run: + +```shell +# build test runner +make +# load test runner to the debgger: +gdb runner + +# run under debugger: +(gdb) run + +# on segfault: +(gdb) where +``` + +## DEPENDENCIES + +Tests are built around this library: [**libcheck**](https://github.com/libcheck/check), here is [**documentation**](https://libcheck.github.io/check/) diff --git a/tests/ccx_encoders_splitbysentence_suite.c b/tests/ccx_encoders_splitbysentence_suite.c new file mode 100644 index 00000000..c36dac07 --- /dev/null +++ b/tests/ccx_encoders_splitbysentence_suite.c @@ -0,0 +1,305 @@ +#include +#include "ccx_encoders_splitbysentence_suite.h" + +// ------------------------------------- +// MOCKS +// ------------------------------------- +typedef int64_t LLONG; +#include "../src/lib_ccx/ccx_encoders_common.h" + +// ------------------------------------- +// Private SBS-functions (for testing only) +// ------------------------------------- +struct cc_subtitle * sbs_append_string(unsigned char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context); + +// ------------------------------------- +// Helpers +// ------------------------------------- +struct cc_subtitle * helper_create_sub(char * str, LLONG time_from, LLONG time_trim) +{ + struct cc_subtitle * sub = (struct cc_subtitle *)malloc(sizeof(struct cc_subtitle)); + sub->type = CC_BITMAP; + sub->start_time = 1; + sub->end_time = 100; + sub->data = strdup(str); + sub->nb_data = strlen(sub->data); + + return sub; +} + +struct cc_subtitle * helper_sbs_append_string(char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context) +{ + char * str1; + struct cc_subtitle * sub; + + str1 = strdup(str); + sub = sbs_append_string(str1, time_from, time_trim, context); + free(str1); + return sub; +} + +// ------------------------------------- +// MOCKS +// ------------------------------------- +struct encoder_ctx * context; + +void freep(void * obj){ +} +void fatal(int x, void * obj){ +} + +unsigned char * paraof_ocrtext(void * sub) { + // this is OCR -> text converter. + // now, in our test cases, we will pass TEXT instead of OCR. + // and will return passed text as result + + return ((struct cc_subtitle *)sub)->data; +} + +// ------------------------------------- +// TEST preparations +// ------------------------------------- +void setup(void) +{ + context = (struct encoder_ctx *)malloc(sizeof(struct encoder_ctx)); + context->sbs_buffer = NULL; + context->sbs_capacity = 0; +} + +void teardown(void) +{ + free(context); +} + +// ------------------------------------- +// TESTS +// ------------------------------------- +START_TEST(test_sbs_one_simple_sentence) +{ + struct cc_subtitle * sub = helper_create_sub("Simple sentence.", 1, 100); + struct cc_subtitle * out = reformat_cc_bitmap_through_sentence_buffer(sub, context); + + ck_assert_ptr_ne(out, NULL); + ck_assert_str_eq(out->data, "Simple sentence."); + ck_assert_ptr_eq(out->next, NULL); + ck_assert_ptr_eq(out->prev, NULL); +} +END_TEST + + +START_TEST(test_sbs_two_sentences_with_rep) +{ + struct cc_subtitle * sub1 = helper_create_sub("asdf", 1, 100); + struct cc_subtitle * out1 = reformat_cc_bitmap_through_sentence_buffer(sub1, context); + ck_assert_ptr_eq(out1, NULL); + + // second sub: + struct cc_subtitle * sub2 = helper_create_sub("asdf Hello.", 101, 200); + struct cc_subtitle * out2 = reformat_cc_bitmap_through_sentence_buffer(sub2, context); + + ck_assert_ptr_ne(out2, NULL); + ck_assert_str_eq(out2->data, "asdf Hello."); + ck_assert_ptr_eq(out2->next, NULL); + ck_assert_ptr_eq(out2->prev, NULL);} +END_TEST + + +START_TEST(test_sbs_append_string_two_separate) +{ + unsigned char * test_strings[] = { + "First string.", + "Second string." + }; + struct cc_subtitle * sub; + unsigned char * str; + + // first string + str = strdup(test_strings[0]); + sub = NULL; + sub = sbs_append_string(str, 1, 20, context); + ck_assert_ptr_ne(sub, NULL); + ck_assert_str_eq(sub->data, test_strings[0]); + ck_assert_int_eq(sub->start_time, 1); + ck_assert_int_eq(sub->end_time, 20); + + // second string: + str = strdup(test_strings[1]); + sub = NULL; + sub = sbs_append_string(str, 21, 40, context); + + ck_assert_ptr_ne(sub, NULL); + ck_assert_str_eq(sub->data, test_strings[1]); + ck_assert_int_eq(sub->start_time, 21); + ck_assert_int_eq(sub->end_time, 40); +} +END_TEST + +START_TEST(test_sbs_append_string_two_with_broken_sentence) +{ + // important !! + // summary len == 32 + char * test_strings[] = { + "First string", + " ends here, deabbea." + }; + struct cc_subtitle * sub; + char * str; + + // first string + str = strdup(test_strings[0]); + sub = sbs_append_string(str, 1, 3, context); + + ck_assert_ptr_eq(sub, NULL); + + // second string: + str = strdup(test_strings[1]); + sub = sbs_append_string(str, 4, 5, context); + + ck_assert_ptr_ne(sub, NULL); + ck_assert_str_eq(sub->data, "First string ends here, deabbea."); + ck_assert_int_eq(sub->start_time, 1); + ck_assert_int_eq(sub->end_time, 5); +} +END_TEST + +START_TEST(test_sbs_append_string_two_intersecting) +{ + char * test_strings[] = { + "First string", + "First string ends here." + }; + struct cc_subtitle * sub; + char * str; + + // first string + str = strdup(test_strings[0]); + sub = sbs_append_string(str, 1, 20, context); + + ck_assert_ptr_eq(sub, NULL); + free(sub); + + // second string: + str = strdup(test_strings[1]); + //printf("second string: [%s]\n", str); + sub = sbs_append_string(str, 21, 40, context); + + ck_assert_ptr_ne(sub, NULL); + ck_assert_str_eq(sub->data, "First string ends here."); + ck_assert_int_eq(sub->start_time, 1); + ck_assert_int_eq(sub->end_time, 40); +} +END_TEST + + +START_TEST(test_sbs_append_string_real_data_1) +{ + struct cc_subtitle * sub; + + // 1 + sub = helper_sbs_append_string("Oleon", + 1, 0, context); + ck_assert_ptr_eq(sub, NULL); + + // 2 + sub = helper_sbs_append_string("Oleon costs.", + 1, 189, context); + ck_assert_ptr_ne(sub, NULL); + ck_assert_str_eq(sub->data, "Oleon costs."); + + // 3 + sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\ +Didn't", + 190, 889, context); + ck_assert_ptr_ne(sub, NULL); + ck_assert_str_eq(sub->data, "buried in the annex, 95 Oleon costs."); + ck_assert_int_eq(sub->start_time, 190); // = + ck_assert_int_eq(sub->end_time, 783); // = + * / + ck_assert_ptr_eq(sub->next, NULL); + + // 4 + sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\ +Didn't want", + 890, 1129, context); + ck_assert_ptr_eq(sub, NULL); + + // 5 + sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\ +Didn't want to", + 1130, 1359, context); + ck_assert_ptr_eq(sub, NULL); + + // 6 + sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\ +Didn't want to acknowledge", + 1360, 2059, context); + ck_assert_ptr_eq(sub, NULL); + + // 7 + sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\ +Didn't want to acknowledge the", + 2060, 2299, context); + ck_assert_ptr_eq(sub, NULL); + + // 9 + sub = helper_sbs_append_string("Didn't want to acknowledge the\n\ +pressures on hospitals, schools and", + 2300, 5019, context); + ck_assert_ptr_eq(sub, NULL); + + // 13 + sub = helper_sbs_append_string("pressures on hospitals, schools and\n\ +infrastructure.", + 5020, 5159, context); + ck_assert_ptr_ne(sub, NULL); + ck_assert_str_eq(sub->data, "Didn't want to acknowledge the pressures on hospitals, schools and infrastructure."); + ck_assert_int_eq(sub->start_time, 784); + ck_assert_int_eq(sub->end_time, 5159); + ck_assert_ptr_eq(sub->next, NULL); + + // 14 + sub = helper_sbs_append_string("pressures on hospitals, schools and\n\ +infrastructure. If", + 5160, 5529, context); + ck_assert_ptr_eq(sub, NULL); + + // 16 + sub = helper_sbs_append_string("pressures on hospitals, schools and\n\ +infrastructure. If we go", + 5530, 6559, context); + ck_assert_ptr_eq(sub, NULL); + + // ck_assert_int_eq(sub->start_time, 1); + // ck_assert_int_eq(sub->end_time, 40); +} +END_TEST + + +Suite * ccx_encoders_splitbysentence_suite(void) +{ + Suite *s; + TCase *tc_core; + + s = suite_create("Sentence Buffer"); + + /* Overall tests */ + tc_core = tcase_create("SB: Overall"); + + tcase_add_checked_fixture(tc_core, setup, teardown); + tcase_add_test(tc_core, test_sbs_one_simple_sentence); + tcase_add_test(tc_core, test_sbs_two_sentences_with_rep); + suite_add_tcase(s, tc_core); + + /**/ + TCase *tc_append_string; + tc_append_string = tcase_create("SB: append_string"); + tcase_add_checked_fixture(tc_append_string, setup, teardown); + + tcase_add_test(tc_append_string, test_sbs_append_string_two_separate); + tcase_add_test(tc_append_string, test_sbs_append_string_two_with_broken_sentence); + tcase_add_test(tc_append_string, test_sbs_append_string_two_intersecting); + tcase_add_test(tc_append_string, test_sbs_append_string_real_data_1); + + suite_add_tcase(s, tc_append_string); + + return s; +} diff --git a/tests/ccx_encoders_splitbysentence_suite.h b/tests/ccx_encoders_splitbysentence_suite.h new file mode 100644 index 00000000..88c21044 --- /dev/null +++ b/tests/ccx_encoders_splitbysentence_suite.h @@ -0,0 +1,4 @@ +// ------------------------------------- +// SUITE +// ------------------------------------- +Suite * ccx_encoders_splitbysentence_suite(void); diff --git a/tests/runtest.c b/tests/runtest.c new file mode 100644 index 00000000..a9898e42 --- /dev/null +++ b/tests/runtest.c @@ -0,0 +1,21 @@ +#include + +// TESTS: +#include "ccx_encoders_splitbysentence_suite.h" + + +int main(void) +{ + int number_failed; + Suite *s; + SRunner *sr; + + s = ccx_encoders_splitbysentence_suite(); + sr = srunner_create(s); + srunner_set_fork_status(sr, CK_NOFORK); + + srunner_run_all(sr, CK_NORMAL); + number_failed = srunner_ntests_failed(sr); + srunner_free(sr); + return (number_failed == 0) ? 0 : 1; +}