Break incoming subs into sentences (through a buffer), and remove duplicates

This commit is contained in:
maxkoryukov 2016-12-02 13:36:33 +05:00
parent d453d9327e
commit 66393a80f2
10 changed files with 1060 additions and 294 deletions

6
.gitignore vendored
View File

@ -1,3 +1,9 @@
####
# Ignore tests tmp files and results
tests/runtest
tests/**/*.gcda
tests/**/*.gcno
#### ####
# Ignore CVS related files # Ignore CVS related files

View File

@ -957,14 +957,10 @@ struct encoder_ctx *init_encoder(struct encoder_cfg *opt)
ctx->force_flush = opt->force_flush; ctx->force_flush = opt->force_flush;
ctx->ucla = opt->ucla; ctx->ucla = opt->ucla;
ctx->splitbysentence = opt->splitbysentence; ctx->splitbysentence = opt->splitbysentence;
ctx->sbs_newblock_start_time = -1; ctx->sbs_time_from = -1;
ctx->sbs_newblock_end_time = -1; ctx->sbs_time_trim = -1;
ctx->sbs_newblock = NULL; ctx->sbs_capacity = 0;
ctx->sbs_newblock_capacity = 0;
ctx->sbs_newblock_size = 0;
ctx->sbs_buffer = NULL; ctx->sbs_buffer = NULL;
ctx->sbs_buffer_capacity = 0;
ctx->sbs_buffer_size = 0;
ctx->subline = (unsigned char *) malloc (SUBLINESIZE); ctx->subline = (unsigned char *) malloc (SUBLINESIZE);
if(!ctx->subline) if(!ctx->subline)
@ -1045,203 +1041,204 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub)
// Write to a buffer that is later s+plit to generate split // Write to a buffer that is later s+plit to generate split
// in sentences // in sentences
if (sub->type == CC_BITMAP) if (sub->type == CC_BITMAP)
wrote_something = write_cc_bitmap_to_sentence_buffer(sub, context); sub = reformat_cc_bitmap_through_sentence_buffer(sub, context);
if (NULL==sub)
return wrote_something;
} }
else // Write subtitles as they come
if (sub->type == CC_608)
{ {
// Write subtitles as they come struct eia608_screen *data = NULL;
if (sub->type == CC_608) struct ccx_s_write *out;
for (data = sub->data; sub->nb_data; sub->nb_data--, data++)
{ {
struct eia608_screen *data = NULL; // Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example)
struct ccx_s_write *out; out = get_output_ctx(context, data->my_field);
for (data = sub->data; sub->nb_data; sub->nb_data--, data++)
if (data->format == SFORMAT_XDS)
{ {
// Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example)
out = get_output_ctx(context, data->my_field);
if (data->format == SFORMAT_XDS)
{
data->end_time = data->end_time + context->subs_delay;
xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class);
if (data->xds_len > 0)
{
ret = write(out->fh, data->xds_str, data->xds_len);
if (ret < data->xds_len)
{
mprint("WARNING:Loss of data\n");
}
}
freep(&data->xds_str);
write_newline(context, 0);
continue;
}
data->end_time = data->end_time + context->subs_delay; data->end_time = data->end_time + context->subs_delay;
switch (context->write_format) xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class);
if (data->xds_len > 0)
{ {
case CCX_OF_SRT: ret = write(out->fh, data->xds_str, data->xds_len);
if (!context->startcredits_displayed && context->start_credits_text != NULL) if (ret < data->xds_len)
try_to_add_start_credits(context, data->start_time); {
wrote_something = write_cc_buffer_as_srt(data, context); mprint("WARNING:Loss of data\n");
break; }
case CCX_OF_SSA:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, data->start_time);
wrote_something = write_cc_buffer_as_ssa(data, context);
break;
case CCX_OF_G608:
wrote_something = write_cc_buffer_as_g608(data, context);
break;
case CCX_OF_WEBVTT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, data->start_time);
wrote_something = write_cc_buffer_as_webvtt(data, context);
break;
case CCX_OF_SAMI:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, data->start_time);
wrote_something = write_cc_buffer_as_sami(data, context);
break;
case CCX_OF_SMPTETT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, data->start_time);
wrote_something = write_cc_buffer_as_smptett(data, context);
break;
case CCX_OF_TRANSCRIPT:
wrote_something = write_cc_buffer_as_transcript2(data, context);
break;
case CCX_OF_SPUPNG:
wrote_something = write_cc_buffer_as_spupng(data, context);
break;
case CCX_OF_SIMPLE_XML:
if (ccx_options.keep_output_closed && context->out->temporarily_closed)
{
temporarily_open_output(context->out);
write_subtitle_file_header(context, context->out);
}
wrote_something = write_cc_buffer_as_simplexml(data, context);
if (ccx_options.keep_output_closed)
{
write_subtitle_file_footer(context, context->out);
temporarily_close_output(context->out);
}
break;
default:
break;
} }
if (wrote_something) freep(&data->xds_str);
context->last_displayed_subs_ms = data->end_time; write_newline(context, 0);
continue;
if (context->gui_mode_reports)
write_cc_buffer_to_gui(sub->data, context);
}
freep(&sub->data);
}
if (sub->type == CC_BITMAP)
{
switch (context->write_format)
{
case CCX_OF_SRT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_srt(sub, context);
break;
case CCX_OF_SSA:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_ssa(sub, context);
break;
case CCX_OF_WEBVTT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_webvtt(sub, context);
break;
case CCX_OF_SAMI:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_sami(sub, context);
break;
case CCX_OF_SMPTETT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_smptett(sub, context);
break;
case CCX_OF_TRANSCRIPT:
wrote_something = write_cc_bitmap_as_transcript(sub, context);
break;
case CCX_OF_SPUPNG:
wrote_something = write_cc_bitmap_as_spupng(sub, context);
break;
case CCX_OF_SIMPLE_XML:
wrote_something = write_cc_bitmap_as_simplexml(sub, context);
break;
#ifdef WITH_LIBCURL
case CCX_OF_CURL:
wrote_something = write_cc_bitmap_as_libcurl(sub, context);
break;
#endif
default:
break;
} }
} data->end_time = data->end_time + context->subs_delay;
if (sub->type == CC_RAW)
{
if (context->send_to_srv)
net_send_header(sub->data, sub->nb_data);
else
{
ret = write(context->out->fh, sub->data, sub->nb_data);
if (ret < sub->nb_data) {
mprint("WARNING: Loss of data\n");
}
}
sub->nb_data = 0;
}
if (sub->type == CC_TEXT)
{
switch (context->write_format) switch (context->write_format)
{ {
case CCX_OF_SRT: case CCX_OF_SRT:
if (!context->startcredits_displayed && context->start_credits_text != NULL) if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time); try_to_add_start_credits(context, data->start_time);
wrote_something = write_cc_subtitle_as_srt(sub, context); wrote_something = write_cc_buffer_as_srt(data, context);
break; break;
case CCX_OF_SSA: case CCX_OF_SSA:
if (!context->startcredits_displayed && context->start_credits_text != NULL) if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time); try_to_add_start_credits(context, data->start_time);
wrote_something = write_cc_subtitle_as_ssa(sub, context); wrote_something = write_cc_buffer_as_ssa(data, context);
break; break;
case CCX_OF_WEBVTT: case CCX_OF_G608:
if (!context->startcredits_displayed && context->start_credits_text != NULL) wrote_something = write_cc_buffer_as_g608(data, context);
try_to_add_start_credits(context, sub->start_time); break;
wrote_something = write_cc_subtitle_as_webvtt(sub, context); case CCX_OF_WEBVTT:
break; if (!context->startcredits_displayed && context->start_credits_text != NULL)
case CCX_OF_SAMI: try_to_add_start_credits(context, data->start_time);
if (!context->startcredits_displayed && context->start_credits_text != NULL) wrote_something = write_cc_buffer_as_webvtt(data, context);
try_to_add_start_credits(context, sub->start_time); break;
wrote_something = write_cc_subtitle_as_sami(sub, context); case CCX_OF_SAMI:
break; if (!context->startcredits_displayed && context->start_credits_text != NULL)
case CCX_OF_SMPTETT: try_to_add_start_credits(context, data->start_time);
if (!context->startcredits_displayed && context->start_credits_text != NULL) wrote_something = write_cc_buffer_as_sami(data, context);
try_to_add_start_credits(context, sub->start_time); break;
wrote_something = write_cc_subtitle_as_smptett(sub, context); case CCX_OF_SMPTETT:
break; if (!context->startcredits_displayed && context->start_credits_text != NULL)
case CCX_OF_TRANSCRIPT: try_to_add_start_credits(context, data->start_time);
wrote_something = write_cc_subtitle_as_transcript(sub, context); wrote_something = write_cc_buffer_as_smptett(data, context);
break; break;
case CCX_OF_SPUPNG: case CCX_OF_TRANSCRIPT:
wrote_something = write_cc_subtitle_as_spupng(sub, context); wrote_something = write_cc_buffer_as_transcript2(data, context);
break; break;
case CCX_OF_SIMPLE_XML: case CCX_OF_SPUPNG:
wrote_something = write_cc_subtitle_as_simplexml(sub, context); wrote_something = write_cc_buffer_as_spupng(data, context);
break; break;
default: case CCX_OF_SIMPLE_XML:
break; if (ccx_options.keep_output_closed && context->out->temporarily_closed)
{
temporarily_open_output(context->out);
write_subtitle_file_header(context, context->out);
}
wrote_something = write_cc_buffer_as_simplexml(data, context);
if (ccx_options.keep_output_closed)
{
write_subtitle_file_footer(context, context->out);
temporarily_close_output(context->out);
}
break;
default:
break;
} }
sub->nb_data = 0; if (wrote_something)
context->last_displayed_subs_ms = data->end_time;
if (context->gui_mode_reports)
write_cc_buffer_to_gui(sub->data, context);
} }
freep(&sub->data);
} }
if (sub->type == CC_BITMAP)
{
switch (context->write_format)
{
case CCX_OF_SRT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_srt(sub, context);
break;
case CCX_OF_SSA:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_ssa(sub, context);
break;
case CCX_OF_WEBVTT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_webvtt(sub, context);
break;
case CCX_OF_SAMI:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_sami(sub, context);
break;
case CCX_OF_SMPTETT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_bitmap_as_smptett(sub, context);
break;
case CCX_OF_TRANSCRIPT:
wrote_something = write_cc_bitmap_as_transcript(sub, context);
break;
case CCX_OF_SPUPNG:
wrote_something = write_cc_bitmap_as_spupng(sub, context);
break;
case CCX_OF_SIMPLE_XML:
wrote_something = write_cc_bitmap_as_simplexml(sub, context);
break;
#ifdef WITH_LIBCURL
case CCX_OF_CURL:
wrote_something = write_cc_bitmap_as_libcurl(sub, context);
break;
#endif
default:
break;
}
}
if (sub->type == CC_RAW)
{
if (context->send_to_srv)
net_send_header(sub->data, sub->nb_data);
else
{
ret = write(context->out->fh, sub->data, sub->nb_data);
if (ret < sub->nb_data) {
mprint("WARNING: Loss of data\n");
}
}
sub->nb_data = 0;
}
if (sub->type == CC_TEXT)
{
switch (context->write_format)
{
case CCX_OF_SRT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_subtitle_as_srt(sub, context);
break;
case CCX_OF_SSA:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_subtitle_as_ssa(sub, context);
break;
case CCX_OF_WEBVTT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_subtitle_as_webvtt(sub, context);
break;
case CCX_OF_SAMI:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_subtitle_as_sami(sub, context);
break;
case CCX_OF_SMPTETT:
if (!context->startcredits_displayed && context->start_credits_text != NULL)
try_to_add_start_credits(context, sub->start_time);
wrote_something = write_cc_subtitle_as_smptett(sub, context);
break;
case CCX_OF_TRANSCRIPT:
wrote_something = write_cc_subtitle_as_transcript(sub, context);
break;
case CCX_OF_SPUPNG:
wrote_something = write_cc_subtitle_as_spupng(sub, context);
break;
case CCX_OF_SIMPLE_XML:
wrote_something = write_cc_subtitle_as_simplexml(sub, context);
break;
default:
break;
}
sub->nb_data = 0;
}
if (!sub->nb_data) if (!sub->nb_data)
freep(&sub->data); freep(&sub->data);
if (wrote_something && context->force_flush) if (wrote_something && context->force_flush)

View File

@ -118,15 +118,14 @@ struct encoder_ctx
/* split-by-sentence stuff */ /* split-by-sentence stuff */
int splitbysentence; int splitbysentence;
LLONG sbs_newblock_start_time; // Used by the split-by-sentence code to know when the current block starts...
LLONG sbs_newblock_end_time; // ... and ends
ccx_sbs_utf8_character *sbs_newblock;
int sbs_newblock_capacity;
int sbs_newblock_size;
ccx_sbs_utf8_character *sbs_buffer;
int sbs_buffer_capacity;
int sbs_buffer_size;
unsigned char * sbs_buffer; /// Storage for sentence-split buffer
size_t sbs_handled_len; /// The length of the string in the SBS-buffer, already handled, but preserved for DUP-detection.
//ccx_sbs_utf8_character *sbs_newblock;
LLONG sbs_time_from; // Used by the split-by-sentence code to know when the current block starts...
LLONG sbs_time_trim; // ... and ends
size_t sbs_capacity;
}; };
#define INITIAL_ENC_BUFFER_CAPACITY 2048 #define INITIAL_ENC_BUFFER_CAPACITY 2048
@ -196,10 +195,9 @@ int write_cc_bitmap_as_sami (struct cc_subtitle *sub, struct encoder_
int write_cc_bitmap_as_smptett (struct cc_subtitle *sub, struct encoder_ctx *context); int write_cc_bitmap_as_smptett (struct cc_subtitle *sub, struct encoder_ctx *context);
int write_cc_bitmap_as_spupng (struct cc_subtitle *sub, struct encoder_ctx *context); int write_cc_bitmap_as_spupng (struct cc_subtitle *sub, struct encoder_ctx *context);
int write_cc_bitmap_as_transcript (struct cc_subtitle *sub, struct encoder_ctx *context); int write_cc_bitmap_as_transcript (struct cc_subtitle *sub, struct encoder_ctx *context);
int write_cc_bitmap_to_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context);
int write_cc_bitmap_as_libcurl (struct cc_subtitle *sub, struct encoder_ctx *context); int write_cc_bitmap_as_libcurl (struct cc_subtitle *sub, struct encoder_ctx *context);
struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context);
void set_encoder_last_displayed_subs_ms(struct encoder_ctx *ctx, LLONG last_displayed_subs_ms); void set_encoder_last_displayed_subs_ms(struct encoder_ctx *ctx, LLONG last_displayed_subs_ms);
void set_encoder_subs_delay(struct encoder_ctx *ctx, LLONG subs_delay); void set_encoder_subs_delay(struct encoder_ctx *ctx, LLONG subs_delay);

View File

@ -1,135 +1,457 @@
#include "ccx_decoders_common.h" #include "ccx_common_platform.h"
#include "ccx_encoders_common.h" #include "ccx_encoders_common.h"
#include "spupng_encoder.h" #include "lib_ccx.h"
#include "ccx_encoders_spupng.h"
#include "utility.h"
#include "ocr.h" #include "ocr.h"
#include "ccx_decoders_608.h" #include "debug_def.h"
#include "ccx_decoders_708.h"
#include "ccx_decoders_708_output.h"
#include "ccx_encoders_xds.h"
#include "ccx_encoders_helpers.h"
#include "utf8proc.h"
#ifdef ENABLE_SHARING #ifdef ENABLE_SHARING
#include "ccx_share.h" #include "ccx_share.h"
#endif //ENABLE_SHARING #endif //ENABLE_SHARING
void lbl_start_block(LLONG start_time, struct encoder_ctx *context) int sbs_is_pointer_on_sentence_breaker(char * start, char * current)
{ {
context->sbs_newblock_start_time = start_time; char c = *current;
} char n = *(current + 1);
char p = *(current - 1);
void lbl_add_character(struct encoder_ctx *context, ccx_sbs_utf8_character ch) if (0 == c) n = 0;
{ if (current == start) p = 0;
if (context->sbs_newblock_capacity == context->sbs_newblock_size)
if (0 == c) return 1;
if ('.' == c
|| '!' == c
|| '?' == c
)
{ {
int newcapacity = (context->sbs_newblock_capacity < 512) ? 1024 : context->sbs_newblock_capacity * 2; if ('.' == n
context->sbs_newblock = (ccx_sbs_utf8_character *)realloc(context->sbs_newblock, newcapacity*sizeof(ccx_sbs_utf8_character)); || '!' == n
if (!context->sbs_newblock) || '?' == n
fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in lbl_add_character"); )
context->sbs_newblock_capacity = newcapacity; {
return 0;
}
return 1;
} }
memcpy(&context->sbs_newblock[context->sbs_newblock_size++], &ch, sizeof ch);
return 0;
} }
void lbl_end_block(LLONG end_time, struct encoder_ctx *context) int sbs_fuzzy_strncmp(const char * a, const char * b, size_t n, const size_t maxerr)
{ {
context->sbs_newblock_end_time = end_time; // TODO: implement fuzzy comparing
// Error counter DOES NOT WORK!!!
int i;
//int err;
char A, B;
i = -1;
do
{
i++;
// Bound check (compare to N)
if (i == n) return 0;
A = a[i];
B = b[i];
// bound check (line endings)
if (A == 0)
{
if (B == 0) return 0;
return 1;
}
else
{
if (B == 0) return -1;
}
if (A == B) continue;
if (isspace(A) && isspace(B)) continue;
if (A > B) return 1;
return -1;
} while(1);
} }
int write_cc_bitmap_to_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context) void sbs_strcpy_without_dup(const unsigned char * str, struct encoder_ctx * context)
{
int intersect_len;
unsigned char * suffix;
const unsigned char * prefix = str;
unsigned long sbs_len;
unsigned long str_len;
str_len = strlen(str);
sbs_len = strlen(context->sbs_buffer);
intersect_len = str_len;
if (sbs_len < intersect_len)
intersect_len = sbs_len;
while (intersect_len>0)
{
suffix = context->sbs_buffer + sbs_len - intersect_len;
if (0 == sbs_fuzzy_strncmp(prefix, suffix, intersect_len, 1))
{
break;
}
intersect_len--;
}
LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, intersection len [%4d]\n", intersect_len);
// check, that new string does not contain data, from
// already handled sentence:
LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, sbslen [%4d] handled len [%4d]\n", sbs_len, context->sbs_handled_len);
if ( (sbs_len - intersect_len) >= context->sbs_handled_len)
{
// there is no intersection.
// It is time to clean the buffer. Excepting the last uncomplete sentence
strcpy(context->sbs_buffer, context->sbs_buffer + context->sbs_handled_len);
context->sbs_handled_len = 0;
sbs_len = strlen(context->sbs_buffer);
LOG_DEBUG("Sentence Buffer: Clean buffer, after BUF [%s]\n\n\n", context->sbs_buffer);
}
if (intersect_len > 0)
{
// there is a common part (suffix of old sentence equals to prefix of new str)
//
// remove dup from buffer
// we will use an appropriate part from the new string
context->sbs_buffer[sbs_len-intersect_len] = 0;
}
sbs_len = strlen(context->sbs_buffer);
// whitespace control. Add space between subs
if (
!isspace(str[0]) // not a space char in the beginning of new str
&& context->sbs_handled_len >0 // buffer is not empty (there is uncomplete sentence)
&& !isspace(context->sbs_buffer[sbs_len-1]) // not a space char at the end of existing buf
)
{
//strcat(context->sbs_buffer, " ");
}
strcat(context->sbs_buffer, str);
}
void sbs_str_autofix(unsigned char * str)
{
int i;
// replace all whitespaces with spaces:
for (i = 0; str[i] != 0; i++)
{
if (isspace(str[i]))
{
str[i] = ' ';
}
if (
str[i] == '|'
&& (i==0 || isspace(str[i-1]))
&& (str[i+1] == 0 || isspace(str[i+1]) || str[i+1]=='\'')
)
{
// try to convert to "I"
str[i] = 'I';
}
}
}
/**
* Appends the function to the sentence buffer, and returns a list of full sentences (if there are any), or NULL
*
* @param str Partial (or full) sub to append.
* @param time_from Starting timestamp
* @param time_trim Ending timestamp
* @param context Encoder context
* @return New <struct cc_subtitle *> subtitle, or NULL, if <str> doesn't contain the ending part of the sentence. If there are more than one sentence, the remaining sentences will be chained using <result->next> reference.
*/
struct cc_subtitle * sbs_append_string(unsigned char * str, const LLONG time_from, const LLONG time_trim, struct encoder_ctx * context)
{
struct cc_subtitle * resub;
struct cc_subtitle * tmpsub;
unsigned char * bp_current;
unsigned char * bp_last_break;
unsigned char * sbs_undone_start;
int is_buf_initialized;
int required_capacity;
int new_capacity;
LLONG alphanum_total;
LLONG alphanum_cur;
LLONG anychar_total;
LLONG anychar_cur;
LLONG duration;
LLONG available_time;
int use_alphanum_counters;
if (! str)
return NULL;
sbs_str_autofix(str);
is_buf_initialized = (NULL == context->sbs_buffer || context->sbs_capacity == 0)
? 0
: 1;
// ===============================
// grow sentence buffer
// ===============================
required_capacity =
(is_buf_initialized ? strlen(context->sbs_buffer) : 0) // existing data in buf
+ strlen(str) // length of new string
+ 1 // trailing \0
+ 1 // space control (will add one space , if required)
;
if (required_capacity >= context->sbs_capacity)
{
new_capacity = context->sbs_capacity;
if (! is_buf_initialized) new_capacity = 16;
while (new_capacity < required_capacity)
{
// increase NEW_capacity, and check, that increment
// is less than 8 Mb. Because 8Mb - it is a lot
// for a TEXT buffer. It is weird...
new_capacity += (new_capacity > 1048576 * 8)
? 1048576 * 8
: new_capacity;
}
context->sbs_buffer = (unsigned char *)realloc(
context->sbs_buffer,
new_capacity * sizeof(/*unsigned char*/ context->sbs_buffer[0] )
);
if (!context->sbs_buffer)
fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in sbs_append_string");
context->sbs_capacity = new_capacity;
// if buffer wasn't initialized, we will se trash in buffer.
// but we need just empty string, so here we will get it:
if (! is_buf_initialized)
{
// INIT SBS
context->sbs_buffer[0] = 0;
context->sbs_handled_len = 0;
}
}
// ===============================
// append to buffer
//
// will update sbs_buffer, sbs_handled_len
// ===============================
sbs_strcpy_without_dup(str, context);
// ===============================
// break to sentences
// ===============================
resub = NULL;
tmpsub = NULL;
alphanum_total = 0;
alphanum_cur = 0;
anychar_total = 0;
anychar_cur = 0;
sbs_undone_start = context->sbs_buffer + context->sbs_handled_len;
bp_last_break = sbs_undone_start;
LOG_DEBUG("Sentence Buffer: BEFORE sentence break. Last break: [%s] sbs_undone_start: [%d], sbs_undone: [%s]\n",
bp_last_break, context->sbs_handled_len, sbs_undone_start
);
for (bp_current = sbs_undone_start; bp_current && *bp_current; bp_current++)
{
if (
0 < anychar_cur // skip empty!
&& sbs_is_pointer_on_sentence_breaker(bp_last_break, bp_current) )
{
// it is new sentence!
tmpsub = malloc(sizeof(struct cc_subtitle));
tmpsub->type = CC_TEXT;
// length of new string:
tmpsub->nb_data =
bp_current - bp_last_break
+ 1 // terminating '\0'
+ 1 // skip '.'
;
tmpsub->data = strndup(bp_last_break, tmpsub->nb_data - 1);
tmpsub->got_output = 1;
tmpsub->start_time = alphanum_cur;
alphanum_cur = 0;
tmpsub->end_time = anychar_cur;
anychar_cur = 0;
bp_last_break = bp_current + 1;
// tune last break:
while (
*bp_last_break
&& isspace(*bp_last_break)
)
{
bp_last_break++;
}
// ???
// tmpsub->info = NULL;
// tmpsub->mode = NULL;
// link with prev sub:
tmpsub->next = NULL;
tmpsub->prev = resub;
if (NULL != resub)
{
resub->next = tmpsub;
}
resub = tmpsub;
}
if (*bp_current && isalnum(*bp_current))
{
alphanum_total++;
alphanum_cur++;
}
anychar_total++;
anychar_cur++;
}
// ===============================
// okay, we have extracted several sentences, now we should
// save the position of the "remainder" - start of the last
// incomplete sentece
// ===============================
if (bp_last_break != sbs_undone_start)
{
context->sbs_handled_len = bp_last_break - sbs_undone_start;
}
LOG_DEBUG("Sentence Buffer: AFTER sentence break: Handled Len [%4d]\n", context->sbs_handled_len);
LOG_DEBUG("Sentence Buffer: Alphanum Total: [%4d] Overall chars: [%4d] STRING:[%20s] BUFFER:[%20s]\n", alphanum_total, anychar_total, str, context->sbs_buffer);
// ===============================
// Calculate time spans
// ===============================
if (!is_buf_initialized)
{
context->sbs_time_from = time_from;
context->sbs_time_trim = time_trim;
}
available_time = time_trim - context->sbs_time_from;
use_alphanum_counters = alphanum_total > 0 ? 1 : 0;
tmpsub = resub;
while (tmpsub)
{
alphanum_cur = tmpsub->start_time;
anychar_cur = tmpsub->end_time;
if (use_alphanum_counters)
{
duration = available_time * alphanum_cur / alphanum_total;
}
else
{
duration = available_time * anychar_cur / anychar_total;
}
tmpsub->start_time = context->sbs_time_from;
tmpsub->end_time = tmpsub->start_time + duration;
context->sbs_time_from = tmpsub->end_time + 1;
tmpsub = tmpsub->next;
}
return resub;
}
struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context)
{ {
int ret = 0;
#ifdef ENABLE_OCR
struct cc_bitmap* rect; struct cc_bitmap* rect;
LLONG ms_start, ms_end; LLONG ms_start, ms_end;
int used;
int i = 0;
char *str;
if (context->prev_start != -1 && (sub->flags & SUB_EOD_MARKER)) // this is a sub with a full sentence (or chain of such subs)
struct cc_subtitle * resub = NULL;
#ifdef ENABLE_OCR
if (sub->flags & SUB_EOD_MARKER)
{ {
ms_start = context->prev_start; // the last sub from input
ms_end = sub->start_time;
if (context->prev_start == -1)
{
ms_start = 1;
ms_end = sub->start_time;
}
else
{
ms_start = context->prev_start;
ms_end = sub->start_time;
}
} }
else if (!(sub->flags & SUB_EOD_MARKER)) else
{ {
// not the last sub from input
ms_start = sub->start_time; ms_start = sub->start_time;
ms_end = sub->end_time; ms_end = sub->end_time;
} }
else if (context->prev_start == -1 && (sub->flags & SUB_EOD_MARKER))
{
ms_start = 1;
ms_end = sub->start_time;
}
if (sub->nb_data == 0) if (sub->nb_data == 0)
return ret; return 0;
rect = sub->data;
if (sub->flags & SUB_EOD_MARKER) if (sub->flags & SUB_EOD_MARKER)
context->prev_start = sub->start_time; context->prev_start = sub->start_time;
str = paraof_ocrtext(sub, " ", 1);
if (rect[0].ocr_text && *(rect[0].ocr_text)) if (str)
{ {
lbl_start_block(ms_start, context);
if (context->prev_start != -1 || !(sub->flags & SUB_EOD_MARKER)) if (context->prev_start != -1 || !(sub->flags & SUB_EOD_MARKER))
{ {
char *token = NULL; resub = sbs_append_string(str, ms_start, ms_end, context);
token = paraof_ocrtext(sub, " ", 1); // Get text with spaces instead of newlines
uint32_t offset=0;
utf8proc_ssize_t ls; // Last size
char *s = token;
int32_t uc;
while ((ls=utf8proc_iterate(s, -1, &uc)))
{
ccx_sbs_utf8_character sbsc;
// Note: We don't care about uc here, since we will be writing the encoded bytes, not the code points in binary.
//TODO: Deal with ls < 0
if (!uc) // End of string
break;
printf("%3ld | %08X | %c %c %c %c\n", ls, uc, ((uc & 0xFF000000) >> 24), ((uc & 0xFF0000) >> 16),
((uc & 0xFF00) >> 8), ( uc & 0xFF));
sbsc.ch = uc;
sbsc.encoded[0] = 0; sbsc.encoded[1] = 0; sbsc.encoded[2] = 0; sbsc.encoded[3] = 0;
memcpy(sbsc.encoded, s, ls);
sbsc.enc_len = ls;
sbsc.ts = 0; // We don't know yet
lbl_add_character(context, sbsc);
s += ls;
// TO-DO: Add each of these characters to the buffer, splitting the timestamps. Remember to add character length to the array
}
printf("-------\n");
/*
while (token)
{
char *newline_pos = strstr(token, context->encoded_crlf);
if (!newline_pos)
{
fdprintf(context->out->fh, "%s", token);
break;
}
else
{
while (token != newline_pos)
{
fdprintf(context->out->fh, "%c", *token);
token++;
}
token += context->encoded_crlf_length;
fdprintf(context->out->fh, "%c", ' ');
}
}*/
} }
lbl_end_block(ms_end, context); freep(&str);
}
for(i = 0, rect = sub->data; i < sub->nb_data; i++, rect++)
{
freep(rect->data);
freep(rect->data+1);
} }
#endif #endif
sub->nb_data = 0; sub->nb_data = 0;
freep(&sub->data); freep(&sub->data);
return ret; return resub;
} }

11
src/lib_ccx/debug_def.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef _DEBUG_DEF_H_
#define _DEBUG_DEF_H_
#ifdef DEBUG
#define LOG_DEBUG(...) printf(__VA_ARGS__)
#else
#define LOG_DEBUG ;
#endif
#endif

59
tests/Makefile Normal file
View File

@ -0,0 +1,59 @@
SHELL = /bin/sh
CC=gcc
# SYS := $(shell gcc -dumpmachine)
CFLAGS=-O0 -std=gnu99 -D ENABLE_OCR -g -ggdb -rdynamic
#-Q -da -v
# enable COVERAGE
# CFLAGS+=-fprofile-arcs -ftest-coverage
# add debug flag
ifdef DEBUG
CFLAGS+=-DDEBUG
endif
#ALL_FLAGS = -Wno-write-strings -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT
LDFLAGS=-lm -g
CFLAGS+=$(shell pkg-config --cflags check)
LDFLAGS+=$(shell pkg-config --libs check)
# TODO: need to rewrite this. Need new way to load sources for testing
SRC=$(wildcard ../src/lib_ccx/ccx_encoders_splitbysentence.c)
OBJS=
SRC_SUITE=$(wildcard *_suite.c)
OBJ_SUITE=$(patsubst %_suite.c, %_suite.o, $(SRC_SUITE))
OBJS+=$(OBJ_SUITE)
all: clean test
%.o: %.c
# explicit output name : -o $@
$(CC) -c $(ALL_FLAGS) $(CFLAGS) $<
runtest: $(OBJS)
@echo "+----------------------------------------------+"
@echo "| BUILD TESTS |"
@echo "+----------------------------------------------+"
$(CC) -c $(ALL_FLAGS) $(CFLAGS) $@.c
$(CC) $(SRC) $@.o $^ $(ALL_FLAGS) $(CFLAGS) $(LDFLAGS) -o $@
.PHONY: test
test: runtest
@echo "+----------------------------------------------+"
@echo "| START TESTS |"
@echo "+----------------------------------------------+"
./runtest
.PHONY: clean
clean:
rm runtest || true
rm *.o || true
# coverage info
rm *.gcda || true
rm *.gcno || true
# debug info
rm *.c.* || true

43
tests/README.md Normal file
View File

@ -0,0 +1,43 @@
# UNIT TESTING
This folder contains a archetype and several unit-tests for CCExtractor
## RUN TESTS
```shell
cd tests
make
```
This will build and run all test-suite.
If you want MORE output:
```shell
DEBUG=1 make
```
Where `DEBUG` is just an environment variable.
## DEBUGGING
If tests failed after your changes, you could debug them (almost all flags for this are set in the `tests/Makefile`.
Run:
```shell
# build test runner
make
# load test runner to the debgger:
gdb runner
# run under debugger:
(gdb) run
# on segfault:
(gdb) where
```
## DEPENDENCIES
Tests are built around this library: [**libcheck**](https://github.com/libcheck/check), here is [**documentation**](https://libcheck.github.io/check/)

View File

@ -0,0 +1,305 @@
#include <check.h>
#include "ccx_encoders_splitbysentence_suite.h"
// -------------------------------------
// MOCKS
// -------------------------------------
typedef int64_t LLONG;
#include "../src/lib_ccx/ccx_encoders_common.h"
// -------------------------------------
// Private SBS-functions (for testing only)
// -------------------------------------
struct cc_subtitle * sbs_append_string(unsigned char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context);
// -------------------------------------
// Helpers
// -------------------------------------
struct cc_subtitle * helper_create_sub(char * str, LLONG time_from, LLONG time_trim)
{
struct cc_subtitle * sub = (struct cc_subtitle *)malloc(sizeof(struct cc_subtitle));
sub->type = CC_BITMAP;
sub->start_time = 1;
sub->end_time = 100;
sub->data = strdup(str);
sub->nb_data = strlen(sub->data);
return sub;
}
struct cc_subtitle * helper_sbs_append_string(char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context)
{
char * str1;
struct cc_subtitle * sub;
str1 = strdup(str);
sub = sbs_append_string(str1, time_from, time_trim, context);
free(str1);
return sub;
}
// -------------------------------------
// MOCKS
// -------------------------------------
struct encoder_ctx * context;
void freep(void * obj){
}
void fatal(int x, void * obj){
}
unsigned char * paraof_ocrtext(void * sub) {
// this is OCR -> text converter.
// now, in our test cases, we will pass TEXT instead of OCR.
// and will return passed text as result
return ((struct cc_subtitle *)sub)->data;
}
// -------------------------------------
// TEST preparations
// -------------------------------------
void setup(void)
{
context = (struct encoder_ctx *)malloc(sizeof(struct encoder_ctx));
context->sbs_buffer = NULL;
context->sbs_capacity = 0;
}
void teardown(void)
{
free(context);
}
// -------------------------------------
// TESTS
// -------------------------------------
START_TEST(test_sbs_one_simple_sentence)
{
struct cc_subtitle * sub = helper_create_sub("Simple sentence.", 1, 100);
struct cc_subtitle * out = reformat_cc_bitmap_through_sentence_buffer(sub, context);
ck_assert_ptr_ne(out, NULL);
ck_assert_str_eq(out->data, "Simple sentence.");
ck_assert_ptr_eq(out->next, NULL);
ck_assert_ptr_eq(out->prev, NULL);
}
END_TEST
START_TEST(test_sbs_two_sentences_with_rep)
{
struct cc_subtitle * sub1 = helper_create_sub("asdf", 1, 100);
struct cc_subtitle * out1 = reformat_cc_bitmap_through_sentence_buffer(sub1, context);
ck_assert_ptr_eq(out1, NULL);
// second sub:
struct cc_subtitle * sub2 = helper_create_sub("asdf Hello.", 101, 200);
struct cc_subtitle * out2 = reformat_cc_bitmap_through_sentence_buffer(sub2, context);
ck_assert_ptr_ne(out2, NULL);
ck_assert_str_eq(out2->data, "asdf Hello.");
ck_assert_ptr_eq(out2->next, NULL);
ck_assert_ptr_eq(out2->prev, NULL);}
END_TEST
START_TEST(test_sbs_append_string_two_separate)
{
unsigned char * test_strings[] = {
"First string.",
"Second string."
};
struct cc_subtitle * sub;
unsigned char * str;
// first string
str = strdup(test_strings[0]);
sub = NULL;
sub = sbs_append_string(str, 1, 20, context);
ck_assert_ptr_ne(sub, NULL);
ck_assert_str_eq(sub->data, test_strings[0]);
ck_assert_int_eq(sub->start_time, 1);
ck_assert_int_eq(sub->end_time, 20);
// second string:
str = strdup(test_strings[1]);
sub = NULL;
sub = sbs_append_string(str, 21, 40, context);
ck_assert_ptr_ne(sub, NULL);
ck_assert_str_eq(sub->data, test_strings[1]);
ck_assert_int_eq(sub->start_time, 21);
ck_assert_int_eq(sub->end_time, 40);
}
END_TEST
START_TEST(test_sbs_append_string_two_with_broken_sentence)
{
// important !!
// summary len == 32
char * test_strings[] = {
"First string",
" ends here, deabbea."
};
struct cc_subtitle * sub;
char * str;
// first string
str = strdup(test_strings[0]);
sub = sbs_append_string(str, 1, 3, context);
ck_assert_ptr_eq(sub, NULL);
// second string:
str = strdup(test_strings[1]);
sub = sbs_append_string(str, 4, 5, context);
ck_assert_ptr_ne(sub, NULL);
ck_assert_str_eq(sub->data, "First string ends here, deabbea.");
ck_assert_int_eq(sub->start_time, 1);
ck_assert_int_eq(sub->end_time, 5);
}
END_TEST
START_TEST(test_sbs_append_string_two_intersecting)
{
char * test_strings[] = {
"First string",
"First string ends here."
};
struct cc_subtitle * sub;
char * str;
// first string
str = strdup(test_strings[0]);
sub = sbs_append_string(str, 1, 20, context);
ck_assert_ptr_eq(sub, NULL);
free(sub);
// second string:
str = strdup(test_strings[1]);
//printf("second string: [%s]\n", str);
sub = sbs_append_string(str, 21, 40, context);
ck_assert_ptr_ne(sub, NULL);
ck_assert_str_eq(sub->data, "First string ends here.");
ck_assert_int_eq(sub->start_time, 1);
ck_assert_int_eq(sub->end_time, 40);
}
END_TEST
START_TEST(test_sbs_append_string_real_data_1)
{
struct cc_subtitle * sub;
// 1
sub = helper_sbs_append_string("Oleon",
1, 0, context);
ck_assert_ptr_eq(sub, NULL);
// 2
sub = helper_sbs_append_string("Oleon costs.",
1, 189, context);
ck_assert_ptr_ne(sub, NULL);
ck_assert_str_eq(sub->data, "Oleon costs.");
// 3
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
Didn't",
190, 889, context);
ck_assert_ptr_ne(sub, NULL);
ck_assert_str_eq(sub->data, "buried in the annex, 95 Oleon costs.");
ck_assert_int_eq(sub->start_time, 190); // = <sub start>
ck_assert_int_eq(sub->end_time, 783); // = <sub start> + <available time,889-190=699 > * <sentence alphanum, 28> / <sub alphanum, 33>
ck_assert_ptr_eq(sub->next, NULL);
// 4
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
Didn't want",
890, 1129, context);
ck_assert_ptr_eq(sub, NULL);
// 5
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
Didn't want to",
1130, 1359, context);
ck_assert_ptr_eq(sub, NULL);
// 6
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
Didn't want to acknowledge",
1360, 2059, context);
ck_assert_ptr_eq(sub, NULL);
// 7
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
Didn't want to acknowledge the",
2060, 2299, context);
ck_assert_ptr_eq(sub, NULL);
// 9
sub = helper_sbs_append_string("Didn't want to acknowledge the\n\
pressures on hospitals, schools and",
2300, 5019, context);
ck_assert_ptr_eq(sub, NULL);
// 13
sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
infrastructure.",
5020, 5159, context);
ck_assert_ptr_ne(sub, NULL);
ck_assert_str_eq(sub->data, "Didn't want to acknowledge the pressures on hospitals, schools and infrastructure.");
ck_assert_int_eq(sub->start_time, 784);
ck_assert_int_eq(sub->end_time, 5159);
ck_assert_ptr_eq(sub->next, NULL);
// 14
sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
infrastructure. If",
5160, 5529, context);
ck_assert_ptr_eq(sub, NULL);
// 16
sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
infrastructure. If we go",
5530, 6559, context);
ck_assert_ptr_eq(sub, NULL);
// ck_assert_int_eq(sub->start_time, 1);
// ck_assert_int_eq(sub->end_time, 40);
}
END_TEST
Suite * ccx_encoders_splitbysentence_suite(void)
{
Suite *s;
TCase *tc_core;
s = suite_create("Sentence Buffer");
/* Overall tests */
tc_core = tcase_create("SB: Overall");
tcase_add_checked_fixture(tc_core, setup, teardown);
tcase_add_test(tc_core, test_sbs_one_simple_sentence);
tcase_add_test(tc_core, test_sbs_two_sentences_with_rep);
suite_add_tcase(s, tc_core);
/**/
TCase *tc_append_string;
tc_append_string = tcase_create("SB: append_string");
tcase_add_checked_fixture(tc_append_string, setup, teardown);
tcase_add_test(tc_append_string, test_sbs_append_string_two_separate);
tcase_add_test(tc_append_string, test_sbs_append_string_two_with_broken_sentence);
tcase_add_test(tc_append_string, test_sbs_append_string_two_intersecting);
tcase_add_test(tc_append_string, test_sbs_append_string_real_data_1);
suite_add_tcase(s, tc_append_string);
return s;
}

View File

@ -0,0 +1,4 @@
// -------------------------------------
// SUITE
// -------------------------------------
Suite * ccx_encoders_splitbysentence_suite(void);

21
tests/runtest.c Normal file
View File

@ -0,0 +1,21 @@
#include <check.h>
// TESTS:
#include "ccx_encoders_splitbysentence_suite.h"
int main(void)
{
int number_failed;
Suite *s;
SRunner *sr;
s = ccx_encoders_splitbysentence_suite();
sr = srunner_create(s);
srunner_set_fork_status(sr, CK_NOFORK);
srunner_run_all(sr, CK_NORMAL);
number_failed = srunner_ntests_failed(sr);
srunner_free(sr);
return (number_failed == 0) ? 0 : 1;
}