mirror of
https://github.com/CCExtractor/ccextractor.git
synced 2024-12-25 04:11:38 +00:00
Break incoming subs into sentences (through a buffer), and remove duplicates
This commit is contained in:
parent
d453d9327e
commit
66393a80f2
6
.gitignore
vendored
6
.gitignore
vendored
@ -1,3 +1,9 @@
|
||||
####
|
||||
# Ignore tests tmp files and results
|
||||
tests/runtest
|
||||
tests/**/*.gcda
|
||||
tests/**/*.gcno
|
||||
|
||||
####
|
||||
# Ignore CVS related files
|
||||
|
||||
|
@ -957,14 +957,10 @@ struct encoder_ctx *init_encoder(struct encoder_cfg *opt)
|
||||
ctx->force_flush = opt->force_flush;
|
||||
ctx->ucla = opt->ucla;
|
||||
ctx->splitbysentence = opt->splitbysentence;
|
||||
ctx->sbs_newblock_start_time = -1;
|
||||
ctx->sbs_newblock_end_time = -1;
|
||||
ctx->sbs_newblock = NULL;
|
||||
ctx->sbs_newblock_capacity = 0;
|
||||
ctx->sbs_newblock_size = 0;
|
||||
ctx->sbs_time_from = -1;
|
||||
ctx->sbs_time_trim = -1;
|
||||
ctx->sbs_capacity = 0;
|
||||
ctx->sbs_buffer = NULL;
|
||||
ctx->sbs_buffer_capacity = 0;
|
||||
ctx->sbs_buffer_size = 0;
|
||||
|
||||
ctx->subline = (unsigned char *) malloc (SUBLINESIZE);
|
||||
if(!ctx->subline)
|
||||
@ -1045,203 +1041,204 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub)
|
||||
// Write to a buffer that is later s+plit to generate split
|
||||
// in sentences
|
||||
if (sub->type == CC_BITMAP)
|
||||
wrote_something = write_cc_bitmap_to_sentence_buffer(sub, context);
|
||||
sub = reformat_cc_bitmap_through_sentence_buffer(sub, context);
|
||||
|
||||
if (NULL==sub)
|
||||
return wrote_something;
|
||||
}
|
||||
else
|
||||
// Write subtitles as they come
|
||||
if (sub->type == CC_608)
|
||||
{
|
||||
// Write subtitles as they come
|
||||
if (sub->type == CC_608)
|
||||
struct eia608_screen *data = NULL;
|
||||
struct ccx_s_write *out;
|
||||
for (data = sub->data; sub->nb_data; sub->nb_data--, data++)
|
||||
{
|
||||
struct eia608_screen *data = NULL;
|
||||
struct ccx_s_write *out;
|
||||
for (data = sub->data; sub->nb_data; sub->nb_data--, data++)
|
||||
// Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example)
|
||||
out = get_output_ctx(context, data->my_field);
|
||||
|
||||
if (data->format == SFORMAT_XDS)
|
||||
{
|
||||
// Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example)
|
||||
out = get_output_ctx(context, data->my_field);
|
||||
|
||||
if (data->format == SFORMAT_XDS)
|
||||
{
|
||||
data->end_time = data->end_time + context->subs_delay;
|
||||
xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class);
|
||||
if (data->xds_len > 0)
|
||||
{
|
||||
ret = write(out->fh, data->xds_str, data->xds_len);
|
||||
if (ret < data->xds_len)
|
||||
{
|
||||
mprint("WARNING:Loss of data\n");
|
||||
}
|
||||
}
|
||||
freep(&data->xds_str);
|
||||
write_newline(context, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
data->end_time = data->end_time + context->subs_delay;
|
||||
switch (context->write_format)
|
||||
xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class);
|
||||
if (data->xds_len > 0)
|
||||
{
|
||||
case CCX_OF_SRT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_srt(data, context);
|
||||
break;
|
||||
case CCX_OF_SSA:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_ssa(data, context);
|
||||
break;
|
||||
case CCX_OF_G608:
|
||||
wrote_something = write_cc_buffer_as_g608(data, context);
|
||||
break;
|
||||
case CCX_OF_WEBVTT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_webvtt(data, context);
|
||||
break;
|
||||
case CCX_OF_SAMI:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_sami(data, context);
|
||||
break;
|
||||
case CCX_OF_SMPTETT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_smptett(data, context);
|
||||
break;
|
||||
case CCX_OF_TRANSCRIPT:
|
||||
wrote_something = write_cc_buffer_as_transcript2(data, context);
|
||||
break;
|
||||
case CCX_OF_SPUPNG:
|
||||
wrote_something = write_cc_buffer_as_spupng(data, context);
|
||||
break;
|
||||
case CCX_OF_SIMPLE_XML:
|
||||
if (ccx_options.keep_output_closed && context->out->temporarily_closed)
|
||||
{
|
||||
temporarily_open_output(context->out);
|
||||
write_subtitle_file_header(context, context->out);
|
||||
}
|
||||
wrote_something = write_cc_buffer_as_simplexml(data, context);
|
||||
if (ccx_options.keep_output_closed)
|
||||
{
|
||||
write_subtitle_file_footer(context, context->out);
|
||||
temporarily_close_output(context->out);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
ret = write(out->fh, data->xds_str, data->xds_len);
|
||||
if (ret < data->xds_len)
|
||||
{
|
||||
mprint("WARNING:Loss of data\n");
|
||||
}
|
||||
}
|
||||
if (wrote_something)
|
||||
context->last_displayed_subs_ms = data->end_time;
|
||||
|
||||
if (context->gui_mode_reports)
|
||||
write_cc_buffer_to_gui(sub->data, context);
|
||||
}
|
||||
freep(&sub->data);
|
||||
}
|
||||
if (sub->type == CC_BITMAP)
|
||||
{
|
||||
switch (context->write_format)
|
||||
{
|
||||
case CCX_OF_SRT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_srt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SSA:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_ssa(sub, context);
|
||||
break;
|
||||
case CCX_OF_WEBVTT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_webvtt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SAMI:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_sami(sub, context);
|
||||
break;
|
||||
case CCX_OF_SMPTETT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_smptett(sub, context);
|
||||
break;
|
||||
case CCX_OF_TRANSCRIPT:
|
||||
wrote_something = write_cc_bitmap_as_transcript(sub, context);
|
||||
break;
|
||||
case CCX_OF_SPUPNG:
|
||||
wrote_something = write_cc_bitmap_as_spupng(sub, context);
|
||||
break;
|
||||
case CCX_OF_SIMPLE_XML:
|
||||
wrote_something = write_cc_bitmap_as_simplexml(sub, context);
|
||||
break;
|
||||
#ifdef WITH_LIBCURL
|
||||
case CCX_OF_CURL:
|
||||
wrote_something = write_cc_bitmap_as_libcurl(sub, context);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
freep(&data->xds_str);
|
||||
write_newline(context, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
}
|
||||
if (sub->type == CC_RAW)
|
||||
{
|
||||
if (context->send_to_srv)
|
||||
net_send_header(sub->data, sub->nb_data);
|
||||
else
|
||||
{
|
||||
ret = write(context->out->fh, sub->data, sub->nb_data);
|
||||
if (ret < sub->nb_data) {
|
||||
mprint("WARNING: Loss of data\n");
|
||||
}
|
||||
}
|
||||
sub->nb_data = 0;
|
||||
}
|
||||
if (sub->type == CC_TEXT)
|
||||
{
|
||||
data->end_time = data->end_time + context->subs_delay;
|
||||
switch (context->write_format)
|
||||
{
|
||||
case CCX_OF_SRT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_srt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SSA:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_ssa(sub, context);
|
||||
break;
|
||||
case CCX_OF_WEBVTT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_webvtt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SAMI:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_sami(sub, context);
|
||||
break;
|
||||
case CCX_OF_SMPTETT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_smptett(sub, context);
|
||||
break;
|
||||
case CCX_OF_TRANSCRIPT:
|
||||
wrote_something = write_cc_subtitle_as_transcript(sub, context);
|
||||
break;
|
||||
case CCX_OF_SPUPNG:
|
||||
wrote_something = write_cc_subtitle_as_spupng(sub, context);
|
||||
break;
|
||||
case CCX_OF_SIMPLE_XML:
|
||||
wrote_something = write_cc_subtitle_as_simplexml(sub, context);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case CCX_OF_SRT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_srt(data, context);
|
||||
break;
|
||||
case CCX_OF_SSA:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_ssa(data, context);
|
||||
break;
|
||||
case CCX_OF_G608:
|
||||
wrote_something = write_cc_buffer_as_g608(data, context);
|
||||
break;
|
||||
case CCX_OF_WEBVTT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_webvtt(data, context);
|
||||
break;
|
||||
case CCX_OF_SAMI:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_sami(data, context);
|
||||
break;
|
||||
case CCX_OF_SMPTETT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, data->start_time);
|
||||
wrote_something = write_cc_buffer_as_smptett(data, context);
|
||||
break;
|
||||
case CCX_OF_TRANSCRIPT:
|
||||
wrote_something = write_cc_buffer_as_transcript2(data, context);
|
||||
break;
|
||||
case CCX_OF_SPUPNG:
|
||||
wrote_something = write_cc_buffer_as_spupng(data, context);
|
||||
break;
|
||||
case CCX_OF_SIMPLE_XML:
|
||||
if (ccx_options.keep_output_closed && context->out->temporarily_closed)
|
||||
{
|
||||
temporarily_open_output(context->out);
|
||||
write_subtitle_file_header(context, context->out);
|
||||
}
|
||||
wrote_something = write_cc_buffer_as_simplexml(data, context);
|
||||
if (ccx_options.keep_output_closed)
|
||||
{
|
||||
write_subtitle_file_footer(context, context->out);
|
||||
temporarily_close_output(context->out);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
sub->nb_data = 0;
|
||||
if (wrote_something)
|
||||
context->last_displayed_subs_ms = data->end_time;
|
||||
|
||||
if (context->gui_mode_reports)
|
||||
write_cc_buffer_to_gui(sub->data, context);
|
||||
}
|
||||
freep(&sub->data);
|
||||
}
|
||||
if (sub->type == CC_BITMAP)
|
||||
{
|
||||
switch (context->write_format)
|
||||
{
|
||||
case CCX_OF_SRT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_srt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SSA:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_ssa(sub, context);
|
||||
break;
|
||||
case CCX_OF_WEBVTT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_webvtt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SAMI:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_sami(sub, context);
|
||||
break;
|
||||
case CCX_OF_SMPTETT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_bitmap_as_smptett(sub, context);
|
||||
break;
|
||||
case CCX_OF_TRANSCRIPT:
|
||||
wrote_something = write_cc_bitmap_as_transcript(sub, context);
|
||||
break;
|
||||
case CCX_OF_SPUPNG:
|
||||
wrote_something = write_cc_bitmap_as_spupng(sub, context);
|
||||
break;
|
||||
case CCX_OF_SIMPLE_XML:
|
||||
wrote_something = write_cc_bitmap_as_simplexml(sub, context);
|
||||
break;
|
||||
#ifdef WITH_LIBCURL
|
||||
case CCX_OF_CURL:
|
||||
wrote_something = write_cc_bitmap_as_libcurl(sub, context);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
if (sub->type == CC_RAW)
|
||||
{
|
||||
if (context->send_to_srv)
|
||||
net_send_header(sub->data, sub->nb_data);
|
||||
else
|
||||
{
|
||||
ret = write(context->out->fh, sub->data, sub->nb_data);
|
||||
if (ret < sub->nb_data) {
|
||||
mprint("WARNING: Loss of data\n");
|
||||
}
|
||||
}
|
||||
sub->nb_data = 0;
|
||||
}
|
||||
if (sub->type == CC_TEXT)
|
||||
{
|
||||
switch (context->write_format)
|
||||
{
|
||||
case CCX_OF_SRT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_srt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SSA:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_ssa(sub, context);
|
||||
break;
|
||||
case CCX_OF_WEBVTT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_webvtt(sub, context);
|
||||
break;
|
||||
case CCX_OF_SAMI:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_sami(sub, context);
|
||||
break;
|
||||
case CCX_OF_SMPTETT:
|
||||
if (!context->startcredits_displayed && context->start_credits_text != NULL)
|
||||
try_to_add_start_credits(context, sub->start_time);
|
||||
wrote_something = write_cc_subtitle_as_smptett(sub, context);
|
||||
break;
|
||||
case CCX_OF_TRANSCRIPT:
|
||||
wrote_something = write_cc_subtitle_as_transcript(sub, context);
|
||||
break;
|
||||
case CCX_OF_SPUPNG:
|
||||
wrote_something = write_cc_subtitle_as_spupng(sub, context);
|
||||
break;
|
||||
case CCX_OF_SIMPLE_XML:
|
||||
wrote_something = write_cc_subtitle_as_simplexml(sub, context);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
sub->nb_data = 0;
|
||||
}
|
||||
|
||||
if (!sub->nb_data)
|
||||
freep(&sub->data);
|
||||
if (wrote_something && context->force_flush)
|
||||
|
@ -118,15 +118,14 @@ struct encoder_ctx
|
||||
|
||||
/* split-by-sentence stuff */
|
||||
int splitbysentence;
|
||||
LLONG sbs_newblock_start_time; // Used by the split-by-sentence code to know when the current block starts...
|
||||
LLONG sbs_newblock_end_time; // ... and ends
|
||||
ccx_sbs_utf8_character *sbs_newblock;
|
||||
int sbs_newblock_capacity;
|
||||
int sbs_newblock_size;
|
||||
ccx_sbs_utf8_character *sbs_buffer;
|
||||
int sbs_buffer_capacity;
|
||||
int sbs_buffer_size;
|
||||
|
||||
unsigned char * sbs_buffer; /// Storage for sentence-split buffer
|
||||
size_t sbs_handled_len; /// The length of the string in the SBS-buffer, already handled, but preserved for DUP-detection.
|
||||
|
||||
//ccx_sbs_utf8_character *sbs_newblock;
|
||||
LLONG sbs_time_from; // Used by the split-by-sentence code to know when the current block starts...
|
||||
LLONG sbs_time_trim; // ... and ends
|
||||
size_t sbs_capacity;
|
||||
};
|
||||
|
||||
#define INITIAL_ENC_BUFFER_CAPACITY 2048
|
||||
@ -196,10 +195,9 @@ int write_cc_bitmap_as_sami (struct cc_subtitle *sub, struct encoder_
|
||||
int write_cc_bitmap_as_smptett (struct cc_subtitle *sub, struct encoder_ctx *context);
|
||||
int write_cc_bitmap_as_spupng (struct cc_subtitle *sub, struct encoder_ctx *context);
|
||||
int write_cc_bitmap_as_transcript (struct cc_subtitle *sub, struct encoder_ctx *context);
|
||||
int write_cc_bitmap_to_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context);
|
||||
int write_cc_bitmap_as_libcurl (struct cc_subtitle *sub, struct encoder_ctx *context);
|
||||
|
||||
|
||||
struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context);
|
||||
|
||||
void set_encoder_last_displayed_subs_ms(struct encoder_ctx *ctx, LLONG last_displayed_subs_ms);
|
||||
void set_encoder_subs_delay(struct encoder_ctx *ctx, LLONG subs_delay);
|
||||
|
@ -1,135 +1,457 @@
|
||||
#include "ccx_decoders_common.h"
|
||||
#include "ccx_common_platform.h"
|
||||
#include "ccx_encoders_common.h"
|
||||
#include "spupng_encoder.h"
|
||||
#include "ccx_encoders_spupng.h"
|
||||
#include "utility.h"
|
||||
#include "lib_ccx.h"
|
||||
#include "ocr.h"
|
||||
#include "ccx_decoders_608.h"
|
||||
#include "ccx_decoders_708.h"
|
||||
#include "ccx_decoders_708_output.h"
|
||||
#include "ccx_encoders_xds.h"
|
||||
#include "ccx_encoders_helpers.h"
|
||||
#include "utf8proc.h"
|
||||
#include "debug_def.h"
|
||||
|
||||
#ifdef ENABLE_SHARING
|
||||
#include "ccx_share.h"
|
||||
#endif //ENABLE_SHARING
|
||||
|
||||
void lbl_start_block(LLONG start_time, struct encoder_ctx *context)
|
||||
int sbs_is_pointer_on_sentence_breaker(char * start, char * current)
|
||||
{
|
||||
context->sbs_newblock_start_time = start_time;
|
||||
}
|
||||
char c = *current;
|
||||
char n = *(current + 1);
|
||||
char p = *(current - 1);
|
||||
|
||||
void lbl_add_character(struct encoder_ctx *context, ccx_sbs_utf8_character ch)
|
||||
{
|
||||
if (context->sbs_newblock_capacity == context->sbs_newblock_size)
|
||||
if (0 == c) n = 0;
|
||||
if (current == start) p = 0;
|
||||
|
||||
if (0 == c) return 1;
|
||||
|
||||
if ('.' == c
|
||||
|| '!' == c
|
||||
|| '?' == c
|
||||
)
|
||||
{
|
||||
int newcapacity = (context->sbs_newblock_capacity < 512) ? 1024 : context->sbs_newblock_capacity * 2;
|
||||
context->sbs_newblock = (ccx_sbs_utf8_character *)realloc(context->sbs_newblock, newcapacity*sizeof(ccx_sbs_utf8_character));
|
||||
if (!context->sbs_newblock)
|
||||
fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in lbl_add_character");
|
||||
context->sbs_newblock_capacity = newcapacity;
|
||||
if ('.' == n
|
||||
|| '!' == n
|
||||
|| '?' == n
|
||||
)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
memcpy(&context->sbs_newblock[context->sbs_newblock_size++], &ch, sizeof ch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void lbl_end_block(LLONG end_time, struct encoder_ctx *context)
|
||||
int sbs_fuzzy_strncmp(const char * a, const char * b, size_t n, const size_t maxerr)
|
||||
{
|
||||
context->sbs_newblock_end_time = end_time;
|
||||
// TODO: implement fuzzy comparing
|
||||
// Error counter DOES NOT WORK!!!
|
||||
|
||||
int i;
|
||||
//int err;
|
||||
char A, B;
|
||||
|
||||
i = -1;
|
||||
do
|
||||
{
|
||||
i++;
|
||||
|
||||
// Bound check (compare to N)
|
||||
if (i == n) return 0;
|
||||
|
||||
A = a[i];
|
||||
B = b[i];
|
||||
|
||||
// bound check (line endings)
|
||||
if (A == 0)
|
||||
{
|
||||
if (B == 0) return 0;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (B == 0) return -1;
|
||||
}
|
||||
|
||||
if (A == B) continue;
|
||||
if (isspace(A) && isspace(B)) continue;
|
||||
|
||||
if (A > B) return 1;
|
||||
return -1;
|
||||
|
||||
} while(1);
|
||||
}
|
||||
|
||||
int write_cc_bitmap_to_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context)
|
||||
void sbs_strcpy_without_dup(const unsigned char * str, struct encoder_ctx * context)
|
||||
{
|
||||
int intersect_len;
|
||||
unsigned char * suffix;
|
||||
const unsigned char * prefix = str;
|
||||
|
||||
unsigned long sbs_len;
|
||||
unsigned long str_len;
|
||||
|
||||
str_len = strlen(str);
|
||||
sbs_len = strlen(context->sbs_buffer);
|
||||
|
||||
intersect_len = str_len;
|
||||
if (sbs_len < intersect_len)
|
||||
intersect_len = sbs_len;
|
||||
|
||||
while (intersect_len>0)
|
||||
{
|
||||
suffix = context->sbs_buffer + sbs_len - intersect_len;
|
||||
if (0 == sbs_fuzzy_strncmp(prefix, suffix, intersect_len, 1))
|
||||
{
|
||||
break;
|
||||
}
|
||||
intersect_len--;
|
||||
}
|
||||
|
||||
LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, intersection len [%4d]\n", intersect_len);
|
||||
|
||||
// check, that new string does not contain data, from
|
||||
// already handled sentence:
|
||||
LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, sbslen [%4d] handled len [%4d]\n", sbs_len, context->sbs_handled_len);
|
||||
if ( (sbs_len - intersect_len) >= context->sbs_handled_len)
|
||||
{
|
||||
// there is no intersection.
|
||||
// It is time to clean the buffer. Excepting the last uncomplete sentence
|
||||
strcpy(context->sbs_buffer, context->sbs_buffer + context->sbs_handled_len);
|
||||
context->sbs_handled_len = 0;
|
||||
sbs_len = strlen(context->sbs_buffer);
|
||||
|
||||
LOG_DEBUG("Sentence Buffer: Clean buffer, after BUF [%s]\n\n\n", context->sbs_buffer);
|
||||
}
|
||||
|
||||
if (intersect_len > 0)
|
||||
{
|
||||
// there is a common part (suffix of old sentence equals to prefix of new str)
|
||||
//
|
||||
// remove dup from buffer
|
||||
// we will use an appropriate part from the new string
|
||||
context->sbs_buffer[sbs_len-intersect_len] = 0;
|
||||
}
|
||||
|
||||
sbs_len = strlen(context->sbs_buffer);
|
||||
|
||||
// whitespace control. Add space between subs
|
||||
if (
|
||||
!isspace(str[0]) // not a space char in the beginning of new str
|
||||
&& context->sbs_handled_len >0 // buffer is not empty (there is uncomplete sentence)
|
||||
&& !isspace(context->sbs_buffer[sbs_len-1]) // not a space char at the end of existing buf
|
||||
)
|
||||
{
|
||||
//strcat(context->sbs_buffer, " ");
|
||||
}
|
||||
|
||||
strcat(context->sbs_buffer, str);
|
||||
}
|
||||
|
||||
void sbs_str_autofix(unsigned char * str)
|
||||
{
|
||||
int i;
|
||||
|
||||
// replace all whitespaces with spaces:
|
||||
for (i = 0; str[i] != 0; i++)
|
||||
{
|
||||
if (isspace(str[i]))
|
||||
{
|
||||
str[i] = ' ';
|
||||
}
|
||||
|
||||
if (
|
||||
str[i] == '|'
|
||||
&& (i==0 || isspace(str[i-1]))
|
||||
&& (str[i+1] == 0 || isspace(str[i+1]) || str[i+1]=='\'')
|
||||
)
|
||||
{
|
||||
// try to convert to "I"
|
||||
str[i] = 'I';
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends the function to the sentence buffer, and returns a list of full sentences (if there are any), or NULL
|
||||
*
|
||||
* @param str Partial (or full) sub to append.
|
||||
* @param time_from Starting timestamp
|
||||
* @param time_trim Ending timestamp
|
||||
* @param context Encoder context
|
||||
* @return New <struct cc_subtitle *> subtitle, or NULL, if <str> doesn't contain the ending part of the sentence. If there are more than one sentence, the remaining sentences will be chained using <result->next> reference.
|
||||
*/
|
||||
struct cc_subtitle * sbs_append_string(unsigned char * str, const LLONG time_from, const LLONG time_trim, struct encoder_ctx * context)
|
||||
{
|
||||
struct cc_subtitle * resub;
|
||||
struct cc_subtitle * tmpsub;
|
||||
|
||||
unsigned char * bp_current;
|
||||
unsigned char * bp_last_break;
|
||||
unsigned char * sbs_undone_start;
|
||||
|
||||
int is_buf_initialized;
|
||||
int required_capacity;
|
||||
int new_capacity;
|
||||
|
||||
LLONG alphanum_total;
|
||||
LLONG alphanum_cur;
|
||||
|
||||
LLONG anychar_total;
|
||||
LLONG anychar_cur;
|
||||
|
||||
LLONG duration;
|
||||
LLONG available_time;
|
||||
int use_alphanum_counters;
|
||||
|
||||
if (! str)
|
||||
return NULL;
|
||||
|
||||
sbs_str_autofix(str);
|
||||
|
||||
is_buf_initialized = (NULL == context->sbs_buffer || context->sbs_capacity == 0)
|
||||
? 0
|
||||
: 1;
|
||||
|
||||
// ===============================
|
||||
// grow sentence buffer
|
||||
// ===============================
|
||||
required_capacity =
|
||||
(is_buf_initialized ? strlen(context->sbs_buffer) : 0) // existing data in buf
|
||||
+ strlen(str) // length of new string
|
||||
+ 1 // trailing \0
|
||||
+ 1 // space control (will add one space , if required)
|
||||
;
|
||||
|
||||
if (required_capacity >= context->sbs_capacity)
|
||||
{
|
||||
new_capacity = context->sbs_capacity;
|
||||
if (! is_buf_initialized) new_capacity = 16;
|
||||
|
||||
while (new_capacity < required_capacity)
|
||||
{
|
||||
// increase NEW_capacity, and check, that increment
|
||||
// is less than 8 Mb. Because 8Mb - it is a lot
|
||||
// for a TEXT buffer. It is weird...
|
||||
new_capacity += (new_capacity > 1048576 * 8)
|
||||
? 1048576 * 8
|
||||
: new_capacity;
|
||||
}
|
||||
|
||||
context->sbs_buffer = (unsigned char *)realloc(
|
||||
context->sbs_buffer,
|
||||
new_capacity * sizeof(/*unsigned char*/ context->sbs_buffer[0] )
|
||||
);
|
||||
|
||||
if (!context->sbs_buffer)
|
||||
fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in sbs_append_string");
|
||||
|
||||
context->sbs_capacity = new_capacity;
|
||||
|
||||
// if buffer wasn't initialized, we will se trash in buffer.
|
||||
// but we need just empty string, so here we will get it:
|
||||
if (! is_buf_initialized)
|
||||
{
|
||||
// INIT SBS
|
||||
context->sbs_buffer[0] = 0;
|
||||
context->sbs_handled_len = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// ===============================
|
||||
// append to buffer
|
||||
//
|
||||
// will update sbs_buffer, sbs_handled_len
|
||||
// ===============================
|
||||
sbs_strcpy_without_dup(str, context);
|
||||
|
||||
// ===============================
|
||||
// break to sentences
|
||||
// ===============================
|
||||
resub = NULL;
|
||||
tmpsub = NULL;
|
||||
|
||||
alphanum_total = 0;
|
||||
alphanum_cur = 0;
|
||||
|
||||
anychar_total = 0;
|
||||
anychar_cur = 0;
|
||||
|
||||
sbs_undone_start = context->sbs_buffer + context->sbs_handled_len;
|
||||
bp_last_break = sbs_undone_start;
|
||||
|
||||
LOG_DEBUG("Sentence Buffer: BEFORE sentence break. Last break: [%s] sbs_undone_start: [%d], sbs_undone: [%s]\n",
|
||||
bp_last_break, context->sbs_handled_len, sbs_undone_start
|
||||
);
|
||||
|
||||
for (bp_current = sbs_undone_start; bp_current && *bp_current; bp_current++)
|
||||
{
|
||||
if (
|
||||
0 < anychar_cur // skip empty!
|
||||
&& sbs_is_pointer_on_sentence_breaker(bp_last_break, bp_current) )
|
||||
{
|
||||
// it is new sentence!
|
||||
tmpsub = malloc(sizeof(struct cc_subtitle));
|
||||
|
||||
tmpsub->type = CC_TEXT;
|
||||
// length of new string:
|
||||
tmpsub->nb_data =
|
||||
bp_current - bp_last_break
|
||||
+ 1 // terminating '\0'
|
||||
+ 1 // skip '.'
|
||||
;
|
||||
tmpsub->data = strndup(bp_last_break, tmpsub->nb_data - 1);
|
||||
tmpsub->got_output = 1;
|
||||
|
||||
tmpsub->start_time = alphanum_cur;
|
||||
alphanum_cur = 0;
|
||||
tmpsub->end_time = anychar_cur;
|
||||
anychar_cur = 0;
|
||||
|
||||
bp_last_break = bp_current + 1;
|
||||
|
||||
// tune last break:
|
||||
while (
|
||||
*bp_last_break
|
||||
&& isspace(*bp_last_break)
|
||||
)
|
||||
{
|
||||
bp_last_break++;
|
||||
}
|
||||
|
||||
// ???
|
||||
// tmpsub->info = NULL;
|
||||
// tmpsub->mode = NULL;
|
||||
|
||||
// link with prev sub:
|
||||
tmpsub->next = NULL;
|
||||
tmpsub->prev = resub;
|
||||
if (NULL != resub)
|
||||
{
|
||||
resub->next = tmpsub;
|
||||
}
|
||||
|
||||
resub = tmpsub;
|
||||
}
|
||||
|
||||
if (*bp_current && isalnum(*bp_current))
|
||||
{
|
||||
alphanum_total++;
|
||||
alphanum_cur++;
|
||||
}
|
||||
anychar_total++;
|
||||
anychar_cur++;
|
||||
}
|
||||
|
||||
// ===============================
|
||||
// okay, we have extracted several sentences, now we should
|
||||
// save the position of the "remainder" - start of the last
|
||||
// incomplete sentece
|
||||
// ===============================
|
||||
if (bp_last_break != sbs_undone_start)
|
||||
{
|
||||
context->sbs_handled_len = bp_last_break - sbs_undone_start;
|
||||
}
|
||||
|
||||
LOG_DEBUG("Sentence Buffer: AFTER sentence break: Handled Len [%4d]\n", context->sbs_handled_len);
|
||||
|
||||
LOG_DEBUG("Sentence Buffer: Alphanum Total: [%4d] Overall chars: [%4d] STRING:[%20s] BUFFER:[%20s]\n", alphanum_total, anychar_total, str, context->sbs_buffer);
|
||||
|
||||
// ===============================
|
||||
// Calculate time spans
|
||||
// ===============================
|
||||
if (!is_buf_initialized)
|
||||
{
|
||||
context->sbs_time_from = time_from;
|
||||
context->sbs_time_trim = time_trim;
|
||||
}
|
||||
|
||||
available_time = time_trim - context->sbs_time_from;
|
||||
use_alphanum_counters = alphanum_total > 0 ? 1 : 0;
|
||||
|
||||
tmpsub = resub;
|
||||
while (tmpsub)
|
||||
{
|
||||
alphanum_cur = tmpsub->start_time;
|
||||
anychar_cur = tmpsub->end_time;
|
||||
|
||||
if (use_alphanum_counters)
|
||||
{
|
||||
duration = available_time * alphanum_cur / alphanum_total;
|
||||
}
|
||||
else
|
||||
{
|
||||
duration = available_time * anychar_cur / anychar_total;
|
||||
}
|
||||
|
||||
tmpsub->start_time = context->sbs_time_from;
|
||||
tmpsub->end_time = tmpsub->start_time + duration;
|
||||
|
||||
context->sbs_time_from = tmpsub->end_time + 1;
|
||||
|
||||
tmpsub = tmpsub->next;
|
||||
}
|
||||
|
||||
return resub;
|
||||
}
|
||||
|
||||
struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context)
|
||||
{
|
||||
int ret = 0;
|
||||
#ifdef ENABLE_OCR
|
||||
struct cc_bitmap* rect;
|
||||
|
||||
LLONG ms_start, ms_end;
|
||||
int used;
|
||||
int i = 0;
|
||||
char *str;
|
||||
|
||||
if (context->prev_start != -1 && (sub->flags & SUB_EOD_MARKER))
|
||||
// this is a sub with a full sentence (or chain of such subs)
|
||||
struct cc_subtitle * resub = NULL;
|
||||
|
||||
#ifdef ENABLE_OCR
|
||||
|
||||
if (sub->flags & SUB_EOD_MARKER)
|
||||
{
|
||||
ms_start = context->prev_start;
|
||||
ms_end = sub->start_time;
|
||||
// the last sub from input
|
||||
|
||||
if (context->prev_start == -1)
|
||||
{
|
||||
ms_start = 1;
|
||||
ms_end = sub->start_time;
|
||||
}
|
||||
else
|
||||
{
|
||||
ms_start = context->prev_start;
|
||||
ms_end = sub->start_time;
|
||||
}
|
||||
}
|
||||
else if (!(sub->flags & SUB_EOD_MARKER))
|
||||
else
|
||||
{
|
||||
// not the last sub from input
|
||||
ms_start = sub->start_time;
|
||||
ms_end = sub->end_time;
|
||||
}
|
||||
else if (context->prev_start == -1 && (sub->flags & SUB_EOD_MARKER))
|
||||
{
|
||||
ms_start = 1;
|
||||
ms_end = sub->start_time;
|
||||
}
|
||||
|
||||
if (sub->nb_data == 0)
|
||||
return ret;
|
||||
rect = sub->data;
|
||||
return 0;
|
||||
|
||||
if (sub->flags & SUB_EOD_MARKER)
|
||||
context->prev_start = sub->start_time;
|
||||
|
||||
|
||||
if (rect[0].ocr_text && *(rect[0].ocr_text))
|
||||
str = paraof_ocrtext(sub, " ", 1);
|
||||
if (str)
|
||||
{
|
||||
lbl_start_block(ms_start, context);
|
||||
if (context->prev_start != -1 || !(sub->flags & SUB_EOD_MARKER))
|
||||
{
|
||||
char *token = NULL;
|
||||
token = paraof_ocrtext(sub, " ", 1); // Get text with spaces instead of newlines
|
||||
uint32_t offset=0;
|
||||
utf8proc_ssize_t ls; // Last size
|
||||
char *s = token;
|
||||
int32_t uc;
|
||||
while ((ls=utf8proc_iterate(s, -1, &uc)))
|
||||
{
|
||||
ccx_sbs_utf8_character sbsc;
|
||||
// Note: We don't care about uc here, since we will be writing the encoded bytes, not the code points in binary.
|
||||
//TODO: Deal with ls < 0
|
||||
if (!uc) // End of string
|
||||
break;
|
||||
printf("%3ld | %08X | %c %c %c %c\n", ls, uc, ((uc & 0xFF000000) >> 24), ((uc & 0xFF0000) >> 16),
|
||||
((uc & 0xFF00) >> 8), ( uc & 0xFF));
|
||||
sbsc.ch = uc;
|
||||
sbsc.encoded[0] = 0; sbsc.encoded[1] = 0; sbsc.encoded[2] = 0; sbsc.encoded[3] = 0;
|
||||
memcpy(sbsc.encoded, s, ls);
|
||||
sbsc.enc_len = ls;
|
||||
sbsc.ts = 0; // We don't know yet
|
||||
lbl_add_character(context, sbsc);
|
||||
s += ls;
|
||||
|
||||
// TO-DO: Add each of these characters to the buffer, splitting the timestamps. Remember to add character length to the array
|
||||
}
|
||||
printf("-------\n");
|
||||
|
||||
/*
|
||||
while (token)
|
||||
{
|
||||
char *newline_pos = strstr(token, context->encoded_crlf);
|
||||
if (!newline_pos)
|
||||
{
|
||||
fdprintf(context->out->fh, "%s", token);
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
while (token != newline_pos)
|
||||
{
|
||||
fdprintf(context->out->fh, "%c", *token);
|
||||
token++;
|
||||
}
|
||||
token += context->encoded_crlf_length;
|
||||
fdprintf(context->out->fh, "%c", ' ');
|
||||
}
|
||||
}*/
|
||||
|
||||
resub = sbs_append_string(str, ms_start, ms_end, context);
|
||||
}
|
||||
lbl_end_block(ms_end, context);
|
||||
freep(&str);
|
||||
}
|
||||
|
||||
for(i = 0, rect = sub->data; i < sub->nb_data; i++, rect++)
|
||||
{
|
||||
freep(rect->data);
|
||||
freep(rect->data+1);
|
||||
}
|
||||
#endif
|
||||
|
||||
sub->nb_data = 0;
|
||||
freep(&sub->data);
|
||||
return ret;
|
||||
return resub;
|
||||
|
||||
}
|
||||
|
11
src/lib_ccx/debug_def.h
Normal file
11
src/lib_ccx/debug_def.h
Normal file
@ -0,0 +1,11 @@
|
||||
#ifndef _DEBUG_DEF_H_
|
||||
#define _DEBUG_DEF_H_
|
||||
|
||||
#ifdef DEBUG
|
||||
#define LOG_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define LOG_DEBUG ;
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
59
tests/Makefile
Normal file
59
tests/Makefile
Normal file
@ -0,0 +1,59 @@
|
||||
SHELL = /bin/sh
|
||||
|
||||
CC=gcc
|
||||
# SYS := $(shell gcc -dumpmachine)
|
||||
CFLAGS=-O0 -std=gnu99 -D ENABLE_OCR -g -ggdb -rdynamic
|
||||
#-Q -da -v
|
||||
|
||||
# enable COVERAGE
|
||||
# CFLAGS+=-fprofile-arcs -ftest-coverage
|
||||
|
||||
# add debug flag
|
||||
ifdef DEBUG
|
||||
CFLAGS+=-DDEBUG
|
||||
endif
|
||||
|
||||
#ALL_FLAGS = -Wno-write-strings -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT
|
||||
LDFLAGS=-lm -g
|
||||
|
||||
CFLAGS+=$(shell pkg-config --cflags check)
|
||||
LDFLAGS+=$(shell pkg-config --libs check)
|
||||
|
||||
# TODO: need to rewrite this. Need new way to load sources for testing
|
||||
SRC=$(wildcard ../src/lib_ccx/ccx_encoders_splitbysentence.c)
|
||||
OBJS=
|
||||
|
||||
SRC_SUITE=$(wildcard *_suite.c)
|
||||
OBJ_SUITE=$(patsubst %_suite.c, %_suite.o, $(SRC_SUITE))
|
||||
|
||||
OBJS+=$(OBJ_SUITE)
|
||||
|
||||
all: clean test
|
||||
|
||||
%.o: %.c
|
||||
# explicit output name : -o $@
|
||||
$(CC) -c $(ALL_FLAGS) $(CFLAGS) $<
|
||||
|
||||
runtest: $(OBJS)
|
||||
@echo "+----------------------------------------------+"
|
||||
@echo "| BUILD TESTS |"
|
||||
@echo "+----------------------------------------------+"
|
||||
$(CC) -c $(ALL_FLAGS) $(CFLAGS) $@.c
|
||||
$(CC) $(SRC) $@.o $^ $(ALL_FLAGS) $(CFLAGS) $(LDFLAGS) -o $@
|
||||
|
||||
.PHONY: test
|
||||
test: runtest
|
||||
@echo "+----------------------------------------------+"
|
||||
@echo "| START TESTS |"
|
||||
@echo "+----------------------------------------------+"
|
||||
./runtest
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
rm runtest || true
|
||||
rm *.o || true
|
||||
# coverage info
|
||||
rm *.gcda || true
|
||||
rm *.gcno || true
|
||||
# debug info
|
||||
rm *.c.* || true
|
43
tests/README.md
Normal file
43
tests/README.md
Normal file
@ -0,0 +1,43 @@
|
||||
# UNIT TESTING
|
||||
|
||||
This folder contains a archetype and several unit-tests for CCExtractor
|
||||
|
||||
## RUN TESTS
|
||||
|
||||
```shell
|
||||
cd tests
|
||||
make
|
||||
```
|
||||
|
||||
This will build and run all test-suite.
|
||||
|
||||
If you want MORE output:
|
||||
|
||||
```shell
|
||||
DEBUG=1 make
|
||||
```
|
||||
|
||||
Where `DEBUG` is just an environment variable.
|
||||
|
||||
## DEBUGGING
|
||||
|
||||
If tests failed after your changes, you could debug them (almost all flags for this are set in the `tests/Makefile`.
|
||||
|
||||
Run:
|
||||
|
||||
```shell
|
||||
# build test runner
|
||||
make
|
||||
# load test runner to the debgger:
|
||||
gdb runner
|
||||
|
||||
# run under debugger:
|
||||
(gdb) run
|
||||
|
||||
# on segfault:
|
||||
(gdb) where
|
||||
```
|
||||
|
||||
## DEPENDENCIES
|
||||
|
||||
Tests are built around this library: [**libcheck**](https://github.com/libcheck/check), here is [**documentation**](https://libcheck.github.io/check/)
|
305
tests/ccx_encoders_splitbysentence_suite.c
Normal file
305
tests/ccx_encoders_splitbysentence_suite.c
Normal file
@ -0,0 +1,305 @@
|
||||
#include <check.h>
|
||||
#include "ccx_encoders_splitbysentence_suite.h"
|
||||
|
||||
// -------------------------------------
|
||||
// MOCKS
|
||||
// -------------------------------------
|
||||
typedef int64_t LLONG;
|
||||
#include "../src/lib_ccx/ccx_encoders_common.h"
|
||||
|
||||
// -------------------------------------
|
||||
// Private SBS-functions (for testing only)
|
||||
// -------------------------------------
|
||||
struct cc_subtitle * sbs_append_string(unsigned char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context);
|
||||
|
||||
// -------------------------------------
|
||||
// Helpers
|
||||
// -------------------------------------
|
||||
struct cc_subtitle * helper_create_sub(char * str, LLONG time_from, LLONG time_trim)
|
||||
{
|
||||
struct cc_subtitle * sub = (struct cc_subtitle *)malloc(sizeof(struct cc_subtitle));
|
||||
sub->type = CC_BITMAP;
|
||||
sub->start_time = 1;
|
||||
sub->end_time = 100;
|
||||
sub->data = strdup(str);
|
||||
sub->nb_data = strlen(sub->data);
|
||||
|
||||
return sub;
|
||||
}
|
||||
|
||||
struct cc_subtitle * helper_sbs_append_string(char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context)
|
||||
{
|
||||
char * str1;
|
||||
struct cc_subtitle * sub;
|
||||
|
||||
str1 = strdup(str);
|
||||
sub = sbs_append_string(str1, time_from, time_trim, context);
|
||||
free(str1);
|
||||
return sub;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
// MOCKS
|
||||
// -------------------------------------
|
||||
struct encoder_ctx * context;
|
||||
|
||||
void freep(void * obj){
|
||||
}
|
||||
void fatal(int x, void * obj){
|
||||
}
|
||||
|
||||
unsigned char * paraof_ocrtext(void * sub) {
|
||||
// this is OCR -> text converter.
|
||||
// now, in our test cases, we will pass TEXT instead of OCR.
|
||||
// and will return passed text as result
|
||||
|
||||
return ((struct cc_subtitle *)sub)->data;
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
// TEST preparations
|
||||
// -------------------------------------
|
||||
void setup(void)
|
||||
{
|
||||
context = (struct encoder_ctx *)malloc(sizeof(struct encoder_ctx));
|
||||
context->sbs_buffer = NULL;
|
||||
context->sbs_capacity = 0;
|
||||
}
|
||||
|
||||
void teardown(void)
|
||||
{
|
||||
free(context);
|
||||
}
|
||||
|
||||
// -------------------------------------
|
||||
// TESTS
|
||||
// -------------------------------------
|
||||
START_TEST(test_sbs_one_simple_sentence)
|
||||
{
|
||||
struct cc_subtitle * sub = helper_create_sub("Simple sentence.", 1, 100);
|
||||
struct cc_subtitle * out = reformat_cc_bitmap_through_sentence_buffer(sub, context);
|
||||
|
||||
ck_assert_ptr_ne(out, NULL);
|
||||
ck_assert_str_eq(out->data, "Simple sentence.");
|
||||
ck_assert_ptr_eq(out->next, NULL);
|
||||
ck_assert_ptr_eq(out->prev, NULL);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
|
||||
START_TEST(test_sbs_two_sentences_with_rep)
|
||||
{
|
||||
struct cc_subtitle * sub1 = helper_create_sub("asdf", 1, 100);
|
||||
struct cc_subtitle * out1 = reformat_cc_bitmap_through_sentence_buffer(sub1, context);
|
||||
ck_assert_ptr_eq(out1, NULL);
|
||||
|
||||
// second sub:
|
||||
struct cc_subtitle * sub2 = helper_create_sub("asdf Hello.", 101, 200);
|
||||
struct cc_subtitle * out2 = reformat_cc_bitmap_through_sentence_buffer(sub2, context);
|
||||
|
||||
ck_assert_ptr_ne(out2, NULL);
|
||||
ck_assert_str_eq(out2->data, "asdf Hello.");
|
||||
ck_assert_ptr_eq(out2->next, NULL);
|
||||
ck_assert_ptr_eq(out2->prev, NULL);}
|
||||
END_TEST
|
||||
|
||||
|
||||
START_TEST(test_sbs_append_string_two_separate)
|
||||
{
|
||||
unsigned char * test_strings[] = {
|
||||
"First string.",
|
||||
"Second string."
|
||||
};
|
||||
struct cc_subtitle * sub;
|
||||
unsigned char * str;
|
||||
|
||||
// first string
|
||||
str = strdup(test_strings[0]);
|
||||
sub = NULL;
|
||||
sub = sbs_append_string(str, 1, 20, context);
|
||||
ck_assert_ptr_ne(sub, NULL);
|
||||
ck_assert_str_eq(sub->data, test_strings[0]);
|
||||
ck_assert_int_eq(sub->start_time, 1);
|
||||
ck_assert_int_eq(sub->end_time, 20);
|
||||
|
||||
// second string:
|
||||
str = strdup(test_strings[1]);
|
||||
sub = NULL;
|
||||
sub = sbs_append_string(str, 21, 40, context);
|
||||
|
||||
ck_assert_ptr_ne(sub, NULL);
|
||||
ck_assert_str_eq(sub->data, test_strings[1]);
|
||||
ck_assert_int_eq(sub->start_time, 21);
|
||||
ck_assert_int_eq(sub->end_time, 40);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_sbs_append_string_two_with_broken_sentence)
|
||||
{
|
||||
// important !!
|
||||
// summary len == 32
|
||||
char * test_strings[] = {
|
||||
"First string",
|
||||
" ends here, deabbea."
|
||||
};
|
||||
struct cc_subtitle * sub;
|
||||
char * str;
|
||||
|
||||
// first string
|
||||
str = strdup(test_strings[0]);
|
||||
sub = sbs_append_string(str, 1, 3, context);
|
||||
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// second string:
|
||||
str = strdup(test_strings[1]);
|
||||
sub = sbs_append_string(str, 4, 5, context);
|
||||
|
||||
ck_assert_ptr_ne(sub, NULL);
|
||||
ck_assert_str_eq(sub->data, "First string ends here, deabbea.");
|
||||
ck_assert_int_eq(sub->start_time, 1);
|
||||
ck_assert_int_eq(sub->end_time, 5);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
START_TEST(test_sbs_append_string_two_intersecting)
|
||||
{
|
||||
char * test_strings[] = {
|
||||
"First string",
|
||||
"First string ends here."
|
||||
};
|
||||
struct cc_subtitle * sub;
|
||||
char * str;
|
||||
|
||||
// first string
|
||||
str = strdup(test_strings[0]);
|
||||
sub = sbs_append_string(str, 1, 20, context);
|
||||
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
free(sub);
|
||||
|
||||
// second string:
|
||||
str = strdup(test_strings[1]);
|
||||
//printf("second string: [%s]\n", str);
|
||||
sub = sbs_append_string(str, 21, 40, context);
|
||||
|
||||
ck_assert_ptr_ne(sub, NULL);
|
||||
ck_assert_str_eq(sub->data, "First string ends here.");
|
||||
ck_assert_int_eq(sub->start_time, 1);
|
||||
ck_assert_int_eq(sub->end_time, 40);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
|
||||
START_TEST(test_sbs_append_string_real_data_1)
|
||||
{
|
||||
struct cc_subtitle * sub;
|
||||
|
||||
// 1
|
||||
sub = helper_sbs_append_string("Oleon",
|
||||
1, 0, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// 2
|
||||
sub = helper_sbs_append_string("Oleon costs.",
|
||||
1, 189, context);
|
||||
ck_assert_ptr_ne(sub, NULL);
|
||||
ck_assert_str_eq(sub->data, "Oleon costs.");
|
||||
|
||||
// 3
|
||||
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
|
||||
Didn't",
|
||||
190, 889, context);
|
||||
ck_assert_ptr_ne(sub, NULL);
|
||||
ck_assert_str_eq(sub->data, "buried in the annex, 95 Oleon costs.");
|
||||
ck_assert_int_eq(sub->start_time, 190); // = <sub start>
|
||||
ck_assert_int_eq(sub->end_time, 783); // = <sub start> + <available time,889-190=699 > * <sentence alphanum, 28> / <sub alphanum, 33>
|
||||
ck_assert_ptr_eq(sub->next, NULL);
|
||||
|
||||
// 4
|
||||
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
|
||||
Didn't want",
|
||||
890, 1129, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// 5
|
||||
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
|
||||
Didn't want to",
|
||||
1130, 1359, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// 6
|
||||
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
|
||||
Didn't want to acknowledge",
|
||||
1360, 2059, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// 7
|
||||
sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
|
||||
Didn't want to acknowledge the",
|
||||
2060, 2299, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// 9
|
||||
sub = helper_sbs_append_string("Didn't want to acknowledge the\n\
|
||||
pressures on hospitals, schools and",
|
||||
2300, 5019, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// 13
|
||||
sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
|
||||
infrastructure.",
|
||||
5020, 5159, context);
|
||||
ck_assert_ptr_ne(sub, NULL);
|
||||
ck_assert_str_eq(sub->data, "Didn't want to acknowledge the pressures on hospitals, schools and infrastructure.");
|
||||
ck_assert_int_eq(sub->start_time, 784);
|
||||
ck_assert_int_eq(sub->end_time, 5159);
|
||||
ck_assert_ptr_eq(sub->next, NULL);
|
||||
|
||||
// 14
|
||||
sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
|
||||
infrastructure. If",
|
||||
5160, 5529, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// 16
|
||||
sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
|
||||
infrastructure. If we go",
|
||||
5530, 6559, context);
|
||||
ck_assert_ptr_eq(sub, NULL);
|
||||
|
||||
// ck_assert_int_eq(sub->start_time, 1);
|
||||
// ck_assert_int_eq(sub->end_time, 40);
|
||||
}
|
||||
END_TEST
|
||||
|
||||
|
||||
Suite * ccx_encoders_splitbysentence_suite(void)
|
||||
{
|
||||
Suite *s;
|
||||
TCase *tc_core;
|
||||
|
||||
s = suite_create("Sentence Buffer");
|
||||
|
||||
/* Overall tests */
|
||||
tc_core = tcase_create("SB: Overall");
|
||||
|
||||
tcase_add_checked_fixture(tc_core, setup, teardown);
|
||||
tcase_add_test(tc_core, test_sbs_one_simple_sentence);
|
||||
tcase_add_test(tc_core, test_sbs_two_sentences_with_rep);
|
||||
suite_add_tcase(s, tc_core);
|
||||
|
||||
/**/
|
||||
TCase *tc_append_string;
|
||||
tc_append_string = tcase_create("SB: append_string");
|
||||
tcase_add_checked_fixture(tc_append_string, setup, teardown);
|
||||
|
||||
tcase_add_test(tc_append_string, test_sbs_append_string_two_separate);
|
||||
tcase_add_test(tc_append_string, test_sbs_append_string_two_with_broken_sentence);
|
||||
tcase_add_test(tc_append_string, test_sbs_append_string_two_intersecting);
|
||||
tcase_add_test(tc_append_string, test_sbs_append_string_real_data_1);
|
||||
|
||||
suite_add_tcase(s, tc_append_string);
|
||||
|
||||
return s;
|
||||
}
|
4
tests/ccx_encoders_splitbysentence_suite.h
Normal file
4
tests/ccx_encoders_splitbysentence_suite.h
Normal file
@ -0,0 +1,4 @@
|
||||
// -------------------------------------
|
||||
// SUITE
|
||||
// -------------------------------------
|
||||
Suite * ccx_encoders_splitbysentence_suite(void);
|
21
tests/runtest.c
Normal file
21
tests/runtest.c
Normal file
@ -0,0 +1,21 @@
|
||||
#include <check.h>
|
||||
|
||||
// TESTS:
|
||||
#include "ccx_encoders_splitbysentence_suite.h"
|
||||
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int number_failed;
|
||||
Suite *s;
|
||||
SRunner *sr;
|
||||
|
||||
s = ccx_encoders_splitbysentence_suite();
|
||||
sr = srunner_create(s);
|
||||
srunner_set_fork_status(sr, CK_NOFORK);
|
||||
|
||||
srunner_run_all(sr, CK_NORMAL);
|
||||
number_failed = srunner_ntests_failed(sr);
|
||||
srunner_free(sr);
|
||||
return (number_failed == 0) ? 0 : 1;
|
||||
}
|
Loading…
Reference in New Issue
Block a user