Break incoming subs into sentences (through a buffer), and remove duplicates

2024-12-24 20:01:42 +00:00 · 2016-12-02 13:36:33 +05:00 · 2016-12-02 13:36:33 +05:00 · 66393a80f2
commit 66393a80f2
parent d453d9327e
10 changed files with 1060 additions and 294 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,9 @@
+####
+# Ignore tests tmp files and results
+tests/runtest
+tests/**/*.gcda
+tests/**/*.gcno
+
 ####
 # Ignore CVS related files

--- a/src/lib_ccx/ccx_encoders_common.c
+++ b/src/lib_ccx/ccx_encoders_common.c
@ -957,14 +957,10 @@ struct encoder_ctx *init_encoder(struct encoder_cfg *opt)
 	ctx->force_flush = opt->force_flush;
 	ctx->ucla = opt->ucla;
 	ctx->splitbysentence = opt->splitbysentence;
-	ctx->sbs_newblock_start_time = -1;
-	ctx->sbs_newblock_end_time = -1;
-	ctx->sbs_newblock = NULL;
-	ctx->sbs_newblock_capacity = 0;
-	ctx->sbs_newblock_size = 0;
+	ctx->sbs_time_from = -1;
+	ctx->sbs_time_trim = -1;
+	ctx->sbs_capacity = 0;
 	ctx->sbs_buffer = NULL;
-	ctx->sbs_buffer_capacity = 0;
-	ctx->sbs_buffer_size = 0;

 	ctx->subline = (unsigned char *) malloc (SUBLINESIZE);
 	if(!ctx->subline)
@ -1045,203 +1041,204 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub)
 		// Write to a buffer that is later s+plit to generate split
 		// in sentences
 		if (sub->type == CC_BITMAP)
-			wrote_something = write_cc_bitmap_to_sentence_buffer(sub, context);
+			sub = reformat_cc_bitmap_through_sentence_buffer(sub, context);
+
+		if (NULL==sub)
+			return wrote_something;
 	}
-	else
+	// Write subtitles as they come
+	if (sub->type == CC_608)
 	{
-		// Write subtitles as they come
-		if (sub->type == CC_608)
+		struct eia608_screen *data = NULL;
+		struct ccx_s_write *out;
+		for (data = sub->data; sub->nb_data; sub->nb_data--, data++)
 		{
-			struct eia608_screen *data = NULL;
-			struct ccx_s_write *out;
-			for (data = sub->data; sub->nb_data; sub->nb_data--, data++)
+			// Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example)
+			out = get_output_ctx(context, data->my_field);
+
+			if (data->format == SFORMAT_XDS)
 			{
-				// Determine context based on channel. This replaces the code that was above, as this was incomplete (for cases where -12 was used for example)
-				out = get_output_ctx(context, data->my_field);
-
-				if (data->format == SFORMAT_XDS)
-				{
-					data->end_time = data->end_time + context->subs_delay;
-					xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class);
-					if (data->xds_len > 0)
-					{
-						ret = write(out->fh, data->xds_str, data->xds_len);
-						if (ret < data->xds_len)
-						{
-							mprint("WARNING:Loss of data\n");
-						}
-					}
-					freep(&data->xds_str);
-					write_newline(context, 0);
-					continue;
-				}
-
 				data->end_time = data->end_time + context->subs_delay;
-				switch (context->write_format)
+				xds_write_transcript_line_prefix(context, out, data->start_time, data->end_time, data->cur_xds_packet_class);
+				if (data->xds_len > 0)
 				{
-					case CCX_OF_SRT:
-						if (!context->startcredits_displayed && context->start_credits_text != NULL)
-							try_to_add_start_credits(context, data->start_time);
-						wrote_something = write_cc_buffer_as_srt(data, context);
-						break;
-					case CCX_OF_SSA:
-						if (!context->startcredits_displayed && context->start_credits_text != NULL)
-							try_to_add_start_credits(context, data->start_time);
-						wrote_something = write_cc_buffer_as_ssa(data, context);
-						break;
-					case CCX_OF_G608:
-						wrote_something = write_cc_buffer_as_g608(data, context);
-						break;
-					case CCX_OF_WEBVTT:
-						if (!context->startcredits_displayed && context->start_credits_text != NULL)
-							try_to_add_start_credits(context, data->start_time);
-						wrote_something = write_cc_buffer_as_webvtt(data, context);
-						break;
-					case CCX_OF_SAMI:
-						if (!context->startcredits_displayed && context->start_credits_text != NULL)
-							try_to_add_start_credits(context, data->start_time);
-						wrote_something = write_cc_buffer_as_sami(data, context);
-						break;
-					case CCX_OF_SMPTETT:
-						if (!context->startcredits_displayed && context->start_credits_text != NULL)
-							try_to_add_start_credits(context, data->start_time);
-						wrote_something = write_cc_buffer_as_smptett(data, context);
-						break;
-					case CCX_OF_TRANSCRIPT:
-						wrote_something = write_cc_buffer_as_transcript2(data, context);
-						break;
-					case CCX_OF_SPUPNG:
-						wrote_something = write_cc_buffer_as_spupng(data, context);
-						break;
-					case CCX_OF_SIMPLE_XML:
-						if (ccx_options.keep_output_closed && context->out->temporarily_closed)
-						{
-							temporarily_open_output(context->out);
-							write_subtitle_file_header(context, context->out);
-						}
-						wrote_something = write_cc_buffer_as_simplexml(data, context);
-						if (ccx_options.keep_output_closed)
-						{
-							write_subtitle_file_footer(context, context->out);
-							temporarily_close_output(context->out);
-						}
-						break;
-					default:
-						break;
+					ret = write(out->fh, data->xds_str, data->xds_len);
+					if (ret < data->xds_len)
+					{
+						mprint("WARNING:Loss of data\n");
+					}
 				}
-				if (wrote_something)
-					context->last_displayed_subs_ms = data->end_time;
-
-				if (context->gui_mode_reports)
-					write_cc_buffer_to_gui(sub->data, context);
-			}
-			freep(&sub->data);
-		}
-		if (sub->type == CC_BITMAP)
-		{
-			switch (context->write_format)
-			{
-			case CCX_OF_SRT:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_bitmap_as_srt(sub, context);
-				break;
-			case CCX_OF_SSA:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-					wrote_something = write_cc_bitmap_as_ssa(sub, context);
-				break;
-			case CCX_OF_WEBVTT:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_bitmap_as_webvtt(sub, context);
-				break;
-			case CCX_OF_SAMI:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_bitmap_as_sami(sub, context);
-				break;
-			case CCX_OF_SMPTETT:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_bitmap_as_smptett(sub, context);
-				break;
-			case CCX_OF_TRANSCRIPT:
-				wrote_something = write_cc_bitmap_as_transcript(sub, context);
-				break;
-			case CCX_OF_SPUPNG:
-				wrote_something = write_cc_bitmap_as_spupng(sub, context);
-				break;
-			case CCX_OF_SIMPLE_XML:
-				wrote_something = write_cc_bitmap_as_simplexml(sub, context);
-				break;
-#ifdef WITH_LIBCURL
-			case CCX_OF_CURL:
-				wrote_something = write_cc_bitmap_as_libcurl(sub, context);
-				break;
-#endif
-			default:
-				break;
+				freep(&data->xds_str);
+				write_newline(context, 0);
+				continue;
 			}

-		}
-		if (sub->type == CC_RAW)
-		{
-			if (context->send_to_srv)
-				net_send_header(sub->data, sub->nb_data);
-			else
-			{
-				ret = write(context->out->fh, sub->data, sub->nb_data);
-				if (ret < sub->nb_data) {
-					mprint("WARNING: Loss of data\n");
-				}
-			}
-			sub->nb_data = 0;
-		}
-		if (sub->type == CC_TEXT)
-		{
+			data->end_time = data->end_time + context->subs_delay;
 			switch (context->write_format)
 			{
-			case CCX_OF_SRT:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_subtitle_as_srt(sub, context);
-				break;
-			case CCX_OF_SSA:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_subtitle_as_ssa(sub, context);
-				break;
-			case CCX_OF_WEBVTT:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_subtitle_as_webvtt(sub, context);
-				break;
-			case CCX_OF_SAMI:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_subtitle_as_sami(sub, context);
-				break;
-			case CCX_OF_SMPTETT:
-				if (!context->startcredits_displayed && context->start_credits_text != NULL)
-					try_to_add_start_credits(context, sub->start_time);
-				wrote_something = write_cc_subtitle_as_smptett(sub, context);
-				break;
-			case CCX_OF_TRANSCRIPT:
-				wrote_something = write_cc_subtitle_as_transcript(sub, context);
-				break;
-			case CCX_OF_SPUPNG:
-				wrote_something = write_cc_subtitle_as_spupng(sub, context);
-				break;
-			case CCX_OF_SIMPLE_XML:
-				wrote_something = write_cc_subtitle_as_simplexml(sub, context);
-				break;
-			default:
-				break;
+				case CCX_OF_SRT:
+					if (!context->startcredits_displayed && context->start_credits_text != NULL)
+						try_to_add_start_credits(context, data->start_time);
+					wrote_something = write_cc_buffer_as_srt(data, context);
+					break;
+				case CCX_OF_SSA:
+					if (!context->startcredits_displayed && context->start_credits_text != NULL)
+						try_to_add_start_credits(context, data->start_time);
+					wrote_something = write_cc_buffer_as_ssa(data, context);
+					break;
+				case CCX_OF_G608:
+					wrote_something = write_cc_buffer_as_g608(data, context);
+					break;
+				case CCX_OF_WEBVTT:
+					if (!context->startcredits_displayed && context->start_credits_text != NULL)
+						try_to_add_start_credits(context, data->start_time);
+					wrote_something = write_cc_buffer_as_webvtt(data, context);
+					break;
+				case CCX_OF_SAMI:
+					if (!context->startcredits_displayed && context->start_credits_text != NULL)
+						try_to_add_start_credits(context, data->start_time);
+					wrote_something = write_cc_buffer_as_sami(data, context);
+					break;
+				case CCX_OF_SMPTETT:
+					if (!context->startcredits_displayed && context->start_credits_text != NULL)
+						try_to_add_start_credits(context, data->start_time);
+					wrote_something = write_cc_buffer_as_smptett(data, context);
+					break;
+				case CCX_OF_TRANSCRIPT:
+					wrote_something = write_cc_buffer_as_transcript2(data, context);
+					break;
+				case CCX_OF_SPUPNG:
+					wrote_something = write_cc_buffer_as_spupng(data, context);
+					break;
+				case CCX_OF_SIMPLE_XML:
+					if (ccx_options.keep_output_closed && context->out->temporarily_closed)
+					{
+						temporarily_open_output(context->out);
+						write_subtitle_file_header(context, context->out);
+					}
+					wrote_something = write_cc_buffer_as_simplexml(data, context);
+					if (ccx_options.keep_output_closed)
+					{
+						write_subtitle_file_footer(context, context->out);
+						temporarily_close_output(context->out);
+					}
+					break;
+				default:
+					break;
 			}
-			sub->nb_data = 0;
+			if (wrote_something)
+				context->last_displayed_subs_ms = data->end_time;
+
+			if (context->gui_mode_reports)
+				write_cc_buffer_to_gui(sub->data, context);
 		}
+		freep(&sub->data);
 	}
+	if (sub->type == CC_BITMAP)
+	{
+		switch (context->write_format)
+		{
+		case CCX_OF_SRT:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_bitmap_as_srt(sub, context);
+			break;
+		case CCX_OF_SSA:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+				wrote_something = write_cc_bitmap_as_ssa(sub, context);
+			break;
+		case CCX_OF_WEBVTT:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_bitmap_as_webvtt(sub, context);
+			break;
+		case CCX_OF_SAMI:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_bitmap_as_sami(sub, context);
+			break;
+		case CCX_OF_SMPTETT:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_bitmap_as_smptett(sub, context);
+			break;
+		case CCX_OF_TRANSCRIPT:
+			wrote_something = write_cc_bitmap_as_transcript(sub, context);
+			break;
+		case CCX_OF_SPUPNG:
+			wrote_something = write_cc_bitmap_as_spupng(sub, context);
+			break;
+		case CCX_OF_SIMPLE_XML:
+			wrote_something = write_cc_bitmap_as_simplexml(sub, context);
+			break;
+#ifdef WITH_LIBCURL
+		case CCX_OF_CURL:
+			wrote_something = write_cc_bitmap_as_libcurl(sub, context);
+			break;
+#endif
+		default:
+			break;
+		}
+
+	}
+	if (sub->type == CC_RAW)
+	{
+		if (context->send_to_srv)
+			net_send_header(sub->data, sub->nb_data);
+		else
+		{
+			ret = write(context->out->fh, sub->data, sub->nb_data);
+			if (ret < sub->nb_data) {
+				mprint("WARNING: Loss of data\n");
+			}
+		}
+		sub->nb_data = 0;
+	}
+	if (sub->type == CC_TEXT)
+	{
+		switch (context->write_format)
+		{
+		case CCX_OF_SRT:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_subtitle_as_srt(sub, context);
+			break;
+		case CCX_OF_SSA:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_subtitle_as_ssa(sub, context);
+			break;
+		case CCX_OF_WEBVTT:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_subtitle_as_webvtt(sub, context);
+			break;
+		case CCX_OF_SAMI:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_subtitle_as_sami(sub, context);
+			break;
+		case CCX_OF_SMPTETT:
+			if (!context->startcredits_displayed && context->start_credits_text != NULL)
+				try_to_add_start_credits(context, sub->start_time);
+			wrote_something = write_cc_subtitle_as_smptett(sub, context);
+			break;
+		case CCX_OF_TRANSCRIPT:
+			wrote_something = write_cc_subtitle_as_transcript(sub, context);
+			break;
+		case CCX_OF_SPUPNG:
+			wrote_something = write_cc_subtitle_as_spupng(sub, context);
+			break;
+		case CCX_OF_SIMPLE_XML:
+			wrote_something = write_cc_subtitle_as_simplexml(sub, context);
+			break;
+		default:
+			break;
+		}
+		sub->nb_data = 0;
+	}
+
 	if (!sub->nb_data)
 		freep(&sub->data);
 	if (wrote_something && context->force_flush)
--- a/src/lib_ccx/ccx_encoders_common.h
+++ b/src/lib_ccx/ccx_encoders_common.h
@ -62,7 +62,7 @@ struct encoder_ctx
 	/* Input file format used in Teletext for exceptional output */
 	unsigned int in_fileformat; //1 =Normal, 2=Teletext
 	/* Keep output file closed when not actually writing to it and start over each time (add headers, etc) */
-	unsigned int keep_output_closed; 
+	unsigned int keep_output_closed;
 	/* Force a flush on the file buffer whenever content is written */
 	int force_flush;
 	/* Keep track of whether -UCLA used */
@ -118,15 +118,14 @@ struct encoder_ctx

 	/* split-by-sentence stuff */
 	int splitbysentence;
-	LLONG sbs_newblock_start_time; // Used by the split-by-sentence code to know when the current block starts...
-	LLONG sbs_newblock_end_time; // ... and ends
-	ccx_sbs_utf8_character *sbs_newblock;
-	int sbs_newblock_capacity;
-	int sbs_newblock_size;
-	ccx_sbs_utf8_character *sbs_buffer;
-	int sbs_buffer_capacity;
-	int sbs_buffer_size;

+	unsigned char * sbs_buffer; /// Storage for sentence-split buffer
+	size_t sbs_handled_len; /// The length of the string in the SBS-buffer, already handled, but preserved for DUP-detection.
+
+	//ccx_sbs_utf8_character *sbs_newblock;
+	LLONG sbs_time_from; // Used by the split-by-sentence code to know when the current block starts...
+	LLONG sbs_time_trim; // ... and ends
+	size_t sbs_capacity;
 };

 #define INITIAL_ENC_BUFFER_CAPACITY	2048
@ -196,10 +195,9 @@ int write_cc_bitmap_as_sami            (struct cc_subtitle *sub, struct encoder_
 int write_cc_bitmap_as_smptett         (struct cc_subtitle *sub, struct encoder_ctx *context);
 int write_cc_bitmap_as_spupng          (struct cc_subtitle *sub, struct encoder_ctx *context);
 int write_cc_bitmap_as_transcript      (struct cc_subtitle *sub, struct encoder_ctx *context);
-int write_cc_bitmap_to_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context);
 int write_cc_bitmap_as_libcurl         (struct cc_subtitle *sub, struct encoder_ctx *context);

-
+struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer (struct cc_subtitle *sub, struct encoder_ctx *context);

 void set_encoder_last_displayed_subs_ms(struct encoder_ctx *ctx, LLONG last_displayed_subs_ms);
 void set_encoder_subs_delay(struct encoder_ctx *ctx, LLONG subs_delay);
--- a/src/lib_ccx/ccx_encoders_splitbysentence.c
+++ b/src/lib_ccx/ccx_encoders_splitbysentence.c
@ -1,135 +1,457 @@
-#include "ccx_decoders_common.h"
+#include "ccx_common_platform.h"
 #include "ccx_encoders_common.h"
-#include "spupng_encoder.h"
-#include "ccx_encoders_spupng.h"
-#include "utility.h"
+#include "lib_ccx.h"
 #include "ocr.h"
-#include "ccx_decoders_608.h"
-#include "ccx_decoders_708.h"
-#include "ccx_decoders_708_output.h"
-#include "ccx_encoders_xds.h"
-#include "ccx_encoders_helpers.h"
-#include "utf8proc.h"
+#include "debug_def.h"

 #ifdef ENABLE_SHARING
 #include "ccx_share.h"
 #endif //ENABLE_SHARING

-void lbl_start_block(LLONG start_time, struct encoder_ctx *context)
+int sbs_is_pointer_on_sentence_breaker(char * start, char * current)
 {
-	context->sbs_newblock_start_time = start_time;
-}
+	char c = *current;
+	char n = *(current + 1);
+	char p = *(current - 1);

-void lbl_add_character(struct encoder_ctx *context, ccx_sbs_utf8_character ch)
-{
-	if (context->sbs_newblock_capacity == context->sbs_newblock_size)
+	if (0 == c) n = 0;
+	if (current == start) p = 0;
+
+	if (0 == c) return 1;
+
+	if ('.' == c
+		|| '!' == c
+		|| '?' == c
+	)
 	{
-		int newcapacity = (context->sbs_newblock_capacity < 512) ? 1024 : context->sbs_newblock_capacity * 2;
-		context->sbs_newblock = (ccx_sbs_utf8_character *)realloc(context->sbs_newblock, newcapacity*sizeof(ccx_sbs_utf8_character));		
-		if (!context->sbs_newblock)
-			fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in lbl_add_character");
-		context->sbs_newblock_capacity = newcapacity;
+		if ('.' == n
+			|| '!' == n
+			|| '?' == n
+		)
+		{
+			return 0;
+		}
+
+		return 1;
 	}
-	memcpy(&context->sbs_newblock[context->sbs_newblock_size++], &ch, sizeof ch);
+
+	return 0;
 }

-void lbl_end_block(LLONG end_time, struct encoder_ctx *context)
+int sbs_fuzzy_strncmp(const char * a, const char * b, size_t n, const size_t maxerr)
 {
-	context->sbs_newblock_end_time = end_time;
+	// TODO: implement fuzzy comparing
+	// Error counter DOES NOT WORK!!!
+
+	int i;
+	//int err;
+	char A, B;
+
+	i = -1;
+	do
+	{
+		i++;
+
+		// Bound check (compare to N)
+		if (i == n) return 0;
+
+		A = a[i];
+		B = b[i];
+
+		// bound check (line endings)
+		if (A == 0)
+		{
+			if (B == 0) return 0;
+			return 1;
+		}
+		else
+		{
+			if (B == 0) return -1;
+		}
+
+		if (A == B) continue;
+		if (isspace(A) && isspace(B)) continue;
+
+		if (A > B) return 1;
+		return -1;
+
+	} while(1);
 }

-int write_cc_bitmap_to_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context)
+void sbs_strcpy_without_dup(const unsigned char * str, struct encoder_ctx * context)
+{
+	int intersect_len;
+	unsigned char * suffix;
+	const unsigned char * prefix = str;
+
+	unsigned long sbs_len;
+	unsigned long str_len;
+
+	str_len = strlen(str);
+	sbs_len = strlen(context->sbs_buffer);
+
+	intersect_len = str_len;
+	if (sbs_len < intersect_len)
+		intersect_len = sbs_len;
+
+	while (intersect_len>0)
+	{
+		suffix = context->sbs_buffer + sbs_len - intersect_len;
+		if (0 == sbs_fuzzy_strncmp(prefix, suffix, intersect_len, 1))
+		{
+			break;
+		}
+		intersect_len--;
+	}
+
+	LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, intersection len [%4d]\n", intersect_len);
+
+	// check, that new string does not contain data, from
+	// already handled sentence:
+	LOG_DEBUG("Sentence Buffer: sbs_strcpy_without_dup, sbslen [%4d] handled len [%4d]\n", sbs_len, context->sbs_handled_len);
+	if ( (sbs_len - intersect_len) >= context->sbs_handled_len)
+	{
+		// there is no intersection.
+		// It is time to clean the buffer. Excepting the last uncomplete sentence
+		strcpy(context->sbs_buffer, context->sbs_buffer + context->sbs_handled_len);
+		context->sbs_handled_len = 0;
+		sbs_len = strlen(context->sbs_buffer);
+
+		LOG_DEBUG("Sentence Buffer: Clean buffer, after BUF [%s]\n\n\n", context->sbs_buffer);
+	}
+
+	if (intersect_len > 0)
+	{
+		// there is a common part (suffix of old sentence equals to prefix of new str)
+		//
+		// remove dup from buffer
+		// we will use an appropriate part from the new string
+		context->sbs_buffer[sbs_len-intersect_len] = 0;
+	}
+
+	sbs_len = strlen(context->sbs_buffer);
+
+	// whitespace control. Add space between subs
+	if (
+		!isspace(str[0])                // not a space char in the beginning of new str
+		&& context->sbs_handled_len >0  // buffer is not empty (there is uncomplete sentence)
+		&& !isspace(context->sbs_buffer[sbs_len-1])  // not a space char at the end of existing buf
+	)
+	{
+		//strcat(context->sbs_buffer, " ");
+	}
+
+	strcat(context->sbs_buffer, str);
+}
+
+void sbs_str_autofix(unsigned char * str)
+{
+	int i;
+
+	// replace all whitespaces with spaces:
+	for (i = 0; str[i] != 0; i++)
+	{
+		if (isspace(str[i]))
+		{
+			str[i] = ' ';
+		}
+
+		if (
+			str[i] == '|'
+			&& (i==0 || isspace(str[i-1]))
+			&& (str[i+1] == 0 || isspace(str[i+1]) || str[i+1]=='\'')
+		)
+		{
+			// try to convert to "I"
+			str[i] = 'I';
+		}
+	}
+
+}
+
+/**
+ * Appends the function to the sentence buffer, and returns a list of full sentences (if there are any), or NULL
+ *
+ * @param  str       Partial (or full) sub to append.
+ * @param  time_from Starting timestamp
+ * @param  time_trim Ending timestamp
+ * @param  context   Encoder context
+ * @return           New <struct cc_subtitle *> subtitle, or NULL, if <str> doesn't contain the ending part of the sentence. If there are more than one sentence, the remaining sentences will be chained using <result->next> reference.
+ */
+struct cc_subtitle * sbs_append_string(unsigned char * str, const LLONG time_from, const LLONG time_trim, struct encoder_ctx * context)
+{
+	struct cc_subtitle * resub;
+	struct cc_subtitle * tmpsub;
+
+	unsigned char * bp_current;
+	unsigned char * bp_last_break;
+	unsigned char * sbs_undone_start;
+
+	int is_buf_initialized;
+	int required_capacity;
+	int new_capacity;
+
+	LLONG alphanum_total;
+	LLONG alphanum_cur;
+
+	LLONG anychar_total;
+	LLONG anychar_cur;
+
+	LLONG duration;
+	LLONG available_time;
+	int use_alphanum_counters;
+
+	if (! str)
+		return NULL;
+
+	sbs_str_autofix(str);
+
+	is_buf_initialized = (NULL == context->sbs_buffer || context->sbs_capacity == 0)
+		? 0
+		: 1;
+
+	// ===============================
+	// grow sentence buffer
+	// ===============================
+	required_capacity =
+		(is_buf_initialized ? strlen(context->sbs_buffer) : 0)    // existing data in buf
+		+ strlen(str)     // length of new string
+		+ 1               // trailing \0
+		+ 1               // space control (will add one space , if required)
+	;
+
+	if (required_capacity >= context->sbs_capacity)
+	{
+		new_capacity = context->sbs_capacity;
+		if (! is_buf_initialized) new_capacity = 16;
+
+		while (new_capacity < required_capacity)
+		{
+			// increase NEW_capacity, and check, that increment
+			// is less than 8 Mb. Because 8Mb - it is a lot
+			// for a TEXT buffer. It is weird...
+			new_capacity += (new_capacity > 1048576 * 8)
+				? 1048576 * 8
+				: new_capacity;
+		}
+
+		context->sbs_buffer = (unsigned char *)realloc(
+			context->sbs_buffer,
+			new_capacity * sizeof(/*unsigned char*/ context->sbs_buffer[0] )
+		);
+
+		if (!context->sbs_buffer)
+			fatal(EXIT_NOT_ENOUGH_MEMORY, "Not enough memory in sbs_append_string");
+
+		context->sbs_capacity = new_capacity;
+
+		// if buffer wasn't initialized, we will se trash in buffer.
+		// but we need just empty string, so here we will get it:
+		if (! is_buf_initialized)
+		{
+			// INIT SBS
+			context->sbs_buffer[0] = 0;
+			context->sbs_handled_len = 0;
+		}
+
+	}
+
+	// ===============================
+	// append to buffer
+	//
+	// will update sbs_buffer, sbs_handled_len
+	// ===============================
+	sbs_strcpy_without_dup(str, context);
+
+	// ===============================
+	// break to sentences
+	// ===============================
+	resub = NULL;
+	tmpsub = NULL;
+
+	alphanum_total = 0;
+	alphanum_cur = 0;
+
+	anychar_total = 0;
+	anychar_cur = 0;
+
+	sbs_undone_start = context->sbs_buffer + context->sbs_handled_len;
+	bp_last_break = sbs_undone_start;
+
+	LOG_DEBUG("Sentence Buffer: BEFORE sentence break. Last break: [%s]  sbs_undone_start: [%d], sbs_undone: [%s]\n",
+		bp_last_break, context->sbs_handled_len, sbs_undone_start
+	);
+
+	for (bp_current = sbs_undone_start; bp_current && *bp_current; bp_current++)
+	{
+		if (
+			0 < anychar_cur	// skip empty!
+			&& sbs_is_pointer_on_sentence_breaker(bp_last_break, bp_current) )
+		{
+			// it is new sentence!
+			tmpsub = malloc(sizeof(struct cc_subtitle));
+
+			tmpsub->type = CC_TEXT;
+			// length of new string:
+			tmpsub->nb_data =
+				bp_current - bp_last_break
+				+ 1	 // terminating '\0'
+				+ 1  // skip '.'
+			;
+			tmpsub->data = strndup(bp_last_break, tmpsub->nb_data - 1);
+			tmpsub->got_output = 1;
+
+			tmpsub->start_time = alphanum_cur;
+			alphanum_cur = 0;
+			tmpsub->end_time = anychar_cur;
+			anychar_cur = 0;
+
+			bp_last_break = bp_current + 1;
+
+			// tune last break:
+			while (
+				*bp_last_break
+				&& isspace(*bp_last_break)
+			)
+			{
+				bp_last_break++;
+			}
+
+			// ???
+			// tmpsub->info = NULL;
+			// tmpsub->mode = NULL;
+
+			// link with prev sub:
+			tmpsub->next = NULL;
+			tmpsub->prev = resub;
+			if (NULL != resub)
+			{
+				resub->next = tmpsub;
+			}
+
+			resub = tmpsub;
+		}
+
+		if (*bp_current && isalnum(*bp_current))
+		{
+			alphanum_total++;
+			alphanum_cur++;
+		}
+		anychar_total++;
+		anychar_cur++;
+	}
+
+	// ===============================
+	// okay, we have extracted several sentences, now we should
+	// save the position of the "remainder" - start of the last
+	// incomplete sentece
+	// ===============================
+	if (bp_last_break != sbs_undone_start)
+	{
+		context->sbs_handled_len = bp_last_break - sbs_undone_start;
+	}
+
+	LOG_DEBUG("Sentence Buffer: AFTER sentence break: Handled Len [%4d]\n", context->sbs_handled_len);
+
+	LOG_DEBUG("Sentence Buffer: Alphanum Total: [%4d]  Overall chars: [%4d]  STRING:[%20s]  BUFFER:[%20s]\n", alphanum_total, anychar_total, str, context->sbs_buffer);
+
+	// ===============================
+	// Calculate time spans
+	// ===============================
+	if (!is_buf_initialized)
+	{
+		context->sbs_time_from = time_from;
+		context->sbs_time_trim = time_trim;
+	}
+
+	available_time = time_trim - context->sbs_time_from;
+	use_alphanum_counters = alphanum_total > 0 ? 1 : 0;
+
+	tmpsub = resub;
+	while (tmpsub)
+	{
+		alphanum_cur = tmpsub->start_time;
+		anychar_cur = tmpsub->end_time;
+
+		if (use_alphanum_counters)
+		{
+			duration = available_time * alphanum_cur / alphanum_total;
+		}
+		else
+		{
+			duration = available_time * anychar_cur / anychar_total;
+		}
+
+		tmpsub->start_time = context->sbs_time_from;
+		tmpsub->end_time = tmpsub->start_time + duration;
+
+		context->sbs_time_from = tmpsub->end_time + 1;
+
+		tmpsub = tmpsub->next;
+	}
+
+	return resub;
+}
+
+struct cc_subtitle * reformat_cc_bitmap_through_sentence_buffer(struct cc_subtitle *sub, struct encoder_ctx *context)
 {
-	int ret = 0;
-#ifdef ENABLE_OCR
 	struct cc_bitmap* rect;
-
 	LLONG ms_start, ms_end;
+	int used;
+	int i = 0;
+	char *str;

-	if (context->prev_start != -1 && (sub->flags & SUB_EOD_MARKER))
+	// this is a sub with a full sentence (or chain of such subs)
+	struct cc_subtitle * resub = NULL;
+
+#ifdef ENABLE_OCR
+
+	if (sub->flags & SUB_EOD_MARKER)
 	{
-		ms_start = context->prev_start;
-		ms_end = sub->start_time;
+		// the last sub from input
+
+		if (context->prev_start == -1)
+		{
+			ms_start = 1;
+			ms_end = sub->start_time;
+		}
+		else
+		{
+			ms_start = context->prev_start;
+			ms_end = sub->start_time;
+		}
 	}
-	else if (!(sub->flags & SUB_EOD_MARKER))
+	else
 	{
+		// not the last sub from input
 		ms_start = sub->start_time;
 		ms_end = sub->end_time;
 	}
-	else if (context->prev_start == -1 && (sub->flags & SUB_EOD_MARKER))
-	{
-		ms_start = 1;
-		ms_end = sub->start_time;
-	}

 	if (sub->nb_data == 0)
-		return ret;
-	rect = sub->data;
+		return 0;

 	if (sub->flags & SUB_EOD_MARKER)
 		context->prev_start = sub->start_time;

-
-	if (rect[0].ocr_text && *(rect[0].ocr_text))
+	str = paraof_ocrtext(sub, " ", 1);
+	if (str)
 	{
-		lbl_start_block(ms_start, context);
 		if (context->prev_start != -1 || !(sub->flags & SUB_EOD_MARKER))
 		{
-			char *token = NULL;
-			token = paraof_ocrtext(sub, " ", 1); // Get text with spaces instead of newlines
-			uint32_t offset=0;
-			utf8proc_ssize_t ls; // Last size
-			char *s = token;
-			int32_t uc;
-			while ((ls=utf8proc_iterate(s, -1, &uc))) 
-			{
-				ccx_sbs_utf8_character sbsc;
-				// Note: We don't care about uc here, since we will be writing the encoded bytes, not the code points in binary.
-				//TODO: Deal with ls < 0
-				if (!uc) // End of string
-					break; 
-				printf("%3ld | %08X | %c %c %c %c\n", ls, uc, ((uc & 0xFF000000) >> 24),  ((uc & 0xFF0000) >> 16), 
-					((uc & 0xFF00) >> 8), ( uc & 0xFF));				
-				sbsc.ch = uc;
-				sbsc.encoded[0] = 0; sbsc.encoded[1] = 0; sbsc.encoded[2] = 0; sbsc.encoded[3] = 0;
-				memcpy(sbsc.encoded, s, ls);
-				sbsc.enc_len = ls;
-				sbsc.ts = 0; // We don't know yet
-				lbl_add_character(context, sbsc);
-				s += ls;				
-				
-				// TO-DO: Add each of these characters to the buffer, splitting the timestamps. Remember to add character length to the array
-			}
-			printf("-------\n");
-
-			/*
-			while (token)
-			{
-				char *newline_pos = strstr(token, context->encoded_crlf);
-				if (!newline_pos)
-				{
-					fdprintf(context->out->fh, "%s", token);
-					break;
-				}
-				else
-				{
-					while (token != newline_pos)
-					{
-						fdprintf(context->out->fh, "%c", *token);
-						token++;
-					}
-					token += context->encoded_crlf_length;
-					fdprintf(context->out->fh, "%c", ' ');
-				}
-			}*/
-
+			resub = sbs_append_string(str, ms_start, ms_end, context);
 		}
-		lbl_end_block(ms_end, context);
+		freep(&str);
+	}
+
+	for(i = 0, rect = sub->data; i < sub->nb_data; i++, rect++)
+	{
+		freep(rect->data);
+		freep(rect->data+1);
 	}
 #endif
-
 	sub->nb_data = 0;
 	freep(&sub->data);
-	return ret;
+	return resub;

 }
--- a/src/lib_ccx/debug_def.h
+++ b/src/lib_ccx/debug_def.h
@ -0,0 +1,11 @@
+#ifndef _DEBUG_DEF_H_
+#define _DEBUG_DEF_H_
+
+#ifdef DEBUG
+#define LOG_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define LOG_DEBUG ;
+#endif
+
+
+#endif
--- a/tests/Makefile
+++ b/tests/Makefile
@ -0,0 +1,59 @@
+SHELL = /bin/sh
+
+CC=gcc
+# SYS := $(shell gcc -dumpmachine)
+CFLAGS=-O0 -std=gnu99 -D ENABLE_OCR -g -ggdb -rdynamic
+#-Q -da -v
+
+# enable COVERAGE
+# CFLAGS+=-fprofile-arcs -ftest-coverage
+
+# add debug flag
+ifdef DEBUG
+CFLAGS+=-DDEBUG
+endif
+
+#ALL_FLAGS = -Wno-write-strings -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT
+LDFLAGS=-lm -g
+
+CFLAGS+=$(shell pkg-config --cflags check)
+LDFLAGS+=$(shell pkg-config --libs check)
+
+# TODO: need to rewrite this. Need new way to load sources for testing
+SRC=$(wildcard ../src/lib_ccx/ccx_encoders_splitbysentence.c)
+OBJS=
+
+SRC_SUITE=$(wildcard *_suite.c)
+OBJ_SUITE=$(patsubst %_suite.c, %_suite.o, $(SRC_SUITE))
+
+OBJS+=$(OBJ_SUITE)
+
+all: clean test
+
+%.o: %.c
+	# explicit output name :  -o $@
+	$(CC) -c $(ALL_FLAGS) $(CFLAGS) $<
+
+runtest: $(OBJS)
+	@echo "+----------------------------------------------+"
+	@echo "|                 BUILD TESTS                  |"
+	@echo "+----------------------------------------------+"
+	$(CC) -c $(ALL_FLAGS) $(CFLAGS) $@.c
+	$(CC) $(SRC) $@.o $^ $(ALL_FLAGS) $(CFLAGS) $(LDFLAGS) -o $@
+
+.PHONY: test
+test: runtest
+	@echo "+----------------------------------------------+"
+	@echo "|                 START TESTS                  |"
+	@echo "+----------------------------------------------+"
+	./runtest
+
+.PHONY: clean
+clean:
+	rm runtest || true
+	rm *.o || true
+	# coverage info
+	rm *.gcda || true
+	rm *.gcno || true
+	# debug info
+	rm *.c.* || true
--- a/tests/README.md
+++ b/tests/README.md
@ -0,0 +1,43 @@
+# UNIT TESTING
+
+This folder contains a archetype and several unit-tests for CCExtractor
+
+## RUN TESTS
+
+```shell
+cd tests
+make
+```
+
+This will build and run all test-suite.
+
+If you want MORE output:
+
+```shell
+DEBUG=1 make
+```
+
+Where `DEBUG` is just an environment variable.
+
+## DEBUGGING
+
+If tests failed after your changes, you could debug them (almost all flags for this are set in the `tests/Makefile`.
+
+Run:
+
+```shell
+# build test runner
+make
+# load test runner to the debgger:
+gdb runner
+
+# run under debugger:
+(gdb) run
+
+# on segfault:
+(gdb) where
+```
+
+## DEPENDENCIES
+
+Tests are built around this library: [**libcheck**](https://github.com/libcheck/check), here is [**documentation**](https://libcheck.github.io/check/)
--- a/tests/ccx_encoders_splitbysentence_suite.c
+++ b/tests/ccx_encoders_splitbysentence_suite.c
@ -0,0 +1,305 @@
+#include <check.h>
+#include "ccx_encoders_splitbysentence_suite.h"
+
+// -------------------------------------
+// MOCKS
+// -------------------------------------
+typedef int64_t LLONG;
+#include "../src/lib_ccx/ccx_encoders_common.h"
+
+// -------------------------------------
+// Private SBS-functions (for testing only)
+// -------------------------------------
+struct cc_subtitle * sbs_append_string(unsigned char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context);
+
+// -------------------------------------
+// Helpers
+// -------------------------------------
+struct cc_subtitle * helper_create_sub(char * str, LLONG time_from, LLONG time_trim)
+{
+	struct cc_subtitle * sub = (struct cc_subtitle *)malloc(sizeof(struct cc_subtitle));
+	sub->type = CC_BITMAP;
+	sub->start_time = 1;
+	sub->end_time = 100;
+	sub->data = strdup(str);
+	sub->nb_data = strlen(sub->data);
+
+	return sub;
+}
+
+struct cc_subtitle * helper_sbs_append_string(char * str, LLONG time_from, LLONG time_trim, struct encoder_ctx * context)
+{
+	char * str1;
+	struct cc_subtitle * sub;
+
+	str1 = strdup(str);
+	sub = sbs_append_string(str1, time_from, time_trim, context);
+	free(str1);
+	return sub;
+}
+
+// -------------------------------------
+// MOCKS
+// -------------------------------------
+struct encoder_ctx * context;
+
+void freep(void * obj){
+}
+void fatal(int x, void * obj){
+}
+
+unsigned char * paraof_ocrtext(void * sub) {
+	// this is OCR -> text converter.
+	// now, in our test cases, we will pass TEXT instead of OCR.
+	// and will return passed text as result
+
+	return ((struct cc_subtitle *)sub)->data;
+}
+
+// -------------------------------------
+// TEST preparations
+// -------------------------------------
+void setup(void)
+{
+	context = (struct encoder_ctx *)malloc(sizeof(struct encoder_ctx));
+	context->sbs_buffer = NULL;
+	context->sbs_capacity = 0;
+}
+
+void teardown(void)
+{
+	free(context);
+}
+
+// -------------------------------------
+// TESTS
+// -------------------------------------
+START_TEST(test_sbs_one_simple_sentence)
+{
+	struct cc_subtitle * sub = helper_create_sub("Simple sentence.", 1, 100);
+	struct cc_subtitle * out = reformat_cc_bitmap_through_sentence_buffer(sub, context);
+
+	ck_assert_ptr_ne(out, NULL);
+	ck_assert_str_eq(out->data, "Simple sentence.");
+	ck_assert_ptr_eq(out->next, NULL);
+	ck_assert_ptr_eq(out->prev, NULL);
+}
+END_TEST
+
+
+START_TEST(test_sbs_two_sentences_with_rep)
+{
+	struct cc_subtitle * sub1 = helper_create_sub("asdf", 1, 100);
+	struct cc_subtitle * out1 = reformat_cc_bitmap_through_sentence_buffer(sub1, context);
+	ck_assert_ptr_eq(out1, NULL);
+
+	// second sub:
+	struct cc_subtitle * sub2 = helper_create_sub("asdf Hello.", 101, 200);
+	struct cc_subtitle * out2 = reformat_cc_bitmap_through_sentence_buffer(sub2, context);
+
+	ck_assert_ptr_ne(out2, NULL);
+	ck_assert_str_eq(out2->data, "asdf Hello.");
+	ck_assert_ptr_eq(out2->next, NULL);
+	ck_assert_ptr_eq(out2->prev, NULL);}
+END_TEST
+
+
+START_TEST(test_sbs_append_string_two_separate)
+{
+	unsigned char * test_strings[] = {
+		"First string.",
+		"Second string."
+	};
+	struct cc_subtitle * sub;
+	unsigned char * str;
+
+	// first string
+	str = strdup(test_strings[0]);
+	sub = NULL;
+	sub = sbs_append_string(str, 1, 20, context);
+	ck_assert_ptr_ne(sub, NULL);
+	ck_assert_str_eq(sub->data, test_strings[0]);
+	ck_assert_int_eq(sub->start_time, 1);
+	ck_assert_int_eq(sub->end_time, 20);
+
+	// second string:
+	str = strdup(test_strings[1]);
+	sub = NULL;
+	sub = sbs_append_string(str, 21, 40, context);
+
+	ck_assert_ptr_ne(sub, NULL);
+	ck_assert_str_eq(sub->data, test_strings[1]);
+	ck_assert_int_eq(sub->start_time, 21);
+	ck_assert_int_eq(sub->end_time, 40);
+}
+END_TEST
+
+START_TEST(test_sbs_append_string_two_with_broken_sentence)
+{
+	// important !!
+	// summary len == 32
+	char * test_strings[] = {
+		"First string",
+		" ends here, deabbea."
+	};
+	struct cc_subtitle * sub;
+	char * str;
+
+	// first string
+	str = strdup(test_strings[0]);
+	sub = sbs_append_string(str, 1, 3, context);
+
+	ck_assert_ptr_eq(sub, NULL);
+
+	// second string:
+	str = strdup(test_strings[1]);
+	sub = sbs_append_string(str, 4, 5, context);
+
+	ck_assert_ptr_ne(sub, NULL);
+	ck_assert_str_eq(sub->data, "First string ends here, deabbea.");
+	ck_assert_int_eq(sub->start_time, 1);
+	ck_assert_int_eq(sub->end_time, 5);
+}
+END_TEST
+
+START_TEST(test_sbs_append_string_two_intersecting)
+{
+	char * test_strings[] = {
+		"First string",
+		"First string ends here."
+	};
+	struct cc_subtitle * sub;
+	char * str;
+
+	// first string
+	str = strdup(test_strings[0]);
+	sub = sbs_append_string(str, 1, 20, context);
+
+	ck_assert_ptr_eq(sub, NULL);
+	free(sub);
+
+	// second string:
+	str = strdup(test_strings[1]);
+	//printf("second string: [%s]\n", str);
+	sub = sbs_append_string(str, 21, 40, context);
+
+	ck_assert_ptr_ne(sub, NULL);
+	ck_assert_str_eq(sub->data, "First string ends here.");
+	ck_assert_int_eq(sub->start_time, 1);
+	ck_assert_int_eq(sub->end_time, 40);
+}
+END_TEST
+
+
+START_TEST(test_sbs_append_string_real_data_1)
+{
+	struct cc_subtitle * sub;
+
+	// 1
+	sub = helper_sbs_append_string("Oleon",
+		1, 0, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// 2
+	sub = helper_sbs_append_string("Oleon costs.",
+		1, 189, context);
+	ck_assert_ptr_ne(sub, NULL);
+	ck_assert_str_eq(sub->data, "Oleon costs.");
+
+	// 3
+	sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
+Didn't",
+		190, 889, context);
+	ck_assert_ptr_ne(sub, NULL);
+	ck_assert_str_eq(sub->data, "buried in the annex, 95 Oleon costs.");
+	ck_assert_int_eq(sub->start_time, 190);    // = <sub start>
+	ck_assert_int_eq(sub->end_time, 783);      // = <sub start>  +  <available time,889-190=699 > * <sentence alphanum, 28>  /  <sub alphanum, 33>
+	ck_assert_ptr_eq(sub->next, NULL);
+
+	// 4
+	sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
+Didn't want",
+		890, 1129, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// 5
+	sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
+Didn't want to",
+		1130, 1359, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// 6
+	sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
+Didn't want to acknowledge",
+		1360, 2059, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// 7
+	sub = helper_sbs_append_string("buried in the annex, 95 Oleon costs.\n\
+Didn't want to acknowledge the",
+		2060, 2299, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// 9
+	sub = helper_sbs_append_string("Didn't want to acknowledge the\n\
+pressures on hospitals, schools and",
+		2300, 5019, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// 13
+	sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
+infrastructure.",
+		5020, 5159, context);
+	ck_assert_ptr_ne(sub, NULL);
+	ck_assert_str_eq(sub->data, "Didn't want to acknowledge the pressures on hospitals, schools and infrastructure.");
+	ck_assert_int_eq(sub->start_time, 784);
+	ck_assert_int_eq(sub->end_time, 5159);
+	ck_assert_ptr_eq(sub->next, NULL);
+
+	// 14
+	sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
+infrastructure. If",
+		5160, 5529, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// 16
+	sub = helper_sbs_append_string("pressures on hospitals, schools and\n\
+infrastructure. If we go",
+		5530, 6559, context);
+	ck_assert_ptr_eq(sub, NULL);
+
+	// ck_assert_int_eq(sub->start_time, 1);
+	// ck_assert_int_eq(sub->end_time, 40);
+}
+END_TEST
+
+
+Suite * ccx_encoders_splitbysentence_suite(void)
+{
+	Suite *s;
+	TCase *tc_core;
+
+	s = suite_create("Sentence Buffer");
+
+	/* Overall tests */
+	tc_core = tcase_create("SB: Overall");
+
+	tcase_add_checked_fixture(tc_core, setup, teardown);
+	tcase_add_test(tc_core, test_sbs_one_simple_sentence);
+	tcase_add_test(tc_core, test_sbs_two_sentences_with_rep);
+	suite_add_tcase(s, tc_core);
+
+	/**/
+	TCase *tc_append_string;
+	tc_append_string = tcase_create("SB: append_string");
+	tcase_add_checked_fixture(tc_append_string, setup, teardown);
+
+	tcase_add_test(tc_append_string, test_sbs_append_string_two_separate);
+	tcase_add_test(tc_append_string, test_sbs_append_string_two_with_broken_sentence);
+	tcase_add_test(tc_append_string, test_sbs_append_string_two_intersecting);
+	tcase_add_test(tc_append_string, test_sbs_append_string_real_data_1);
+
+	suite_add_tcase(s, tc_append_string);
+
+	return s;
+}
--- a/tests/ccx_encoders_splitbysentence_suite.h
+++ b/tests/ccx_encoders_splitbysentence_suite.h
@ -0,0 +1,4 @@
+// -------------------------------------
+// SUITE
+// -------------------------------------
+Suite * ccx_encoders_splitbysentence_suite(void);
--- a/tests/runtest.c
+++ b/tests/runtest.c
@ -0,0 +1,21 @@
+#include <check.h>
+
+// TESTS:
+#include "ccx_encoders_splitbysentence_suite.h"
+
+
+int main(void)
+{
+	int number_failed;
+	Suite *s;
+	SRunner *sr;
+
+	s = ccx_encoders_splitbysentence_suite();
+	sr = srunner_create(s);
+	srunner_set_fork_status(sr, CK_NOFORK);
+
+	srunner_run_all(sr, CK_NORMAL);
+	number_failed = srunner_ntests_failed(sr);
+	srunner_free(sr);
+	return (number_failed == 0) ? 0 : 1;
+}