Italic Detection and improved documentation

2024-12-25 04:11:38 +00:00 · 2016-08-10 09:33:08 -07:00 · 2016-08-10 09:33:08 -07:00 · e2f850192f
commit e2f850192f
parent 5a6dfd0c18
4 changed files with 131 additions and 1 deletions
--- a/docs/HARDSUBX.txt
+++ b/docs/HARDSUBX.txt
@ -1,6 +1,9 @@

 Overview
 ========
+Subtitles which are burned into the video (or hard subbed) can be extracted using the -hardsubx flag.
+The system works by processing video frames and extracting only the subtitles from them, followed
+by an OCR recognition using Tesseract.

 Dependencies
 ============
@ -17,7 +20,23 @@ Linux
 Make sure Tesseract, Leptonica and FFMPeg are installed, and that their libraries can be found using pkg-config.
 Refer to OCR.txt for installation details.

-Run:-
+To install FFmpeg (libav), follow the steps at:-
+https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu - For Ubuntu, Debian and Linux Mint
+https://trac.ffmpeg.org/wiki/CompilationGuide/Generic - For generic Linux compilation
+
+To validate your FFMpeg installation, make sure you can run the following commands on your terminal:-
+pkg-config --cflags libavcodec
+pkg-config --cflags libavformat
+pkg-config --cflags libavutil
+pkg-config --cflags libswscale
+pkg-config --libs libavcodec
+pkg-config --libs libavformat
+pkg-config --libs libavutil
+pkg-config --libs libswscale
+
+On success, you should see the correct include directory path and the linker flags.
+
+To build the program with hardsubx support, from the linux directory run:-
 make ENABLE_HARDSUBX=yes

 Windows
--- a/src/lib_ccx/hardsubx_classifier.c
+++ b/src/lib_ccx/hardsubx_classifier.c
@ -45,6 +45,8 @@ char *get_ocr_text_wordwise(struct lib_hardsubx_ctx *ctx, PIX *image)
 	TessResultIterator *it = TessBaseAPIGetIterator(ctx->tess_handle);
 	TessPageIteratorLevel level = RIL_WORD;

+	int prev_ital = 0;
+
 	if(it!=0)
 	{
 		do
@ -54,11 +56,53 @@ char *get_ocr_text_wordwise(struct lib_hardsubx_ctx *ctx, PIX *image)
 				continue;
 			if(text_out == NULL)
 			{
+				if(ctx->detect_italics)
+				{
+					int italic=0;
+					int dummy=0;
+					TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
+					if(italic==1 && prev_ital==0)
+					{
+						char *word_copy = strdup(word);
+						word = realloc(word, strlen(word)+strlen("<i>")+2);
+						strcpy(word,"<i>");
+						strcat(word, word_copy);
+						free(word_copy);
+						prev_ital = 1;
+					}
+					else if(italic == 0 && prev_ital == 1)
+					{
+						word = realloc(word, strlen(word)+strlen("</i>")+2);
+						strcat(word, "</i>");
+						prev_ital = 0;
+					}	
+				}
 				text_out = strdup(word);
 				text_out = realloc(text_out, strlen(text_out)+2);
 				strcat(text_out, " ");
 				continue;
 			}
+			if(ctx->detect_italics)
+			{
+				int italic=0;
+				int dummy=0;
+				TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
+				if(italic==1 && prev_ital==0)
+				{
+					char *word_copy = strdup(word);
+					word = realloc(word, strlen(word)+strlen("<i>")+2);
+					strcpy(word,"<i>");
+					strcat(word, word_copy);
+					free(word_copy);
+					prev_ital = 1;
+				}
+				else if(italic == 0 && prev_ital == 1)
+				{
+					word = realloc(word, strlen(word)+strlen("</i>")+2);
+					strcat(word, "</i>");
+					prev_ital = 0;
+				}
+			}
 			text_out = realloc(text_out, strlen(text_out)+strlen(word)+2);
 			strcat(text_out, word);
 			strcat(text_out, " ");
@ -66,6 +110,12 @@ char *get_ocr_text_wordwise(struct lib_hardsubx_ctx *ctx, PIX *image)
 		} while(TessPageIteratorNext((TessPageIterator *)it, level));
 	}

+	if(ctx->detect_italics && prev_ital == 1)
+	{
+		text_out = realloc(text_out, strlen(text_out)+strlen("</i>")+2);
+		strcat(text_out, "</i>");
+	}
+
 	TessResultIteratorDelete(it);

 	return text_out;
@ -141,6 +191,8 @@ char *get_ocr_text_wordwise_threshold(struct lib_hardsubx_ctx *ctx, PIX *image,
 	TessResultIterator *it = TessBaseAPIGetIterator(ctx->tess_handle);
 	TessPageIteratorLevel level = RIL_WORD;

+	int prev_ital = 0;
+
 	if(it!=0)
 	{
 		do
@ -153,11 +205,53 @@ char *get_ocr_text_wordwise_threshold(struct lib_hardsubx_ctx *ctx, PIX *image,
 				continue;
 			if(text_out == NULL)
 			{
+				if(ctx->detect_italics)
+				{
+					int italic=0;
+					int dummy=0;
+					TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
+					if(italic==1 && prev_ital==0)
+					{
+						char *word_copy = strdup(word);
+						word = realloc(word, strlen(word)+strlen("<i>")+2);
+						strcpy(word,"<i>");
+						strcat(word, word_copy);
+						free(word_copy);
+						prev_ital = 1;
+					}
+					else if(italic == 0 && prev_ital == 1)
+					{
+						word = realloc(word, strlen(word)+strlen("</i>")+2);
+						strcat(word, "</i>");
+						prev_ital = 0;
+					}	
+				}
 				text_out = strdup(word);
 				text_out = realloc(text_out, strlen(text_out)+2);
 				strcat(text_out, " ");
 				continue;
 			}
+			if(ctx->detect_italics)
+			{
+				int italic=0;
+				int dummy=0;
+				TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
+				if(italic==1 && prev_ital==0)
+				{
+					char *word_copy = strdup(word);
+					word = realloc(word, strlen(word)+strlen("<i>")+2);
+					strcpy(word,"<i>");
+					strcat(word, word_copy);
+					free(word_copy);
+					prev_ital = 1;
+				}
+				else if(italic == 0 && prev_ital == 1)
+				{
+					word = realloc(word, strlen(word)+strlen("</i>")+2);
+					strcat(word, "</i>");
+					prev_ital = 0;
+				}
+			}
 			text_out = realloc(text_out, strlen(text_out)+strlen(word)+2);
 			strcat(text_out, word);
 			strcat(text_out, " ");
@ -165,6 +259,12 @@ char *get_ocr_text_wordwise_threshold(struct lib_hardsubx_ctx *ctx, PIX *image,
 		} while(TessPageIteratorNext((TessPageIterator *)it, level));
 	}

+	if(ctx->detect_italics && prev_ital == 1)
+	{
+		text_out = realloc(text_out, strlen(text_out)+strlen("</i>")+2);
+		strcat(text_out, "</i>");
+	}
+
 	TessResultIteratorDelete(it);

 	return text_out;
--- a/src/lib_ccx/hardsubx_decoder.c
+++ b/src/lib_ccx/hardsubx_decoder.c
@ -40,6 +40,11 @@ char* _process_frame_white_basic(struct lib_hardsubx_ctx *ctx, AVFrame *frame, i
 		}
 	}

+	if(ctx->detect_italics)
+	{
+		ctx->ocr_mode = HARDSUBX_OCRMODE_WORD;
+	}
+
 	// TESSERACT OCR FOR THE FRAME HERE
 	switch(ctx->ocr_mode)
 	{
@ -117,6 +122,11 @@ char *_process_frame_color_basic(struct lib_hardsubx_ctx *ctx, AVFrame *frame, i
 		}
 	}

+	if(ctx->detect_italics)
+	{
+		ctx->ocr_mode = HARDSUBX_OCRMODE_WORD;
+	}
+
 	// TESSERACT OCR FOR THE FRAME HERE
 	switch(ctx->ocr_mode)
 	{
--- a/src/lib_ccx/params.c
+++ b/src/lib_ccx/params.c
@ -752,6 +752,7 @@ void usage (void)
 	mprint("                     e.g. -min_sub_duration 1.0 (for a duration of 1 second)\n");
 	mprint("\n");
 	mprint("   -detect_italics : Specify whether italics are to be detected from the OCR text.\n");
+	mprint("                     Italic detection automatically enforces the OCR mode to be word-wise");
 	mprint("\n");
 	mprint("      -conf_thresh : Specify the classifier confidence threshold between 1 and 100.\n");
 	mprint("                     Try and use a threshold which works for you if you get a lot of garbage text.\n");