mirror of
https://github.com/CCExtractor/ccextractor.git
synced 2024-12-25 12:23:59 +00:00
Italic Detection and improved documentation
This commit is contained in:
parent
5a6dfd0c18
commit
e2f850192f
@ -1,6 +1,9 @@
|
|||||||
|
|
||||||
Overview
|
Overview
|
||||||
========
|
========
|
||||||
|
Subtitles which are burned into the video (or hard subbed) can be extracted using the -hardsubx flag.
|
||||||
|
The system works by processing video frames and extracting only the subtitles from them, followed
|
||||||
|
by an OCR recognition using Tesseract.
|
||||||
|
|
||||||
Dependencies
|
Dependencies
|
||||||
============
|
============
|
||||||
@ -17,7 +20,23 @@ Linux
|
|||||||
Make sure Tesseract, Leptonica and FFMPeg are installed, and that their libraries can be found using pkg-config.
|
Make sure Tesseract, Leptonica and FFMPeg are installed, and that their libraries can be found using pkg-config.
|
||||||
Refer to OCR.txt for installation details.
|
Refer to OCR.txt for installation details.
|
||||||
|
|
||||||
Run:-
|
To install FFmpeg (libav), follow the steps at:-
|
||||||
|
https://trac.ffmpeg.org/wiki/CompilationGuide/Ubuntu - For Ubuntu, Debian and Linux Mint
|
||||||
|
https://trac.ffmpeg.org/wiki/CompilationGuide/Generic - For generic Linux compilation
|
||||||
|
|
||||||
|
To validate your FFMpeg installation, make sure you can run the following commands on your terminal:-
|
||||||
|
pkg-config --cflags libavcodec
|
||||||
|
pkg-config --cflags libavformat
|
||||||
|
pkg-config --cflags libavutil
|
||||||
|
pkg-config --cflags libswscale
|
||||||
|
pkg-config --libs libavcodec
|
||||||
|
pkg-config --libs libavformat
|
||||||
|
pkg-config --libs libavutil
|
||||||
|
pkg-config --libs libswscale
|
||||||
|
|
||||||
|
On success, you should see the correct include directory path and the linker flags.
|
||||||
|
|
||||||
|
To build the program with hardsubx support, from the linux directory run:-
|
||||||
make ENABLE_HARDSUBX=yes
|
make ENABLE_HARDSUBX=yes
|
||||||
|
|
||||||
Windows
|
Windows
|
||||||
|
@ -45,6 +45,8 @@ char *get_ocr_text_wordwise(struct lib_hardsubx_ctx *ctx, PIX *image)
|
|||||||
TessResultIterator *it = TessBaseAPIGetIterator(ctx->tess_handle);
|
TessResultIterator *it = TessBaseAPIGetIterator(ctx->tess_handle);
|
||||||
TessPageIteratorLevel level = RIL_WORD;
|
TessPageIteratorLevel level = RIL_WORD;
|
||||||
|
|
||||||
|
int prev_ital = 0;
|
||||||
|
|
||||||
if(it!=0)
|
if(it!=0)
|
||||||
{
|
{
|
||||||
do
|
do
|
||||||
@ -54,11 +56,53 @@ char *get_ocr_text_wordwise(struct lib_hardsubx_ctx *ctx, PIX *image)
|
|||||||
continue;
|
continue;
|
||||||
if(text_out == NULL)
|
if(text_out == NULL)
|
||||||
{
|
{
|
||||||
|
if(ctx->detect_italics)
|
||||||
|
{
|
||||||
|
int italic=0;
|
||||||
|
int dummy=0;
|
||||||
|
TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
|
||||||
|
if(italic==1 && prev_ital==0)
|
||||||
|
{
|
||||||
|
char *word_copy = strdup(word);
|
||||||
|
word = realloc(word, strlen(word)+strlen("<i>")+2);
|
||||||
|
strcpy(word,"<i>");
|
||||||
|
strcat(word, word_copy);
|
||||||
|
free(word_copy);
|
||||||
|
prev_ital = 1;
|
||||||
|
}
|
||||||
|
else if(italic == 0 && prev_ital == 1)
|
||||||
|
{
|
||||||
|
word = realloc(word, strlen(word)+strlen("</i>")+2);
|
||||||
|
strcat(word, "</i>");
|
||||||
|
prev_ital = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
text_out = strdup(word);
|
text_out = strdup(word);
|
||||||
text_out = realloc(text_out, strlen(text_out)+2);
|
text_out = realloc(text_out, strlen(text_out)+2);
|
||||||
strcat(text_out, " ");
|
strcat(text_out, " ");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if(ctx->detect_italics)
|
||||||
|
{
|
||||||
|
int italic=0;
|
||||||
|
int dummy=0;
|
||||||
|
TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
|
||||||
|
if(italic==1 && prev_ital==0)
|
||||||
|
{
|
||||||
|
char *word_copy = strdup(word);
|
||||||
|
word = realloc(word, strlen(word)+strlen("<i>")+2);
|
||||||
|
strcpy(word,"<i>");
|
||||||
|
strcat(word, word_copy);
|
||||||
|
free(word_copy);
|
||||||
|
prev_ital = 1;
|
||||||
|
}
|
||||||
|
else if(italic == 0 && prev_ital == 1)
|
||||||
|
{
|
||||||
|
word = realloc(word, strlen(word)+strlen("</i>")+2);
|
||||||
|
strcat(word, "</i>");
|
||||||
|
prev_ital = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
text_out = realloc(text_out, strlen(text_out)+strlen(word)+2);
|
text_out = realloc(text_out, strlen(text_out)+strlen(word)+2);
|
||||||
strcat(text_out, word);
|
strcat(text_out, word);
|
||||||
strcat(text_out, " ");
|
strcat(text_out, " ");
|
||||||
@ -66,6 +110,12 @@ char *get_ocr_text_wordwise(struct lib_hardsubx_ctx *ctx, PIX *image)
|
|||||||
} while(TessPageIteratorNext((TessPageIterator *)it, level));
|
} while(TessPageIteratorNext((TessPageIterator *)it, level));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(ctx->detect_italics && prev_ital == 1)
|
||||||
|
{
|
||||||
|
text_out = realloc(text_out, strlen(text_out)+strlen("</i>")+2);
|
||||||
|
strcat(text_out, "</i>");
|
||||||
|
}
|
||||||
|
|
||||||
TessResultIteratorDelete(it);
|
TessResultIteratorDelete(it);
|
||||||
|
|
||||||
return text_out;
|
return text_out;
|
||||||
@ -141,6 +191,8 @@ char *get_ocr_text_wordwise_threshold(struct lib_hardsubx_ctx *ctx, PIX *image,
|
|||||||
TessResultIterator *it = TessBaseAPIGetIterator(ctx->tess_handle);
|
TessResultIterator *it = TessBaseAPIGetIterator(ctx->tess_handle);
|
||||||
TessPageIteratorLevel level = RIL_WORD;
|
TessPageIteratorLevel level = RIL_WORD;
|
||||||
|
|
||||||
|
int prev_ital = 0;
|
||||||
|
|
||||||
if(it!=0)
|
if(it!=0)
|
||||||
{
|
{
|
||||||
do
|
do
|
||||||
@ -153,11 +205,53 @@ char *get_ocr_text_wordwise_threshold(struct lib_hardsubx_ctx *ctx, PIX *image,
|
|||||||
continue;
|
continue;
|
||||||
if(text_out == NULL)
|
if(text_out == NULL)
|
||||||
{
|
{
|
||||||
|
if(ctx->detect_italics)
|
||||||
|
{
|
||||||
|
int italic=0;
|
||||||
|
int dummy=0;
|
||||||
|
TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
|
||||||
|
if(italic==1 && prev_ital==0)
|
||||||
|
{
|
||||||
|
char *word_copy = strdup(word);
|
||||||
|
word = realloc(word, strlen(word)+strlen("<i>")+2);
|
||||||
|
strcpy(word,"<i>");
|
||||||
|
strcat(word, word_copy);
|
||||||
|
free(word_copy);
|
||||||
|
prev_ital = 1;
|
||||||
|
}
|
||||||
|
else if(italic == 0 && prev_ital == 1)
|
||||||
|
{
|
||||||
|
word = realloc(word, strlen(word)+strlen("</i>")+2);
|
||||||
|
strcat(word, "</i>");
|
||||||
|
prev_ital = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
text_out = strdup(word);
|
text_out = strdup(word);
|
||||||
text_out = realloc(text_out, strlen(text_out)+2);
|
text_out = realloc(text_out, strlen(text_out)+2);
|
||||||
strcat(text_out, " ");
|
strcat(text_out, " ");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if(ctx->detect_italics)
|
||||||
|
{
|
||||||
|
int italic=0;
|
||||||
|
int dummy=0;
|
||||||
|
TessResultIteratorWordFontAttributes(it, &dummy, &italic,&dummy, &dummy, &dummy,&dummy, &dummy, &dummy);
|
||||||
|
if(italic==1 && prev_ital==0)
|
||||||
|
{
|
||||||
|
char *word_copy = strdup(word);
|
||||||
|
word = realloc(word, strlen(word)+strlen("<i>")+2);
|
||||||
|
strcpy(word,"<i>");
|
||||||
|
strcat(word, word_copy);
|
||||||
|
free(word_copy);
|
||||||
|
prev_ital = 1;
|
||||||
|
}
|
||||||
|
else if(italic == 0 && prev_ital == 1)
|
||||||
|
{
|
||||||
|
word = realloc(word, strlen(word)+strlen("</i>")+2);
|
||||||
|
strcat(word, "</i>");
|
||||||
|
prev_ital = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
text_out = realloc(text_out, strlen(text_out)+strlen(word)+2);
|
text_out = realloc(text_out, strlen(text_out)+strlen(word)+2);
|
||||||
strcat(text_out, word);
|
strcat(text_out, word);
|
||||||
strcat(text_out, " ");
|
strcat(text_out, " ");
|
||||||
@ -165,6 +259,12 @@ char *get_ocr_text_wordwise_threshold(struct lib_hardsubx_ctx *ctx, PIX *image,
|
|||||||
} while(TessPageIteratorNext((TessPageIterator *)it, level));
|
} while(TessPageIteratorNext((TessPageIterator *)it, level));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(ctx->detect_italics && prev_ital == 1)
|
||||||
|
{
|
||||||
|
text_out = realloc(text_out, strlen(text_out)+strlen("</i>")+2);
|
||||||
|
strcat(text_out, "</i>");
|
||||||
|
}
|
||||||
|
|
||||||
TessResultIteratorDelete(it);
|
TessResultIteratorDelete(it);
|
||||||
|
|
||||||
return text_out;
|
return text_out;
|
||||||
|
@ -40,6 +40,11 @@ char* _process_frame_white_basic(struct lib_hardsubx_ctx *ctx, AVFrame *frame, i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(ctx->detect_italics)
|
||||||
|
{
|
||||||
|
ctx->ocr_mode = HARDSUBX_OCRMODE_WORD;
|
||||||
|
}
|
||||||
|
|
||||||
// TESSERACT OCR FOR THE FRAME HERE
|
// TESSERACT OCR FOR THE FRAME HERE
|
||||||
switch(ctx->ocr_mode)
|
switch(ctx->ocr_mode)
|
||||||
{
|
{
|
||||||
@ -117,6 +122,11 @@ char *_process_frame_color_basic(struct lib_hardsubx_ctx *ctx, AVFrame *frame, i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(ctx->detect_italics)
|
||||||
|
{
|
||||||
|
ctx->ocr_mode = HARDSUBX_OCRMODE_WORD;
|
||||||
|
}
|
||||||
|
|
||||||
// TESSERACT OCR FOR THE FRAME HERE
|
// TESSERACT OCR FOR THE FRAME HERE
|
||||||
switch(ctx->ocr_mode)
|
switch(ctx->ocr_mode)
|
||||||
{
|
{
|
||||||
|
@ -752,6 +752,7 @@ void usage (void)
|
|||||||
mprint(" e.g. -min_sub_duration 1.0 (for a duration of 1 second)\n");
|
mprint(" e.g. -min_sub_duration 1.0 (for a duration of 1 second)\n");
|
||||||
mprint("\n");
|
mprint("\n");
|
||||||
mprint(" -detect_italics : Specify whether italics are to be detected from the OCR text.\n");
|
mprint(" -detect_italics : Specify whether italics are to be detected from the OCR text.\n");
|
||||||
|
mprint(" Italic detection automatically enforces the OCR mode to be word-wise");
|
||||||
mprint("\n");
|
mprint("\n");
|
||||||
mprint(" -conf_thresh : Specify the classifier confidence threshold between 1 and 100.\n");
|
mprint(" -conf_thresh : Specify the classifier confidence threshold between 1 and 100.\n");
|
||||||
mprint(" Try and use a threshold which works for you if you get a lot of garbage text.\n");
|
mprint(" Try and use a threshold which works for you if you get a lot of garbage text.\n");
|
||||||
|
Loading…
Reference in New Issue
Block a user