From 5dbbe654f05f1b3e5fcdfd6633e6258bed216345 Mon Sep 17 00:00:00 2001 From: Anshul Maheshwari Date: Wed, 7 Nov 2018 12:53:26 +0530 Subject: [PATCH] Add support for 4.0 tesseract --- docs/CHANGES.TXT | 1 + src/lib_ccx/dvb_subtitle_decoder.c | 8 +- src/lib_ccx/ocr.c | 170 ++++++++++++++++++----------- src/lib_ccx/params.c | 2 +- 4 files changed, 116 insertions(+), 65 deletions(-) diff --git a/docs/CHANGES.TXT b/docs/CHANGES.TXT index c002c940..a61d9e5f 100644 --- a/docs/CHANGES.TXT +++ b/docs/CHANGES.TXT @@ -1,5 +1,6 @@ 0.88(2018-10-24) (unreleased) ----------------- +- New: Add support for tesseract 4.0 - Optimize: Remove multiple RGB to grey conversion in OCR. - Fix: Update UTF8Proc to 2.2.0 - Fix: Warn instead of fatal when a 0xFF marker is missing diff --git a/src/lib_ccx/dvb_subtitle_decoder.c b/src/lib_ccx/dvb_subtitle_decoder.c index c9ea9942..2e33f00d 100644 --- a/src/lib_ccx/dvb_subtitle_decoder.c +++ b/src/lib_ccx/dvb_subtitle_decoder.c @@ -1661,7 +1661,8 @@ static int write_dvb_sub(struct lib_cc_decode *dec_ctx, struct cc_subtitle *sub) // Perform OCR #ifdef ENABLE_OCR char *ocr_str = NULL; - if (ctx->ocr_ctx) { + if (ctx->ocr_ctx) + { ret = ocr_rect(ctx->ocr_ctx, rect, &ocr_str, region->bgcolor, dec_ctx->ocr_quantmode); if (ret >= 0) rect->ocr_text = ocr_str; @@ -1669,7 +1670,8 @@ static int write_dvb_sub(struct lib_cc_decode *dec_ctx, struct cc_subtitle *sub) rect->ocr_text = NULL; dbg_print(CCX_DMT_DVB, "\nOCR Result: %s\n", rect->ocr_text ? rect->ocr_text : "NULL"); } - else { + else + { rect->ocr_text = NULL; } #endif @@ -1889,7 +1891,7 @@ int parse_dvb_description(struct dvb_config* cfg, unsigned char*data, /* setting language to undefined if not found in language lkup table */ char lang_name[4]; dbg_print(CCX_DMT_DVB, "DVBSUB - LANGUAGE \""); - + for(int char_index = 0; char_index < 3; char_index++) { lang_name[char_index] = cctolower(data[char_index]); diff --git a/src/lib_ccx/ocr.c b/src/lib_ccx/ocr.c index daea418d..bf7936e3 100644 --- a/src/lib_ccx/ocr.c +++ b/src/lib_ccx/ocr.c @@ -1,7 +1,7 @@ #include "png.h" #include "lib_ccx.h" #ifdef ENABLE_OCR -#include "capi.h" +#include "tesseract/capi.h" #include "ccx_common_constants.h" #include "allheaders.h" #include @@ -28,14 +28,14 @@ static int check_trans_tn_intensity(const void *p1, const void *p2, void *arg) unsigned char act_i; /** TODO verify that RGB follow ITU-R BT.709 * Below formula is valid only for 709 standard - * Y = 0.2126 R + 0.7152 G + 0.0722 B - */ + * Y = 0.2126 R + 0.7152 G + 0.0722 B + */ tmp_i = (0.2126 * ti->palette[*tmp].red) + (0.7152 * ti->palette[*tmp].green) + (0.0722 * ti->palette[*tmp].blue); act_i = (0.2126 * ti->palette[*act].red) + (0.7152 * ti->palette[*act].green) + (0.0722 * ti->palette[*act].blue); - if (ti->t[*tmp] < ti->t[*act] || (ti->t[*tmp] == ti->t[*act] && tmp_i < act_i)) + if (ti->t[*tmp] < ti->t[*act] || (ti->t[*tmp] == ti->t[*act] && tmp_i < act_i)) return -1; - else if (ti->t[*tmp] == ti->t[*act] && tmp_i == act_i) + else if (ti->t[*tmp] == ti->t[*act] && tmp_i == act_i) return 0; return 1; @@ -43,10 +43,13 @@ static int check_trans_tn_intensity(const void *p1, const void *p2, void *arg) static int search_language_pack(const char *dir_name,const char *lang_name) { + if (!dir_name) + return -1; + //Search for a tessdata folder in the specified directory char *dirname = strdup(dir_name); - dirname = realloc(dirname,strlen(dirname)+strlen("/tessdata/")+1); - strcat(dirname,"/tessdata/"); + dirname = realloc(dirname,strlen(dirname)+strlen("tessdata/")+1); + strcat(dirname,"tessdata/"); DIR *dp; struct dirent *dirp; @@ -79,15 +82,62 @@ void delete_ocr (void** arg) freep(arg); } +/** + * probe_tessdata_location + * + * This function probe tesseract data location + * + * Priority of Tesseract traineddata file search paths:- + * 1. tessdata in TESSDATA_PREFIX, if it is specified. Overrides others + * 2. tessdata in current working directory + * 3. tessdata in /usr/share + */ +char* probe_tessdata_location(int lang_index) +{ + int ret = 0; + char *tessdata_dir_path = getenv("TESSDATA_PREFIX"); + + ret = search_language_pack(tessdata_dir_path, language[lang_index]); + if (!ret) + return tessdata_dir_path; + + tessdata_dir_path = "./"; + ret = search_language_pack(tessdata_dir_path,language[lang_index]); + if (!ret) + return tessdata_dir_path; + + tessdata_dir_path = "/usr/share/"; + ret = search_language_pack(tessdata_dir_path, language[lang_index]); + if (!ret) + return tessdata_dir_path; + + tessdata_dir_path = "/usr/local/share/"; + ret = search_language_pack(tessdata_dir_path, language[lang_index]); + if (!ret) + return tessdata_dir_path; + + tessdata_dir_path = "/usr/share/tesseract-ocr/"; + ret = search_language_pack(tessdata_dir_path, language[lang_index]); + if (!ret) + return tessdata_dir_path; + + tessdata_dir_path = "/usr/share/tesseract-ocr/4.00/"; + ret = search_language_pack(tessdata_dir_path, language[lang_index]); + if (!ret) + return tessdata_dir_path; + + return NULL; +} + void* init_ocr(int lang_index) { int ret = -1; struct ocrCtx* ctx; + const char* lang = NULL, *tessdata_path = NULL; ctx = (struct ocrCtx*)malloc(sizeof(struct ocrCtx)); if(!ctx) return NULL; - ctx->api = TessBaseAPICreate(); /* if language was undefined use english */ if(lang_index == 0) @@ -102,53 +152,53 @@ void* init_ocr(int lang_index) goto fail; } - /*Priority of Tesseract traineddata file search paths:- - 1. tessdata in TESSDATA_PREFIX, if it is specified. Overrides others - 2. tessdata in current working directory - 3. tessdata in /usr/share - */ - int data_location = 0; - char *tessdata_dir_path="."; - if(!getenv("TESSDATA_PREFIX")) - { - ret = search_language_pack(tessdata_dir_path,language[lang_index]); - } - if(ret < 0) - { - data_location = 1; - if(getenv("TESSDATA_PREFIX")) - ret = search_language_pack(getenv("TESSDATA_PREFIX"), language[lang_index]); - else - ret = search_language_pack("/usr/share", language[lang_index]); - } - if(ret < 0 && lang_index != 1 && ccx_options.ocrlang==NULL) + tessdata_path = probe_tessdata_location(lang_index); + if(!tessdata_path) { + if (lang_index == 1) + { + mprint("eng.traineddata not found! No Switching Possible\n"); + return NULL; + } mprint("%s.traineddata not found! Switching to English\n",language[lang_index]); - /* select english */ lang_index = 1; + tessdata_path = probe_tessdata_location(lang_index); + if(!tessdata_path) + { + mprint("eng.traineddata not found! No Switching Possible\n"); + return NULL; + } } - const char* lang = NULL, *tessdata_path = NULL; if (ccx_options.ocrlang) lang = ccx_options.ocrlang; - else if (data_location == 1) + else lang = language[lang_index]; - else { - lang = language[lang_index]; - tessdata_path = tessdata_dir_path; - } char* pars_vec = strdup("debug_file"); - char* pars_values = strdup("/dev/null"); + char* pars_values = strdup("tess.log"); - ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, - &pars_values, 1, false); + ctx->api = TessBaseAPICreate(); + if (!strncmp("4.", TessVersion(), 2)) + { + char tess_path [1024]; + snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); + //ccx_options.ocr_oem are depricated and only supported mode is OEM_LSTM_ONLY + ret = TessBaseAPIInit4(ctx->api, tess_path, lang, 1, NULL, 0, &pars_vec, + &pars_values, 1, false); + } + else + { + ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, + &pars_values, 1, false); + } free(pars_vec); free(pars_values); if(ret < 0) { + mprint("Failed TessBaseAPIInit4 %d\n", ret); goto fail; } return ctx; @@ -297,18 +347,21 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i TessPageIteratorLevel level = RIL_WORD; TessBaseAPISetImage2(ctx->api, color_pix_out); tess_ret = TessBaseAPIRecognize(ctx->api, NULL); - if (tess_ret != 0) { + if (tess_ret != 0) + { mprint("\nTessBaseAPIRecognize returned %d, skipping this bitmap.\n", tess_ret); } - else + else + { ri = TessBaseAPIGetIterator(ctx->api); + } if(!tess_ret && ri!=0) { do { char* word = TessResultIteratorGetUTF8Text(ri,level); - float conf = TessResultIteratorConfidence(ri,level); + // float conf = TessResultIteratorConfidence(ri,level); int x1, y1, x2, y2; if (!TessPageIteratorBoundingBox((TessPageIterator *)ri, level, &x1, &y1, &x2, &y2)) continue; @@ -325,7 +378,6 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i uint32_t *histogram = NULL; uint8_t *iot = NULL; uint32_t *mcit = NULL; - int ret = 0; int max_color=2; histogram = (uint32_t*) malloc(copy->nb_colors * sizeof(uint32_t)); @@ -334,7 +386,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i struct transIntensity ti = {copy->alpha,copy->palette}; memset(histogram, 0, copy->nb_colors * sizeof(uint32_t)); - /* initializing intensity ordered table with serial order of unsorted color table */ + /* initializing intensity ordered table with serial order of unsorted color table */ for (int i = 0; i < copy->nb_colors; i++) { iot[i] = i; @@ -342,7 +394,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i memset(mcit, 0, copy->nb_colors * sizeof(uint32_t)); /* calculate histogram of image */ - int firstpixel = copy->data[0]; //TODO: Verify this border pixel assumption holds + int firstpixel = copy->data[0]; //TODO: Verify this border pixel assumption holds for(int i=y1;i<=y2;i++) { for(int j=x1;j<=x2;j++) @@ -360,7 +412,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i // i, iot[i], histogram[iot[i]]); // } /** - * using selection sort since need to find only max_color + * using selection sort since need to find only max_color * Histogram becomes invalid in this loop */ for (int i = 0; i < max_color; i++) @@ -390,7 +442,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i palette[i].blue = copy->palette[i].blue; alpha[i]=copy->alpha[i]; } - + for (int i = 0, mxi = 0; i < copy->nb_colors; i++) { int step, inc; @@ -419,13 +471,13 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i } } - - // Detecting the color present in quantized word image + + // Detecting the color present in quantized word image int r_avg=0,g_avg=0,b_avg=0,denom=0; for (int i = 0; i < copy->nb_colors; i++) { if(palette[i].red == ((copy->bgcolor >> 16) & 0xff) && - palette[i].green == ((copy->bgcolor >> 8) & 0xff) && + palette[i].green == ((copy->bgcolor >> 8) & 0xff) && palette[i].blue == ((copy->bgcolor >> 0) & 0xff)) continue; denom++; @@ -510,7 +562,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i int length_closing_font = 7; // exclude '\0' char *line_start = text_out; - int length = strlen(text_out) + length_closing_font * 10; // usually enough + int length = strlen(text_out) + length_closing_font * 10; // usually enough char *new_text_out = malloc(length); char *new_text_out_iter = new_text_out; @@ -543,7 +595,6 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i long diff = new_text_out_iter - new_text_out; new_text_out = realloc(new_text_out, length); new_text_out_iter = new_text_out + diff; - } // Add to the beginning of the line if it is missing @@ -575,7 +626,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i // Add if it is indeed missing if (line_end - line_start < length_closing_font || strncmp(line_start, closing_font, length_closing_font)) { - + memcpy(new_text_out_iter, closing_font, length_closing_font); new_text_out_iter += length_closing_font; @@ -600,7 +651,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i pixDestroy(&cpix_gs); pixDestroy(&color_pix); pixDestroy(&color_pix_out); - + return text_out; } @@ -653,7 +704,7 @@ static int quantize_map(png_byte *alpha, png_color *palette, memset(histogram, 0, nb_color * sizeof(uint32_t)); - /* initializing intensity ordered table with serial order of unsorted color table */ + /* initializing intensity ordered table with serial order of unsorted color table */ for (int i = 0; i < nb_color; i++) { iot[i] = i; @@ -677,7 +728,7 @@ static int quantize_map(png_byte *alpha, png_color *palette, } #endif /** - * using selection sort since need to find only max_color + * using selection sort since need to find only max_color * Histogram becomes invalid in this loop */ for (int i = 0; i < max_color; i++) @@ -702,7 +753,7 @@ static int quantize_map(png_byte *alpha, png_color *palette, } #ifdef OCR_DEBUG - ccx_common_logging.log_ftn("max redundant intensities table\n"); + ccx_common_logging.log_ftn("max redundant intensities table\n"); for (int i = 0; i < max_color; i++) { ccx_common_logging.log_ftn("%02d) mcit %02d\n", @@ -756,7 +807,7 @@ int ocr_rect(void* arg, struct cc_bitmap *rect, char **str, int bgcolor, int ocr int ret = 0; png_color *palette = NULL; png_byte *alpha = NULL; - + struct image_copy *copy; copy = (struct image_copy *)malloc(sizeof(struct image_copy)); copy->nb_colors = rect->nb_colors; @@ -843,10 +894,7 @@ int compare_rect_by_ypos(const void*p1, const void *p2, void*arg) if(r1->x > r2->x) return 1; } - else - { - return -1; - } + return -1; } void add_ocrtext2str(char *dest, char *src, const char *crlf, unsigned crlf_length) diff --git a/src/lib_ccx/params.c b/src/lib_ccx/params.c index eb8e6c82..42f7f70f 100644 --- a/src/lib_ccx/params.c +++ b/src/lib_ccx/params.c @@ -16,7 +16,7 @@ #include "utf8proc/utf8proc.h" #ifdef ENABLE_OCR -#include "capi.h" +#include "tesseract/capi.h" #include "allheaders.h" #endif