Add support for 4.0 tesseract

This commit is contained in:
Anshul Maheshwari 2018-11-07 12:53:26 +05:30
parent 5df1dbb922
commit 5dbbe654f0
4 changed files with 116 additions and 65 deletions

View File

@ -1,5 +1,6 @@
0.88(2018-10-24) (unreleased)
-----------------
- New: Add support for tesseract 4.0
- Optimize: Remove multiple RGB to grey conversion in OCR.
- Fix: Update UTF8Proc to 2.2.0
- Fix: Warn instead of fatal when a 0xFF marker is missing

View File

@ -1661,7 +1661,8 @@ static int write_dvb_sub(struct lib_cc_decode *dec_ctx, struct cc_subtitle *sub)
// Perform OCR
#ifdef ENABLE_OCR
char *ocr_str = NULL;
if (ctx->ocr_ctx) {
if (ctx->ocr_ctx)
{
ret = ocr_rect(ctx->ocr_ctx, rect, &ocr_str, region->bgcolor, dec_ctx->ocr_quantmode);
if (ret >= 0)
rect->ocr_text = ocr_str;
@ -1669,7 +1670,8 @@ static int write_dvb_sub(struct lib_cc_decode *dec_ctx, struct cc_subtitle *sub)
rect->ocr_text = NULL;
dbg_print(CCX_DMT_DVB, "\nOCR Result: %s\n", rect->ocr_text ? rect->ocr_text : "NULL");
}
else {
else
{
rect->ocr_text = NULL;
}
#endif
@ -1889,7 +1891,7 @@ int parse_dvb_description(struct dvb_config* cfg, unsigned char*data,
/* setting language to undefined if not found in language lkup table */
char lang_name[4];
dbg_print(CCX_DMT_DVB, "DVBSUB - LANGUAGE \"");
for(int char_index = 0; char_index < 3; char_index++)
{
lang_name[char_index] = cctolower(data[char_index]);

View File

@ -1,7 +1,7 @@
#include "png.h"
#include "lib_ccx.h"
#ifdef ENABLE_OCR
#include "capi.h"
#include "tesseract/capi.h"
#include "ccx_common_constants.h"
#include "allheaders.h"
#include <dirent.h>
@ -28,14 +28,14 @@ static int check_trans_tn_intensity(const void *p1, const void *p2, void *arg)
unsigned char act_i;
/** TODO verify that RGB follow ITU-R BT.709
* Below formula is valid only for 709 standard
* Y = 0.2126 R + 0.7152 G + 0.0722 B
*/
* Y = 0.2126 R + 0.7152 G + 0.0722 B
*/
tmp_i = (0.2126 * ti->palette[*tmp].red) + (0.7152 * ti->palette[*tmp].green) + (0.0722 * ti->palette[*tmp].blue);
act_i = (0.2126 * ti->palette[*act].red) + (0.7152 * ti->palette[*act].green) + (0.0722 * ti->palette[*act].blue);
if (ti->t[*tmp] < ti->t[*act] || (ti->t[*tmp] == ti->t[*act] && tmp_i < act_i))
if (ti->t[*tmp] < ti->t[*act] || (ti->t[*tmp] == ti->t[*act] && tmp_i < act_i))
return -1;
else if (ti->t[*tmp] == ti->t[*act] && tmp_i == act_i)
else if (ti->t[*tmp] == ti->t[*act] && tmp_i == act_i)
return 0;
return 1;
@ -43,10 +43,13 @@ static int check_trans_tn_intensity(const void *p1, const void *p2, void *arg)
static int search_language_pack(const char *dir_name,const char *lang_name)
{
if (!dir_name)
return -1;
//Search for a tessdata folder in the specified directory
char *dirname = strdup(dir_name);
dirname = realloc(dirname,strlen(dirname)+strlen("/tessdata/")+1);
strcat(dirname,"/tessdata/");
dirname = realloc(dirname,strlen(dirname)+strlen("tessdata/")+1);
strcat(dirname,"tessdata/");
DIR *dp;
struct dirent *dirp;
@ -79,15 +82,62 @@ void delete_ocr (void** arg)
freep(arg);
}
/**
* probe_tessdata_location
*
* This function probe tesseract data location
*
* Priority of Tesseract traineddata file search paths:-
* 1. tessdata in TESSDATA_PREFIX, if it is specified. Overrides others
* 2. tessdata in current working directory
* 3. tessdata in /usr/share
*/
char* probe_tessdata_location(int lang_index)
{
int ret = 0;
char *tessdata_dir_path = getenv("TESSDATA_PREFIX");
ret = search_language_pack(tessdata_dir_path, language[lang_index]);
if (!ret)
return tessdata_dir_path;
tessdata_dir_path = "./";
ret = search_language_pack(tessdata_dir_path,language[lang_index]);
if (!ret)
return tessdata_dir_path;
tessdata_dir_path = "/usr/share/";
ret = search_language_pack(tessdata_dir_path, language[lang_index]);
if (!ret)
return tessdata_dir_path;
tessdata_dir_path = "/usr/local/share/";
ret = search_language_pack(tessdata_dir_path, language[lang_index]);
if (!ret)
return tessdata_dir_path;
tessdata_dir_path = "/usr/share/tesseract-ocr/";
ret = search_language_pack(tessdata_dir_path, language[lang_index]);
if (!ret)
return tessdata_dir_path;
tessdata_dir_path = "/usr/share/tesseract-ocr/4.00/";
ret = search_language_pack(tessdata_dir_path, language[lang_index]);
if (!ret)
return tessdata_dir_path;
return NULL;
}
void* init_ocr(int lang_index)
{
int ret = -1;
struct ocrCtx* ctx;
const char* lang = NULL, *tessdata_path = NULL;
ctx = (struct ocrCtx*)malloc(sizeof(struct ocrCtx));
if(!ctx)
return NULL;
ctx->api = TessBaseAPICreate();
/* if language was undefined use english */
if(lang_index == 0)
@ -102,53 +152,53 @@ void* init_ocr(int lang_index)
goto fail;
}
/*Priority of Tesseract traineddata file search paths:-
1. tessdata in TESSDATA_PREFIX, if it is specified. Overrides others
2. tessdata in current working directory
3. tessdata in /usr/share
*/
int data_location = 0;
char *tessdata_dir_path=".";
if(!getenv("TESSDATA_PREFIX"))
{
ret = search_language_pack(tessdata_dir_path,language[lang_index]);
}
if(ret < 0)
{
data_location = 1;
if(getenv("TESSDATA_PREFIX"))
ret = search_language_pack(getenv("TESSDATA_PREFIX"), language[lang_index]);
else
ret = search_language_pack("/usr/share", language[lang_index]);
}
if(ret < 0 && lang_index != 1 && ccx_options.ocrlang==NULL)
tessdata_path = probe_tessdata_location(lang_index);
if(!tessdata_path)
{
if (lang_index == 1)
{
mprint("eng.traineddata not found! No Switching Possible\n");
return NULL;
}
mprint("%s.traineddata not found! Switching to English\n",language[lang_index]);
/* select english */
lang_index = 1;
tessdata_path = probe_tessdata_location(lang_index);
if(!tessdata_path)
{
mprint("eng.traineddata not found! No Switching Possible\n");
return NULL;
}
}
const char* lang = NULL, *tessdata_path = NULL;
if (ccx_options.ocrlang)
lang = ccx_options.ocrlang;
else if (data_location == 1)
else
lang = language[lang_index];
else {
lang = language[lang_index];
tessdata_path = tessdata_dir_path;
}
char* pars_vec = strdup("debug_file");
char* pars_values = strdup("/dev/null");
char* pars_values = strdup("tess.log");
ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
&pars_values, 1, false);
ctx->api = TessBaseAPICreate();
if (!strncmp("4.", TessVersion(), 2))
{
char tess_path [1024];
snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata");
//ccx_options.ocr_oem are depricated and only supported mode is OEM_LSTM_ONLY
ret = TessBaseAPIInit4(ctx->api, tess_path, lang, 1, NULL, 0, &pars_vec,
&pars_values, 1, false);
}
else
{
ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
&pars_values, 1, false);
}
free(pars_vec);
free(pars_values);
if(ret < 0)
{
mprint("Failed TessBaseAPIInit4 %d\n", ret);
goto fail;
}
return ctx;
@ -297,18 +347,21 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
TessPageIteratorLevel level = RIL_WORD;
TessBaseAPISetImage2(ctx->api, color_pix_out);
tess_ret = TessBaseAPIRecognize(ctx->api, NULL);
if (tess_ret != 0) {
if (tess_ret != 0)
{
mprint("\nTessBaseAPIRecognize returned %d, skipping this bitmap.\n", tess_ret);
}
else
else
{
ri = TessBaseAPIGetIterator(ctx->api);
}
if(!tess_ret && ri!=0)
{
do
{
char* word = TessResultIteratorGetUTF8Text(ri,level);
float conf = TessResultIteratorConfidence(ri,level);
// float conf = TessResultIteratorConfidence(ri,level);
int x1, y1, x2, y2;
if (!TessPageIteratorBoundingBox((TessPageIterator *)ri, level, &x1, &y1, &x2, &y2))
continue;
@ -325,7 +378,6 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
uint32_t *histogram = NULL;
uint8_t *iot = NULL;
uint32_t *mcit = NULL;
int ret = 0;
int max_color=2;
histogram = (uint32_t*) malloc(copy->nb_colors * sizeof(uint32_t));
@ -334,7 +386,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
struct transIntensity ti = {copy->alpha,copy->palette};
memset(histogram, 0, copy->nb_colors * sizeof(uint32_t));
/* initializing intensity ordered table with serial order of unsorted color table */
/* initializing intensity ordered table with serial order of unsorted color table */
for (int i = 0; i < copy->nb_colors; i++)
{
iot[i] = i;
@ -342,7 +394,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
memset(mcit, 0, copy->nb_colors * sizeof(uint32_t));
/* calculate histogram of image */
int firstpixel = copy->data[0]; //TODO: Verify this border pixel assumption holds
int firstpixel = copy->data[0]; //TODO: Verify this border pixel assumption holds
for(int i=y1;i<=y2;i++)
{
for(int j=x1;j<=x2;j++)
@ -360,7 +412,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
// i, iot[i], histogram[iot[i]]);
// }
/**
* using selection sort since need to find only max_color
* using selection sort since need to find only max_color
* Histogram becomes invalid in this loop
*/
for (int i = 0; i < max_color; i++)
@ -390,7 +442,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
palette[i].blue = copy->palette[i].blue;
alpha[i]=copy->alpha[i];
}
for (int i = 0, mxi = 0; i < copy->nb_colors; i++)
{
int step, inc;
@ -419,13 +471,13 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
}
}
// Detecting the color present in quantized word image
// Detecting the color present in quantized word image
int r_avg=0,g_avg=0,b_avg=0,denom=0;
for (int i = 0; i < copy->nb_colors; i++)
{
if(palette[i].red == ((copy->bgcolor >> 16) & 0xff) &&
palette[i].green == ((copy->bgcolor >> 8) & 0xff) &&
palette[i].green == ((copy->bgcolor >> 8) & 0xff) &&
palette[i].blue == ((copy->bgcolor >> 0) & 0xff))
continue;
denom++;
@ -510,7 +562,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
int length_closing_font = 7; // exclude '\0'
char *line_start = text_out;
int length = strlen(text_out) + length_closing_font * 10; // usually enough
int length = strlen(text_out) + length_closing_font * 10; // usually enough
char *new_text_out = malloc(length);
char *new_text_out_iter = new_text_out;
@ -543,7 +595,6 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
long diff = new_text_out_iter - new_text_out;
new_text_out = realloc(new_text_out, length);
new_text_out_iter = new_text_out + diff;
}
// Add <font> to the beginning of the line if it is missing
@ -575,7 +626,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
// Add </font> if it is indeed missing
if (line_end - line_start < length_closing_font ||
strncmp(line_start, closing_font, length_closing_font)) {
memcpy(new_text_out_iter, closing_font, length_closing_font);
new_text_out_iter += length_closing_font;
@ -600,7 +651,7 @@ char* ocr_bitmap(void* arg, png_color *palette,png_byte *alpha, unsigned char* i
pixDestroy(&cpix_gs);
pixDestroy(&color_pix);
pixDestroy(&color_pix_out);
return text_out;
}
@ -653,7 +704,7 @@ static int quantize_map(png_byte *alpha, png_color *palette,
memset(histogram, 0, nb_color * sizeof(uint32_t));
/* initializing intensity ordered table with serial order of unsorted color table */
/* initializing intensity ordered table with serial order of unsorted color table */
for (int i = 0; i < nb_color; i++)
{
iot[i] = i;
@ -677,7 +728,7 @@ static int quantize_map(png_byte *alpha, png_color *palette,
}
#endif
/**
* using selection sort since need to find only max_color
* using selection sort since need to find only max_color
* Histogram becomes invalid in this loop
*/
for (int i = 0; i < max_color; i++)
@ -702,7 +753,7 @@ static int quantize_map(png_byte *alpha, png_color *palette,
}
#ifdef OCR_DEBUG
ccx_common_logging.log_ftn("max redundant intensities table\n");
ccx_common_logging.log_ftn("max redundant intensities table\n");
for (int i = 0; i < max_color; i++)
{
ccx_common_logging.log_ftn("%02d) mcit %02d\n",
@ -756,7 +807,7 @@ int ocr_rect(void* arg, struct cc_bitmap *rect, char **str, int bgcolor, int ocr
int ret = 0;
png_color *palette = NULL;
png_byte *alpha = NULL;
struct image_copy *copy;
copy = (struct image_copy *)malloc(sizeof(struct image_copy));
copy->nb_colors = rect->nb_colors;
@ -843,10 +894,7 @@ int compare_rect_by_ypos(const void*p1, const void *p2, void*arg)
if(r1->x > r2->x)
return 1;
}
else
{
return -1;
}
return -1;
}
void add_ocrtext2str(char *dest, char *src, const char *crlf, unsigned crlf_length)

View File

@ -16,7 +16,7 @@
#include "utf8proc/utf8proc.h"
#ifdef ENABLE_OCR
#include "capi.h"
#include "tesseract/capi.h"
#include "allheaders.h"
#endif