#include "lib_ccx.h" #include "utility.h" #ifdef ENABLE_HARDSUBX //TODO: Correct FFMpeg integration #include #include #include #include #include "allheaders.h" #include "hardsubx.h" #include "capi.h" char* _process_frame_white_basic(struct lib_hardsubx_ctx *ctx, AVFrame *frame, int width, int height, int index) { //printf("frame : %04d\n", index); PIX *im; PIX *edge_im; PIX *lum_im; PIX *feat_im; char *subtitle_text=NULL; im = pixCreate(width,height,32); lum_im = pixCreate(width,height,32); feat_im = pixCreate(width,height,32); int i,j; for(i=(3*height)/4;ilinesize[0]; int r=frame->data[0][p]; int g=frame->data[0][p+1]; int b=frame->data[0][p+2]; pixSetRGBPixel(im,j,i,r,g,b); float L,A,B; rgb_to_lab((float)r,(float)g,(float)b,&L,&A,&B); if(L > ctx->lum_thresh) pixSetRGBPixel(lum_im,j,i,255,255,255); else pixSetRGBPixel(lum_im,j,i,0,0,0); } } //Handle the edge image edge_im = pixCreate(width,height,8); edge_im = pixConvertRGBToGray(im,0.0,0.0,0.0); edge_im = pixSobelEdgeFilter(edge_im, L_VERTICAL_EDGES); edge_im = pixDilateGray(edge_im, 21, 11); edge_im = pixThresholdToBinary(edge_im,50); for(i=3*(height/4);i0) pixSetRGBPixel(feat_im,j,i,255,255,255); else pixSetRGBPixel(feat_im,j,i,0,0,0); } } if(ctx->detect_italics) { ctx->ocr_mode = HARDSUBX_OCRMODE_WORD; } // TESSERACT OCR FOR THE FRAME HERE switch(ctx->ocr_mode) { case HARDSUBX_OCRMODE_WORD: if(ctx->conf_thresh > 0) subtitle_text = get_ocr_text_wordwise_threshold(ctx, lum_im, ctx->conf_thresh); else subtitle_text = get_ocr_text_wordwise(ctx, lum_im); break; case HARDSUBX_OCRMODE_LETTER: if(ctx->conf_thresh > 0) subtitle_text = get_ocr_text_letterwise_threshold(ctx, lum_im, ctx->conf_thresh); else subtitle_text = get_ocr_text_letterwise(ctx, lum_im); break; case HARDSUBX_OCRMODE_FRAME: if(ctx->conf_thresh > 0) subtitle_text = get_ocr_text_simple_threshold(ctx, lum_im, ctx->conf_thresh); else subtitle_text = get_ocr_text_simple(ctx, lum_im); break; default: fatal(EXIT_MALFORMED_PARAMETER,"Invalid OCR Mode"); } pixDestroy(&lum_im); pixDestroy(&im); pixDestroy(&edge_im); pixDestroy(&feat_im); return subtitle_text; } char *_process_frame_color_basic(struct lib_hardsubx_ctx *ctx, AVFrame *frame, int width, int height, int index) { char *subtitle_text=NULL; PIX *im; im = pixCreate(width,height,32); PIX *hue_im = pixCreate(width,height,32); int i,j; for(i=0;ilinesize[0]; int r=frame->data[0][p]; int g=frame->data[0][p+1]; int b=frame->data[0][p+2]; pixSetRGBPixel(im,j,i,r,g,b); float H,S,V; rgb_to_hsv((float)r,(float)g,(float)b,&H,&S,&V); if(abs(H-ctx->hue)<20) { pixSetRGBPixel(hue_im,j,i,r,g,b); } } } PIX *edge_im = pixCreate(width,height,8),*edge_im_2 = pixCreate(width,height,8); edge_im = pixConvertRGBToGray(im,0.0,0.0,0.0); edge_im = pixSobelEdgeFilter(edge_im, L_VERTICAL_EDGES); edge_im = pixDilateGray(edge_im, 21, 1); edge_im = pixThresholdToBinary(edge_im,50); PIX *pixd = pixCreate(width,height,1); pixSauvolaBinarize(pixConvertRGBToGray(hue_im,0.0,0.0,0.0), 15, 0.3, 1, NULL, NULL, NULL, &pixd); edge_im_2 = pixConvertRGBToGray(hue_im,0.0,0.0,0.0); edge_im_2 = pixDilateGray(edge_im_2, 5, 5); PIX *feat_im = pixCreate(width,height,32); for(i=3*(height/4);i0)//if(p4>0&&p1==0)//if(p2==0&&p1==0&&p3>0) { pixSetRGBPixel(feat_im,j,i,255,255,255); } } } if(ctx->detect_italics) { ctx->ocr_mode = HARDSUBX_OCRMODE_WORD; } // TESSERACT OCR FOR THE FRAME HERE switch(ctx->ocr_mode) { case HARDSUBX_OCRMODE_WORD: if(ctx->conf_thresh > 0) subtitle_text = get_ocr_text_wordwise_threshold(ctx, feat_im, ctx->conf_thresh); else subtitle_text = get_ocr_text_wordwise(ctx, feat_im); break; case HARDSUBX_OCRMODE_LETTER: if(ctx->conf_thresh > 0) subtitle_text = get_ocr_text_letterwise_threshold(ctx, feat_im, ctx->conf_thresh); else subtitle_text = get_ocr_text_letterwise(ctx, feat_im); break; case HARDSUBX_OCRMODE_FRAME: if(ctx->conf_thresh > 0) subtitle_text = get_ocr_text_simple_threshold(ctx, feat_im, ctx->conf_thresh); else subtitle_text = get_ocr_text_simple(ctx, feat_im); break; default: fatal(EXIT_MALFORMED_PARAMETER,"Invalid OCR Mode"); } pixDestroy(&feat_im); pixDestroy(&im); pixDestroy(&edge_im); pixDestroy(&hue_im); return subtitle_text; } void _display_frame(struct lib_hardsubx_ctx *ctx, AVFrame *frame, int width, int height, int timestamp) { // Debug: Display the frame after processing PIX *im; im = pixCreate(width,height,32); PIX *hue_im = pixCreate(width,height,32); int i,j; for(i=0;ilinesize[0]; int r=frame->data[0][p]; int g=frame->data[0][p+1]; int b=frame->data[0][p+2]; pixSetRGBPixel(im,j,i,r,g,b); float H,S,V; rgb_to_hsv((float)r,(float)g,(float)b,&H,&S,&V); if(abs(H-ctx->hue)<20) { pixSetRGBPixel(hue_im,j,i,r,g,b); } } } PIX *edge_im = pixCreate(width,height,8),*edge_im_2 = pixCreate(width,height,8); edge_im = pixConvertRGBToGray(im,0.0,0.0,0.0); edge_im = pixSobelEdgeFilter(edge_im, L_VERTICAL_EDGES); edge_im = pixDilateGray(edge_im, 21, 1); edge_im = pixThresholdToBinary(edge_im,50); PIX *pixd = pixCreate(width,height,1); pixSauvolaBinarize(pixConvertRGBToGray(hue_im,0.0,0.0,0.0), 15, 0.3, 1, NULL, NULL, NULL, &pixd); edge_im_2 = pixConvertRGBToGray(hue_im,0.0,0.0,0.0); edge_im_2 = pixDilateGray(edge_im_2, 5, 5); PIX *feat_im = pixCreate(width,height,32); for(i=3*(height/4);i0)//if(p4>0&&p1==0)//if(p2==0&&p1==0&&p3>0) { pixSetRGBPixel(feat_im,j,i,255,255,255); } } } char *txt=NULL; // txt = get_ocr_text_simple(ctx, feat_im); // txt=get_ocr_text_wordwise_threshold(ctx, feat_im, ctx->conf_thresh); // if(txt != NULL)printf("%s\n", txt); pixDestroy(&im); pixDestroy(&edge_im); pixDestroy(&feat_im); pixDestroy(&edge_im_2); pixDestroy(&pixd); } int hardsubx_process_frames_linear(struct lib_hardsubx_ctx *ctx, struct encoder_ctx *enc_ctx) { // Do an exhaustive linear search over the video int got_frame; int dist; int cur_sec,total_sec,progress; int frame_number = 0; int64_t begin_time = 0,end_time = 0,prev_packet_pts = 0; char *subtitle_text=NULL; char *prev_subtitle_text=NULL; while(av_read_frame(ctx->format_ctx, &ctx->packet)>=0) { if(ctx->packet.stream_index == ctx->video_stream_id) { frame_number++; //Decode the video stream packet avcodec_decode_video2(ctx->codec_ctx, ctx->frame, &got_frame, &ctx->packet); if(got_frame && frame_number % 25 == 0) { float diff = (float)convert_pts_to_ms(ctx->packet.pts - prev_packet_pts, ctx->format_ctx->streams[ctx->video_stream_id]->time_base); if(abs(diff) < 1000*ctx->min_sub_duration) //If the minimum duration of a subtitle line is exceeded, process packet continue; // sws_scale is used to convert the pixel format to RGB24 from all other cases sws_scale( ctx->sws_ctx, (uint8_t const * const *)ctx->frame->data, ctx->frame->linesize, 0, ctx->codec_ctx->height, ctx->rgb_frame->data, ctx->rgb_frame->linesize ); // Send the frame to other functions for processing if(ctx->subcolor==HARDSUBX_COLOR_WHITE) { subtitle_text = _process_frame_white_basic(ctx,ctx->rgb_frame,ctx->codec_ctx->width,ctx->codec_ctx->height,frame_number); } else { subtitle_text = _process_frame_color_basic(ctx, ctx->rgb_frame, ctx->codec_ctx->width,ctx->codec_ctx->height,frame_number); } _display_frame(ctx, ctx->rgb_frame,ctx->codec_ctx->width,ctx->codec_ctx->height,frame_number); cur_sec = (int)convert_pts_to_s(ctx->packet.pts, ctx->format_ctx->streams[ctx->video_stream_id]->time_base); total_sec = (int)convert_pts_to_s(ctx->format_ctx->duration, AV_TIME_BASE_Q); progress = (cur_sec*100)/total_sec; activity_progress(progress,cur_sec/60,cur_sec%60); if(subtitle_text==NULL) continue; if(!strlen(subtitle_text)) continue; char *double_enter = strstr(subtitle_text,"\n\n"); if(double_enter!=NULL) *(double_enter)='\0'; //subtitle_text = prune_string(subtitle_text); end_time = convert_pts_to_ms(ctx->packet.pts, ctx->format_ctx->streams[ctx->video_stream_id]->time_base); if(prev_subtitle_text) { //TODO: Encode text with highest confidence dist = edit_distance(subtitle_text, prev_subtitle_text, strlen(subtitle_text), strlen(prev_subtitle_text)); if(dist > (0.2 * fmin(strlen(subtitle_text), strlen(prev_subtitle_text)))) { add_cc_sub_text(ctx->dec_sub, prev_subtitle_text, begin_time, end_time, "", "BURN", CCX_ENC_UTF_8); encode_sub(enc_ctx, ctx->dec_sub); begin_time = end_time + 1; } } // if(ctx->conf_thresh > 0) // { // if(ctx->cur_conf >= ctx->prev_conf) // { // prev_subtitle_text = strdup(subtitle_text); // ctx->prev_conf = ctx->cur_conf; // } // } // else // { // prev_subtitle_text = strdup(subtitle_text); // } prev_subtitle_text = strdup(subtitle_text); prev_packet_pts = ctx->packet.pts; } } av_packet_unref(&ctx->packet); } add_cc_sub_text(ctx->dec_sub, prev_subtitle_text, begin_time, end_time, "", "BURN", CCX_ENC_UTF_8); encode_sub(enc_ctx, ctx->dec_sub); activity_progress(100,cur_sec/60,cur_sec%60); } int hardsubx_process_frames_binary(struct lib_hardsubx_ctx *ctx) { // Do a binary search over the input video for faster processing // printf("Duration: %d\n", (int)ctx->format_ctx->duration); int got_frame; int seconds_time = 0; for(seconds_time=0;seconds_time<20;seconds_time++){ int64_t seek_time = (int64_t)(seconds_time*AV_TIME_BASE); seek_time = av_rescale_q(seek_time, AV_TIME_BASE_Q, ctx->format_ctx->streams[ctx->video_stream_id]->time_base); int ret = av_seek_frame(ctx->format_ctx, ctx->video_stream_id, seek_time, AVSEEK_FLAG_BACKWARD); // printf("%d\n", ret); // if(ret < 0) // { // printf("seeking back\n"); // ret = av_seek_frame(ctx->format_ctx, -1, seek_time, AVSEEK_FLAG_BACKWARD); // } if(ret >= 0) { while(av_read_frame(ctx->format_ctx, &ctx->packet)>=0) { if(ctx->packet.stream_index == ctx->video_stream_id) { avcodec_decode_video2(ctx->codec_ctx, ctx->frame, &got_frame, &ctx->packet); if(got_frame) { // printf("%d\n", seek_time); if(ctx->packet.pts < seek_time) continue; // printf("GOT FRAME: %d\n",ctx->packet.pts); // sws_scale is used to convert the pixel format to RGB24 from all other cases sws_scale( ctx->sws_ctx, (uint8_t const * const *)ctx->frame->data, ctx->frame->linesize, 0, ctx->codec_ctx->height, ctx->rgb_frame->data, ctx->rgb_frame->linesize ); // Send the frame to other functions for processing _display_frame(ctx, ctx->rgb_frame,ctx->codec_ctx->width,ctx->codec_ctx->height,seconds_time); break; } } } } else { printf("Seeking to timestamp failed\n"); } } } #endif