#!/bin/sh -ex #################################################################### # setup by tracey apr 2012 # updated version dec 2016 # see: http://www.ccextractor.org/doku.php #################################################################### # build it static! # simplest way is with linux alpine # hop onto box with docker on it and cd to dir of the file you are staring at # You will get a static-compiled binary and english language library file in the end. if [ ! -e /tmp/cc/ccextractor-README.txt ]; then rm -rf /tmp/cc; mkdir -p -m777 /tmp/cc; mkdir -p -m777 ../lib/tessdata/; cp ccextractor-README.txt /tmp/cc/; sudo docker run -v /tmp/cc:/tmp/cc --rm -it alpine:latest /tmp/cc/ccextractor-README.txt; # NOTE: _AFTER_ testing/validating, you can promote it from "ccextractor.next" to "ccextractor"... ;-) cp /tmp/cc/*traineddata ../lib/tessdata/; chmod go-w ../lib/tessdata/; exit 0; fi # NOW we are inside docker container... cd /tmp/cc; # we want tesseract (for OCR) echo ' http://dl-cdn.alpinelinux.org/alpine/v3.5/main http://dl-cdn.alpinelinux.org/alpine/v3.5/community ' >| /etc/apk/repositories; apk update; apk upgrade; apk add --update bash zsh alpine-sdk perl; # (needed by various static builds below) # Even though we're going to (re)builid tesseract from source statically, get its dependencies setup by # installing it now, too. apk add autoconf automake libtool tesseract-ocr-dev; # Now comes the not-so-fun parts... Many packages _only_ provide .so files in their distros -- not the .a # needed files for building something with it statically. Step through them now... # libgif wget https://sourceforge.net/projects/giflib/files/giflib-5.1.4.tar.gz; zcat giflib*tar.gz | tar xf -; cd giflib*/; ./configure --disable-shared --enable-static; make; make install; hash -r; cd -; # libwebp git clone https://github.com/webmproject/libwebp; cd libwebp; ./autogen.sh; ./configure --disable-shared --enable-static; make; make install; cd -; # leptonica wget http://www.leptonica.org/source/leptonica-1.73.tar.gz; zcat leptonica*tar.gz | tar xf -; cd leptonica*/; ./configure --disable-shared --enable-static; make; make install; hash -r; cd -; # tesseract git clone https://github.com/tesseract-ocr/tesseract; cd tesseract; ./autogen.sh; ./configure --disable-shared --enable-static; make; make install; cd -; # ccextractor -- build static git clone https://github.com/CCExtractor/ccextractor; cd ccextractor/linux/; # wget https://sourceforge.net/projects/ccextractor/files/ccextractor/0.82/ccextractor.src.0.82.zip; # unzip ccextractor*.zip; # cd ccextractor.*/linux/; perl -i -pe 's/O3 /O3 -static /' Makefile; # quick patch: perl -i -pe 's/(strchr|strstr)\(/$1((char *)/' ../src/gpacmp4/url.c ../src/gpacmp4/error.c; set +e; # this _will_ FAIL at the end.. make ENABLE_OCR=yes; set -e; # I confess hand-compiling (cherrypicking which .a to use when there are 2, etc.) is fragile... # But it was the _only_ way I could get a fully static build after hours of thrashing... gcc -Wno-write-strings -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -O3 -std=gnu99 -s -DGPAC_CONFIG_LINUX -DENABLE_OCR -DPNG_NO_CONFIG_H -I/usr/local/include/tesseract -I/usr/local/include/leptonica objs/*.o -o ccextractor \ --static -lm \ /usr/local/lib/libtesseract.a \ /usr/local/lib/liblept.a \ /usr/local/lib/libgif.a \ /usr/local/lib/libwebp.a \ /usr/lib/libjpeg.a \ /usr/lib/libtiff.a \ /usr/lib/libgomp.a \ -lstdc++; cp ccextractor /tmp/cc/ccextractor.next; cd -; # get english lang trained data wget https://github.com/tesseract-ocr/tessdata/raw/master/eng.traineddata;