-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathcompile-tesseract.sh
66 lines (54 loc) · 1.82 KB
/
compile-tesseract.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Spin up and enter the docker container on your machine with the following command:
# docker run -it --entrypoint /bin/bash public.ecr.aws/lambda/nodejs:18-arm64
# Then run the rest of the commands inside
# install basic stuff required for compilation
yum install -y aclocal autoconf automake cmakegcc freetype-devel gcc gcc-c++ \
git lcms2-devel libjpeg-devel libjpeg-turbo-devel autogen autoconf libtool \
libpng-devel libtiff-devel libtool libwebp-devel libzip-devel make zlib-devel
# leptonica
cd ~
git clone https://github.com/DanBloomberg/leptonica.git
cd leptonica/
git checkout 1.78.0 # newer version crashes tesseract build for now. See https://github.com/tesseract-ocr/tesseract/issues/3815
./autogen.sh
./configure
make
make install
# tesseract
cd ~
git clone https://github.com/tesseract-ocr/tesseract.git
cd tesseract
git checkout 5.3.3
export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
./autogen.sh
./configure
make
make install
cd ~
mkdir tesseract-standalone
# copy files
cd tesseract-standalone
cp /usr/local/bin/tesseract .
mkdir lib
cp /usr/local/lib/libtesseract.so.5 lib/
cp /lib64/libpng15.so.15 lib/
cp /lib64/libtiff.so.5 lib/
cp /lib64/libgomp.so.1 lib/
cp /lib64/libjbig.so.2.0 lib/
cp /usr/local/lib/liblept.so.5 lib/
cp /usr/lib64/libjpeg.so.62 lib/
cp /usr/lib64/libwebp.so.4 lib/
cp /usr/lib64/libstdc++.so.6 lib/
# copy training data
mkdir tessdata
cd tessdata
curl -L https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main/eng.traineddata --output eng.traineddata
# archive
cd ~
# trim unneeded ~ 15 MB
strip ./tesseract-standalone/**/*
tar -zcvf tesseract.tar.gz tesseract-standalone
# download from docker to local machine
# d1c431e8c85e is docker container id, you can look it up by running "docker ps"
# run this outside of the docker container
docker cp d1c431e8c85e:/root/tesseract.tar.gz tt.tar.gz