blob: 8454b669d31e314e81177752ad321ba89e8a48dc (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
#!/bin/sh
usage="Usage: $0 dir
Runs preprocessing and OCR over a directory of images, saving a
report on the quality of each page.
The preprocessing is done several different ways, and the best
quality option is selected for each page, as determined by the
OCR engine confidence level. The best quality OCR is then saved
into the dir/best directory."
TRAINING=rescribealphav5
test $# -ne 1 && echo "$usage" && exit 1
prereqs="bookgraph pgconf preprocmulti tesseract"
for i in $prereqs; do
if ! command -v $i > /dev/null ; then
echo "Error: no $i tool found"
exit 1
fi
done
if ! test -d "$1"; then
echo "Error: $1 does not exist"
exit 1
fi
echo "Preprocess"
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
b=`basename "$f" .jpg`
d=`dirname "$f"`
test -f "$d/${b}_bin0.2.png" && echo "Skipping preprocessing for $b; .${b}_bin0.2.png already exists" && continue
preprocmulti "$f" "$d/$b" || exit 1
done
echo "OCR"
find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do
b=`basename "$f" .png`
d=`dirname "$f"`
test -f "$d/$b.hocr" && echo "Skipping tesseract for $b; .hocr already exists" && continue
tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1
done
echo "Confidence"
find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do
b=`basename "$f" .hocr`
d=`dirname "$f"`
test -f "$d/${b}.conf" && echo "Skipping pgconf for $b; ${b}.conf already exists" && continue
pgconf "$f" > "$d/$b.conf" 2>/dev/null || rm -f "$d/$b.conf"
done
echo "Best"
mkdir -p "$1/best" || exit 1
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
b=`basename "$f" .jpg`
d=`dirname "$f"`
best=0
bestfn=""
for c in "$d/$b"*conf; do
test ! -f "$c" && continue
conf=`cat "$c"`
if test $conf -gt $best; then
best=$conf
bestfn="$c"
fi
done
test -z "$bestfn" && continue
hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr/'`
cp "$hocrfn" "$1/best/" || exit 1
done
echo "Worst"
mkdir -p "$1/best/worst" || exit 1
find "$1/best" -maxdepth 1 -type f -name '*.hocr' | while read f; do
b=`basename "$f" .hocr`
d=`dirname "$f"`
conf="$1/$b.conf"
if test $conf -lt 40; then
mv "$d/$d".* "$1/best/worst" || exit 1
fi
done
echo "Graphing"
bookgraph "$1"
|