blob: bac8121c75c0562e01859babac1a33f6dd27c2fc (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
#!/bin/sh
usage="Usage: $0 dir
Runs preprocessing and OCR over a directory of images, saving a
report on the quality of each page.
The preprocessing is done several different ways, and the best
quality option is selected for each page, as determined by the
OCR engine confidence level. The best quality OCR is then saved
into the dir/best directory."
TRAINING=rescribealphav5
test $# -ne 1 && echo "$usage" && exit 1
prereqs="bookgraph pgconf preprocmulti tesseract"
for i in $prereqs; do
if ! command -v $i > /dev/null ; then
echo "Error: no $i tool found"
exit 1
fi
done
if ! test -d "$1"; then
echo "Error: $1 does not exist"
exit 1
fi
echo "Preprocess"
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
b=`basename "$f" .jpg`
d=`dirname "$f"`
preprocmulti "$f" "$d/$b" || exit 1
done
echo "OCR"
find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do
b=`basename "$f" .png`
d=`dirname "$f"`
# TODO: ensure to run the correct command here
tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1
done
echo "Confidence"
find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do
b=`basename "$f" .hocr`
d=`dirname "$f"`
pgconf "$f" > "$d/$b.conf" 2>/dev/null || rm -f "$d/$b.conf"
done
echo "Best"
mkdir -p "$1/best" || exit 1
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
b=`basename "$f" .jpg`
d=`dirname "$f"`
best=0
bestfn=""
for c in "$d/$b"*conf; do
test ! -f "$c" && continue
conf=`cat "$c"`
if test $conf -gt $best; then
best=$conf
bestfn="$c"
fi
done
test -z "$bestfn" && continue
hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr/'`
cp "$hocrfn" "$1/best/" || exit 1
done
echo "Graphing"
bookgraph "$1"
|