summaryrefslogtreecommitdiff
path: root/fullocrdir.sh
blob: 8454b669d31e314e81177752ad321ba89e8a48dc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/bin/sh
usage="Usage: $0 dir

Runs preprocessing and OCR over a directory of images, saving a
report on the quality of each page.

The preprocessing is done several different ways, and the best
quality option is selected for each page, as determined by the
OCR engine confidence level. The best quality OCR is then saved
into the dir/best directory."

TRAINING=rescribealphav5

test $# -ne 1 && echo "$usage" && exit 1

prereqs="bookgraph pgconf preprocmulti tesseract"
for i in $prereqs; do
	if ! command -v $i > /dev/null ; then
		echo "Error: no $i tool found"
		exit 1
	fi
done

if ! test -d "$1"; then
	echo "Error: $1 does not exist"
	exit 1
fi

echo "Preprocess"
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
	b=`basename "$f" .jpg`
	d=`dirname "$f"`

	test -f "$d/${b}_bin0.2.png" && echo "Skipping preprocessing for $b; .${b}_bin0.2.png already exists" && continue

	preprocmulti "$f" "$d/$b" || exit 1
done

echo "OCR"
find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do
	b=`basename "$f" .png`
	d=`dirname "$f"`

	test -f "$d/$b.hocr" && echo "Skipping tesseract for $b; .hocr already exists" && continue

	tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1
done

echo "Confidence"
find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do
	b=`basename "$f" .hocr`
	d=`dirname "$f"`

	test -f "$d/${b}.conf" && echo "Skipping pgconf for $b; ${b}.conf already exists" && continue

	pgconf "$f" > "$d/$b.conf" 2>/dev/null || rm -f "$d/$b.conf"
done

echo "Best"
mkdir -p "$1/best" || exit 1
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
	b=`basename "$f" .jpg`
	d=`dirname "$f"`

	best=0
	bestfn=""
	for c in "$d/$b"*conf; do
		test ! -f "$c" && continue
		conf=`cat "$c"`
		if test $conf -gt $best; then
			best=$conf
			bestfn="$c"
		fi
	done
	test -z "$bestfn" && continue

	hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr/'`
	cp "$hocrfn" "$1/best/" || exit 1
done

echo "Worst"
mkdir -p "$1/best/worst" || exit 1
find "$1/best" -maxdepth 1 -type f -name '*.hocr' | while read f; do
	b=`basename "$f" .hocr`
	d=`dirname "$f"`

	conf="$1/$b.conf"

	if test $conf -lt 40; then
		mv "$d/$d".* "$1/best/worst" || exit 1
	fi
done

echo "Graphing"
bookgraph "$1"