summaryrefslogtreecommitdiff
path: root/fullocrdir.sh
blob: bac8121c75c0562e01859babac1a33f6dd27c2fc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/bin/sh
usage="Usage: $0 dir

Runs preprocessing and OCR over a directory of images, saving a
report on the quality of each page.

The preprocessing is done several different ways, and the best
quality option is selected for each page, as determined by the
OCR engine confidence level. The best quality OCR is then saved
into the dir/best directory."

TRAINING=rescribealphav5

test $# -ne 1 && echo "$usage" && exit 1

prereqs="bookgraph pgconf preprocmulti tesseract"
for i in $prereqs; do
	if ! command -v $i > /dev/null ; then
		echo "Error: no $i tool found"
		exit 1
	fi
done

if ! test -d "$1"; then
	echo "Error: $1 does not exist"
	exit 1
fi

echo "Preprocess"
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
	b=`basename "$f" .jpg`
	d=`dirname "$f"`

	preprocmulti "$f" "$d/$b" || exit 1
done

echo "OCR"
find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do
	b=`basename "$f" .png`
	d=`dirname "$f"`

	# TODO: ensure to run the correct command here
	tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1
done

echo "Confidence"
find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do
	b=`basename "$f" .hocr`
	d=`dirname "$f"`

	pgconf "$f" > "$d/$b.conf" 2>/dev/null || rm -f "$d/$b.conf"
done

echo "Best"
mkdir -p "$1/best" || exit 1
find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
	b=`basename "$f" .jpg`
	d=`dirname "$f"`

	best=0
	bestfn=""
	for c in "$d/$b"*conf; do
		test ! -f "$c" && continue
		conf=`cat "$c"`
		if test $conf -gt $best; then
			best=$conf
			bestfn="$c"
		fi
	done
	test -z "$bestfn" && continue

	hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr/'`
	cp "$hocrfn" "$1/best/" || exit 1
done

echo "Graphing"
bookgraph "$1"