summaryrefslogtreecommitdiff
path: root/format-for-hocr-pdf.sh
blob: 0059f81b5af32ce23dfd22b04e14d7522df4b96d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/sh
usage="Usage: $0 indir outdir

Creates a new directory with image and hocr files appropriately named
and formatted for use with the the hocr-pdf tool from hocr-tools."

test $# -ne 2 && echo "$usage" && exit 1

# All possible training files to check for
TRAININGS="rescribealphav4 rescribealphav5"

# Set image compression and dpi
QUALITY=0
DPI=600

if ! test -d "$1"; then
	echo "Error: $1 does not exist"
	exit 1
fi
mkdir -p "$2" || exit 1

echo "Copying hocrs and converting pngs from $1 to $2"
find "$1" -maxdepth 1 -type f -name '*.binarized.png' | while read i; do
	b=`basename "$i" .binarized.png`

	hocr=""
	for t in $TRAININGS; do
		n=`echo "$i" | sed "s/.binarized.png/_$t.hocr/"`
		test -f "$n" && hocr="$n"
	done
	if test -z "$hocr"; then
		echo "Warning: no corresponding hocr file found for $i, skipping."
		continue
	fi

	gm convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$2/$b.jpg" || exit 1
	cp "$hocr" "$2/$b.hocr" || exit 1
done

nhocr=`find "$2" -type f -name '*hocr'|wc -l`
njpg=`find "$2" -type f -name '*jpg'|wc -l`
echo "Done. There are $nhocr hocr files and $njpg jpg files in $2"