blob: 0059f81b5af32ce23dfd22b04e14d7522df4b96d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
#!/bin/sh
usage="Usage: $0 indir outdir
Creates a new directory with image and hocr files appropriately named
and formatted for use with the the hocr-pdf tool from hocr-tools."
test $# -ne 2 && echo "$usage" && exit 1
# All possible training files to check for
TRAININGS="rescribealphav4 rescribealphav5"
# Set image compression and dpi
QUALITY=0
DPI=600
if ! test -d "$1"; then
echo "Error: $1 does not exist"
exit 1
fi
mkdir -p "$2" || exit 1
echo "Copying hocrs and converting pngs from $1 to $2"
find "$1" -maxdepth 1 -type f -name '*.binarized.png' | while read i; do
b=`basename "$i" .binarized.png`
hocr=""
for t in $TRAININGS; do
n=`echo "$i" | sed "s/.binarized.png/_$t.hocr/"`
test -f "$n" && hocr="$n"
done
if test -z "$hocr"; then
echo "Warning: no corresponding hocr file found for $i, skipping."
continue
fi
gm convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$2/$b.jpg" || exit 1
cp "$hocr" "$2/$b.hocr" || exit 1
done
nhocr=`find "$2" -type f -name '*hocr'|wc -l`
njpg=`find "$2" -type f -name '*jpg'|wc -l`
echo "Done. There are $nhocr hocr files and $njpg jpg files in $2"
|