diff options
Diffstat (limited to 'format-for-hocr-pdf.sh')
-rw-r--r-- | format-for-hocr-pdf.sh | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/format-for-hocr-pdf.sh b/format-for-hocr-pdf.sh new file mode 100644 index 0000000..89d4dd8 --- /dev/null +++ b/format-for-hocr-pdf.sh @@ -0,0 +1,39 @@ +#!/bin/sh +usage="Usage: $0 indir outdir + +Creates a new directory with image and hocr files appropriately named +and formatted for use with the the hocr-pdf tool from hocr-tools." + +test $# -ne 2 && echo "$usage" && exit 1 + +# All possible training files to check for +TRAININGS="rescribealphav4 rescribealphav5" + +if ! test -d "$1"; then + echo "Error: $1 does not exist" + exit 1 +fi +mkdir -p "$2" || exit 1 + +echo "Copying hocrs and converting pngs from $1 to $2" +find "$1" -maxdepth 1 -type f -name '*.binarized.png' | while read i; do + b=`basename "$i" .binarized.png` + + hocr="" + for t in $TRAININGS; do + n=`echo "$i" | sed "s/.binarized.png/_$t.hocr/"` + test -f "$n" && hocr="$n" + done + if test -z "$hocr"; then + echo "Warning: no corresponding hocr file found for $i, skipping." + continue + fi + + gm convert "$i" "$2/$b.jpg" || exit 1 + cp "$hocr" "$2/$b.hocr" || exit 1 +done + +nhocr=`find "$2" -type f -name '*hocr'|wc -l` +njpg=`find "$2" -type f -name '*jpg'|wc -l` +echo "Done. There are $nhocr hocr files and $njpg jpg files in $2" + |