#!/bin/sh usage="Usage: $0 indir outdir Creates a new directory with image and hocr files appropriately named and formatted for use with the the hocr-pdf tool from hocr-tools." test $# -ne 2 && echo "$usage" && exit 1 # All possible training files to check for TRAININGS="rescribealphav4 rescribealphav5" # Set image compression and dpi QUALITY=0 DPI=600 if ! test -d "$1"; then echo "Error: $1 does not exist" exit 1 fi mkdir -p "$2" || exit 1 echo "Copying hocrs and converting pngs from $1 to $2" find "$1" -maxdepth 1 -type f -name '*.binarized.png' | while read i; do b=`basename "$i" .binarized.png` hocr="" for t in $TRAININGS; do n=`echo "$i" | sed "s/.binarized.png/_$t.hocr/"` test -f "$n" && hocr="$n" done if test -z "$hocr"; then echo "Warning: no corresponding hocr file found for $i, skipping." continue fi gm convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$2/$b.jpg" || exit 1 cp "$hocr" "$2/$b.hocr" || exit 1 done nhocr=`find "$2" -type f -name '*hocr'|wc -l` njpg=`find "$2" -type f -name '*jpg'|wc -l` echo "Done. There are $nhocr hocr files and $njpg jpg files in $2"