diff options
| -rwxr-xr-x | fullocrdir.sh | 74 | 
1 files changed, 74 insertions, 0 deletions
| diff --git a/fullocrdir.sh b/fullocrdir.sh new file mode 100755 index 0000000..c7b162e --- /dev/null +++ b/fullocrdir.sh @@ -0,0 +1,74 @@ +#!/bin/sh +usage="Usage: $0 dir + +Runs preprocessing and OCR over a directory of images, saving a +report on the quality of each page. + +The preprocessing is done several different ways, and the best +quality option is selected for each page, as determined by the +OCR engine confidence level. The best quality OCR is then saved +into the dir/best directory." + +TRAINING=rescribealphav5 + +test $# -ne 1 && echo "$usage" && exit 1 + +prereqs="pgconf preprocmulti tesseract" +for i in $prereqs; do +	if ! command -v $i > /dev/null ; then +	        echo "Error: no $i tool found" +	        exit 1 +	fi +done + +if ! test -d "$1"; then +	echo "Error: $1 does not exist" +	exit 1 +fi + +echo "Preprocess" +find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do +	b=`basename "$f" .jpg` +	d=`dirname "$f"` + +	preprocmulti "$f" "$d/$b" || exit 1 +done + +echo "OCR" +find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do +	b=`basename "$f" .png` +	d=`dirname "$f"` + +	# TODO: ensure to run the correct command here +	tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1 +done + +echo "Confidence" +find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do +	b=`basename "$f" .hocr` +	d=`dirname "$f"` + +	pgconf "$f" > "$d/$b.conf" || rm -f "$d/$b.conf" +done + +echo "Best" +mkdir -p "$1/best" || exit 1 +find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do +	b=`basename "$f" .jpg` +	d=`dirname "$f"` + +	best=0 +	bestfn="" +	for c in "$d/$b"*conf; do +		conf=`cat "$c"` +		test "$conf" = "No lines found" && continue +		if test $conf -gt $best; then +			best=$conf +			bestfn="$c" +		fi +	done +	test -z "$bestfn" && continue + +	hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr'` +	cp "$hocrfn" "$1/best/" || exit 1 +done | 
