diff options
-rwxr-xr-x | fullocrdir.sh | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/fullocrdir.sh b/fullocrdir.sh new file mode 100755 index 0000000..c7b162e --- /dev/null +++ b/fullocrdir.sh @@ -0,0 +1,74 @@ +#!/bin/sh +usage="Usage: $0 dir + +Runs preprocessing and OCR over a directory of images, saving a +report on the quality of each page. + +The preprocessing is done several different ways, and the best +quality option is selected for each page, as determined by the +OCR engine confidence level. The best quality OCR is then saved +into the dir/best directory." + +TRAINING=rescribealphav5 + +test $# -ne 1 && echo "$usage" && exit 1 + +prereqs="pgconf preprocmulti tesseract" +for i in $prereqs; do + if ! command -v $i > /dev/null ; then + echo "Error: no $i tool found" + exit 1 + fi +done + +if ! test -d "$1"; then + echo "Error: $1 does not exist" + exit 1 +fi + +echo "Preprocess" +find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do + b=`basename "$f" .jpg` + d=`dirname "$f"` + + preprocmulti "$f" "$d/$b" || exit 1 +done + +echo "OCR" +find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do + b=`basename "$f" .png` + d=`dirname "$f"` + + # TODO: ensure to run the correct command here + tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1 +done + +echo "Confidence" +find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do + b=`basename "$f" .hocr` + d=`dirname "$f"` + + pgconf "$f" > "$d/$b.conf" || rm -f "$d/$b.conf" +done + +echo "Best" +mkdir -p "$1/best" || exit 1 +find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do + b=`basename "$f" .jpg` + d=`dirname "$f"` + + best=0 + bestfn="" + for c in "$d/$b"*conf; do + conf=`cat "$c"` + test "$conf" = "No lines found" && continue + if test $conf -gt $best; then + best=$conf + bestfn="$c" + fi + done + test -z "$bestfn" && continue + + hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr'` + cp "$hocrfn" "$1/best/" || exit 1 +done |