#!/bin/sh usage="Usage: $0 dir Runs preprocessing and OCR over a directory of images, saving a report on the quality of each page. The preprocessing is done several different ways, and the best quality option is selected for each page, as determined by the OCR engine confidence level. The best quality OCR is then saved into the dir/best directory." TRAINING=rescribealphav5 test $# -ne 1 && echo "$usage" && exit 1 prereqs="bookgraph pgconf preprocmulti tesseract" for i in $prereqs; do if ! command -v $i > /dev/null ; then echo "Error: no $i tool found" exit 1 fi done if ! test -d "$1"; then echo "Error: $1 does not exist" exit 1 fi echo "Preprocess" find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do b=`basename "$f" .jpg` d=`dirname "$f"` test -f "$d/${b}_bin0.2.png" && echo "Skipping preprocessing for $b; .${b}_bin0.2.png already exists" && continue preprocmulti "$f" "$d/$b" || exit 1 done echo "OCR" find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do b=`basename "$f" .png` d=`dirname "$f"` test -f "$d/$b.hocr" && echo "Skipping tesseract for $b; .hocr already exists" && continue tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1 done echo "Confidence" find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do b=`basename "$f" .hocr` d=`dirname "$f"` test -f "$d/${b}.conf" && echo "Skipping pgconf for $b; ${b}.conf already exists" && continue pgconf "$f" > "$d/$b.conf" 2>/dev/null || rm -f "$d/$b.conf" done echo "Best" mkdir -p "$1/best" || exit 1 find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do b=`basename "$f" .jpg` d=`dirname "$f"` best=0 bestfn="" for c in "$d/$b"*conf; do test ! -f "$c" && continue conf=`cat "$c"` if test $conf -gt $best; then best=$conf bestfn="$c" fi done test -z "$bestfn" && continue hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr/'` cp "$hocrfn" "$1/best/" || exit 1 done echo "Worst" mkdir -p "$1/best/worst" || exit 1 find "$1/best" -maxdepth 1 -type f -name '*.hocr' | while read f; do b=`basename "$f" .hocr` d=`dirname "$f"` conf="$1/$b.conf" if test $conf -lt 40; then mv "$d/$d".* "$1/best/worst" || exit 1 fi done echo "Graphing" bookgraph "$1"