summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-05-14 17:07:52 +0100
committerNick White <git@njw.name>2019-05-14 17:07:52 +0100
commit36fe81a6349977dfcb31feea2a00a25a791536a6 (patch)
treea29bc96fa60aaae166b1286b8ae741fb65375b21
parent58e51317883ebb4c66b92e6c1cc807e937f780cb (diff)
Add fullocrdir script, which does multiple binarisation options and picks the ones with the highest confidence
-rwxr-xr-xfullocrdir.sh74
1 files changed, 74 insertions, 0 deletions
diff --git a/fullocrdir.sh b/fullocrdir.sh
new file mode 100755
index 0000000..c7b162e
--- /dev/null
+++ b/fullocrdir.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+usage="Usage: $0 dir
+
+Runs preprocessing and OCR over a directory of images, saving a
+report on the quality of each page.
+
+The preprocessing is done several different ways, and the best
+quality option is selected for each page, as determined by the
+OCR engine confidence level. The best quality OCR is then saved
+into the dir/best directory."
+
+TRAINING=rescribealphav5
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+prereqs="pgconf preprocmulti tesseract"
+for i in $prereqs; do
+ if ! command -v $i > /dev/null ; then
+ echo "Error: no $i tool found"
+ exit 1
+ fi
+done
+
+if ! test -d "$1"; then
+ echo "Error: $1 does not exist"
+ exit 1
+fi
+
+echo "Preprocess"
+find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
+ b=`basename "$f" .jpg`
+ d=`dirname "$f"`
+
+ preprocmulti "$f" "$d/$b" || exit 1
+done
+
+echo "OCR"
+find "$1" -maxdepth 1 -type f -name '*_bin?.?.png' | while read f; do
+ b=`basename "$f" .png`
+ d=`dirname "$f"`
+
+ # TODO: ensure to run the correct command here
+ tesseract -l $TRAINING "$f" "$d/$b" hocr || exit 1
+done
+
+echo "Confidence"
+find "$1" -maxdepth 1 -type f -name '*.hocr' | while read f; do
+ b=`basename "$f" .hocr`
+ d=`dirname "$f"`
+
+ pgconf "$f" > "$d/$b.conf" || rm -f "$d/$b.conf"
+done
+
+echo "Best"
+mkdir -p "$1/best" || exit 1
+find "$1" -maxdepth 1 -type f -name '*.jpg' | while read f; do
+ b=`basename "$f" .jpg`
+ d=`dirname "$f"`
+
+ best=0
+ bestfn=""
+ for c in "$d/$b"*conf; do
+ conf=`cat "$c"`
+ test "$conf" = "No lines found" && continue
+ if test $conf -gt $best; then
+ best=$conf
+ bestfn="$c"
+ fi
+ done
+ test -z "$bestfn" && continue
+
+ hocrfn=`echo "$bestfn" | sed 's/.conf$/.hocr'`
+ cp "$hocrfn" "$1/best/" || exit 1
+done