summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-05-08 09:40:17 +0100
committerNick White <git@njw.name>2019-05-08 09:40:17 +0100
commitaa660900858adec7de2d2e85d2a9b0ae3ee01c4d (patch)
tree0c16e23c8d1931ae1a5093d31868e594355d62de
parent17f3e2287977895a0c127384472678be78cec022 (diff)
Add format-for-hocr-pdf.sh script
-rw-r--r--format-for-hocr-pdf.sh39
1 files changed, 39 insertions, 0 deletions
diff --git a/format-for-hocr-pdf.sh b/format-for-hocr-pdf.sh
new file mode 100644
index 0000000..89d4dd8
--- /dev/null
+++ b/format-for-hocr-pdf.sh
@@ -0,0 +1,39 @@
+#!/bin/sh
+usage="Usage: $0 indir outdir
+
+Creates a new directory with image and hocr files appropriately named
+and formatted for use with the the hocr-pdf tool from hocr-tools."
+
+test $# -ne 2 && echo "$usage" && exit 1
+
+# All possible training files to check for
+TRAININGS="rescribealphav4 rescribealphav5"
+
+if ! test -d "$1"; then
+ echo "Error: $1 does not exist"
+ exit 1
+fi
+mkdir -p "$2" || exit 1
+
+echo "Copying hocrs and converting pngs from $1 to $2"
+find "$1" -maxdepth 1 -type f -name '*.binarized.png' | while read i; do
+ b=`basename "$i" .binarized.png`
+
+ hocr=""
+ for t in $TRAININGS; do
+ n=`echo "$i" | sed "s/.binarized.png/_$t.hocr/"`
+ test -f "$n" && hocr="$n"
+ done
+ if test -z "$hocr"; then
+ echo "Warning: no corresponding hocr file found for $i, skipping."
+ continue
+ fi
+
+ gm convert "$i" "$2/$b.jpg" || exit 1
+ cp "$hocr" "$2/$b.hocr" || exit 1
+done
+
+nhocr=`find "$2" -type f -name '*hocr'|wc -l`
+njpg=`find "$2" -type f -name '*jpg'|wc -l`
+echo "Done. There are $nhocr hocr files and $njpg jpg files in $2"
+