summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-05-08 15:23:30 +0100
committerNick White <git@njw.name>2019-05-08 15:23:30 +0100
commit7bef4bc5eee200960ee0870974d922e763c16882 (patch)
tree244bd93d952f71c4d424a831105b986a7de75853
parent9673976e3a563ba3ebf183c7f18df2ae5c64b141 (diff)
Rename pdf prep tool as it creates the pdf too now
-rwxr-xr-xdir-to-pdf.sh70
-rw-r--r--format-for-hocr-pdf.sh43
2 files changed, 70 insertions, 43 deletions
diff --git a/dir-to-pdf.sh b/dir-to-pdf.sh
new file mode 100755
index 0000000..399bc16
--- /dev/null
+++ b/dir-to-pdf.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+usage="Usage: $0 indir
+
+Creates a PDF from image and hocr files in indir, saving it to indir.pdf.
+
+The necessary files are first copied to a temporary directory where
+they are renamed and reformatted for the hocr-pdf tool from hocr-tools.
+
+The PDF is then created with hocr-pdf, and the temporary directory is
+removed."
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+# All possible training files to check for
+TRAININGS="rescribealphav4 rescribealphav5"
+
+# Set image compression and dpi
+QUALITY=0
+DPI=600
+
+if command -v gm > /dev/null ; then
+ convert="gm convert"
+elif command -v convert > /dev/null ; then
+ convert="convert"
+else
+ echo "Error: no graphicksmagick or imagemagick found"
+ exit 1
+fi
+
+if ! command -v hocr-pdf > /dev/null ; then
+ echo "Error: no hocr-pdf tool found"
+ exit 1
+fi
+
+if ! test -d "$1"; then
+ echo "Error: $1 does not exist"
+ exit 1
+fi
+
+tmpdir=`mktemp -d`
+if test $? -ne 0 ; then
+ echo "Error: Failed to create temporary directory"
+ exit 1
+fi
+
+mkdir -p "$tmpdir" || exit 1
+
+echo "Copying hocrs and converting pngs from $1 to $tmpdir"
+find "$1" -maxdepth 1 -type f -name '*.unpapered.png' | while read i; do
+ b=`basename "$i" .unpapered.png`
+
+ hocr=""
+ for t in $TRAININGS; do
+ n=`echo "$i" | sed "s/.unpapered.png/_unpapered_$t.hocr/"`
+ test -f "$n" && hocr="$n"
+ done
+ if test -z "$hocr"; then
+ echo "Warning: no corresponding hocr file found for $i, skipping."
+ continue
+ fi
+
+ $convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1
+ cp "$hocr" "$tmpdir/$b.hocr" || exit 1
+done
+
+echo "Creating PDF"
+hocr-pdf --savefile "$1.pdf" "$tmpdir" || exit 1
+
+echo "Created a PDF at $1.pdf"
+rm -rf "$tmpdir"
diff --git a/format-for-hocr-pdf.sh b/format-for-hocr-pdf.sh
deleted file mode 100644
index 0059f81..0000000
--- a/format-for-hocr-pdf.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/sh
-usage="Usage: $0 indir outdir
-
-Creates a new directory with image and hocr files appropriately named
-and formatted for use with the the hocr-pdf tool from hocr-tools."
-
-test $# -ne 2 && echo "$usage" && exit 1
-
-# All possible training files to check for
-TRAININGS="rescribealphav4 rescribealphav5"
-
-# Set image compression and dpi
-QUALITY=0
-DPI=600
-
-if ! test -d "$1"; then
- echo "Error: $1 does not exist"
- exit 1
-fi
-mkdir -p "$2" || exit 1
-
-echo "Copying hocrs and converting pngs from $1 to $2"
-find "$1" -maxdepth 1 -type f -name '*.binarized.png' | while read i; do
- b=`basename "$i" .binarized.png`
-
- hocr=""
- for t in $TRAININGS; do
- n=`echo "$i" | sed "s/.binarized.png/_$t.hocr/"`
- test -f "$n" && hocr="$n"
- done
- if test -z "$hocr"; then
- echo "Warning: no corresponding hocr file found for $i, skipping."
- continue
- fi
-
- gm convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$2/$b.jpg" || exit 1
- cp "$hocr" "$2/$b.hocr" || exit 1
-done
-
-nhocr=`find "$2" -type f -name '*hocr'|wc -l`
-njpg=`find "$2" -type f -name '*jpg'|wc -l`
-echo "Done. There are $nhocr hocr files and $njpg jpg files in $2"
-