summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-06-03 09:48:33 +0100
committerNick White <git@njw.name>2019-06-03 09:48:33 +0100
commit439a0b73e12657f1241692505770e6161f811c74 (patch)
tree10e913de64508911e50734ca8a4152665c88b551
parentd74a5297cffb867e64309451dd0cc526d2ebc3a2 (diff)
Add dir-to-pdfv2 script
-rwxr-xr-xdir-to-pdfv2.sh102
1 files changed, 102 insertions, 0 deletions
diff --git a/dir-to-pdfv2.sh b/dir-to-pdfv2.sh
new file mode 100755
index 0000000..db2b3c3
--- /dev/null
+++ b/dir-to-pdfv2.sh
@@ -0,0 +1,102 @@
+#!/bin/sh
+usage="Usage: $0 indir [pdf]
+
+Creates a PDF from image and hocr files in indir, saving it to pdf, or
+indir.pdf if not specified.
+
+The necessary files are first copied to a temporary directory where
+they are renamed and reformatted for the hocr-pdf tool from hocr-tools.
+
+The PDF is then created with hocr-pdf, and the temporary directory is
+removed."
+
+# All possible training files to check for
+TRAININGS="rescribealphav4 rescribealphav5"
+
+# Set image compression and dpi
+QUALITY=0
+DPI=600
+
+# Set resize ratio
+RESIZEPERC=25%
+RESIZEDIV=4
+DPI=`expr $DPI / $RESIZEDIV`
+
+test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1
+
+if command -v gm > /dev/null ; then
+ convert="gm convert"
+elif command -v convert > /dev/null ; then
+ convert="convert"
+else
+ echo "Error: no graphicksmagick or imagemagick found"
+ exit 1
+fi
+
+if ! command -v hocr-pdf > /dev/null ; then
+ echo "Error: no hocr-pdf tool found"
+ exit 1
+fi
+
+if ! test -d "$1"; then
+ echo "Error: $1 does not exist"
+ exit 1
+fi
+
+if test $# -eq 2 ; then
+ outfile="$2"
+else
+ o=`echo "$1" | sed 's/\/$//'`
+ outfile="$o.pdf"
+fi
+
+tmpdir=`mktemp -d`
+if test $? -ne 0 ; then
+ echo "Error: Failed to create temporary directory"
+ exit 1
+fi
+
+mkdir -p "$tmpdir" || exit 1
+
+echo "Copying hocrs and converting pngs from $1 to $tmpdir"
+find "$1/best" -maxdepth 1 -type f -name '*.hocr' | sort | while read i; do
+ b=`basename "$i" .hocr`
+ img="$1/$b.png"
+
+ if ! test -f "$img"; then
+ echo "Warning: no image found for hocr file $i, skipping"
+ continue
+ fi
+
+ $convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$img" "$tmpdir/$b.jpg" || exit 1
+
+ # Adjust the bounding boxes to match the new geometry after resizing
+ cat "$i" | while read line; do
+ if `echo "$line" | grep -q -v "title=['\"]bbox"`; then
+ printf "%s\n" "$line"
+ continue
+ fi
+ # get original values
+ bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"`
+ x1=`echo "$bbox"|awk '{print $1}'`
+ y1=`echo "$bbox"|awk '{print $2}'`
+ x2=`echo "$bbox"|awk '{print $3}'`
+ y2=`echo "$bbox"|awk '{print $4}'`
+
+ # first halve all values
+ x1=`echo "$x1 / $RESIZEDIV" | bc`
+ y1=`echo "$y1 / $RESIZEDIV" | bc`
+ x2=`echo "$x2 / $RESIZEDIV" | bc`
+ y2=`echo "$y2 / $RESIZEDIV" | bc`
+
+ newbbox="$x1 $y1 $x2 $y2"
+ newline=`echo "$line" | sed "s/$bbox/$newbbox/"`
+ printf "%s\n" "$newline"
+ done > "$tmpdir/$b.hocr"
+done
+
+echo "Creating PDF"
+hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1
+
+echo "Created a PDF at $outfile"
+rm -rf "$tmpdir"