summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-10-23 11:05:01 +0100
committerNick White <git@njw.name>2019-10-23 11:05:01 +0100
commit7d151bbb85338bdf2022f815c73873f9eace7d38 (patch)
tree61999f3e4008fb2b355b5c14fd1b0f90210db9e4
parentd475271449c9d5d9fb10850499288f450b03e71c (diff)
Add dir-to-pdfv3.sh, for use alongside bookpipeline
-rwxr-xr-xdir-to-pdfv3.sh118
1 files changed, 118 insertions, 0 deletions
diff --git a/dir-to-pdfv3.sh b/dir-to-pdfv3.sh
new file mode 100755
index 0000000..4248be9
--- /dev/null
+++ b/dir-to-pdfv3.sh
@@ -0,0 +1,118 @@
+#!/bin/sh
+usage="Usage: $0 [-c] indir [pdf]
+
+Creates a PDF from image and hocr files in indir, saving it to pdf, or
+indir.pdf if not specified.
+
+The necessary files are first copied to a temporary directory where
+they are renamed and reformatted for the hocr-pdf tool from hocr-tools.
+
+The PDF is then created with hocr-pdf, and the temporary directory is
+removed.
+
+This is designed to work with files which have gone through the
+rescribe.xyz/bookpipeline process, with a 'best' file which lists the
+best hocr files for each page.
+
+-c: colour output"
+
+# Set image compression and dpi
+QUALITY=20
+DPI=600
+
+# Set resize ratio
+RESIZEPERC=25%
+RESIZEDIV=4
+DPI=`expr $DPI / $RESIZEDIV`
+
+colour=0
+test $# -gt 1 && test "$1" = "-c" && colour=1 && shift
+
+test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1
+
+if command -v gm > /dev/null ; then
+ convert="gm convert"
+elif command -v convert > /dev/null ; then
+ convert="convert"
+else
+ echo "Error: no graphicksmagick or imagemagick found"
+ exit 1
+fi
+
+if ! command -v hocr-pdf > /dev/null ; then
+ echo "Error: no hocr-pdf tool found"
+ exit 1
+fi
+
+if ! test -d "$1"; then
+ echo "Error: $1 does not exist"
+ exit 1
+fi
+
+if test $# -eq 2 ; then
+ outfile="$2"
+else
+ o=`echo "$1" | sed 's/\/$//'`
+ outfile="$o.pdf"
+fi
+
+tmpdir=`mktemp -d`
+if test $? -ne 0 ; then
+ echo "Error: Failed to create temporary directory"
+ exit 1
+fi
+
+mkdir -p "$tmpdir" || exit 1
+
+echo "Copying hocrs and converting jpgs from $1 to $tmpdir"
+while read i; do
+ if ! test -f "$1/$i"; then
+ echo "Warning: no hocr file found for $i, skipping"
+ continue
+ fi
+
+ n=`echo "$i" | sed 's/_bin.*//'`
+
+ b=`basename "$i" .hocr`
+ if test $colour -eq 1; then
+ img="$1/$n.jpg" # colour
+ else
+ img="$1/$b.png" # binarised
+ fi
+ if ! test -f "$img"; then
+ echo "Warning: no image found for hocr file $i, skipping"
+ continue
+ fi
+
+ $convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$img" "$tmpdir/$n.jpg" || exit 1
+
+ # Adjust the bounding boxes to match the new geometry after resizing
+ cat "$1/$i" | while read line; do
+ if `echo "$line" | grep -q -v "title=['\"]bbox"`; then
+ printf "%s\n" "$line"
+ continue
+ fi
+ # get original values
+ bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"`
+ x1=`echo "$bbox"|awk '{print $1}'`
+ y1=`echo "$bbox"|awk '{print $2}'`
+ x2=`echo "$bbox"|awk '{print $3}'`
+ y2=`echo "$bbox"|awk '{print $4}'`
+
+ # first halve all values
+ x1=`echo "$x1 / $RESIZEDIV" | bc`
+ y1=`echo "$y1 / $RESIZEDIV" | bc`
+ x2=`echo "$x2 / $RESIZEDIV" | bc`
+ y2=`echo "$y2 / $RESIZEDIV" | bc`
+
+ newbbox="$x1 $y1 $x2 $y2"
+ newline=`echo "$line" | sed "s/$bbox/$newbbox/"`
+ printf "%s\n" "$newline"
+ done > "$tmpdir/$n.hocr"
+done < "$1/best"
+
+echo "Creating PDF"
+hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1
+
+echo "Created a PDF at $outfile"
+rm -rf "$tmpdir"