From 439a0b73e12657f1241692505770e6161f811c74 Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 3 Jun 2019 09:48:33 +0100 Subject: Add dir-to-pdfv2 script --- dir-to-pdfv2.sh | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100755 dir-to-pdfv2.sh (limited to 'dir-to-pdfv2.sh') diff --git a/dir-to-pdfv2.sh b/dir-to-pdfv2.sh new file mode 100755 index 0000000..db2b3c3 --- /dev/null +++ b/dir-to-pdfv2.sh @@ -0,0 +1,102 @@ +#!/bin/sh +usage="Usage: $0 indir [pdf] + +Creates a PDF from image and hocr files in indir, saving it to pdf, or +indir.pdf if not specified. + +The necessary files are first copied to a temporary directory where +they are renamed and reformatted for the hocr-pdf tool from hocr-tools. + +The PDF is then created with hocr-pdf, and the temporary directory is +removed." + +# All possible training files to check for +TRAININGS="rescribealphav4 rescribealphav5" + +# Set image compression and dpi +QUALITY=0 +DPI=600 + +# Set resize ratio +RESIZEPERC=25% +RESIZEDIV=4 +DPI=`expr $DPI / $RESIZEDIV` + +test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1 + +if command -v gm > /dev/null ; then + convert="gm convert" +elif command -v convert > /dev/null ; then + convert="convert" +else + echo "Error: no graphicksmagick or imagemagick found" + exit 1 +fi + +if ! command -v hocr-pdf > /dev/null ; then + echo "Error: no hocr-pdf tool found" + exit 1 +fi + +if ! test -d "$1"; then + echo "Error: $1 does not exist" + exit 1 +fi + +if test $# -eq 2 ; then + outfile="$2" +else + o=`echo "$1" | sed 's/\/$//'` + outfile="$o.pdf" +fi + +tmpdir=`mktemp -d` +if test $? -ne 0 ; then + echo "Error: Failed to create temporary directory" + exit 1 +fi + +mkdir -p "$tmpdir" || exit 1 + +echo "Copying hocrs and converting pngs from $1 to $tmpdir" +find "$1/best" -maxdepth 1 -type f -name '*.hocr' | sort | while read i; do + b=`basename "$i" .hocr` + img="$1/$b.png" + + if ! test -f "$img"; then + echo "Warning: no image found for hocr file $i, skipping" + continue + fi + + $convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$img" "$tmpdir/$b.jpg" || exit 1 + + # Adjust the bounding boxes to match the new geometry after resizing + cat "$i" | while read line; do + if `echo "$line" | grep -q -v "title=['\"]bbox"`; then + printf "%s\n" "$line" + continue + fi + # get original values + bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"` + x1=`echo "$bbox"|awk '{print $1}'` + y1=`echo "$bbox"|awk '{print $2}'` + x2=`echo "$bbox"|awk '{print $3}'` + y2=`echo "$bbox"|awk '{print $4}'` + + # first halve all values + x1=`echo "$x1 / $RESIZEDIV" | bc` + y1=`echo "$y1 / $RESIZEDIV" | bc` + x2=`echo "$x2 / $RESIZEDIV" | bc` + y2=`echo "$y2 / $RESIZEDIV" | bc` + + newbbox="$x1 $y1 $x2 $y2" + newline=`echo "$line" | sed "s/$bbox/$newbbox/"` + printf "%s\n" "$newline" + done > "$tmpdir/$b.hocr" +done + +echo "Creating PDF" +hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1 + +echo "Created a PDF at $outfile" +rm -rf "$tmpdir" -- cgit v1.2.1-24-ge1ad