#!/bin/sh usage="Usage: $0 [-c] indir [pdf] Creates a PDF from image and hocr files in indir, saving it to pdf, or indir.pdf if not specified. The necessary files are first copied to a temporary directory where they are renamed and reformatted for the hocr-pdf tool from hocr-tools. The PDF is then created with hocr-pdf, and the temporary directory is removed. This is designed to work with files which have gone through the rescribe.xyz/bookpipeline process, with a 'best' file which lists the best hocr files for each page. -c: colour output" # Set image compression and dpi QUALITY=20 DPI=600 # Set resize ratio RESIZEPERC=25% RESIZEDIV=4 DPI=`expr $DPI / $RESIZEDIV` colour=0 test $# -gt 1 && test "$1" = "-c" && colour=1 && shift test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1 if command -v gm > /dev/null ; then convert="gm convert" elif command -v convert > /dev/null ; then convert="convert" else echo "Error: no graphicksmagick or imagemagick found" exit 1 fi if ! command -v hocr-pdf > /dev/null ; then echo "Error: no hocr-pdf tool found" exit 1 fi if ! test -d "$1"; then echo "Error: $1 does not exist" exit 1 fi if test $# -eq 2 ; then outfile="$2" else o=`echo "$1" | sed 's/\/$//'` outfile="$o.pdf" fi tmpdir=`mktemp -d` if test $? -ne 0 ; then echo "Error: Failed to create temporary directory" exit 1 fi mkdir -p "$tmpdir" || exit 1 echo "Copying hocrs and converting jpgs from $1 to $tmpdir" while read i; do if ! test -f "$1/$i"; then echo "Warning: no hocr file found for $i, skipping" continue fi n=`echo "$i" | sed 's/_bin.*//'` b=`basename "$i" .hocr` if test $colour -eq 1; then img="$1/$n.jpg" # colour else img="$1/$b.png" # binarised fi if ! test -f "$img"; then echo "Warning: no image found for hocr file $i, skipping" continue fi $convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$img" "$tmpdir/$n.jpg" || exit 1 # Adjust the bounding boxes to match the new geometry after resizing cat "$1/$i" | while read line; do if `echo "$line" | grep -q -v "title=['\"]bbox"`; then printf "%s\n" "$line" continue fi # get original values bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"` x1=`echo "$bbox"|awk '{print $1}'` y1=`echo "$bbox"|awk '{print $2}'` x2=`echo "$bbox"|awk '{print $3}'` y2=`echo "$bbox"|awk '{print $4}'` # first halve all values x1=`echo "$x1 / $RESIZEDIV" | bc` y1=`echo "$y1 / $RESIZEDIV" | bc` x2=`echo "$x2 / $RESIZEDIV" | bc` y2=`echo "$y2 / $RESIZEDIV" | bc` newbbox="$x1 $y1 $x2 $y2" newline=`echo "$line" | sed "s/$bbox/$newbbox/"` printf "%s\n" "$newline" done > "$tmpdir/$n.hocr" done < "$1/best" echo "Creating PDF" hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1 echo "Created a PDF at $outfile" rm -rf "$tmpdir"