diff options
| -rwxr-xr-x | dir-to-pdfv3.sh | 118 | 
1 files changed, 118 insertions, 0 deletions
| diff --git a/dir-to-pdfv3.sh b/dir-to-pdfv3.sh new file mode 100755 index 0000000..4248be9 --- /dev/null +++ b/dir-to-pdfv3.sh @@ -0,0 +1,118 @@ +#!/bin/sh +usage="Usage: $0 [-c] indir [pdf] + +Creates a PDF from image and hocr files in indir, saving it to pdf, or +indir.pdf if not specified. + +The necessary files are first copied to a temporary directory where +they are renamed and reformatted for the hocr-pdf tool from hocr-tools. + +The PDF is then created with hocr-pdf, and the temporary directory is +removed. + +This is designed to work with files which have gone through the +rescribe.xyz/bookpipeline process, with a 'best' file which lists the +best hocr files for each page. + +-c: colour output" + +# Set image compression and dpi +QUALITY=20 +DPI=600 + +# Set resize ratio +RESIZEPERC=25% +RESIZEDIV=4 +DPI=`expr $DPI / $RESIZEDIV` + +colour=0 +test $# -gt 1 && test "$1" = "-c" && colour=1 && shift + +test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1 + +if command -v gm > /dev/null ; then +	convert="gm convert" +elif command -v convert > /dev/null ; then +	convert="convert" +else +	echo "Error: no graphicksmagick or imagemagick found" +	exit 1 +fi + +if ! command -v hocr-pdf > /dev/null ; then +	echo "Error: no hocr-pdf tool found" +	exit 1 +fi + +if ! test -d "$1"; then +	echo "Error: $1 does not exist" +	exit 1 +fi + +if test $# -eq 2 ; then +	outfile="$2" +else +	o=`echo "$1" | sed 's/\/$//'` +	outfile="$o.pdf" +fi + +tmpdir=`mktemp -d` +if test $? -ne 0 ; then +	echo "Error: Failed to create temporary directory" +	exit 1 +fi + +mkdir -p "$tmpdir" || exit 1 + +echo "Copying hocrs and converting jpgs from $1 to $tmpdir" +while read i; do +	if ! test -f "$1/$i"; then +		echo "Warning: no hocr file found for $i, skipping" +		continue +	fi + +	n=`echo "$i" | sed 's/_bin.*//'` + +	b=`basename "$i" .hocr` +	if test $colour -eq 1; then +		img="$1/$n.jpg"  # colour +	else +		img="$1/$b.png" # binarised +	fi +	if ! test -f "$img"; then +		echo "Warning: no image found for hocr file $i, skipping" +		continue +	fi + +	$convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$img" "$tmpdir/$n.jpg" || exit 1 + +	# Adjust the bounding boxes to match the new geometry after resizing +	cat "$1/$i" | while read line; do +		if `echo "$line" | grep -q -v "title=['\"]bbox"`; then +	                printf "%s\n" "$line" +	                continue +	        fi +		# get original values +	        bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"` +	        x1=`echo "$bbox"|awk '{print $1}'` +	        y1=`echo "$bbox"|awk '{print $2}'` +	        x2=`echo "$bbox"|awk '{print $3}'` +	        y2=`echo "$bbox"|awk '{print $4}'` + +	        # first halve all values +	        x1=`echo "$x1 / $RESIZEDIV" | bc` +	        y1=`echo "$y1 / $RESIZEDIV" | bc` +	        x2=`echo "$x2 / $RESIZEDIV" | bc` +	        y2=`echo "$y2 / $RESIZEDIV" | bc` + +		newbbox="$x1 $y1 $x2 $y2" +	        newline=`echo "$line" | sed "s/$bbox/$newbbox/"` +	        printf "%s\n" "$newline" +	done > "$tmpdir/$n.hocr" +done < "$1/best" + +echo "Creating PDF" +hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1 + +echo "Created a PDF at $outfile" +rm -rf "$tmpdir" | 
