diff options
| -rwxr-xr-x | dir-to-pdf.sh | 51 | ||||
| -rwxr-xr-x[-rw-r--r--] | scrape-erara.sh | 0 | 
2 files changed, 43 insertions, 8 deletions
| diff --git a/dir-to-pdf.sh b/dir-to-pdf.sh index 399bc16..345d908 100755 --- a/dir-to-pdf.sh +++ b/dir-to-pdf.sh @@ -1,7 +1,8 @@  #!/bin/sh -usage="Usage: $0 indir +usage="Usage: $0 indir [pdf] -Creates a PDF from image and hocr files in indir, saving it to indir.pdf. +Creates a PDF from image and hocr files in indir, saving it to pdf, or +indir.pdf if not specified.  The necessary files are first copied to a temporary directory where  they are renamed and reformatted for the hocr-pdf tool from hocr-tools. @@ -9,8 +10,6 @@ they are renamed and reformatted for the hocr-pdf tool from hocr-tools.  The PDF is then created with hocr-pdf, and the temporary directory is  removed." -test $# -ne 1 && echo "$usage" && exit 1 -  # All possible training files to check for  TRAININGS="rescribealphav4 rescribealphav5" @@ -18,6 +17,13 @@ TRAININGS="rescribealphav4 rescribealphav5"  QUALITY=0  DPI=600 +# Set resize ratio +RESIZEPERC=25% +RESIZEDIV=4 +DPI=`expr $DPI / $RESIZEDIV` + +test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1 +  if command -v gm > /dev/null ; then  	convert="gm convert"  elif command -v convert > /dev/null ; then @@ -37,6 +43,12 @@ if ! test -d "$1"; then  	exit 1  fi +if test $# -eq 2 ; then +	outfile="$2" +else +	outfile="$1.pdf" +fi +  tmpdir=`mktemp -d`  if test $? -ne 0 ; then  	echo "Error: Failed to create temporary directory" @@ -59,12 +71,35 @@ find "$1" -maxdepth 1 -type f -name '*.unpapered.png' | while read i; do  		continue  	fi -	$convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1 -	cp "$hocr" "$tmpdir/$b.hocr" || exit 1 +	$convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1 + +	# Adjust the bounding boxes to match the new geometry after resizing +	cat "$hocr" | while read line; do +		if `echo "$line" | grep -q -v "title=['\"]bbox"`; then +	                printf "%s\n" "$line" +	                continue +	        fi +		# get original values +	        bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"` +	        x1=`echo "$bbox"|awk '{print $1}'` +	        y1=`echo "$bbox"|awk '{print $2}'` +	        x2=`echo "$bbox"|awk '{print $3}'` +	        y2=`echo "$bbox"|awk '{print $4}'` + +	        # first halve all values +	        x1=`echo "$x1 / $RESIZEDIV" | bc` +	        y1=`echo "$y1 / $RESIZEDIV" | bc` +	        x2=`echo "$x2 / $RESIZEDIV" | bc` +	        y2=`echo "$y2 / $RESIZEDIV" | bc` + +		newbbox="$x1 $y1 $x2 $y2" +	        newline=`echo "$line" | sed "s/$bbox/$newbbox/"` +	        printf "%s\n" "$newline" +	done > "$tmpdir/$b.hocr"  done  echo "Creating PDF" -hocr-pdf --savefile "$1.pdf" "$tmpdir" || exit 1 +hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1 -echo "Created a PDF at $1.pdf" +echo "Created a PDF at $outfile"  rm -rf "$tmpdir" diff --git a/scrape-erara.sh b/scrape-erara.sh index c2da6f2..c2da6f2 100644..100755 --- a/scrape-erara.sh +++ b/scrape-erara.sh | 
