diff options
author | Nick White <git@njw.name> | 2019-05-08 16:39:25 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-05-08 16:39:25 +0100 |
commit | 44dfb6c33649efb0b45c72a3ab9ae7d629aa3172 (patch) | |
tree | 887383b6d4925b9b3cea1cbed1e5016794e12e23 | |
parent | 7bef4bc5eee200960ee0870974d922e763c16882 (diff) |
Allow an argument to set pdf savefile, and resize pdf images to be way smaller
-rwxr-xr-x | dir-to-pdf.sh | 51 | ||||
-rwxr-xr-x[-rw-r--r--] | scrape-erara.sh | 0 |
2 files changed, 43 insertions, 8 deletions
diff --git a/dir-to-pdf.sh b/dir-to-pdf.sh index 399bc16..345d908 100755 --- a/dir-to-pdf.sh +++ b/dir-to-pdf.sh @@ -1,7 +1,8 @@ #!/bin/sh -usage="Usage: $0 indir +usage="Usage: $0 indir [pdf] -Creates a PDF from image and hocr files in indir, saving it to indir.pdf. +Creates a PDF from image and hocr files in indir, saving it to pdf, or +indir.pdf if not specified. The necessary files are first copied to a temporary directory where they are renamed and reformatted for the hocr-pdf tool from hocr-tools. @@ -9,8 +10,6 @@ they are renamed and reformatted for the hocr-pdf tool from hocr-tools. The PDF is then created with hocr-pdf, and the temporary directory is removed." -test $# -ne 1 && echo "$usage" && exit 1 - # All possible training files to check for TRAININGS="rescribealphav4 rescribealphav5" @@ -18,6 +17,13 @@ TRAININGS="rescribealphav4 rescribealphav5" QUALITY=0 DPI=600 +# Set resize ratio +RESIZEPERC=25% +RESIZEDIV=4 +DPI=`expr $DPI / $RESIZEDIV` + +test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1 + if command -v gm > /dev/null ; then convert="gm convert" elif command -v convert > /dev/null ; then @@ -37,6 +43,12 @@ if ! test -d "$1"; then exit 1 fi +if test $# -eq 2 ; then + outfile="$2" +else + outfile="$1.pdf" +fi + tmpdir=`mktemp -d` if test $? -ne 0 ; then echo "Error: Failed to create temporary directory" @@ -59,12 +71,35 @@ find "$1" -maxdepth 1 -type f -name '*.unpapered.png' | while read i; do continue fi - $convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1 - cp "$hocr" "$tmpdir/$b.hocr" || exit 1 + $convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1 + + # Adjust the bounding boxes to match the new geometry after resizing + cat "$hocr" | while read line; do + if `echo "$line" | grep -q -v "title=['\"]bbox"`; then + printf "%s\n" "$line" + continue + fi + # get original values + bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"` + x1=`echo "$bbox"|awk '{print $1}'` + y1=`echo "$bbox"|awk '{print $2}'` + x2=`echo "$bbox"|awk '{print $3}'` + y2=`echo "$bbox"|awk '{print $4}'` + + # first halve all values + x1=`echo "$x1 / $RESIZEDIV" | bc` + y1=`echo "$y1 / $RESIZEDIV" | bc` + x2=`echo "$x2 / $RESIZEDIV" | bc` + y2=`echo "$y2 / $RESIZEDIV" | bc` + + newbbox="$x1 $y1 $x2 $y2" + newline=`echo "$line" | sed "s/$bbox/$newbbox/"` + printf "%s\n" "$newline" + done > "$tmpdir/$b.hocr" done echo "Creating PDF" -hocr-pdf --savefile "$1.pdf" "$tmpdir" || exit 1 +hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1 -echo "Created a PDF at $1.pdf" +echo "Created a PDF at $outfile" rm -rf "$tmpdir" diff --git a/scrape-erara.sh b/scrape-erara.sh index c2da6f2..c2da6f2 100644..100755 --- a/scrape-erara.sh +++ b/scrape-erara.sh |