summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-05-08 16:39:25 +0100
committerNick White <git@njw.name>2019-05-08 16:39:25 +0100
commit44dfb6c33649efb0b45c72a3ab9ae7d629aa3172 (patch)
tree887383b6d4925b9b3cea1cbed1e5016794e12e23
parent7bef4bc5eee200960ee0870974d922e763c16882 (diff)
Allow an argument to set pdf savefile, and resize pdf images to be way smaller
-rwxr-xr-xdir-to-pdf.sh51
-rwxr-xr-x[-rw-r--r--]scrape-erara.sh0
2 files changed, 43 insertions, 8 deletions
diff --git a/dir-to-pdf.sh b/dir-to-pdf.sh
index 399bc16..345d908 100755
--- a/dir-to-pdf.sh
+++ b/dir-to-pdf.sh
@@ -1,7 +1,8 @@
#!/bin/sh
-usage="Usage: $0 indir
+usage="Usage: $0 indir [pdf]
-Creates a PDF from image and hocr files in indir, saving it to indir.pdf.
+Creates a PDF from image and hocr files in indir, saving it to pdf, or
+indir.pdf if not specified.
The necessary files are first copied to a temporary directory where
they are renamed and reformatted for the hocr-pdf tool from hocr-tools.
@@ -9,8 +10,6 @@ they are renamed and reformatted for the hocr-pdf tool from hocr-tools.
The PDF is then created with hocr-pdf, and the temporary directory is
removed."
-test $# -ne 1 && echo "$usage" && exit 1
-
# All possible training files to check for
TRAININGS="rescribealphav4 rescribealphav5"
@@ -18,6 +17,13 @@ TRAININGS="rescribealphav4 rescribealphav5"
QUALITY=0
DPI=600
+# Set resize ratio
+RESIZEPERC=25%
+RESIZEDIV=4
+DPI=`expr $DPI / $RESIZEDIV`
+
+test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1
+
if command -v gm > /dev/null ; then
convert="gm convert"
elif command -v convert > /dev/null ; then
@@ -37,6 +43,12 @@ if ! test -d "$1"; then
exit 1
fi
+if test $# -eq 2 ; then
+ outfile="$2"
+else
+ outfile="$1.pdf"
+fi
+
tmpdir=`mktemp -d`
if test $? -ne 0 ; then
echo "Error: Failed to create temporary directory"
@@ -59,12 +71,35 @@ find "$1" -maxdepth 1 -type f -name '*.unpapered.png' | while read i; do
continue
fi
- $convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1
- cp "$hocr" "$tmpdir/$b.hocr" || exit 1
+ $convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1
+
+ # Adjust the bounding boxes to match the new geometry after resizing
+ cat "$hocr" | while read line; do
+ if `echo "$line" | grep -q -v "title=['\"]bbox"`; then
+ printf "%s\n" "$line"
+ continue
+ fi
+ # get original values
+ bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"`
+ x1=`echo "$bbox"|awk '{print $1}'`
+ y1=`echo "$bbox"|awk '{print $2}'`
+ x2=`echo "$bbox"|awk '{print $3}'`
+ y2=`echo "$bbox"|awk '{print $4}'`
+
+ # first halve all values
+ x1=`echo "$x1 / $RESIZEDIV" | bc`
+ y1=`echo "$y1 / $RESIZEDIV" | bc`
+ x2=`echo "$x2 / $RESIZEDIV" | bc`
+ y2=`echo "$y2 / $RESIZEDIV" | bc`
+
+ newbbox="$x1 $y1 $x2 $y2"
+ newline=`echo "$line" | sed "s/$bbox/$newbbox/"`
+ printf "%s\n" "$newline"
+ done > "$tmpdir/$b.hocr"
done
echo "Creating PDF"
-hocr-pdf --savefile "$1.pdf" "$tmpdir" || exit 1
+hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1
-echo "Created a PDF at $1.pdf"
+echo "Created a PDF at $outfile"
rm -rf "$tmpdir"
diff --git a/scrape-erara.sh b/scrape-erara.sh
index c2da6f2..c2da6f2 100644..100755
--- a/scrape-erara.sh
+++ b/scrape-erara.sh