diff options
author | Nick White <git@njw.name> | 2019-05-08 15:23:30 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-05-08 15:23:30 +0100 |
commit | 7bef4bc5eee200960ee0870974d922e763c16882 (patch) | |
tree | 244bd93d952f71c4d424a831105b986a7de75853 | |
parent | 9673976e3a563ba3ebf183c7f18df2ae5c64b141 (diff) |
Rename pdf prep tool as it creates the pdf too now
-rwxr-xr-x | dir-to-pdf.sh | 70 | ||||
-rw-r--r-- | format-for-hocr-pdf.sh | 43 |
2 files changed, 70 insertions, 43 deletions
diff --git a/dir-to-pdf.sh b/dir-to-pdf.sh new file mode 100755 index 0000000..399bc16 --- /dev/null +++ b/dir-to-pdf.sh @@ -0,0 +1,70 @@ +#!/bin/sh +usage="Usage: $0 indir + +Creates a PDF from image and hocr files in indir, saving it to indir.pdf. + +The necessary files are first copied to a temporary directory where +they are renamed and reformatted for the hocr-pdf tool from hocr-tools. + +The PDF is then created with hocr-pdf, and the temporary directory is +removed." + +test $# -ne 1 && echo "$usage" && exit 1 + +# All possible training files to check for +TRAININGS="rescribealphav4 rescribealphav5" + +# Set image compression and dpi +QUALITY=0 +DPI=600 + +if command -v gm > /dev/null ; then + convert="gm convert" +elif command -v convert > /dev/null ; then + convert="convert" +else + echo "Error: no graphicksmagick or imagemagick found" + exit 1 +fi + +if ! command -v hocr-pdf > /dev/null ; then + echo "Error: no hocr-pdf tool found" + exit 1 +fi + +if ! test -d "$1"; then + echo "Error: $1 does not exist" + exit 1 +fi + +tmpdir=`mktemp -d` +if test $? -ne 0 ; then + echo "Error: Failed to create temporary directory" + exit 1 +fi + +mkdir -p "$tmpdir" || exit 1 + +echo "Copying hocrs and converting pngs from $1 to $tmpdir" +find "$1" -maxdepth 1 -type f -name '*.unpapered.png' | while read i; do + b=`basename "$i" .unpapered.png` + + hocr="" + for t in $TRAININGS; do + n=`echo "$i" | sed "s/.unpapered.png/_unpapered_$t.hocr/"` + test -f "$n" && hocr="$n" + done + if test -z "$hocr"; then + echo "Warning: no corresponding hocr file found for $i, skipping." + continue + fi + + $convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1 + cp "$hocr" "$tmpdir/$b.hocr" || exit 1 +done + +echo "Creating PDF" +hocr-pdf --savefile "$1.pdf" "$tmpdir" || exit 1 + +echo "Created a PDF at $1.pdf" +rm -rf "$tmpdir" diff --git a/format-for-hocr-pdf.sh b/format-for-hocr-pdf.sh deleted file mode 100644 index 0059f81..0000000 --- a/format-for-hocr-pdf.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/sh -usage="Usage: $0 indir outdir - -Creates a new directory with image and hocr files appropriately named -and formatted for use with the the hocr-pdf tool from hocr-tools." - -test $# -ne 2 && echo "$usage" && exit 1 - -# All possible training files to check for -TRAININGS="rescribealphav4 rescribealphav5" - -# Set image compression and dpi -QUALITY=0 -DPI=600 - -if ! test -d "$1"; then - echo "Error: $1 does not exist" - exit 1 -fi -mkdir -p "$2" || exit 1 - -echo "Copying hocrs and converting pngs from $1 to $2" -find "$1" -maxdepth 1 -type f -name '*.binarized.png' | while read i; do - b=`basename "$i" .binarized.png` - - hocr="" - for t in $TRAININGS; do - n=`echo "$i" | sed "s/.binarized.png/_$t.hocr/"` - test -f "$n" && hocr="$n" - done - if test -z "$hocr"; then - echo "Warning: no corresponding hocr file found for $i, skipping." - continue - fi - - gm convert -quality $QUALITY -density ${DPI}x${DPI} "$i" "$2/$b.jpg" || exit 1 - cp "$hocr" "$2/$b.hocr" || exit 1 -done - -nhocr=`find "$2" -type f -name '*hocr'|wc -l` -njpg=`find "$2" -type f -name '*jpg'|wc -l` -echo "Done. There are $nhocr hocr files and $njpg jpg files in $2" - |