diff options
author | Nick White <git@njw.name> | 2019-06-11 17:14:52 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-06-11 17:14:52 +0100 |
commit | 1efa16179c8d405903dc56fca7245d3205ebb07e (patch) | |
tree | 67d5e68e8e5a88d9d534e6c2e41f4eccd174a2ea /eebotopdf.sh | |
parent | 24f05bf6afb368ef5c88f510f6d3f98f2a42759f (diff) |
Add eebotopdf script
Diffstat (limited to 'eebotopdf.sh')
-rw-r--r-- | eebotopdf.sh | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/eebotopdf.sh b/eebotopdf.sh new file mode 100644 index 0000000..e82fe54 --- /dev/null +++ b/eebotopdf.sh @@ -0,0 +1,46 @@ +#!/bin/sh +usage="Usage: $0 in.pdf in.xml out.pdf" + +test $# -ne 3 && echo "$usage" && exit 1 + +prereqs="eeboxmltohocr hocr-pdf pdfimages" +for i in $prereqs; do + if ! command -v $i > /dev/null ; then + echo "Error: no $i tool found" + exit 1 + fi +done + +if command -v gm > /dev/null ; then + convert="gm convert" +elif command -v convert > /dev/null ; then + convert="convert" +else + echo "Error: no graphicksmagick or imagemagick found" + exit 1 +fi + +root=`basename "$3" .pdf` + +# extract images to png, then convert to jpg, as originals aren't jpg +# but hocr-pdf requires them +echo "Extracting images from original PDF" +pdfimages -png "$1" "$root" || exit 1 +for i in "$root"*png; do + b=`basename "$i" .png` + $convert "$i" "$b.jpg" || exit 1 + rm "$i" +done + +echo "Extracting text from XML" +eeboxmltohocr "$2" "$root" || exit 1 + +# remove any images that don't have a corresponding hocr +for i in *jpg; do + b=`basename "$i" .jpg` + test -f "$b.hocr" || rm "$i" +done + +echo "Combining images and text into PDF" +hocr-pdf . > "$root.pdf" || exit 1 +rm "$root"*jpg "$root"*hocr |