#!/bin/sh usage="Usage: $0 in.pdf in.xml out.pdf" test $# -ne 3 && echo "$usage" && exit 1 prereqs="eeboxmltohocr hocr-pdf pdfimages" for i in $prereqs; do if ! command -v $i > /dev/null ; then echo "Error: no $i tool found" exit 1 fi done if command -v gm > /dev/null ; then convert="gm convert" elif command -v convert > /dev/null ; then convert="convert" else echo "Error: no graphicksmagick or imagemagick found" exit 1 fi root=`basename "$3" .pdf` t=`mktemp -d` test $? -ne 0 && exit 1 # extract images to png, then convert to jpg, as originals aren't jpg # but hocr-pdf requires them echo "Extracting images from original PDF" pdfimages -png "$1" "$t/$root" || exit 1 for i in "$t/$root"*png; do b=`basename "$i" .png` "$convert" "$i" "$t/$b.jpg" || exit 1 rm "$i" done echo "Extracting text from XML" eeboxmltohocr "$2" "$t/$root" || exit 1 # remove any images that don't have a corresponding hocr for i in "$t/"*jpg; do b=`basename "$i" .jpg` test -f "$t/$b.hocr" || rm "$i" done echo "Combining images and text into PDF" hocr-pdf "$t" > "$3" || exit 1 rm "$t/$root"*jpg "$root"*hocr