#!/bin/sh usage="Usage: $0 in.pdf in.xml out.pdf" test $# -ne 3 && echo "$usage" && exit 1 prereqs="eeboxmltohocr hocr-pdf pdfimages" for i in $prereqs; do if ! command -v $i > /dev/null ; then echo "Error: no $i tool found" exit 1 fi done if command -v gm > /dev/null ; then convert="gm convert" elif command -v convert > /dev/null ; then convert="convert" else echo "Error: no graphicksmagick or imagemagick found" exit 1 fi root=`basename "$3" .pdf` # extract images to png, then convert to jpg, as originals aren't jpg # but hocr-pdf requires them echo "Extracting images from original PDF" pdfimages -png "$1" "$root" || exit 1 for i in "$root"*png; do b=`basename "$i" .png` $convert "$i" "$b.jpg" || exit 1 rm "$i" done echo "Extracting text from XML" eeboxmltohocr "$2" "$root" || exit 1 # remove any images that don't have a corresponding hocr for i in *jpg; do b=`basename "$i" .jpg` test -f "$b.hocr" || rm "$i" done echo "Combining images and text into PDF" hocr-pdf . > "$root.pdf" || exit 1 rm "$root"*jpg "$root"*hocr