From 1efa16179c8d405903dc56fca7245d3205ebb07e Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 11 Jun 2019 17:14:52 +0100 Subject: Add eebotopdf script --- eebotopdf.sh | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 eebotopdf.sh (limited to 'eebotopdf.sh') diff --git a/eebotopdf.sh b/eebotopdf.sh new file mode 100644 index 0000000..e82fe54 --- /dev/null +++ b/eebotopdf.sh @@ -0,0 +1,46 @@ +#!/bin/sh +usage="Usage: $0 in.pdf in.xml out.pdf" + +test $# -ne 3 && echo "$usage" && exit 1 + +prereqs="eeboxmltohocr hocr-pdf pdfimages" +for i in $prereqs; do + if ! command -v $i > /dev/null ; then + echo "Error: no $i tool found" + exit 1 + fi +done + +if command -v gm > /dev/null ; then + convert="gm convert" +elif command -v convert > /dev/null ; then + convert="convert" +else + echo "Error: no graphicksmagick or imagemagick found" + exit 1 +fi + +root=`basename "$3" .pdf` + +# extract images to png, then convert to jpg, as originals aren't jpg +# but hocr-pdf requires them +echo "Extracting images from original PDF" +pdfimages -png "$1" "$root" || exit 1 +for i in "$root"*png; do + b=`basename "$i" .png` + $convert "$i" "$b.jpg" || exit 1 + rm "$i" +done + +echo "Extracting text from XML" +eeboxmltohocr "$2" "$root" || exit 1 + +# remove any images that don't have a corresponding hocr +for i in *jpg; do + b=`basename "$i" .jpg` + test -f "$b.hocr" || rm "$i" +done + +echo "Combining images and text into PDF" +hocr-pdf . > "$root.pdf" || exit 1 +rm "$root"*jpg "$root"*hocr -- cgit v1.2.1-24-ge1ad