summaryrefslogtreecommitdiff
path: root/eebotopdf.sh
diff options
context:
space:
mode:
Diffstat (limited to 'eebotopdf.sh')
-rw-r--r--eebotopdf.sh46
1 files changed, 46 insertions, 0 deletions
diff --git a/eebotopdf.sh b/eebotopdf.sh
new file mode 100644
index 0000000..e82fe54
--- /dev/null
+++ b/eebotopdf.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+usage="Usage: $0 in.pdf in.xml out.pdf"
+
+test $# -ne 3 && echo "$usage" && exit 1
+
+prereqs="eeboxmltohocr hocr-pdf pdfimages"
+for i in $prereqs; do
+ if ! command -v $i > /dev/null ; then
+ echo "Error: no $i tool found"
+ exit 1
+ fi
+done
+
+if command -v gm > /dev/null ; then
+ convert="gm convert"
+elif command -v convert > /dev/null ; then
+ convert="convert"
+else
+ echo "Error: no graphicksmagick or imagemagick found"
+ exit 1
+fi
+
+root=`basename "$3" .pdf`
+
+# extract images to png, then convert to jpg, as originals aren't jpg
+# but hocr-pdf requires them
+echo "Extracting images from original PDF"
+pdfimages -png "$1" "$root" || exit 1
+for i in "$root"*png; do
+ b=`basename "$i" .png`
+ $convert "$i" "$b.jpg" || exit 1
+ rm "$i"
+done
+
+echo "Extracting text from XML"
+eeboxmltohocr "$2" "$root" || exit 1
+
+# remove any images that don't have a corresponding hocr
+for i in *jpg; do
+ b=`basename "$i" .jpg`
+ test -f "$b.hocr" || rm "$i"
+done
+
+echo "Combining images and text into PDF"
+hocr-pdf . > "$root.pdf" || exit 1
+rm "$root"*jpg "$root"*hocr