summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-06-11 17:14:52 +0100
committerNick White <git@njw.name>2019-06-11 17:14:52 +0100
commit1efa16179c8d405903dc56fca7245d3205ebb07e (patch)
tree67d5e68e8e5a88d9d534e6c2e41f4eccd174a2ea
parent24f05bf6afb368ef5c88f510f6d3f98f2a42759f (diff)
Add eebotopdf script
-rw-r--r--eebotopdf.sh46
1 files changed, 46 insertions, 0 deletions
diff --git a/eebotopdf.sh b/eebotopdf.sh
new file mode 100644
index 0000000..e82fe54
--- /dev/null
+++ b/eebotopdf.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+usage="Usage: $0 in.pdf in.xml out.pdf"
+
+test $# -ne 3 && echo "$usage" && exit 1
+
+prereqs="eeboxmltohocr hocr-pdf pdfimages"
+for i in $prereqs; do
+ if ! command -v $i > /dev/null ; then
+ echo "Error: no $i tool found"
+ exit 1
+ fi
+done
+
+if command -v gm > /dev/null ; then
+ convert="gm convert"
+elif command -v convert > /dev/null ; then
+ convert="convert"
+else
+ echo "Error: no graphicksmagick or imagemagick found"
+ exit 1
+fi
+
+root=`basename "$3" .pdf`
+
+# extract images to png, then convert to jpg, as originals aren't jpg
+# but hocr-pdf requires them
+echo "Extracting images from original PDF"
+pdfimages -png "$1" "$root" || exit 1
+for i in "$root"*png; do
+ b=`basename "$i" .png`
+ $convert "$i" "$b.jpg" || exit 1
+ rm "$i"
+done
+
+echo "Extracting text from XML"
+eeboxmltohocr "$2" "$root" || exit 1
+
+# remove any images that don't have a corresponding hocr
+for i in *jpg; do
+ b=`basename "$i" .jpg`
+ test -f "$b.hocr" || rm "$i"
+done
+
+echo "Combining images and text into PDF"
+hocr-pdf . > "$root.pdf" || exit 1
+rm "$root"*jpg "$root"*hocr