summaryrefslogtreecommitdiff
path: root/eebotopdf.sh
blob: e82fe54a77e0ec2d1fd88ebabb1e7e388c17779c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/sh
usage="Usage: $0 in.pdf in.xml out.pdf"

test $# -ne 3 && echo "$usage" && exit 1

prereqs="eeboxmltohocr hocr-pdf pdfimages"
for i in $prereqs; do
	if ! command -v $i > /dev/null ; then
		echo "Error: no $i tool found"
		exit 1
	fi
done

if command -v gm > /dev/null ; then
        convert="gm convert"
elif command -v convert > /dev/null ; then
        convert="convert"
else
        echo "Error: no graphicksmagick or imagemagick found"
        exit 1
fi

root=`basename "$3" .pdf`

# extract images to png, then convert to jpg, as originals aren't jpg
# but hocr-pdf requires them
echo "Extracting images from original PDF"
pdfimages -png "$1" "$root" || exit 1
for i in "$root"*png; do
	b=`basename "$i" .png`
	$convert "$i" "$b.jpg" || exit 1
	rm "$i"
done

echo "Extracting text from XML"
eeboxmltohocr "$2" "$root" || exit 1

# remove any images that don't have a corresponding hocr
for i in *jpg; do
	b=`basename "$i" .jpg`
	test -f "$b.hocr" || rm "$i"
done

echo "Combining images and text into PDF"
hocr-pdf . > "$root.pdf" || exit 1
rm "$root"*jpg "$root"*hocr