summaryrefslogtreecommitdiff
path: root/eebotopdf.sh
blob: 7fac4c09fa2609a0179be783a2e8f4b478a596c2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/bin/sh
usage="Usage: $0 in.pdf in.xml out.pdf"

test $# -ne 3 && echo "$usage" && exit 1

prereqs="eeboxmltohocr hocr-pdf pdfimages"
for i in $prereqs; do
	if ! command -v $i > /dev/null ; then
		echo "Error: no $i tool found"
		exit 1
	fi
done

if command -v gm > /dev/null ; then
        convert="gm convert"
elif command -v convert > /dev/null ; then
        convert="convert"
else
        echo "Error: no graphicksmagick or imagemagick found"
        exit 1
fi

root=`basename "$3" .pdf`

t=`mktemp -d`
test $? -ne 0 && exit 1

# extract images to png, then convert to jpg, as originals aren't jpg
# but hocr-pdf requires them
echo "Extracting images from original PDF"
pdfimages -png "$1" "$t/$root" || exit 1
for i in "$t/$root"*png; do
	b=`basename "$i" .png`
	"$convert" "$i" "$t/$b.jpg" || exit 1
	rm "$i"
done

echo "Extracting text from XML"
eeboxmltohocr "$2" "$t/$root" || exit 1

# remove any images that don't have a corresponding hocr
for i in "$t/"*jpg; do
	b=`basename "$i" .jpg`
	test -f "$t/$b.hocr" || rm "$i"
done

echo "Combining images and text into PDF"
hocr-pdf "$t" > "$3" || exit 1
rm "$t/$root"*jpg "$root"*hocr