blob: 7fac4c09fa2609a0179be783a2e8f4b478a596c2 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#!/bin/sh
usage="Usage: $0 in.pdf in.xml out.pdf"
test $# -ne 3 && echo "$usage" && exit 1
prereqs="eeboxmltohocr hocr-pdf pdfimages"
for i in $prereqs; do
if ! command -v $i > /dev/null ; then
echo "Error: no $i tool found"
exit 1
fi
done
if command -v gm > /dev/null ; then
convert="gm convert"
elif command -v convert > /dev/null ; then
convert="convert"
else
echo "Error: no graphicksmagick or imagemagick found"
exit 1
fi
root=`basename "$3" .pdf`
t=`mktemp -d`
test $? -ne 0 && exit 1
# extract images to png, then convert to jpg, as originals aren't jpg
# but hocr-pdf requires them
echo "Extracting images from original PDF"
pdfimages -png "$1" "$t/$root" || exit 1
for i in "$t/$root"*png; do
b=`basename "$i" .png`
"$convert" "$i" "$t/$b.jpg" || exit 1
rm "$i"
done
echo "Extracting text from XML"
eeboxmltohocr "$2" "$t/$root" || exit 1
# remove any images that don't have a corresponding hocr
for i in "$t/"*jpg; do
b=`basename "$i" .jpg`
test -f "$t/$b.hocr" || rm "$i"
done
echo "Combining images and text into PDF"
hocr-pdf "$t" > "$3" || exit 1
rm "$t/$root"*jpg "$root"*hocr
|