blob: e82fe54a77e0ec2d1fd88ebabb1e7e388c17779c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
#!/bin/sh
usage="Usage: $0 in.pdf in.xml out.pdf"
test $# -ne 3 && echo "$usage" && exit 1
prereqs="eeboxmltohocr hocr-pdf pdfimages"
for i in $prereqs; do
if ! command -v $i > /dev/null ; then
echo "Error: no $i tool found"
exit 1
fi
done
if command -v gm > /dev/null ; then
convert="gm convert"
elif command -v convert > /dev/null ; then
convert="convert"
else
echo "Error: no graphicksmagick or imagemagick found"
exit 1
fi
root=`basename "$3" .pdf`
# extract images to png, then convert to jpg, as originals aren't jpg
# but hocr-pdf requires them
echo "Extracting images from original PDF"
pdfimages -png "$1" "$root" || exit 1
for i in "$root"*png; do
b=`basename "$i" .png`
$convert "$i" "$b.jpg" || exit 1
rm "$i"
done
echo "Extracting text from XML"
eeboxmltohocr "$2" "$root" || exit 1
# remove any images that don't have a corresponding hocr
for i in *jpg; do
b=`basename "$i" .jpg`
test -f "$b.hocr" || rm "$i"
done
echo "Combining images and text into PDF"
hocr-pdf . > "$root.pdf" || exit 1
rm "$root"*jpg "$root"*hocr
|