blob: 83d2ab34b66c41d9bc38be95ba1c293a1da62322 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
#!/bin/sh
usage="Usage: $0 indir [pdf]
Creates a PDF from image and hocr files in indir, saving it to pdf, or
indir.pdf if not specified.
The necessary files are first copied to a temporary directory where
they are renamed and reformatted for the hocr-pdf tool from hocr-tools.
The PDF is then created with hocr-pdf, and the temporary directory is
removed."
# All possible training files to check for
TRAININGS="rescribealphav4 rescribealphav5"
# Set image compression and dpi
QUALITY=0
DPI=600
# Set resize ratio
RESIZEPERC=25%
RESIZEDIV=4
DPI=`expr $DPI / $RESIZEDIV`
test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1
if command -v gm > /dev/null ; then
convert="gm convert"
elif command -v convert > /dev/null ; then
convert="convert"
else
echo "Error: no graphicksmagick or imagemagick found"
exit 1
fi
if ! command -v hocr-pdf > /dev/null ; then
echo "Error: no hocr-pdf tool found"
exit 1
fi
if ! test -d "$1"; then
echo "Error: $1 does not exist"
exit 1
fi
if test $# -eq 2 ; then
outfile="$2"
else
o=`echo "$1" | sed 's/\/$//'`
outfile="$o.pdf"
fi
tmpdir=`mktemp -d`
if test $? -ne 0 ; then
echo "Error: Failed to create temporary directory"
exit 1
fi
mkdir -p "$tmpdir" || exit 1
echo "Copying hocrs and converting pngs from $1 to $tmpdir"
n=`find "$1" -maxdepth 1 -type f -name '*.unpapered.png' | wc -l`
if test $n -gt 0 ; then
imgsuffix=".unpapered.png"
hocrsuffix="_unpapered_"
else
n=`find "$1" -maxdepth 1 -type f -name '*.binarized.png' | wc -l`
if test $n -gt 0 ; then
imgsuffix=".binarized.png"
hocrsuffix="_"
else
echo "Error: no pages found"
exit 1
fi
fi
find "$1" -maxdepth 1 -type f -name '*'"$imgsuffix" | while read i; do
b=`basename "$i" "$imgsuffix"`
hocr=""
for t in $TRAININGS; do
n=`echo "$i" | sed "s/${imgsuffix}/${hocrsuffix}$t.hocr/"`
test -f "$n" && hocr="$n"
done
if test -z "$hocr"; then
echo "Warning: no corresponding hocr file found for $i, skipping."
continue
fi
$convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1
# Adjust the bounding boxes to match the new geometry after resizing
cat "$hocr" | while read line; do
if `echo "$line" | grep -q -v "title=['\"]bbox"`; then
printf "%s\n" "$line"
continue
fi
# get original values
bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"`
x1=`echo "$bbox"|awk '{print $1}'`
y1=`echo "$bbox"|awk '{print $2}'`
x2=`echo "$bbox"|awk '{print $3}'`
y2=`echo "$bbox"|awk '{print $4}'`
# first halve all values
x1=`echo "$x1 / $RESIZEDIV" | bc`
y1=`echo "$y1 / $RESIZEDIV" | bc`
x2=`echo "$x2 / $RESIZEDIV" | bc`
y2=`echo "$y2 / $RESIZEDIV" | bc`
newbbox="$x1 $y1 $x2 $y2"
newline=`echo "$line" | sed "s/$bbox/$newbbox/"`
printf "%s\n" "$newline"
done > "$tmpdir/$b.hocr"
done
echo "Creating PDF"
hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1
echo "Created a PDF at $outfile"
rm -rf "$tmpdir"
|