summaryrefslogtreecommitdiff
path: root/dir-to-pdf.sh
blob: 83d2ab34b66c41d9bc38be95ba1c293a1da62322 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/sh
usage="Usage: $0 indir [pdf]

Creates a PDF from image and hocr files in indir, saving it to pdf, or
indir.pdf if not specified.

The necessary files are first copied to a temporary directory where
they are renamed and reformatted for the hocr-pdf tool from hocr-tools.

The PDF is then created with hocr-pdf, and the temporary directory is
removed."

# All possible training files to check for
TRAININGS="rescribealphav4 rescribealphav5"

# Set image compression and dpi
QUALITY=0
DPI=600

# Set resize ratio
RESIZEPERC=25%
RESIZEDIV=4
DPI=`expr $DPI / $RESIZEDIV`

test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1

if command -v gm > /dev/null ; then
	convert="gm convert"
elif command -v convert > /dev/null ; then
	convert="convert"
else
	echo "Error: no graphicksmagick or imagemagick found"
	exit 1
fi

if ! command -v hocr-pdf > /dev/null ; then
	echo "Error: no hocr-pdf tool found"
	exit 1
fi

if ! test -d "$1"; then
	echo "Error: $1 does not exist"
	exit 1
fi

if test $# -eq 2 ; then
	outfile="$2"
else
	o=`echo "$1" | sed 's/\/$//'`
	outfile="$o.pdf"
fi

tmpdir=`mktemp -d`
if test $? -ne 0 ; then
	echo "Error: Failed to create temporary directory"
	exit 1
fi

mkdir -p "$tmpdir" || exit 1

echo "Copying hocrs and converting pngs from $1 to $tmpdir"
n=`find "$1" -maxdepth 1 -type f -name '*.unpapered.png' | wc -l`
if test $n -gt 0 ; then
	imgsuffix=".unpapered.png"
	hocrsuffix="_unpapered_"
else
	n=`find "$1" -maxdepth 1 -type f -name '*.binarized.png' | wc -l`
	if test $n -gt 0 ; then
		imgsuffix=".binarized.png"
		hocrsuffix="_"
	else
		echo "Error: no pages found"
		exit 1
	fi
fi

find "$1" -maxdepth 1 -type f -name '*'"$imgsuffix" | while read i; do
	b=`basename "$i" "$imgsuffix"`

	hocr=""
	for t in $TRAININGS; do
		n=`echo "$i" | sed "s/${imgsuffix}/${hocrsuffix}$t.hocr/"`
		test -f "$n" && hocr="$n"
	done
	if test -z "$hocr"; then
		echo "Warning: no corresponding hocr file found for $i, skipping."
		continue
	fi

	$convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$i" "$tmpdir/$b.jpg" || exit 1

	# Adjust the bounding boxes to match the new geometry after resizing
	cat "$hocr" | while read line; do
		if `echo "$line" | grep -q -v "title=['\"]bbox"`; then
	                printf "%s\n" "$line"
	                continue
	        fi
		# get original values
	        bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"`
	        x1=`echo "$bbox"|awk '{print $1}'`
	        y1=`echo "$bbox"|awk '{print $2}'`
	        x2=`echo "$bbox"|awk '{print $3}'`
	        y2=`echo "$bbox"|awk '{print $4}'`

	        # first halve all values
	        x1=`echo "$x1 / $RESIZEDIV" | bc`
	        y1=`echo "$y1 / $RESIZEDIV" | bc`
	        x2=`echo "$x2 / $RESIZEDIV" | bc`
	        y2=`echo "$y2 / $RESIZEDIV" | bc`

		newbbox="$x1 $y1 $x2 $y2"
	        newline=`echo "$line" | sed "s/$bbox/$newbbox/"`
	        printf "%s\n" "$newline"
	done > "$tmpdir/$b.hocr"
done

echo "Creating PDF"
hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1

echo "Created a PDF at $outfile"
rm -rf "$tmpdir"