summaryrefslogtreecommitdiff
path: root/dir-to-pdfv3.sh
blob: 4248be94d3b1d7e9f9649a04d6ecdd28b8fe9df3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/sh
usage="Usage: $0 [-c] indir [pdf]

Creates a PDF from image and hocr files in indir, saving it to pdf, or
indir.pdf if not specified.

The necessary files are first copied to a temporary directory where
they are renamed and reformatted for the hocr-pdf tool from hocr-tools.

The PDF is then created with hocr-pdf, and the temporary directory is
removed.

This is designed to work with files which have gone through the
rescribe.xyz/bookpipeline process, with a 'best' file which lists the
best hocr files for each page.

-c: colour output"

# Set image compression and dpi
QUALITY=20
DPI=600

# Set resize ratio
RESIZEPERC=25%
RESIZEDIV=4
DPI=`expr $DPI / $RESIZEDIV`

colour=0
test $# -gt 1 && test "$1" = "-c" && colour=1 && shift

test $# -lt 1 -o $# -gt 2 && echo "$usage" && exit 1

if command -v gm > /dev/null ; then
	convert="gm convert"
elif command -v convert > /dev/null ; then
	convert="convert"
else
	echo "Error: no graphicksmagick or imagemagick found"
	exit 1
fi

if ! command -v hocr-pdf > /dev/null ; then
	echo "Error: no hocr-pdf tool found"
	exit 1
fi

if ! test -d "$1"; then
	echo "Error: $1 does not exist"
	exit 1
fi

if test $# -eq 2 ; then
	outfile="$2"
else
	o=`echo "$1" | sed 's/\/$//'`
	outfile="$o.pdf"
fi

tmpdir=`mktemp -d`
if test $? -ne 0 ; then
	echo "Error: Failed to create temporary directory"
	exit 1
fi

mkdir -p "$tmpdir" || exit 1

echo "Copying hocrs and converting jpgs from $1 to $tmpdir"
while read i; do
	if ! test -f "$1/$i"; then
		echo "Warning: no hocr file found for $i, skipping"
		continue
	fi

	n=`echo "$i" | sed 's/_bin.*//'`

	b=`basename "$i" .hocr`
	if test $colour -eq 1; then
		img="$1/$n.jpg"  # colour
	else
		img="$1/$b.png" # binarised
	fi
	if ! test -f "$img"; then
		echo "Warning: no image found for hocr file $i, skipping"
		continue
	fi

	$convert -quality $QUALITY -geometry ${RESIZEPERC}x${RESIZEPERC} -density ${DPI}x${DPI} "$img" "$tmpdir/$n.jpg" || exit 1

	# Adjust the bounding boxes to match the new geometry after resizing
	cat "$1/$i" | while read line; do
		if `echo "$line" | grep -q -v "title=['\"]bbox"`; then
	                printf "%s\n" "$line"
	                continue
	        fi
		# get original values
	        bbox=`echo "$line" | sed "s/.*title=['\"]bbox \(.[^;']*\)[;'\"].*/\1/"`
	        x1=`echo "$bbox"|awk '{print $1}'`
	        y1=`echo "$bbox"|awk '{print $2}'`
	        x2=`echo "$bbox"|awk '{print $3}'`
	        y2=`echo "$bbox"|awk '{print $4}'`

	        # first halve all values
	        x1=`echo "$x1 / $RESIZEDIV" | bc`
	        y1=`echo "$y1 / $RESIZEDIV" | bc`
	        x2=`echo "$x2 / $RESIZEDIV" | bc`
	        y2=`echo "$y2 / $RESIZEDIV" | bc`

		newbbox="$x1 $y1 $x2 $y2"
	        newline=`echo "$line" | sed "s/$bbox/$newbbox/"`
	        printf "%s\n" "$newline"
	done > "$tmpdir/$n.hocr"
done < "$1/best"

echo "Creating PDF"
hocr-pdf --savefile "$outfile" "$tmpdir" || exit 1

echo "Created a PDF at $outfile"
rm -rf "$tmpdir"