summaryrefslogtreecommitdiff
path: root/bookgraph
blob: 8010bdc0ba287fb5b2bcb5eab76720bcf205031a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/bin/sh
usage="$0 bookdir

Creates a graph showing the average confidence and a (scaled) word
count of each page in a book. The word count is scaled (divided by
10 and plus 50) to make it easy to compare to the confidence, by
generally occupying a similar scale."

test $# -ne 1 && echo "$usage" && exit 1

if ! command -v pgconf > /dev/null ; then
	echo "Error: no pgconf tool found"
	exit 1
fi

t=`mktemp`

find "$1/best" -maxdepth 1 -type f -name '*hocr' | while read i; do
	c=`pgconf "$i"`
	n=`basename "$i" .hocr | sed 's/_.*//'`
	w=`grep ocrx_word "$i" | wc -l`
	printf '%s\t%d\t%d\n' "$n" "$c" "$w" >> "$t"
done

r=`readlink -f "$1"`
b=`basename "$r"`

gp=`mktemp`
printf '

set style data lines
set title "Book Confidence for %s"
set xlabel "Page Number"
set ylabel "Page Confidence %%"
set mxtics
set terminal png truecolor size 4600,1700
set output "%s"
plot "%s" using 1:2 with lines title "Confidence",\\
     "" using 1:2:(sprintf("%%d", $1)) with labels point pt 2 notitle,\\
     "" using 1:($3/10+50) with lines title "Number of words (scaled)"
quit
' "$b" "$1/${b}_bookgraph.png" "$t" >> "$gp"
gnuplot "$gp"

rm -f "$t" "$gp"