summaryrefslogtreecommitdiff
path: root/bookgraph
diff options
context:
space:
mode:
Diffstat (limited to 'bookgraph')
-rwxr-xr-xbookgraph45
1 files changed, 45 insertions, 0 deletions
diff --git a/bookgraph b/bookgraph
new file mode 100755
index 0000000..8010bdc
--- /dev/null
+++ b/bookgraph
@@ -0,0 +1,45 @@
+#!/bin/sh
+usage="$0 bookdir
+
+Creates a graph showing the average confidence and a (scaled) word
+count of each page in a book. The word count is scaled (divided by
+10 and plus 50) to make it easy to compare to the confidence, by
+generally occupying a similar scale."
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+if ! command -v pgconf > /dev/null ; then
+ echo "Error: no pgconf tool found"
+ exit 1
+fi
+
+t=`mktemp`
+
+find "$1/best" -maxdepth 1 -type f -name '*hocr' | while read i; do
+ c=`pgconf "$i"`
+ n=`basename "$i" .hocr | sed 's/_.*//'`
+ w=`grep ocrx_word "$i" | wc -l`
+ printf '%s\t%d\t%d\n' "$n" "$c" "$w" >> "$t"
+done
+
+r=`readlink -f "$1"`
+b=`basename "$r"`
+
+gp=`mktemp`
+printf '
+
+set style data lines
+set title "Book Confidence for %s"
+set xlabel "Page Number"
+set ylabel "Page Confidence %%"
+set mxtics
+set terminal png truecolor size 4600,1700
+set output "%s"
+plot "%s" using 1:2 with lines title "Confidence",\\
+ "" using 1:2:(sprintf("%%d", $1)) with labels point pt 2 notitle,\\
+ "" using 1:($3/10+50) with lines title "Number of words (scaled)"
+quit
+' "$b" "$1/${b}_bookgraph.png" "$t" >> "$gp"
+gnuplot "$gp"
+
+rm -f "$t" "$gp"