summaryrefslogtreecommitdiff
path: root/checkoverwiping.sh
blob: a8bb3b5dcb899afc6f988f73cf4591f64ec7603a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/sh
usage="Usage: $0 bookdir

Detects possibly overwiped pages by looking for pages where the best
HOCR chosen has significantly shorter maximum line length than an
alternative candidate with a different binarisation level."
test $# -ne 1 && echo "$usage" && exit 1

find "$1" -maxdepth 1 -type f -name '*_bin0.2.hocr' | sort | while read i; do
        n=`basename "$i" .hocr | sed 's/_.*//'`

	maxline=0
	maxlinename=""
	for f in "$1/${n}"*hocr; do
		line=`awk -F '"' '/ocr_line/ {print $2}' "$f"|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'`
		b=`basename "$f"`
		name=`echo "$b" | sed 's/.*_//g;s/.hocr//g'`
		if test $line -gt $maxline; then
			maxline=$line
			maxlinename=$name
		fi
	done

	# skip any missing files in best/
	test -f "$1/best/$n"*hocr || continue

	bestline=`awk -F '"' '/ocr_line/ {print $2}' "$1/best/$n"*hocr|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'`
	bestlineb=`basename "$1/best/$n"*hocr`
	bestlinename=`echo "$bestlineb" | sed 's/.*_//g;s/.hocr//g'`

	maxlinelimit=`expr $maxline - 100`

	if test $bestline -lt $maxlinelimit; then
		echo "$n looks suspicious; we chose an hocr ($bestlinename) max line length of $bestline whereas another version ($maxlinename) had max line length of $maxline"
	fi
done