blob: a8bb3b5dcb899afc6f988f73cf4591f64ec7603a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
|
#!/bin/sh
usage="Usage: $0 bookdir
Detects possibly overwiped pages by looking for pages where the best
HOCR chosen has significantly shorter maximum line length than an
alternative candidate with a different binarisation level."
test $# -ne 1 && echo "$usage" && exit 1
find "$1" -maxdepth 1 -type f -name '*_bin0.2.hocr' | sort | while read i; do
n=`basename "$i" .hocr | sed 's/_.*//'`
maxline=0
maxlinename=""
for f in "$1/${n}"*hocr; do
line=`awk -F '"' '/ocr_line/ {print $2}' "$f"|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'`
b=`basename "$f"`
name=`echo "$b" | sed 's/.*_//g;s/.hocr//g'`
if test $line -gt $maxline; then
maxline=$line
maxlinename=$name
fi
done
# skip any missing files in best/
test -f "$1/best/$n"*hocr || continue
bestline=`awk -F '"' '/ocr_line/ {print $2}' "$1/best/$n"*hocr|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'`
bestlineb=`basename "$1/best/$n"*hocr`
bestlinename=`echo "$bestlineb" | sed 's/.*_//g;s/.hocr//g'`
maxlinelimit=`expr $maxline - 100`
if test $bestline -lt $maxlinelimit; then
echo "$n looks suspicious; we chose an hocr ($bestlinename) max line length of $bestline whereas another version ($maxlinename) had max line length of $maxline"
fi
done
|