diff options
author | Nick White <git@njw.name> | 2019-06-11 18:21:02 +0100 |
---|---|---|
committer | Nick White <git@njw.name> | 2019-06-11 18:21:02 +0100 |
commit | 79f89cfc67b4bf07885d89403a6905d903394d9b (patch) | |
tree | a551c9ea7ac4b0de4cd294074461631e4a925a0b | |
parent | 1efa16179c8d405903dc56fca7245d3205ebb07e (diff) |
Add checkoverwiping script
-rw-r--r-- | checkoverwiping.sh | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/checkoverwiping.sh b/checkoverwiping.sh new file mode 100644 index 0000000..a1c7d9c --- /dev/null +++ b/checkoverwiping.sh @@ -0,0 +1,36 @@ +#!/bin/sh +usage="Usage: $0 bookdir + +Detects possibly overwiped pages by looking for pages where the best +HOCR chosen has significantly shorter maximum line length than an +alternative candidate with a different binarisation level." +test $# -ne 1 && echo "$usage" && exit 1 + +find "$1" -maxdepth 1 -type f -name '*_bin0.2.hocr' | sort | while read i; do + n=`basename "$i" .hocr | sed 's/_.*//'` + + maxline=0 + maxlinename="" + for f in "${n}"*hocr; do + line=`awk -F '"' '/ocr_line/ {print $2}' "$f"|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'` + b=`basename "$f"` + name=`echo "$b" | sed 's/.*_//g;s/.hocr//g'` + if test $line -gt $maxline; then + maxline=$line + maxlinename=$name + fi + done + + # skip any missing files in best/ + test -f "$1/best/$n"*hocr || continue + + bestline=`awk -F '"' '/ocr_line/ {print $2}' "$1/best/$n"*hocr|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'` + bestlineb=`basename "$1/best/$n"*hocr` + bestlinename=`echo "$bestlineb" | sed 's/.*_//g;s/.hocr//g'` + + maxlinelimit=`expr $maxline - 100` + + if test $bestline -lt $maxlinelimit; then + echo "$n looks suspicious; we chose an hocr ($bestlinename) max line length of $bestline whereas another version ($maxlinename) had max line length of $maxline" + fi +done |