From 79f89cfc67b4bf07885d89403a6905d903394d9b Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 11 Jun 2019 18:21:02 +0100 Subject: Add checkoverwiping script --- checkoverwiping.sh | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 checkoverwiping.sh diff --git a/checkoverwiping.sh b/checkoverwiping.sh new file mode 100644 index 0000000..a1c7d9c --- /dev/null +++ b/checkoverwiping.sh @@ -0,0 +1,36 @@ +#!/bin/sh +usage="Usage: $0 bookdir + +Detects possibly overwiped pages by looking for pages where the best +HOCR chosen has significantly shorter maximum line length than an +alternative candidate with a different binarisation level." +test $# -ne 1 && echo "$usage" && exit 1 + +find "$1" -maxdepth 1 -type f -name '*_bin0.2.hocr' | sort | while read i; do + n=`basename "$i" .hocr | sed 's/_.*//'` + + maxline=0 + maxlinename="" + for f in "${n}"*hocr; do + line=`awk -F '"' '/ocr_line/ {print $2}' "$f"|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'` + b=`basename "$f"` + name=`echo "$b" | sed 's/.*_//g;s/.hocr//g'` + if test $line -gt $maxline; then + maxline=$line + maxlinename=$name + fi + done + + # skip any missing files in best/ + test -f "$1/best/$n"*hocr || continue + + bestline=`awk -F '"' '/ocr_line/ {print $2}' "$1/best/$n"*hocr|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'` + bestlineb=`basename "$1/best/$n"*hocr` + bestlinename=`echo "$bestlineb" | sed 's/.*_//g;s/.hocr//g'` + + maxlinelimit=`expr $maxline - 100` + + if test $bestline -lt $maxlinelimit; then + echo "$n looks suspicious; we chose an hocr ($bestlinename) max line length of $bestline whereas another version ($maxlinename) had max line length of $maxline" + fi +done -- cgit v1.2.1-24-ge1ad