#!/bin/sh usage="Usage: $0 bookdir Detects possibly overwiped pages by looking for pages where the best HOCR chosen has significantly shorter maximum line length than an alternative candidate with a different binarisation level." test $# -ne 1 && echo "$usage" && exit 1 find "$1" -maxdepth 1 -type f -name '*_bin0.2.hocr' | sort | while read i; do n=`basename "$i" .hocr | sed 's/_.*//'` maxline=0 maxlinename="" for f in "$1/${n}"*hocr; do line=`awk -F '"' '/ocr_line/ {print $2}' "$f"|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'` b=`basename "$f"` name=`echo "$b" | sed 's/.*_//g;s/.hocr//g'` if test $line -gt $maxline; then maxline=$line maxlinename=$name fi done # skip any missing files in best/ test -f "$1/best/$n"*hocr || continue bestline=`awk -F '"' '/ocr_line/ {print $2}' "$1/best/$n"*hocr|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'` bestlineb=`basename "$1/best/$n"*hocr` bestlinename=`echo "$bestlineb" | sed 's/.*_//g;s/.hocr//g'` maxlinelimit=`expr $maxline - 100` if test $bestline -lt $maxlinelimit; then echo "$n looks suspicious; we chose an hocr ($bestlinename) max line length of $bestline whereas another version ($maxlinename) had max line length of $maxline" fi done