summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-06-11 18:21:02 +0100
committerNick White <git@njw.name>2019-06-11 18:21:02 +0100
commit79f89cfc67b4bf07885d89403a6905d903394d9b (patch)
treea551c9ea7ac4b0de4cd294074461631e4a925a0b
parent1efa16179c8d405903dc56fca7245d3205ebb07e (diff)
Add checkoverwiping script
-rw-r--r--checkoverwiping.sh36
1 files changed, 36 insertions, 0 deletions
diff --git a/checkoverwiping.sh b/checkoverwiping.sh
new file mode 100644
index 0000000..a1c7d9c
--- /dev/null
+++ b/checkoverwiping.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+usage="Usage: $0 bookdir
+
+Detects possibly overwiped pages by looking for pages where the best
+HOCR chosen has significantly shorter maximum line length than an
+alternative candidate with a different binarisation level."
+test $# -ne 1 && echo "$usage" && exit 1
+
+find "$1" -maxdepth 1 -type f -name '*_bin0.2.hocr' | sort | while read i; do
+ n=`basename "$i" .hocr | sed 's/_.*//'`
+
+ maxline=0
+ maxlinename=""
+ for f in "${n}"*hocr; do
+ line=`awk -F '"' '/ocr_line/ {print $2}' "$f"|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'`
+ b=`basename "$f"`
+ name=`echo "$b" | sed 's/.*_//g;s/.hocr//g'`
+ if test $line -gt $maxline; then
+ maxline=$line
+ maxlinename=$name
+ fi
+ done
+
+ # skip any missing files in best/
+ test -f "$1/best/$n"*hocr || continue
+
+ bestline=`awk -F '"' '/ocr_line/ {print $2}' "$1/best/$n"*hocr|sed 's/;.*//g'|awk '{w=$4 - $2; if(w > max){max=w}} END {printf("%d\n", max)}'`
+ bestlineb=`basename "$1/best/$n"*hocr`
+ bestlinename=`echo "$bestlineb" | sed 's/.*_//g;s/.hocr//g'`
+
+ maxlinelimit=`expr $maxline - 100`
+
+ if test $bestline -lt $maxlinelimit; then
+ echo "$n looks suspicious; we chose an hocr ($bestlinename) max line length of $bestline whereas another version ($maxlinename) had max line length of $maxline"
+ fi
+done