diff options
-rw-r--r-- | allpgsconf.sh | 40 | ||||
-rw-r--r-- | bookgraph.sh | 34 | ||||
-rw-r--r-- | hocrtotxtdir.sh | 5 | ||||
-rw-r--r-- | ocropuspgtxt.sh | 6 | ||||
-rw-r--r-- | pggraph.sh | 24 | ||||
-rw-r--r-- | scrape-bnf.sh | 17 | ||||
-rw-r--r-- | snippets/tessfortesting | 22 | ||||
-rw-r--r-- | sortbucketed.sh | 3 |
8 files changed, 151 insertions, 0 deletions
diff --git a/allpgsconf.sh b/allpgsconf.sh new file mode 100644 index 0000000..60ea4d6 --- /dev/null +++ b/allpgsconf.sh @@ -0,0 +1,40 @@ +#!/bin/sh +usage="usage: $0 dir + +Calculate the average confidence of each page, saving them in a file +called hocr-avgpgconf in the book directory." +test $# -ne 1 && echo "$usage" && exit 1 + +### calculate averages for hocr (tesseract) +o="$1/hocr-avgpgconf" + +printf '' > "$o" + +for i in "$1"/*hocr +do + b=`basename "$i" .hocr` + + pgavg=`avg-lines "$i" 2>/dev/null \ + | awk -F ':' '{print $2}' \ + | sed 's/%//g;s/ //g' \ + | awk '{total += $1; n++} END{if(n > 0) {printf("%.2f\n", total/n)}}'` + + printf '%s\t%.2f\n' "$b" "$pgavg" >> "$o" +done + +### calculate averages for prob (ocropus) +o="$1/prob-avgpgconf" + +printf '' > "$o" + +for i in "$1"/???? +do + b=`basename "$i"` + + pgavg=`avg-lines "$i"/*prob 2>/dev/null \ + | awk -F ':' '{print $2}' \ + | sed 's/%//g;s/ //g' \ + | awk '{total += $1; n++} END{if(n > 0) {printf("%.2f\n", total/n)}}'` + + printf '%s\t%.2f\n' "$b" "$pgavg" >> "$o" +done diff --git a/bookgraph.sh b/bookgraph.sh new file mode 100644 index 0000000..8113a5e --- /dev/null +++ b/bookgraph.sh @@ -0,0 +1,34 @@ +#!/bin/sh +usage="$0 bookdir + +Creates a graph showing the average confidence of each page in a book. +This relies on the hocr-avgpgconf file being present, which is generated +by the allpgsconf.sh script" + +test $# -ne 1 && echo "$usage" && exit 1 + +! test -f "$1/hocr-avgpgconf" && echo "No $1/hocr-avgpgconf file found; run allpgsconf.sh before running this" && exit 1 +! test -f "$1/prob-avgpgconf" && echo "No $1/prob-avgpgconf file found; run allpgsconf.sh before running this" && exit 1 + +t=`mktemp` +# NOTE: this expects book file naming to be in the format nnnn_sometext +sed 's/_[^\t]*//g' < "$1/hocr-avgpgconf" > "$t" + +b=`basename "$1"` + +gp=`mktemp` +printf 'set style data lines\n' >> "$gp" +printf 'set title "Book Confidence for %s"\n' "$b" >> "$gp" +printf 'set xlabel "Page Number"\n' >> "$gp" +printf 'set ylabel "Page confidence"\n' >> "$gp" +printf 'set mxtics\n' >> "$gp" +printf 'set terminal png truecolor size 4600,1700\n' >> "$gp" +printf 'set output "%s"\n' "$1/bookgraph.png" >> "$gp" +printf 'plot "%s" using 1:2 with lines title "Tesseract",\\\n' "$t" >> "$gp" +printf ' "" using 1:2:(sprintf("%%d", $1)) with labels point pt 2 notitle,\\\n' >> "$gp" +printf ' "%s" using 1:2 with lines title "Ocropus",\\\n' "$1/prob-avgpgconf" >> "$gp" +printf ' "" using 1:2:(sprintf("%%d", $1)) with labels point pt 2 notitle\n' >> "$gp" +printf 'quit\n' >> "$gp" +gnuplot "$gp" + +rm -f "$t" "$gp" diff --git a/hocrtotxtdir.sh b/hocrtotxtdir.sh new file mode 100644 index 0000000..20c45a5 --- /dev/null +++ b/hocrtotxtdir.sh @@ -0,0 +1,5 @@ +#!/bin/sh +usage="usage: $0 dirname" +test $# -ne 1 && echo "$usage" && exit 1 + +for i in "$1"/*hocr; do b=`basename "$i" .hocr`; d=`dirname "$i"`; ~/rescribe/2018-natphil/src/go/bin/hocrtotxt "$i" > "$d/$b.txt"; done diff --git a/ocropuspgtxt.sh b/ocropuspgtxt.sh new file mode 100644 index 0000000..ed4b9f3 --- /dev/null +++ b/ocropuspgtxt.sh @@ -0,0 +1,6 @@ +#!/bin/sh +usage="usage: $0 ocrdir +Adds pgname_ocropus.txt files for each page to an ocropus directory" +test $# -ne 1 && echo "$usage" && exit 1 + +for i in "$1"/????; do n=`basename "$i"`; cat "$i"/*txt > "$1/${n}_ocropus.txt"; done diff --git a/pggraph.sh b/pggraph.sh new file mode 100644 index 0000000..11eb143 --- /dev/null +++ b/pggraph.sh @@ -0,0 +1,24 @@ +#!/bin/sh +usage="$0 hocr|prob graph.png" + +test $# -ne 2 && echo "$usage" && exit 1 + +t=`mktemp` + +# TODO: the line_1_ part is changes formatting from hocr line naming to +# line numbers. make it work for .prob stuff too. +avg-lines -nosort "$1" | awk -F ':' '{printf("%s\t%s\n", $1, $2)}' | sed 's/%//g' | sed 's/[^ ]* line_1_//g' > "$t" + +gp=`mktemp` +printf 'set style data lines\n' >> "$gp" +printf 'set title "Line confidences for %s"\n' "$1" >> "$gp" +printf 'set xlabel "Line Number"\n' >> "$gp" +printf 'set ylabel "Average confidence"\n' >> "$gp" +printf 'unset key\n' >> "$gp" +printf 'set terminal png truecolor size 1920,1080\n' >> "$gp" +printf 'set output "%s"\n' "$2" >> "$gp" +printf 'plot "%s"\n' "$t" >> "$gp" +printf 'quit\n' >> "$gp" +gnuplot "$gp" + +rm -f "$t" "$gp" diff --git a/scrape-bnf.sh b/scrape-bnf.sh new file mode 100644 index 0000000..677e4d4 --- /dev/null +++ b/scrape-bnf.sh @@ -0,0 +1,17 @@ +#!/bin/sh +usage="Usage: $0 bnfurl" + +test $# -ne 1 && echo "$usage" && exit 1 + +bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'` +bookid_name=`echo "$bookid" | sed 's/\//_/'` + +html=`curl -s "https://gallica.bnf.fr/ark:/${bookid}"` + +pagenum=`echo "$html" | sed 's/.*nbTotalVues\\\"://g' | sed 's/,.*//'` + +for i in `seq "$pagenum"`; do + pgname=`printf "%s_%03d" "${bookid_name}" "${i}"` + echo "Downloading page $i of $pagenum to ${pgname}.jpg" + curl -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg" +done diff --git a/snippets/tessfortesting b/snippets/tessfortesting new file mode 100644 index 0000000..daec106 --- /dev/null +++ b/snippets/tessfortesting @@ -0,0 +1,22 @@ +#!/bin/sh + +# rename .gt.txt files to .txt +for i in *gt.txt; do n=`echo "$i"|sed 's/\.gt\.txt/.txt/g'`; mv "$i" "$n"; done + +# create boxes for all lines +for i in *png; do b=`basename "$i" .png`; python ../../generate_line_box.py -i "$b.png" -t "$b.txt" > "$b.box"; done + +# back up ground truth .txt (the lstm.train step wipes it) +for i in *txt; do cp "$i" "$i.bak"; done + +# create lstmf counterparts to all files +for i in *png; do b=`basename "$i" .png`; tesseract "$i" "$b" --psm 6 lstm.train; done + +# restore ground truth .txt +for i in *.bak; do b=`basename "$i" .bak`; mv "$i" "$b"; done + +# create list of lstmf files +ls *lstmf > all-files.txt + +# do the evaluation (note: verbosity 2 means that correct lines are printed as well as incorrect ones) +lstmeval --model ../rescribealphav4.traineddata --eval_listfile all-files.txt --verbosity 2 2>&1 | tee evaluation diff --git a/sortbucketed.sh b/sortbucketed.sh new file mode 100644 index 0000000..ba3fbb4 --- /dev/null +++ b/sortbucketed.sh @@ -0,0 +1,3 @@ +# as filenames aren't necessarily the same length, pad them all to be so (bearing in mind it's right padding, as it's implied that it's 0.x) + +ls *png|sed 's/.png//g'|awk -F _ '{p = $3; while(length(p) < 18) {p = p "0"} ; printf("%s %s.png\n", p, $0)}'|sort -n|less |