summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-02-25 17:19:48 +0000
committerNick White <git@njw.name>2019-02-25 17:19:48 +0000
commit0aec35a060a9f9e1e33c18bf0e2af2aafd6a6257 (patch)
tree06681e52bf515814728e87c3971b50b178d9b480
Add various helper scripts
-rw-r--r--allpgsconf.sh40
-rw-r--r--bookgraph.sh34
-rw-r--r--hocrtotxtdir.sh5
-rw-r--r--ocropuspgtxt.sh6
-rw-r--r--pggraph.sh24
-rw-r--r--scrape-bnf.sh17
-rw-r--r--snippets/tessfortesting22
-rw-r--r--sortbucketed.sh3
8 files changed, 151 insertions, 0 deletions
diff --git a/allpgsconf.sh b/allpgsconf.sh
new file mode 100644
index 0000000..60ea4d6
--- /dev/null
+++ b/allpgsconf.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+usage="usage: $0 dir
+
+Calculate the average confidence of each page, saving them in a file
+called hocr-avgpgconf in the book directory."
+test $# -ne 1 && echo "$usage" && exit 1
+
+### calculate averages for hocr (tesseract)
+o="$1/hocr-avgpgconf"
+
+printf '' > "$o"
+
+for i in "$1"/*hocr
+do
+ b=`basename "$i" .hocr`
+
+ pgavg=`avg-lines "$i" 2>/dev/null \
+ | awk -F ':' '{print $2}' \
+ | sed 's/%//g;s/ //g' \
+ | awk '{total += $1; n++} END{if(n > 0) {printf("%.2f\n", total/n)}}'`
+
+ printf '%s\t%.2f\n' "$b" "$pgavg" >> "$o"
+done
+
+### calculate averages for prob (ocropus)
+o="$1/prob-avgpgconf"
+
+printf '' > "$o"
+
+for i in "$1"/????
+do
+ b=`basename "$i"`
+
+ pgavg=`avg-lines "$i"/*prob 2>/dev/null \
+ | awk -F ':' '{print $2}' \
+ | sed 's/%//g;s/ //g' \
+ | awk '{total += $1; n++} END{if(n > 0) {printf("%.2f\n", total/n)}}'`
+
+ printf '%s\t%.2f\n' "$b" "$pgavg" >> "$o"
+done
diff --git a/bookgraph.sh b/bookgraph.sh
new file mode 100644
index 0000000..8113a5e
--- /dev/null
+++ b/bookgraph.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+usage="$0 bookdir
+
+Creates a graph showing the average confidence of each page in a book.
+This relies on the hocr-avgpgconf file being present, which is generated
+by the allpgsconf.sh script"
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+! test -f "$1/hocr-avgpgconf" && echo "No $1/hocr-avgpgconf file found; run allpgsconf.sh before running this" && exit 1
+! test -f "$1/prob-avgpgconf" && echo "No $1/prob-avgpgconf file found; run allpgsconf.sh before running this" && exit 1
+
+t=`mktemp`
+# NOTE: this expects book file naming to be in the format nnnn_sometext
+sed 's/_[^\t]*//g' < "$1/hocr-avgpgconf" > "$t"
+
+b=`basename "$1"`
+
+gp=`mktemp`
+printf 'set style data lines\n' >> "$gp"
+printf 'set title "Book Confidence for %s"\n' "$b" >> "$gp"
+printf 'set xlabel "Page Number"\n' >> "$gp"
+printf 'set ylabel "Page confidence"\n' >> "$gp"
+printf 'set mxtics\n' >> "$gp"
+printf 'set terminal png truecolor size 4600,1700\n' >> "$gp"
+printf 'set output "%s"\n' "$1/bookgraph.png" >> "$gp"
+printf 'plot "%s" using 1:2 with lines title "Tesseract",\\\n' "$t" >> "$gp"
+printf ' "" using 1:2:(sprintf("%%d", $1)) with labels point pt 2 notitle,\\\n' >> "$gp"
+printf ' "%s" using 1:2 with lines title "Ocropus",\\\n' "$1/prob-avgpgconf" >> "$gp"
+printf ' "" using 1:2:(sprintf("%%d", $1)) with labels point pt 2 notitle\n' >> "$gp"
+printf 'quit\n' >> "$gp"
+gnuplot "$gp"
+
+rm -f "$t" "$gp"
diff --git a/hocrtotxtdir.sh b/hocrtotxtdir.sh
new file mode 100644
index 0000000..20c45a5
--- /dev/null
+++ b/hocrtotxtdir.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+usage="usage: $0 dirname"
+test $# -ne 1 && echo "$usage" && exit 1
+
+for i in "$1"/*hocr; do b=`basename "$i" .hocr`; d=`dirname "$i"`; ~/rescribe/2018-natphil/src/go/bin/hocrtotxt "$i" > "$d/$b.txt"; done
diff --git a/ocropuspgtxt.sh b/ocropuspgtxt.sh
new file mode 100644
index 0000000..ed4b9f3
--- /dev/null
+++ b/ocropuspgtxt.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+usage="usage: $0 ocrdir
+Adds pgname_ocropus.txt files for each page to an ocropus directory"
+test $# -ne 1 && echo "$usage" && exit 1
+
+for i in "$1"/????; do n=`basename "$i"`; cat "$i"/*txt > "$1/${n}_ocropus.txt"; done
diff --git a/pggraph.sh b/pggraph.sh
new file mode 100644
index 0000000..11eb143
--- /dev/null
+++ b/pggraph.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+usage="$0 hocr|prob graph.png"
+
+test $# -ne 2 && echo "$usage" && exit 1
+
+t=`mktemp`
+
+# TODO: the line_1_ part is changes formatting from hocr line naming to
+# line numbers. make it work for .prob stuff too.
+avg-lines -nosort "$1" | awk -F ':' '{printf("%s\t%s\n", $1, $2)}' | sed 's/%//g' | sed 's/[^ ]* line_1_//g' > "$t"
+
+gp=`mktemp`
+printf 'set style data lines\n' >> "$gp"
+printf 'set title "Line confidences for %s"\n' "$1" >> "$gp"
+printf 'set xlabel "Line Number"\n' >> "$gp"
+printf 'set ylabel "Average confidence"\n' >> "$gp"
+printf 'unset key\n' >> "$gp"
+printf 'set terminal png truecolor size 1920,1080\n' >> "$gp"
+printf 'set output "%s"\n' "$2" >> "$gp"
+printf 'plot "%s"\n' "$t" >> "$gp"
+printf 'quit\n' >> "$gp"
+gnuplot "$gp"
+
+rm -f "$t" "$gp"
diff --git a/scrape-bnf.sh b/scrape-bnf.sh
new file mode 100644
index 0000000..677e4d4
--- /dev/null
+++ b/scrape-bnf.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+usage="Usage: $0 bnfurl"
+
+test $# -ne 1 && echo "$usage" && exit 1
+
+bookid=`echo "$1" |awk -F / '{printf("%s/%s\n", $5, $6)}'`
+bookid_name=`echo "$bookid" | sed 's/\//_/'`
+
+html=`curl -s "https://gallica.bnf.fr/ark:/${bookid}"`
+
+pagenum=`echo "$html" | sed 's/.*nbTotalVues\\\"://g' | sed 's/,.*//'`
+
+for i in `seq "$pagenum"`; do
+ pgname=`printf "%s_%03d" "${bookid_name}" "${i}"`
+ echo "Downloading page $i of $pagenum to ${pgname}.jpg"
+ curl -s "https://gallica.bnf.fr/iiif/ark:/${bookid}/f${i}/full/full/0/native.jpg" > "${pgname}.jpg"
+done
diff --git a/snippets/tessfortesting b/snippets/tessfortesting
new file mode 100644
index 0000000..daec106
--- /dev/null
+++ b/snippets/tessfortesting
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+# rename .gt.txt files to .txt
+for i in *gt.txt; do n=`echo "$i"|sed 's/\.gt\.txt/.txt/g'`; mv "$i" "$n"; done
+
+# create boxes for all lines
+for i in *png; do b=`basename "$i" .png`; python ../../generate_line_box.py -i "$b.png" -t "$b.txt" > "$b.box"; done
+
+# back up ground truth .txt (the lstm.train step wipes it)
+for i in *txt; do cp "$i" "$i.bak"; done
+
+# create lstmf counterparts to all files
+for i in *png; do b=`basename "$i" .png`; tesseract "$i" "$b" --psm 6 lstm.train; done
+
+# restore ground truth .txt
+for i in *.bak; do b=`basename "$i" .bak`; mv "$i" "$b"; done
+
+# create list of lstmf files
+ls *lstmf > all-files.txt
+
+# do the evaluation (note: verbosity 2 means that correct lines are printed as well as incorrect ones)
+lstmeval --model ../rescribealphav4.traineddata --eval_listfile all-files.txt --verbosity 2 2>&1 | tee evaluation
diff --git a/sortbucketed.sh b/sortbucketed.sh
new file mode 100644
index 0000000..ba3fbb4
--- /dev/null
+++ b/sortbucketed.sh
@@ -0,0 +1,3 @@
+# as filenames aren't necessarily the same length, pad them all to be so (bearing in mind it's right padding, as it's implied that it's 0.x)
+
+ls *png|sed 's/.png//g'|awk -F _ '{p = $3; while(length(p) < 18) {p = p "0"} ; printf("%s %s.png\n", p, $0)}'|sort -n|less