summaryrefslogtreecommitdiff
path: root/traintessv4.sh
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-04-02 15:07:24 +0100
committerNick White <git@njw.name>2019-04-02 15:07:24 +0100
commitb1731f0f286dbbfc12b7f152f6554dcdbff85061 (patch)
tree4e85e3343715d9781641842020b80e6ada0e9ce3 /traintessv4.sh
parenta3ce33af0b4e7ba9b8463443c3ebf4a7797d160a (diff)
Add tesseractv4 training script
Diffstat (limited to 'traintessv4.sh')
-rwxr-xr-xtraintessv4.sh89
1 files changed, 89 insertions, 0 deletions
diff --git a/traintessv4.sh b/traintessv4.sh
new file mode 100755
index 0000000..948397b
--- /dev/null
+++ b/traintessv4.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+usage="Usage: $0 gtdir gtevaldir oldtraineddata trainingname"
+
+test $# -ne 4 && echo "$usage" && exit 1
+
+## Settings ##
+# This retrains the top layers
+#extra="--append_index 5"
+#netspec="[Lfx512 O1c1]"
+# This fine-tunes the existing layers (copying the existing best/eng netspec)
+extra=""
+netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys64 Lfx96 Lrx96 Lfx512 O1c1]"
+iterations=10000
+
+oldtraining="$3"
+name="$4"
+
+mkdir -p "$name"
+
+printf 'gtdir: %s\ngtevaldir: %s\noldtraineddata: %s\ntrainingname: %s\nnetspec: %s\niterations: %s\nextra_args: %s\n' \
+ "$1" "$2" "$3" "$4" "$netspec" $iterations "$extra" > "$name/settings"
+
+
+echo "Copying training ground truth"
+mkdir -p "$name/gt"
+find "$1" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do
+ n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.gt\.txt$/.txt/g'`
+ cp "$i" "$name/gt/$n"
+done
+
+echo "Copying eval ground truth"
+mkdir -p "$name/eval"
+find "$2" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do
+ n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.gt\.txt$/.txt/g'`
+ cp "$i" "$name/eval/$n"
+done
+
+echo "Making box files"
+find "$name/gt" "$name/eval" -type f -name '*txt' | while read i; do
+ b=`basename "$i"`
+ d=`dirname "$i"`
+ test -f "$b.tif" && n="$b.tif"
+ test -f "$b.png" && n="$b.png"
+ python generate_line_box.py -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1
+done
+
+echo "Making unicharset"
+unicharset_extractor --output_unicharset "$name/gt/unicharset" --norm_mode 2 "$name/gt/"*box || exit 1
+
+echo "Making lstmf files"
+find "$name/gt" "$name/eval" -type f -name '*box' | while read i; do
+ b=`basename "$i" .box`
+ d=`dirname "$i"`
+ test -f "$d/$b.tif" && n="$b.tif"
+ test -f "$d/$b.png" && n="$b.png"
+ echo "making lstm for $d/$n"
+ tesseract "$d/$n" "$d/$b" --psm 6 lstm.train || exit 1
+done
+
+echo "Listing lstmf files"
+find "$name/gt" -type f -name '*lstmf' > "$name/gt/list"
+find "$name/eval" -type f -name '*lstmf' > "$name/eval/list"
+
+echo "Unpacking old training"
+mkdir -p "$name/orig"
+cp "$oldtraining" "$name/orig/orig.traineddata"
+combine_tessdata -u "$name/orig/orig.traineddata" "$name/orig/orig" || exit 1
+
+echo "Making complete unicharset"
+merge_unicharsets "$name/gt/unicharset" "$name/orig/orig.lstm-unicharset" "$name/unicharset" || exit 1
+
+echo "Making starter training"
+mkdir -p "$name/starter"
+curl -L -f 'https://github.com/tesseract-ocr/langdata_lstm/raw/master/radical-stroke.txt' > "$name/starter/radical-stroke.txt" || exit 1
+combine_lang_model --input_unicharset "$name/unicharset" --script_dir "$name/starter" --output_dir "$name/starter" --lang "$name" || exit 1
+
+mkdir -p "$name/checkpoint"
+
+lstmtraining \
+ --traineddata "$name/starter/$name/$name.traineddata" \
+ --old_traineddata "$name/orig/orig.traineddata" \
+ --continue_from "$name/orig/orig.lstm" \
+ --net_spec "$netspec" \
+ --model_output "$name/checkpoint/$name" \
+ --learning_rate 20e-4 \
+ --train_listfile "$name/gt/list" \
+ --eval_listfile "$name/eval/list" \
+ --max_iterations $iterations \
+ $extra || exit 1