From eb20477b97f441abfa2fcbb146240a6799b59d0a Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 11 Feb 2020 12:14:23 +0000 Subject: Rename traintessv5.sh to traintess.sh --- traintess.sh | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ traintessv5.sh | 116 --------------------------------------------------------- 2 files changed, 116 insertions(+), 116 deletions(-) create mode 100755 traintess.sh delete mode 100755 traintessv5.sh diff --git a/traintess.sh b/traintess.sh new file mode 100755 index 0000000..24c83d3 --- /dev/null +++ b/traintess.sh @@ -0,0 +1,116 @@ +#!/bin/sh +usage="Usage: $0 gtdir gtevaldir oldtraineddata trainingname" + +test $# -ne 4 && echo "$usage" && exit 1 + +## Settings ## +# This retrains the top layers +#extra="--append_index 5" +#netspec="[Lfx512 O1c1]" +# This fine-tunes the existing layers (copying the existing best/eng netspec) +netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys64 Lfx96 Lrx96 Lfx512 O1c1]" +extra="" +# (copied from the fast/fra netspec [the best/fra one is absent from the version string]) +#netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx128 O1c1]" +iterations=10000 + +oldtraining="$3" +name="$4" + +mkdir -p "$name" + +if test ! -f "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata"; then + echo "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata not found, needed for lstmf generation, bailing" + exit 1 +fi + +commit=`cd "$1" && git log -n 1|awk '/^commit/ {print $2}'` +printf 'gtdir: %s\ngtdir commit: %s\ngtevaldir: %s\noldtraineddata: %s\ntrainingname: %s\nnetspec: %s\niterations: %s\nextra_args: %s\n' \ + "$1" "$commit" "$2" "$3" "$4" "$netspec" $iterations "$extra" > "$name/settings" + + +echo "Copying training ground truth" +mkdir -p "$name/gt" +find "$1" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do + n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'` + cp "$i" "$name/gt/$n" +done + +echo "Copying eval ground truth" +mkdir -p "$name/eval" +find "$2" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do + n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'` + cp "$i" "$name/eval/$n" +done + +echo "Making box files" +find "$name/gt" "$name/eval" -type f -name '*txt' | while read i; do + b=`basename "$i" .txt` + d=`dirname "$i"` + n="" + test -f "$d/$b.tif" && n="$b.tif" + test -f "$d/$b.png" && n="$b.png" + test -z "$n" && echo "Skipping $i as no corresponding image found" && continue + test -f "$d/$b.box" && echo "Skipping $i as box file already present" && continue + python ~/bigboy/othertools/generate_line_box.py -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1 +done + +echo "Making unicharset" +unicharset_extractor --output_unicharset "$name/gt/unicharset" --norm_mode 2 "$name/gt/"*box || exit 1 + +echo "Making lstmf files" +find "$name/gt" "$name/eval" -type f -name '*box' | while read i; do + b=`basename "$i" .box` + d=`dirname "$i"` + test -f "$d/$b.tif" && n="$b.tif" + test -f "$d/$b.png" && n="$b.png" + echo "making lstm for $d/$n" + tesseract "$d/$n" "$d/$b" --psm 6 lstm.train || exit 1 +done + +echo "Listing lstmf files" +find "$name/gt" -type f -name '*lstmf' > "$name/gt/list" +find "$name/eval" -type f -name '*lstmf' > "$name/eval/list" + +echo "Unpacking old training" +mkdir -p "$name/orig" +cp "$oldtraining" "$name/orig/orig.traineddata" +combine_tessdata -u "$name/orig/orig.traineddata" "$name/orig/orig" || exit 1 + +echo "Making complete unicharset" +merge_unicharsets "$name/gt/unicharset" "$name/orig/orig.lstm-unicharset" "$name/unicharset" || exit 1 + +echo "Making starter training" +mkdir -p "$name/starter" +curl -L -f 'https://github.com/tesseract-ocr/langdata_lstm/raw/master/radical-stroke.txt' > "$name/starter/radical-stroke.txt" || exit 1 +combine_lang_model --input_unicharset "$name/unicharset" --script_dir "$name/starter" --output_dir "$name/starter" --lang "$name" || exit 1 + +mkdir -p "$name/checkpoint" + +echo "Starting training" +lstmtraining \ + --traineddata "$name/starter/$name/$name.traineddata" \ + --old_traineddata "$name/orig/orig.traineddata" \ + --continue_from "$name/orig/orig.lstm" \ + --net_spec "$netspec" \ + --model_output "$name/checkpoint/$name" \ + --learning_rate 20e-4 \ + --train_listfile "$name/gt/list" \ + --eval_listfile "$name/eval/list" \ + --max_iterations $iterations \ + $extra || exit 1 + +echo "Saving training" +lstmtraining \ + --stop_training \ + --continue_from "$name/checkpoint/${name}_checkpoint" \ + --traineddata "$name/starter/$name/$name.traineddata" \ + --model_output "$name/$name.traineddata" + +echo "Saving fast version of training" +lstmtraining \ + --stop_training \ + --model_output "$name/${name}_fast.traineddata" \ + --continue_from "$name/checkpoint/${name}_checkpoint" \ + --traineddata "$name/starter/$name/$name.traineddata" \ + --convert_to_int true diff --git a/traintessv5.sh b/traintessv5.sh deleted file mode 100755 index 24c83d3..0000000 --- a/traintessv5.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/sh -usage="Usage: $0 gtdir gtevaldir oldtraineddata trainingname" - -test $# -ne 4 && echo "$usage" && exit 1 - -## Settings ## -# This retrains the top layers -#extra="--append_index 5" -#netspec="[Lfx512 O1c1]" -# This fine-tunes the existing layers (copying the existing best/eng netspec) -netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys64 Lfx96 Lrx96 Lfx512 O1c1]" -extra="" -# (copied from the fast/fra netspec [the best/fra one is absent from the version string]) -#netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx128 O1c1]" -iterations=10000 - -oldtraining="$3" -name="$4" - -mkdir -p "$name" - -if test ! -f "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata"; then - echo "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata not found, needed for lstmf generation, bailing" - exit 1 -fi - -commit=`cd "$1" && git log -n 1|awk '/^commit/ {print $2}'` -printf 'gtdir: %s\ngtdir commit: %s\ngtevaldir: %s\noldtraineddata: %s\ntrainingname: %s\nnetspec: %s\niterations: %s\nextra_args: %s\n' \ - "$1" "$commit" "$2" "$3" "$4" "$netspec" $iterations "$extra" > "$name/settings" - - -echo "Copying training ground truth" -mkdir -p "$name/gt" -find "$1" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do - n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'` - cp "$i" "$name/gt/$n" -done - -echo "Copying eval ground truth" -mkdir -p "$name/eval" -find "$2" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do - n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'` - cp "$i" "$name/eval/$n" -done - -echo "Making box files" -find "$name/gt" "$name/eval" -type f -name '*txt' | while read i; do - b=`basename "$i" .txt` - d=`dirname "$i"` - n="" - test -f "$d/$b.tif" && n="$b.tif" - test -f "$d/$b.png" && n="$b.png" - test -z "$n" && echo "Skipping $i as no corresponding image found" && continue - test -f "$d/$b.box" && echo "Skipping $i as box file already present" && continue - python ~/bigboy/othertools/generate_line_box.py -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1 -done - -echo "Making unicharset" -unicharset_extractor --output_unicharset "$name/gt/unicharset" --norm_mode 2 "$name/gt/"*box || exit 1 - -echo "Making lstmf files" -find "$name/gt" "$name/eval" -type f -name '*box' | while read i; do - b=`basename "$i" .box` - d=`dirname "$i"` - test -f "$d/$b.tif" && n="$b.tif" - test -f "$d/$b.png" && n="$b.png" - echo "making lstm for $d/$n" - tesseract "$d/$n" "$d/$b" --psm 6 lstm.train || exit 1 -done - -echo "Listing lstmf files" -find "$name/gt" -type f -name '*lstmf' > "$name/gt/list" -find "$name/eval" -type f -name '*lstmf' > "$name/eval/list" - -echo "Unpacking old training" -mkdir -p "$name/orig" -cp "$oldtraining" "$name/orig/orig.traineddata" -combine_tessdata -u "$name/orig/orig.traineddata" "$name/orig/orig" || exit 1 - -echo "Making complete unicharset" -merge_unicharsets "$name/gt/unicharset" "$name/orig/orig.lstm-unicharset" "$name/unicharset" || exit 1 - -echo "Making starter training" -mkdir -p "$name/starter" -curl -L -f 'https://github.com/tesseract-ocr/langdata_lstm/raw/master/radical-stroke.txt' > "$name/starter/radical-stroke.txt" || exit 1 -combine_lang_model --input_unicharset "$name/unicharset" --script_dir "$name/starter" --output_dir "$name/starter" --lang "$name" || exit 1 - -mkdir -p "$name/checkpoint" - -echo "Starting training" -lstmtraining \ - --traineddata "$name/starter/$name/$name.traineddata" \ - --old_traineddata "$name/orig/orig.traineddata" \ - --continue_from "$name/orig/orig.lstm" \ - --net_spec "$netspec" \ - --model_output "$name/checkpoint/$name" \ - --learning_rate 20e-4 \ - --train_listfile "$name/gt/list" \ - --eval_listfile "$name/eval/list" \ - --max_iterations $iterations \ - $extra || exit 1 - -echo "Saving training" -lstmtraining \ - --stop_training \ - --continue_from "$name/checkpoint/${name}_checkpoint" \ - --traineddata "$name/starter/$name/$name.traineddata" \ - --model_output "$name/$name.traineddata" - -echo "Saving fast version of training" -lstmtraining \ - --stop_training \ - --model_output "$name/${name}_fast.traineddata" \ - --continue_from "$name/checkpoint/${name}_checkpoint" \ - --traineddata "$name/starter/$name/$name.traineddata" \ - --convert_to_int true -- cgit v1.2.1-24-ge1ad