#!/bin/sh usage="Usage: $0 gtdir gtevaldir oldtraineddata trainingname" # add timestamps for log # ensure exit 1 is gone where it caused failures test $# -ne 4 && echo "$usage" && exit 1 ## Settings ## # This retrains the top layers #extra="--append_index 5" #netspec="[Lfx512 O1c1]" # This fine-tunes the existing layers (copying the existing best/eng netspec) netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys64 Lfx96 Lrx96 Lfx512 O1c1]" extra="" # (copied from the fast/fra netspec [the best/fra one is absent from the version string]) #netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx128 O1c1]" iterations=100000 oldtraining="$3" name="$4" here=`dirname "$0"` mkdir -p "$name" if test ! -f "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata"; then echo "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata not found, needed for lstmf generation, bailing" exit 1 fi commit=`cd "$1" && git log -n 1|awk '/^commit/ {print $2}'` printf 'gtdir: %s\ngtdir commit: %s\ngtevaldir: %s\noldtraineddata: %s\ntrainingname: %s\nnetspec: %s\niterations: %s\nextra_args: %s\n' \ "$1" "$commit" "$2" "$3" "$4" "$netspec" $iterations "$extra" > "$name/settings" echo "Copying training ground truth" mkdir -p "$name/gt" find "$1" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'` cp "$i" "$name/gt/$n" done echo "Copying eval ground truth" mkdir -p "$name/eval" find "$2" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'` cp "$i" "$name/eval/$n" done echo "Making box files" find "$name/gt" "$name/eval" -type f -name '*txt' | while read i; do b=`basename "$i" .txt` d=`dirname "$i"` n="" test -f "$d/$b.tif" && n="$b.tif" test -f "$d/$b.png" && n="$b.png" test -z "$n" && echo "Skipping $i as no corresponding image found" && continue test -f "$d/$b.box" && echo "Skipping $i as box file already present" && continue python3 "${here}/generate_line_box.py" -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1 done echo "Making unicharset" unicharset_extractor --output_unicharset "$name/gt/unicharset" --norm_mode 2 "$name/gt/"*box || exit 1 echo "Making lstmf files" find "$name/gt" "$name/eval" -type f -name '*box' | while read i; do b=`basename "$i" .box` d=`dirname "$i"` test -f "$d/$b.tif" && n="$b.tif" test -f "$d/$b.png" && n="$b.png" echo "making lstm for $d/$n" tesseract "$d/$n" "$d/$b" --psm 6 lstm.train done echo "Listing lstmf files" find "$name/gt" -type f -name '*lstmf' > "$name/gt/list" find "$name/eval" -type f -name '*lstmf' > "$name/eval/list" echo "Unpacking old training" mkdir -p "$name/orig" cp "$oldtraining" "$name/orig/orig.traineddata" combine_tessdata -u "$name/orig/orig.traineddata" "$name/orig/orig" || exit 1 echo "Making complete unicharset" merge_unicharsets "$name/gt/unicharset" "$name/orig/orig.lstm-unicharset" "$name/unicharset" || exit 1 echo "Making starter training" mkdir -p "$name/starter" curl -L -f 'https://github.com/tesseract-ocr/langdata_lstm/raw/main/radical-stroke.txt' > "$name/starter/radical-stroke.txt" || exit 1 combine_lang_model --input_unicharset "$name/unicharset" --script_dir "$name/starter" --output_dir "$name/starter" --lang "$name" || exit 1 mkdir -p "$name/checkpoint" echo "Starting training" lstmtraining \ --traineddata "$name/starter/$name/$name.traineddata" \ --old_traineddata "$name/orig/orig.traineddata" \ --continue_from "$name/orig/orig.lstm" \ --net_spec "$netspec" \ --model_output "$name/checkpoint/$name" \ --learning_rate 20e-4 \ --train_listfile "$name/gt/list" \ --eval_listfile "$name/eval/list" \ --max_iterations $iterations \ $extra || exit 1 echo "Saving training" lstmtraining \ --stop_training \ --continue_from "$name/checkpoint/${name}_checkpoint" \ --traineddata "$name/starter/$name/$name.traineddata" \ --model_output "$name/$name.traineddata" echo "Saving fast version of training" lstmtraining \ --stop_training \ --model_output "$name/${name}_fast.traineddata" \ --continue_from "$name/checkpoint/${name}_checkpoint" \ --traineddata "$name/starter/$name/$name.traineddata" \ --convert_to_int true