summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-01-22 14:18:08 +0000
committerNick White <git@njw.name>2020-01-22 14:18:08 +0000
commit8212994501dfff2bd42281ec796ab64e8a22ffb1 (patch)
tree18ace647e576f39e055689d632c725381ea6024f
parent7d151bbb85338bdf2022f815c73873f9eace7d38 (diff)
Replace traintessv4 with traintessv5 script, which was used for fra-engbase training (very minor edits)
-rwxr-xr-xtraintessv5.sh (renamed from traintessv4.sh)15
1 files changed, 11 insertions, 4 deletions
diff --git a/traintessv4.sh b/traintessv5.sh
index 92da4be..8b95443 100755
--- a/traintessv4.sh
+++ b/traintessv5.sh
@@ -8,8 +8,10 @@ test $# -ne 4 && echo "$usage" && exit 1
#extra="--append_index 5"
#netspec="[Lfx512 O1c1]"
# This fine-tunes the existing layers (copying the existing best/eng netspec)
-extra=""
netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys64 Lfx96 Lrx96 Lfx512 O1c1]"
+extra=""
+# (copied from the fast/fra netspec [the best/fra one is absent from the version string])
+#netspec="[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx128 O1c1]"
iterations=10000
oldtraining="$3"
@@ -17,6 +19,11 @@ name="$4"
mkdir -p "$name"
+if test ! -f "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata"; then
+ echo "/usr/local/share/tesseract-ocr/tessdata/eng.traineddata not found, needed for lstmf generation, bailing"
+ exit 1
+fi
+
printf 'gtdir: %s\ngtevaldir: %s\noldtraineddata: %s\ntrainingname: %s\nnetspec: %s\niterations: %s\nextra_args: %s\n' \
"$1" "$2" "$3" "$4" "$netspec" $iterations "$extra" > "$name/settings"
@@ -24,14 +31,14 @@ printf 'gtdir: %s\ngtevaldir: %s\noldtraineddata: %s\ntrainingname: %s\nnetspec:
echo "Copying training ground truth"
mkdir -p "$name/gt"
find "$1" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do
- n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.gt\.txt$/.txt/g'`
+ n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'`
cp "$i" "$name/gt/$n"
done
echo "Copying eval ground truth"
mkdir -p "$name/eval"
find "$2" -type f -name '*tif' -o -name '*png' -o -name '*txt' -o -name '*box' | while read i; do
- n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.gt\.txt$/.txt/g'`
+ n=`basename "$i" | sed 's/\.bin\.png$/.png/g; s/\.bin\.txt$/.txt/g; s/\.gt\.txt$/.txt/g'`
cp "$i" "$name/eval/$n"
done
@@ -44,7 +51,7 @@ find "$name/gt" "$name/eval" -type f -name '*txt' | while read i; do
test -f "$d/$b.png" && n="$b.png"
test -z "$n" && echo "Skipping $i as no corresponding image found" && continue
test -f "$d/$b.box" && echo "Skipping $i as box file already present" && continue
- python ~/training/generate_line_box.py -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1
+ python ~/bigboy/othertools/generate_line_box.py -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1
done
echo "Making unicharset"