summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-04-20 13:40:17 +0100
committerNick White <git@njw.name>2020-04-20 13:40:17 +0100
commit31889aebd8ef29b9fa8ba4f685365dee0d93abbf (patch)
treedf0e7b90e0d895bcfdd8b4667baec3aa45e36422
parent6d43d44449380aa25987906d2a3d9dd97c906700 (diff)
Add generate_line_box.py to this repo and reference it nicely in scripts
-rwxr-xr-xgenerate_line_box.py49
-rwxr-xr-xtesttraining.sh3
-rwxr-xr-xtraintess.sh3
3 files changed, 53 insertions, 2 deletions
diff --git a/generate_line_box.py b/generate_line_box.py
new file mode 100755
index 0000000..8b7ec74
--- /dev/null
+++ b/generate_line_box.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+import io
+import argparse
+import unicodedata
+from PIL import Image
+
+import sys
+import codecs
+sys.stdout = codecs.getwriter('utf8')(sys.stdout)
+sys.stderr = codecs.getwriter('utf8')(sys.stderr)
+
+#
+# command line arguments
+#
+arg_parser = argparse.ArgumentParser('''Creates tesseract box files for given (line) image text pairs''')
+
+# Text ground truth
+arg_parser.add_argument('-t', '--txt', nargs='?', metavar='TXT', help='Line text (GT)', required=True)
+
+# Image file
+arg_parser.add_argument('-i', '--image', nargs='?', metavar='IMAGE', help='Image file', required=True)
+
+args = arg_parser.parse_args()
+
+#
+# main
+#
+
+# load image
+with open(args.image, "rb") as f:
+ width, height = Image.open(f).size
+
+# load gt
+with io.open(args.txt, "r", encoding='utf-8') as f:
+ lines = f.read().strip().split('\n')
+
+for line in lines:
+ if line.strip():
+ for i in range(1, len(line)):
+ char = line[i]
+ prev_char = line[i-1]
+ if unicodedata.combining(char):
+ print(u"%s %d %d %d %d 0" % ((prev_char + char), 0, 0, width, height))
+ elif not unicodedata.combining(prev_char):
+ print(u"%s %d %d %d %d 0" % (prev_char, 0, 0, width, height))
+ if not unicodedata.combining(line[-1]):
+ print(u"%s %d %d %d %d 0" % (line[-1], 0, 0, width, height))
+ print(u"%s %d %d %d %d 0" % ("\t", width, height, width+1, height+1))
diff --git a/testtraining.sh b/testtraining.sh
index 64d39bd..686dae8 100755
--- a/testtraining.sh
+++ b/testtraining.sh
@@ -4,6 +4,7 @@ usage="Usage: $0 traineddata evaldir"
test $# -ne 2 && echo "$usage" && exit 1
evaldir="$2"
+here=`dirname "$0"`
# Make box files
find "$evaldir" -type f -name '*txt' | while read i; do
@@ -14,7 +15,7 @@ find "$evaldir" -type f -name '*txt' | while read i; do
test -f "$d/$b.box" && continue
test -f "$d/${b}_lstmf.lstmf" && continue
- python ~/bigboy/othertools/generate_line_box.py -i "$d/$b.png" -t "$i" > "$d/$b.box" || exit 1
+ python "${here}/generate_line_box.py" -i "$d/$b.png" -t "$i" > "$d/$b.box" || exit 1
done
diff --git a/traintess.sh b/traintess.sh
index e94ca9d..0841899 100755
--- a/traintess.sh
+++ b/traintess.sh
@@ -16,6 +16,7 @@ iterations=100000
oldtraining="$3"
name="$4"
+here=`dirname "$0"`
mkdir -p "$name"
@@ -52,7 +53,7 @@ find "$name/gt" "$name/eval" -type f -name '*txt' | while read i; do
test -f "$d/$b.png" && n="$b.png"
test -z "$n" && echo "Skipping $i as no corresponding image found" && continue
test -f "$d/$b.box" && echo "Skipping $i as box file already present" && continue
- python ~/bigboy/othertools/generate_line_box.py -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1
+ python "${here}/generate_line_box.py" -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1
done
echo "Making unicharset"