From 31889aebd8ef29b9fa8ba4f685365dee0d93abbf Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 20 Apr 2020 13:40:17 +0100 Subject: Add generate_line_box.py to this repo and reference it nicely in scripts --- generate_line_box.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ testtraining.sh | 3 ++- traintess.sh | 3 ++- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100755 generate_line_box.py diff --git a/generate_line_box.py b/generate_line_box.py new file mode 100755 index 0000000..8b7ec74 --- /dev/null +++ b/generate_line_box.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import io +import argparse +import unicodedata +from PIL import Image + +import sys +import codecs +sys.stdout = codecs.getwriter('utf8')(sys.stdout) +sys.stderr = codecs.getwriter('utf8')(sys.stderr) + +# +# command line arguments +# +arg_parser = argparse.ArgumentParser('''Creates tesseract box files for given (line) image text pairs''') + +# Text ground truth +arg_parser.add_argument('-t', '--txt', nargs='?', metavar='TXT', help='Line text (GT)', required=True) + +# Image file +arg_parser.add_argument('-i', '--image', nargs='?', metavar='IMAGE', help='Image file', required=True) + +args = arg_parser.parse_args() + +# +# main +# + +# load image +with open(args.image, "rb") as f: + width, height = Image.open(f).size + +# load gt +with io.open(args.txt, "r", encoding='utf-8') as f: + lines = f.read().strip().split('\n') + +for line in lines: + if line.strip(): + for i in range(1, len(line)): + char = line[i] + prev_char = line[i-1] + if unicodedata.combining(char): + print(u"%s %d %d %d %d 0" % ((prev_char + char), 0, 0, width, height)) + elif not unicodedata.combining(prev_char): + print(u"%s %d %d %d %d 0" % (prev_char, 0, 0, width, height)) + if not unicodedata.combining(line[-1]): + print(u"%s %d %d %d %d 0" % (line[-1], 0, 0, width, height)) + print(u"%s %d %d %d %d 0" % ("\t", width, height, width+1, height+1)) diff --git a/testtraining.sh b/testtraining.sh index 64d39bd..686dae8 100755 --- a/testtraining.sh +++ b/testtraining.sh @@ -4,6 +4,7 @@ usage="Usage: $0 traineddata evaldir" test $# -ne 2 && echo "$usage" && exit 1 evaldir="$2" +here=`dirname "$0"` # Make box files find "$evaldir" -type f -name '*txt' | while read i; do @@ -14,7 +15,7 @@ find "$evaldir" -type f -name '*txt' | while read i; do test -f "$d/$b.box" && continue test -f "$d/${b}_lstmf.lstmf" && continue - python ~/bigboy/othertools/generate_line_box.py -i "$d/$b.png" -t "$i" > "$d/$b.box" || exit 1 + python "${here}/generate_line_box.py" -i "$d/$b.png" -t "$i" > "$d/$b.box" || exit 1 done diff --git a/traintess.sh b/traintess.sh index e94ca9d..0841899 100755 --- a/traintess.sh +++ b/traintess.sh @@ -16,6 +16,7 @@ iterations=100000 oldtraining="$3" name="$4" +here=`dirname "$0"` mkdir -p "$name" @@ -52,7 +53,7 @@ find "$name/gt" "$name/eval" -type f -name '*txt' | while read i; do test -f "$d/$b.png" && n="$b.png" test -z "$n" && echo "Skipping $i as no corresponding image found" && continue test -f "$d/$b.box" && echo "Skipping $i as box file already present" && continue - python ~/bigboy/othertools/generate_line_box.py -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1 + python "${here}/generate_line_box.py" -i "$d/$n" -t "$i" > "$d/$b.box" || exit 1 done echo "Making unicharset" -- cgit v1.2.1-24-ge1ad