From 31889aebd8ef29b9fa8ba4f685365dee0d93abbf Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 20 Apr 2020 13:40:17 +0100 Subject: Add generate_line_box.py to this repo and reference it nicely in scripts --- generate_line_box.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 generate_line_box.py (limited to 'generate_line_box.py') diff --git a/generate_line_box.py b/generate_line_box.py new file mode 100755 index 0000000..8b7ec74 --- /dev/null +++ b/generate_line_box.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import io +import argparse +import unicodedata +from PIL import Image + +import sys +import codecs +sys.stdout = codecs.getwriter('utf8')(sys.stdout) +sys.stderr = codecs.getwriter('utf8')(sys.stderr) + +# +# command line arguments +# +arg_parser = argparse.ArgumentParser('''Creates tesseract box files for given (line) image text pairs''') + +# Text ground truth +arg_parser.add_argument('-t', '--txt', nargs='?', metavar='TXT', help='Line text (GT)', required=True) + +# Image file +arg_parser.add_argument('-i', '--image', nargs='?', metavar='IMAGE', help='Image file', required=True) + +args = arg_parser.parse_args() + +# +# main +# + +# load image +with open(args.image, "rb") as f: + width, height = Image.open(f).size + +# load gt +with io.open(args.txt, "r", encoding='utf-8') as f: + lines = f.read().strip().split('\n') + +for line in lines: + if line.strip(): + for i in range(1, len(line)): + char = line[i] + prev_char = line[i-1] + if unicodedata.combining(char): + print(u"%s %d %d %d %d 0" % ((prev_char + char), 0, 0, width, height)) + elif not unicodedata.combining(prev_char): + print(u"%s %d %d %d %d 0" % (prev_char, 0, 0, width, height)) + if not unicodedata.combining(line[-1]): + print(u"%s %d %d %d %d 0" % (line[-1], 0, 0, width, height)) + print(u"%s %d %d %d %d 0" % ("\t", width, height, width+1, height+1)) -- cgit v1.2.1-24-ge1ad