diff options
| author | Nick White <git@njw.name> | 2019-02-25 13:01:28 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-02-25 13:01:28 +0000 | 
| commit | cd1fb1c9f6e1384ac0add8904425e6f92b17a704 (patch) | |
| tree | 4b634aca131fa95ecb761904d312322386a38420 /lib | |
| parent | 3c4c5f7c202b7c54ca8d23e7bd7bff4a4bb696cc (diff) | |
Generalise get text from hocr lines
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/hocr/hocr.go | 29 | ||||
| -rw-r--r-- | lib/hocr/lines.go | 63 | 
2 files changed, 36 insertions, 56 deletions
| diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go index fbf1523..f6316d8 100644 --- a/lib/hocr/hocr.go +++ b/lib/hocr/hocr.go @@ -92,34 +92,9 @@ func GetText(hocrfn string) (string, error) {  		return s, err  	} +  	for _, l := range h.Lines { -		linetext := l.Text -		if noText(linetext) { -			linetext = "" -			for _, w := range l.Words { -				if w.Class != "ocrx_word" { -					continue -				} -				linetext += w.Text + " " -			} -		} -		if noText(linetext) { -			linetext = "" -			for _, w := range l.Words { -				if w.Class != "ocrx_word" { -					continue -				} -				for _, c := range w.Chars { -					if c.Class != "ocrx_cinfo" { -						continue -					} -					linetext += c.Text -				} -				linetext += " " -			} -		} -		linetext = strings.TrimRight(linetext, " ") + "\n" -		s += linetext +		s += getLineText(l)  	}  	return s, nil  } diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go index 00acd1f..c60a619 100644 --- a/lib/hocr/lines.go +++ b/lib/hocr/lines.go @@ -14,6 +14,39 @@ import (  	"rescribe.xyz/go.git/lib/line"  ) +func getLineText(l OcrLine) (string) { +	linetext := "" + +	linetext = l.Text +	if noText(linetext) { +		linetext = "" +		for _, w := range l.Words { +			if w.Class != "ocrx_word" { +				continue +			} +			linetext += w.Text + " " +		} +	} +	if noText(linetext) { +		linetext = "" +		for _, w := range l.Words { +			if w.Class != "ocrx_word" { +				continue +			} +			for _, c := range w.Chars { +				if c.Class != "ocrx_cinfo" { +					continue +				} +				linetext += c.Text +			} +			linetext += " " +		} +	} +	linetext = strings.TrimRight(linetext, " ") +	linetext += "\n" +	return linetext +} +  func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) {  	lines := make(line.Details, 0) @@ -37,35 +70,7 @@ func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error)  		var ln line.Detail  		ln.Name = l.Id  		ln.Avgconf = (totalconf / float64(num)) / 100 -		linetext := "" - -		linetext = l.Text -		if noText(linetext) { -			linetext = "" -			for _, w := range l.Words { -				if w.Class != "ocrx_word" { -					continue -				} -				linetext += w.Text + " " -			} -		} -		if noText(linetext) { -			linetext = "" -			for _, w := range l.Words { -				if w.Class != "ocrx_word" { -					continue -				} -				for _, c := range w.Chars { -					if c.Class != "ocrx_cinfo" { -						continue -					} -					linetext += c.Text -				} -				linetext += " " -			} -		} -		ln.Text = strings.TrimRight(linetext, " ") -		ln.Text += "\n" +		ln.Text = getLineText(l)  		ln.OcrName = name  		var imgd line.ImgDirect  		imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) | 
