diff options
author | Nick White <git@njw.name> | 2021-02-09 17:45:35 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2021-02-09 17:45:35 +0000 |
commit | 45943f847b3db8db5142c79a806f251659264ca0 (patch) | |
tree | 10b1c484f6d858f0dedc267c10dfe97f80342799 | |
parent | 13ce8fc4b45073e1f81a39c4923e44420509be73 (diff) |
hocr: Use extracted page name for line naming
This means that even in multi page hocrs with lines with the same
id (like line_1_1), then the page name will be different, so
extracthocrlines now won't mistakenly name different lines the same
and therefore overwrite them.
-rw-r--r-- | pkg/hocr/lines.go | 14 |
1 files changed, 8 insertions, 6 deletions
diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go index 393f0ec..1c759e0 100644 --- a/pkg/hocr/lines.go +++ b/pkg/hocr/lines.go @@ -54,7 +54,7 @@ func LineText(l OcrLine) string { return linetext } -func parseLineDetails(h Hocr, dir string, name string) (line.Details, error) { +func parseLineDetails(h Hocr, dir string) (line.Details, error) { lines := make(line.Details, 0) for _, p := range h.Pages { @@ -99,7 +99,11 @@ func parseLineDetails(h Hocr, dir string, name string) (line.Details, error) { ln.Name = l.Id ln.Avgconf = (totalconf / float64(num)) / 100 ln.Text = LineText(l) - ln.OcrName = name + imgpath, err := imagePath(p.Title) + if err != nil { + return lines, err + } + ln.OcrName = strings.TrimSuffix(filepath.Base(imgpath), ".png") if gray != nil { var imgd line.ImgDirect imgd.Img = gray.SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) @@ -127,8 +131,7 @@ func GetLineDetails(hocrfn string) (line.Details, error) { return newlines, err } - n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) - return parseLineDetails(h, filepath.Dir(hocrfn), n) + return parseLineDetails(h, filepath.Dir(hocrfn)) } // GetLineBasics parses a hocr file and returns a corresponding @@ -146,6 +149,5 @@ func GetLineBasics(hocrfn string) (line.Details, error) { return newlines, err } - n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) - return parseLineDetails(h, filepath.Dir(hocrfn), n) + return parseLineDetails(h, filepath.Dir(hocrfn)) } |