From fa4f48ad54ec94c222269d335d40b21becff92a4 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 23 Mar 2021 11:14:35 +0000 Subject: hocr: Add ability to specify a custom image path for hocr line extraction, and use it in extracthocrlines --- cmd/extracthocrlines/main.go | 13 ++++++++++-- pkg/hocr/hocr.go | 10 --------- pkg/hocr/lines.go | 48 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/cmd/extracthocrlines/main.go b/cmd/extracthocrlines/main.go index d765875..fa5ef28 100644 --- a/cmd/extracthocrlines/main.go +++ b/cmd/extracthocrlines/main.go @@ -14,12 +14,13 @@ import ( "log" "os" "path/filepath" + "strings" "rescribe.xyz/utils/pkg/hocr" "rescribe.xyz/utils/pkg/line" ) -const usage = `Usage: extracthocrlines file.hocr [file.hocr] +const usage = `Usage: extracthocrlines [-d] [-e] file.hocr [file.hocr] Copies the text and corresponding image section for each line of a HOCR file into separate files, which is useful for OCR @@ -64,6 +65,7 @@ func main() { flag.PrintDefaults() } dir := flag.String("d", ".", "Directory to save lines in") + embeddedimgpath := flag.Bool("e", false, "Use image path embedded in hOCR (rather than the path of the .hocr file with a .png suffix)") flag.Parse() if flag.NArg() < 1 { flag.Usage() @@ -71,7 +73,14 @@ func main() { } for _, f := range flag.Args() { - newlines, err := hocr.GetLineDetails(f) + var err error + var newlines line.Details + if *embeddedimgpath { + newlines, err = hocr.GetLineDetails(f) + } else { + imgName := strings.TrimSuffix(f, ".hocr") + ".png" + newlines, err = hocr.GetLineDetailsCustomImg(f, imgName) + } if err != nil { log.Fatal(err) } diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go index 6cd1df1..e84c2b1 100644 --- a/pkg/hocr/hocr.go +++ b/pkg/hocr/hocr.go @@ -58,16 +58,6 @@ func wordConf(s string) (float64, error) { return strconv.ParseFloat(conf[1], 64) } -// Returns the image path for a page from a ocr_page title -func imagePath(s string) (string, error) { - re, err := regexp.Compile(`image ["']([^"']+)["']`) - if err != nil { - return "", err - } - m := re.FindStringSubmatch(s) - return m[1], nil -} - // BoxCoords parses bbox coordinate strings func BoxCoords(s string) ([4]int, error) { var coords [4]int diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go index 1c759e0..5794243 100644 --- a/pkg/hocr/lines.go +++ b/pkg/hocr/lines.go @@ -16,11 +16,22 @@ import ( "io/ioutil" "os" "path/filepath" + "regexp" "strings" "rescribe.xyz/utils/pkg/line" ) +// Returns the image path for a page from a ocr_page title +func imagePathFromTitle(s string) (string, error) { + re, err := regexp.Compile(`image ["']([^"']+)["']`) + if err != nil { + return "", err + } + m := re.FindStringSubmatch(s) + return m[1], nil +} + // LineText extracts the text from an OcrLine func LineText(l OcrLine) string { linetext := "" @@ -54,11 +65,17 @@ func LineText(l OcrLine) string { return linetext } -func parseLineDetails(h Hocr, dir string) (line.Details, error) { +// parseLineDetails parses a Hocr struct into a line.Details +// struct, including extracted image segments for each line. +// The image location is taken from imgPath, which can either +// be imagePathFromTitle (see above) which loads the image +// path embedded in the title attribute of a hocr page, or +// a custom handler. +func parseLineDetails(h Hocr, dir string, imgPath func(string) (string, error)) (line.Details, error) { lines := make(line.Details, 0) for _, p := range h.Pages { - imgpath, err := imagePath(p.Title) + imgpath, err := imgPath(p.Title) if err != nil { return lines, err } @@ -68,7 +85,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) { var gray *image.Gray pngf, err := os.Open(imgpath) if err != nil { - fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v", imgpath, err) + fmt.Fprintf(os.Stderr, "Warning: error opening image %s: %v\n", imgpath, err) } defer pngf.Close() img, _, err = image.Decode(pngf) @@ -99,7 +116,7 @@ func parseLineDetails(h Hocr, dir string) (line.Details, error) { ln.Name = l.Id ln.Avgconf = (totalconf / float64(num)) / 100 ln.Text = LineText(l) - imgpath, err := imagePath(p.Title) + imgpath, err := imgPath(p.Title) if err != nil { return lines, err } @@ -131,7 +148,26 @@ func GetLineDetails(hocrfn string) (line.Details, error) { return newlines, err } - return parseLineDetails(h, filepath.Dir(hocrfn)) + return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle) +} + +// GetLineDetailsCustomImg is a variant of GetLineDetails that +// uses a provided image path for line image extracts, rather +// than the image name embedded in the .hocr +func GetLineDetailsCustomImg(hocrfn string, imgfn string) (line.Details, error) { + var newlines line.Details + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return newlines, err + } + + h, err := Parse(file) + if err != nil { + return newlines, err + } + + return parseLineDetails(h, filepath.Dir(hocrfn), func(s string) (string, error) {return imgfn, nil}) } // GetLineBasics parses a hocr file and returns a corresponding @@ -149,5 +185,5 @@ func GetLineBasics(hocrfn string) (line.Details, error) { return newlines, err } - return parseLineDetails(h, filepath.Dir(hocrfn)) + return parseLineDetails(h, filepath.Dir(hocrfn), imagePathFromTitle) } -- cgit v1.2.1-24-ge1ad