diff options
author | Nick White <git@njw.name> | 2021-02-09 17:55:18 +0000 |
---|---|---|
committer | Nick White <git@njw.name> | 2021-02-09 17:55:18 +0000 |
commit | d134ae8a3be1ddc79122937609fa441a5076fd03 (patch) | |
tree | e0986995d5f6bbcc4a4e13d618dfddff2815fe72 /cmd/extracthocrlines | |
parent | 45943f847b3db8db5142c79a806f251659264ca0 (diff) |
Add extracthocrlines tool
Diffstat (limited to 'cmd/extracthocrlines')
-rw-r--r-- | cmd/extracthocrlines/main.go | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/cmd/extracthocrlines/main.go b/cmd/extracthocrlines/main.go new file mode 100644 index 0000000..d765875 --- /dev/null +++ b/cmd/extracthocrlines/main.go @@ -0,0 +1,89 @@ +// Copyright 2021 Nick White. +// Use of this source code is governed by the GPLv3 +// license that can be found in the LICENSE file. + +// extracthocrlines copies the text and corresponding image section +// for each line of a HOCR file into separate files, which is +// useful for OCR training +package main + +import ( + "flag" + "fmt" + "io" + "log" + "os" + "path/filepath" + + "rescribe.xyz/utils/pkg/hocr" + "rescribe.xyz/utils/pkg/line" +) + +const usage = `Usage: extracthocrlines file.hocr [file.hocr] + +Copies the text and corresponding image section for each line +of a HOCR file into separate files, which is useful for OCR +training. +` + +// saveline saves the text and image for a line in a directory +func saveline(l line.Detail, dir string) error { + err := os.MkdirAll(dir, 0700) + if err != nil { + return err + } + + base := filepath.Join(dir, l.OcrName+"_"+l.Name) + + f, err := os.Create(base + ".png") + if err != nil { + return fmt.Errorf("Error creating file %s: %v", base+".png", err) + } + + err = l.Img.CopyLineTo(f) + if err != nil { + return fmt.Errorf("Error writing line image for %s: %v", base+".png", err) + } + + f, err = os.Create(base + ".txt") + if err != nil { + return fmt.Errorf("Error creating file %s: %v", base+".txt", err) + } + + _, err = io.WriteString(f, l.Text) + if err != nil { + return fmt.Errorf("Error writing line text for %s: %v", base+".txt", err) + } + + return nil +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), usage) + flag.PrintDefaults() + } + dir := flag.String("d", ".", "Directory to save lines in") + flag.Parse() + if flag.NArg() < 1 { + flag.Usage() + os.Exit(1) + } + + for _, f := range flag.Args() { + newlines, err := hocr.GetLineDetails(f) + if err != nil { + log.Fatal(err) + } + + for _, l := range newlines { + if l.Img == nil { + continue + } + err = saveline(l, *dir) + if err != nil { + log.Fatal(err) + } + } + } +} |