From fb77852acbdbbcedcdb9771770cb6771da002851 Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 23 Jan 2019 21:54:09 +0000 Subject: Update line-conf-buckets to mostly use package functions too. Working now, but needs more consolidation to be worth it. --- line-conf-buckets-tess/line-conf-buckets-tess.go | 4 +- line-conf-buckets/line-conf-buckets.go | 116 ++++++----------------- parse/hocr/hocr.go | 5 +- parse/line.go | 18 +++- parse/prob/prob.go | 72 ++++++++++++++ 5 files changed, 125 insertions(+), 90 deletions(-) create mode 100644 parse/prob/prob.go diff --git a/line-conf-buckets-tess/line-conf-buckets-tess.go b/line-conf-buckets-tess/line-conf-buckets-tess.go index b24bdec..8abdff3 100644 --- a/line-conf-buckets-tess/line-conf-buckets-tess.go +++ b/line-conf-buckets-tess/line-conf-buckets-tess.go @@ -92,7 +92,7 @@ func main() { avgstr := strconv.FormatFloat(l.Avgconf, 'f', 5, 64) avgstr = avgstr[2:] - outname := filepath.Join(outdir, todir, l.Hocrname + "_" + l.Name + "_" + avgstr + ".png") + outname := filepath.Join(outdir, todir, l.OcrName + "_" + l.Name + "_" + avgstr + ".png") err := os.MkdirAll(filepath.Join(outdir, todir), 0700) if err != nil { @@ -111,7 +111,7 @@ func main() { log.Fatal(err) } - outname = filepath.Join(outdir, todir, l.Hocrname + "_" + l.Name + "_" + avgstr + ".txt") + outname = filepath.Join(outdir, todir, l.OcrName + "_" + l.Name + "_" + avgstr + ".txt") outfile, err = os.Create(outname) if err != nil { fmt.Fprintf(os.Stderr, "Failed to create %s\n", outname) diff --git a/line-conf-buckets/line-conf-buckets.go b/line-conf-buckets/line-conf-buckets.go index c2df074..1c33ba4 100644 --- a/line-conf-buckets/line-conf-buckets.go +++ b/line-conf-buckets/line-conf-buckets.go @@ -5,42 +5,23 @@ import ( "flag" "fmt" "io" - "io/ioutil" "log" "os" "path/filepath" "sort" "strconv" "strings" -) - -type LineDetail struct { - Filename string - Avgconf float64 - Filebase string - Basename string - Dirname string - Fulltext string -} - -type LineDetails []LineDetail - -// Used by sort.Sort. -func (l LineDetails) Len() int { return len(l) } -// Used by sort.Sort. -func (l LineDetails) Less(i, j int) bool { - return l[i].Avgconf < l[j].Avgconf -} - -// Used by sort.Sort. -func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } + "git.rescribe.xyz/testingtools/parse" + "git.rescribe.xyz/testingtools/parse/prob" +) -func copyline(filebase string, dirname string, basename string, avgconf string, outdir string, todir string) (err error) { +// TODO: this is just a placeholder, do this more sensibly, as -tess does (hint: full txt should already be in the LineDetail) +func copyline(filebase string, dirname string, basename string, avgconf string, outdir string, todir string, l parse.LineDetail) (err error) { outname := filepath.Join(outdir, todir, filepath.Base(dirname) + "_" + basename + "_" + avgconf) //log.Fatalf("I'd use '%s' as outname, and '%s' as filebase\n", outname, filebase) - for _, extn := range []string{".bin.png", ".txt"} { + for _, extn := range []string{".txt"} { infile, err := os.Open(filebase + extn) if err != nil { fmt.Fprintf(os.Stderr, "Failed to open %s\n", filebase + extn) @@ -66,6 +47,16 @@ func copyline(filebase string, dirname string, basename string, avgconf string, } } + f, err := os.Create(outname + ".bin.png") + if err != nil { + return err + } + defer f.Close() + err = l.Img.CopyLineTo(f) + if err != nil { + return err + } + return err } @@ -82,77 +73,28 @@ func main() { os.Exit(1) } - lines := make(LineDetails, 0) + lines := make(parse.LineDetails, 0) for _, f := range flag.Args() { file, err := os.Open(f) if err != nil { - fmt.Fprintf(os.Stderr, "Error opening %s\n", f) log.Fatal(err) } defer file.Close() reader := bufio.NewReader(file) - totalconf := float64(0) - num := 0 - - err = nil - for err == nil { - var line string - line, err = reader.ReadString('\n') - fields := strings.Fields(line) - - if len(fields) == 2 { - conf, converr := strconv.ParseFloat(fields[1], 64) - if converr != nil { - fmt.Fprintf(os.Stderr, "Error: can't convert '%s' to float (full line: %s)\n", fields[1], line) - continue - } - totalconf += conf - num += 1 - } + newlines, err := prob.GetLineDetails(f, reader) + if err != nil { + log.Fatal(err) } - avg := totalconf / float64(num) - // Explicitly close file immediately after use, rather than relying on defer, - // as too many files could be opened before any of the files are closed, leading - // to a 'too many open files' error - // TODO: rewrite this loop so it uses a function or two, so we can rely - // on defer sensibly again. + for _, l := range newlines { + lines = append(lines, l) + } + // explicitly close the file, so we can be sure we won't run out of + // handles before defer runs file.Close() - - if num == 0 || avg == 0 { - continue - } - - var linedetail LineDetail - linedetail.Filename = f - linedetail.Avgconf = avg - linedetail.Filebase = strings.Replace(f, ".prob", "", 1) - linedetail.Basename = filepath.Base(linedetail.Filebase) - linedetail.Dirname = filepath.Dir(linedetail.Filebase) - - txtfile, ferr := os.Open(linedetail.Filebase + ".txt") - if ferr != nil { - fmt.Fprintf(os.Stderr, "Error opening %s\n", linedetail.Filebase + ".txt") - log.Fatal(ferr) - } - defer txtfile.Close() - ft, ferr := ioutil.ReadAll(txtfile) - if ferr != nil { - fmt.Fprintf(os.Stderr, "Error reading %s\n", linedetail.Filebase + ".txt") - log.Fatal(ferr) - } - linedetail.Fulltext = string(ft) - // Explicitly close file immediately after use, rather than relying on defer, - // as too many files could be opened before any of the files are closed, leading - // to a 'too many open files' error - // TODO: rewrite this loop so it uses a function or two, so we can rely - // on defer sensibly again. - txtfile.Close() - - lines = append(lines, linedetail) } sort.Sort(lines) @@ -178,8 +120,12 @@ func main() { } avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) - avgstr = avgstr[2:] - err := copyline(l.Filebase, l.Dirname, l.Basename, avgstr, outdir, todir) + if len(avgstr) > 2 { + avgstr = avgstr[2:] + } + filebase := strings.Replace(l.Name, ".prob", "", 1) + basename := filepath.Base(filebase) + err := copyline(filebase, l.OcrName, basename, avgstr, outdir, todir, l) if err != nil { log.Fatal(err) } diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go index a281a7a..f7cac05 100644 --- a/parse/hocr/hocr.go +++ b/parse/hocr/hocr.go @@ -1,7 +1,8 @@ package hocr // TODO: consider making GetLineDetails() a function of Hocr, so could do a -// similar thing with prob format files too. +// similar thing with prob format files too, and then fire them both +// off a generic interface, potentially. // TODO: Parse line name to zero pad line numbers, so they come out in the correct order import ( @@ -137,7 +138,7 @@ func GetLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, erro } line.Text = strings.TrimRight(linetext, " ") line.Text += "\n" - line.Hocrname = name + line.OcrName = name var imgd parse.ImgDirect imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) line.Img = imgd diff --git a/parse/line.go b/parse/line.go index 3ddde76..9a2be8e 100644 --- a/parse/line.go +++ b/parse/line.go @@ -9,6 +9,7 @@ import ( "image" "image/png" "io" + "os" ) type LineDetail struct { @@ -16,7 +17,7 @@ type LineDetail struct { Avgconf float64 Img CopyableLine Text string - Hocrname string + OcrName string } type CopyableLine interface { @@ -37,6 +38,21 @@ func (i ImgDirect) CopyLineTo(w io.Writer) (error) { return nil } +type ImgPath struct { + Path string +} + +func (i ImgPath) CopyLineTo(w io.Writer) (error) { + f, err := os.Open(i.Path) + if err != nil { + return err + } + defer f.Close() + + _, err = io.Copy(w, f) + return err +} + type LineDetails []LineDetail // Used by sort.Sort. diff --git a/parse/prob/prob.go b/parse/prob/prob.go new file mode 100644 index 0000000..5a84567 --- /dev/null +++ b/parse/prob/prob.go @@ -0,0 +1,72 @@ +package prob + +import ( + "bufio" + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "git.rescribe.xyz/testingtools/parse" +) + +// TODO: probably switch to just relying on io.Reader +func getLineAvg(r *bufio.Reader) (float64, error) { + var err error + + totalconf := float64(0) + num := 0 + + err = nil + for err == nil { + var line string + line, err = r.ReadString('\n') + fields := strings.Fields(line) + + if len(fields) == 2 { + conf, converr := strconv.ParseFloat(fields[1], 64) + if converr != nil { + continue + } + totalconf += conf + num += 1 + } + } + if num <= 0 { + return 0, nil + } + avg := totalconf / float64(num) + return avg, nil +} + +// TODO: probably switch to just relying on io.Reader +// Note this only processes one line at a time +func GetLineDetails(name string, r *bufio.Reader) (parse.LineDetails, error) { + var line parse.LineDetail + lines := make(parse.LineDetails, 0) + + avg, err := getLineAvg(r) + if err != nil { + return lines, err + } + + filebase := strings.Replace(name, ".prob", "", 1) + + txt, err := ioutil.ReadFile(filebase + ".txt") + if err != nil { + return lines, err + } + + line.Name = name + line.Avgconf = avg + line.Text = string(txt) + line.OcrName = filepath.Dir(filebase) + + var imgfn parse.ImgPath + imgfn.Path = filebase + ".bin.png" + line.Img = imgfn + + lines = append(lines, line) + + return lines, nil +} -- cgit v1.2.1-24-ge1ad