summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--line-conf-buckets-tess/line-conf-buckets-tess.go4
-rw-r--r--line-conf-buckets/line-conf-buckets.go116
-rw-r--r--parse/hocr/hocr.go5
-rw-r--r--parse/line.go18
-rw-r--r--parse/prob/prob.go72
5 files changed, 125 insertions, 90 deletions
diff --git a/line-conf-buckets-tess/line-conf-buckets-tess.go b/line-conf-buckets-tess/line-conf-buckets-tess.go
index b24bdec..8abdff3 100644
--- a/line-conf-buckets-tess/line-conf-buckets-tess.go
+++ b/line-conf-buckets-tess/line-conf-buckets-tess.go
@@ -92,7 +92,7 @@ func main() {
avgstr := strconv.FormatFloat(l.Avgconf, 'f', 5, 64)
avgstr = avgstr[2:]
- outname := filepath.Join(outdir, todir, l.Hocrname + "_" + l.Name + "_" + avgstr + ".png")
+ outname := filepath.Join(outdir, todir, l.OcrName + "_" + l.Name + "_" + avgstr + ".png")
err := os.MkdirAll(filepath.Join(outdir, todir), 0700)
if err != nil {
@@ -111,7 +111,7 @@ func main() {
log.Fatal(err)
}
- outname = filepath.Join(outdir, todir, l.Hocrname + "_" + l.Name + "_" + avgstr + ".txt")
+ outname = filepath.Join(outdir, todir, l.OcrName + "_" + l.Name + "_" + avgstr + ".txt")
outfile, err = os.Create(outname)
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to create %s\n", outname)
diff --git a/line-conf-buckets/line-conf-buckets.go b/line-conf-buckets/line-conf-buckets.go
index c2df074..1c33ba4 100644
--- a/line-conf-buckets/line-conf-buckets.go
+++ b/line-conf-buckets/line-conf-buckets.go
@@ -5,42 +5,23 @@ import (
"flag"
"fmt"
"io"
- "io/ioutil"
"log"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
-)
-
-type LineDetail struct {
- Filename string
- Avgconf float64
- Filebase string
- Basename string
- Dirname string
- Fulltext string
-}
-
-type LineDetails []LineDetail
-
-// Used by sort.Sort.
-func (l LineDetails) Len() int { return len(l) }
-// Used by sort.Sort.
-func (l LineDetails) Less(i, j int) bool {
- return l[i].Avgconf < l[j].Avgconf
-}
-
-// Used by sort.Sort.
-func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
+ "git.rescribe.xyz/testingtools/parse"
+ "git.rescribe.xyz/testingtools/parse/prob"
+)
-func copyline(filebase string, dirname string, basename string, avgconf string, outdir string, todir string) (err error) {
+// TODO: this is just a placeholder, do this more sensibly, as -tess does (hint: full txt should already be in the LineDetail)
+func copyline(filebase string, dirname string, basename string, avgconf string, outdir string, todir string, l parse.LineDetail) (err error) {
outname := filepath.Join(outdir, todir, filepath.Base(dirname) + "_" + basename + "_" + avgconf)
//log.Fatalf("I'd use '%s' as outname, and '%s' as filebase\n", outname, filebase)
- for _, extn := range []string{".bin.png", ".txt"} {
+ for _, extn := range []string{".txt"} {
infile, err := os.Open(filebase + extn)
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to open %s\n", filebase + extn)
@@ -66,6 +47,16 @@ func copyline(filebase string, dirname string, basename string, avgconf string,
}
}
+ f, err := os.Create(outname + ".bin.png")
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ err = l.Img.CopyLineTo(f)
+ if err != nil {
+ return err
+ }
+
return err
}
@@ -82,77 +73,28 @@ func main() {
os.Exit(1)
}
- lines := make(LineDetails, 0)
+ lines := make(parse.LineDetails, 0)
for _, f := range flag.Args() {
file, err := os.Open(f)
if err != nil {
- fmt.Fprintf(os.Stderr, "Error opening %s\n", f)
log.Fatal(err)
}
defer file.Close()
reader := bufio.NewReader(file)
- totalconf := float64(0)
- num := 0
-
- err = nil
- for err == nil {
- var line string
- line, err = reader.ReadString('\n')
- fields := strings.Fields(line)
-
- if len(fields) == 2 {
- conf, converr := strconv.ParseFloat(fields[1], 64)
- if converr != nil {
- fmt.Fprintf(os.Stderr, "Error: can't convert '%s' to float (full line: %s)\n", fields[1], line)
- continue
- }
- totalconf += conf
- num += 1
- }
+ newlines, err := prob.GetLineDetails(f, reader)
+ if err != nil {
+ log.Fatal(err)
}
- avg := totalconf / float64(num)
- // Explicitly close file immediately after use, rather than relying on defer,
- // as too many files could be opened before any of the files are closed, leading
- // to a 'too many open files' error
- // TODO: rewrite this loop so it uses a function or two, so we can rely
- // on defer sensibly again.
+ for _, l := range newlines {
+ lines = append(lines, l)
+ }
+ // explicitly close the file, so we can be sure we won't run out of
+ // handles before defer runs
file.Close()
-
- if num == 0 || avg == 0 {
- continue
- }
-
- var linedetail LineDetail
- linedetail.Filename = f
- linedetail.Avgconf = avg
- linedetail.Filebase = strings.Replace(f, ".prob", "", 1)
- linedetail.Basename = filepath.Base(linedetail.Filebase)
- linedetail.Dirname = filepath.Dir(linedetail.Filebase)
-
- txtfile, ferr := os.Open(linedetail.Filebase + ".txt")
- if ferr != nil {
- fmt.Fprintf(os.Stderr, "Error opening %s\n", linedetail.Filebase + ".txt")
- log.Fatal(ferr)
- }
- defer txtfile.Close()
- ft, ferr := ioutil.ReadAll(txtfile)
- if ferr != nil {
- fmt.Fprintf(os.Stderr, "Error reading %s\n", linedetail.Filebase + ".txt")
- log.Fatal(ferr)
- }
- linedetail.Fulltext = string(ft)
- // Explicitly close file immediately after use, rather than relying on defer,
- // as too many files could be opened before any of the files are closed, leading
- // to a 'too many open files' error
- // TODO: rewrite this loop so it uses a function or two, so we can rely
- // on defer sensibly again.
- txtfile.Close()
-
- lines = append(lines, linedetail)
}
sort.Sort(lines)
@@ -178,8 +120,12 @@ func main() {
}
avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64)
- avgstr = avgstr[2:]
- err := copyline(l.Filebase, l.Dirname, l.Basename, avgstr, outdir, todir)
+ if len(avgstr) > 2 {
+ avgstr = avgstr[2:]
+ }
+ filebase := strings.Replace(l.Name, ".prob", "", 1)
+ basename := filepath.Base(filebase)
+ err := copyline(filebase, l.OcrName, basename, avgstr, outdir, todir, l)
if err != nil {
log.Fatal(err)
}
diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go
index a281a7a..f7cac05 100644
--- a/parse/hocr/hocr.go
+++ b/parse/hocr/hocr.go
@@ -1,7 +1,8 @@
package hocr
// TODO: consider making GetLineDetails() a function of Hocr, so could do a
-// similar thing with prob format files too.
+// similar thing with prob format files too, and then fire them both
+// off a generic interface, potentially.
// TODO: Parse line name to zero pad line numbers, so they come out in the correct order
import (
@@ -137,7 +138,7 @@ func GetLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, erro
}
line.Text = strings.TrimRight(linetext, " ")
line.Text += "\n"
- line.Hocrname = name
+ line.OcrName = name
var imgd parse.ImgDirect
imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
line.Img = imgd
diff --git a/parse/line.go b/parse/line.go
index 3ddde76..9a2be8e 100644
--- a/parse/line.go
+++ b/parse/line.go
@@ -9,6 +9,7 @@ import (
"image"
"image/png"
"io"
+ "os"
)
type LineDetail struct {
@@ -16,7 +17,7 @@ type LineDetail struct {
Avgconf float64
Img CopyableLine
Text string
- Hocrname string
+ OcrName string
}
type CopyableLine interface {
@@ -37,6 +38,21 @@ func (i ImgDirect) CopyLineTo(w io.Writer) (error) {
return nil
}
+type ImgPath struct {
+ Path string
+}
+
+func (i ImgPath) CopyLineTo(w io.Writer) (error) {
+ f, err := os.Open(i.Path)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ _, err = io.Copy(w, f)
+ return err
+}
+
type LineDetails []LineDetail
// Used by sort.Sort.
diff --git a/parse/prob/prob.go b/parse/prob/prob.go
new file mode 100644
index 0000000..5a84567
--- /dev/null
+++ b/parse/prob/prob.go
@@ -0,0 +1,72 @@
+package prob
+
+import (
+ "bufio"
+ "io/ioutil"
+ "path/filepath"
+ "strconv"
+ "strings"
+
+ "git.rescribe.xyz/testingtools/parse"
+)
+
+// TODO: probably switch to just relying on io.Reader
+func getLineAvg(r *bufio.Reader) (float64, error) {
+ var err error
+
+ totalconf := float64(0)
+ num := 0
+
+ err = nil
+ for err == nil {
+ var line string
+ line, err = r.ReadString('\n')
+ fields := strings.Fields(line)
+
+ if len(fields) == 2 {
+ conf, converr := strconv.ParseFloat(fields[1], 64)
+ if converr != nil {
+ continue
+ }
+ totalconf += conf
+ num += 1
+ }
+ }
+ if num <= 0 {
+ return 0, nil
+ }
+ avg := totalconf / float64(num)
+ return avg, nil
+}
+
+// TODO: probably switch to just relying on io.Reader
+// Note this only processes one line at a time
+func GetLineDetails(name string, r *bufio.Reader) (parse.LineDetails, error) {
+ var line parse.LineDetail
+ lines := make(parse.LineDetails, 0)
+
+ avg, err := getLineAvg(r)
+ if err != nil {
+ return lines, err
+ }
+
+ filebase := strings.Replace(name, ".prob", "", 1)
+
+ txt, err := ioutil.ReadFile(filebase + ".txt")
+ if err != nil {
+ return lines, err
+ }
+
+ line.Name = name
+ line.Avgconf = avg
+ line.Text = string(txt)
+ line.OcrName = filepath.Dir(filebase)
+
+ var imgfn parse.ImgPath
+ imgfn.Path = filebase + ".bin.png"
+ line.Img = imgfn
+
+ lines = append(lines, line)
+
+ return lines, nil
+}