summaryrefslogtreecommitdiff
path: root/parse
diff options
context:
space:
mode:
authorNick White <git@njw.name>2019-01-25 09:55:55 +0000
committerNick White <git@njw.name>2019-01-25 09:55:55 +0000
commit30c088b90e7b6a25d93cbdad7564ff063e62afd3 (patch)
treec47d7bc086a076cfe5e702628c4e5e3b1eab1aa5 /parse
parent1c17766952bdcd6f7d31d0fa1a2e504b1aa4f14a (diff)
Reorganisation and cleanup
Diffstat (limited to 'parse')
-rw-r--r--parse/bucket.go123
-rw-r--r--parse/hocr/hocr.go181
-rw-r--r--parse/line.go67
-rw-r--r--parse/prob/prob.go69
4 files changed, 0 insertions, 440 deletions
diff --git a/parse/bucket.go b/parse/bucket.go
deleted file mode 100644
index 44b1d24..0000000
--- a/parse/bucket.go
+++ /dev/null
@@ -1,123 +0,0 @@
-package parse
-
-import (
- "fmt"
- "io"
- "path/filepath"
- "os"
- "sort"
- "strconv"
-)
-
-type BucketSpec struct {
- Min float64
- Name string
-}
-type BucketSpecs []BucketSpec
-func (b BucketSpecs) Len() int { return len(b) }
-func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
-func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min }
-
-func bucketLine(l LineDetail, buckets BucketSpecs, dirname string) (string, error) {
- var bucket string
-
- todir := ""
- for _, b := range buckets {
- if l.Avgconf >= b.Min {
- todir = b.Name
- bucket = b.Name
- }
- }
-
- if todir == "" {
- return bucket, nil
- }
-
- avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64)
- if len(avgstr) > 2 {
- avgstr = avgstr[2:]
- }
-
- base := filepath.Join(dirname, todir, filepath.Base(l.OcrName) + "_" + l.Name + "_" + avgstr)
-
- err := os.MkdirAll(filepath.Join(dirname, todir), 0700)
- if err != nil {
- return bucket, err
- }
-
- f, err := os.Create(base + ".png")
- if err != nil {
- return bucket, err
- }
- defer f.Close()
-
- err = l.Img.CopyLineTo(f)
- if err != nil {
- return bucket, err
- }
-
- f, err = os.Create(base + ".txt")
- if err != nil {
- return bucket, err
- }
- defer f.Close()
-
- _, err = io.WriteString(f, l.Text)
- if err != nil {
- return bucket, err
- }
-
- return bucket, err
-}
-
-type BucketStat struct {
- name string
- num int
-}
-type BucketStats []BucketStat
-func (b BucketStats) Len() int { return len(b) }
-func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
-func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num }
-
-// Copies line images and text into directories based on their
-// confidence, as defined by the buckets struct
-func BucketUp(lines LineDetails, buckets BucketSpecs, dirname string) (BucketStats, error) {
- var all []string
- var stats BucketStats
-
- sort.Sort(lines)
- sort.Sort(buckets)
- for _, l := range lines {
- bname, err := bucketLine(l, buckets, dirname)
- if err != nil {
- return stats, err
- }
- all = append(all, bname)
- }
-
- for _, b := range all {
- i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b })
- if i == len(stats) {
- newstat := BucketStat { b, 0 }
- stats = append(stats, newstat)
- i = len(stats) - 1
- }
- stats[i].num++
- }
-
- return stats, nil
-}
-
-func PrintBucketStats(w io.Writer, stats BucketStats) {
- var total int
- for _, s := range stats {
- total += s.num
- }
-
- fmt.Fprintf(w, "Copied %d lines\n", total)
- fmt.Fprintf(w, "---------------------------------\n")
- sort.Sort(stats)
- for _, s := range stats {
- fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100 * s.num / total)
- }
-}
diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go
deleted file mode 100644
index 81250a9..0000000
--- a/parse/hocr/hocr.go
+++ /dev/null
@@ -1,181 +0,0 @@
-package hocr
-
-// TODO: Parse line name to zero pad line numbers, so they can
-// be sorted easily
-// TODO: have same filename format as .prob uses, so include base
-// dirname, and don't include line numbers if there's only
-// one line in the hocr
-
-import (
- "encoding/xml"
- "image"
- "image/png"
- "io/ioutil"
- "os"
- "path/filepath"
- "regexp"
- "strconv"
- "strings"
-
- "git.rescribe.xyz/testingtools/parse"
-)
-
-type Hocr struct {
- Lines []OcrLine `xml:"body>div>div>p>span"`
-}
-
-type OcrLine struct {
- Class string `xml:"class,attr"`
- Id string `xml:"id,attr"`
- Title string `xml:"title,attr"`
- Words []OcrWord `xml:"span"`
- Text string `xml:",chardata"`
-}
-
-type OcrWord struct {
- Class string `xml:"class,attr"`
- Id string `xml:"id,attr"`
- Title string `xml:"title,attr"`
- Chars []OcrChar `xml:"span"`
- Text string `xml:",chardata"`
-}
-
-type OcrChar struct {
- Class string `xml:"class,attr"`
- Id string `xml:"id,attr"`
- Title string `xml:"title,attr"`
- Chars []OcrChar `xml:"span"`
- Text string `xml:",chardata"`
-}
-
-// Returns the confidence for a word based on its x_wconf value
-func wordConf(s string) (float64, error) {
- re, err := regexp.Compile(`x_wconf ([0-9.]+)`)
- if err != nil {
- return 0.0, err
- }
- conf := re.FindStringSubmatch(s)
- return strconv.ParseFloat(conf[1], 64)
-}
-
-func boxCoords(s string) ([4]int, error) {
- var coords [4]int
- re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
- if err != nil {
- return coords, err
- }
- coordstr := re.FindStringSubmatch(s)
- for i := range coords {
- c, err := strconv.Atoi(coordstr[i+1])
- if err != nil {
- return coords, err
- }
- coords[i] = c
- }
- return coords, nil
-}
-
-func noText(s string) bool {
- t := strings.Trim(s, " \n")
- return len(t) == 0
-}
-
-func Parse(b []byte) (Hocr, error) {
- var hocr Hocr
-
- err := xml.Unmarshal(b, &hocr)
- if err != nil {
- return hocr, err
- }
-
- return hocr, nil
-}
-
-func parseLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, error) {
- lines := make(parse.LineDetails, 0)
-
- for _, l := range h.Lines {
- totalconf := float64(0)
- num := 0
- for _, w := range l.Words {
- c, err := wordConf(w.Title)
- if err != nil {
- return lines, err
- }
- num++
- totalconf += c
- }
-
- coords, err := boxCoords(l.Title)
- if err != nil {
- return lines, err
- }
-
- var line parse.LineDetail
- line.Name = l.Id
- line.Avgconf = (totalconf/float64(num)) / 100
- linetext := ""
-
- linetext = l.Text
- if(noText(linetext)) {
- linetext = ""
- for _, w := range l.Words {
- if(w.Class != "ocrx_word") {
- continue
- }
- linetext += w.Text + " "
- }
- }
- if(noText(linetext)) {
- linetext = ""
- for _, w := range l.Words {
- if(w.Class != "ocrx_word") {
- continue
- }
- for _, c := range w.Chars {
- if(c.Class != "ocrx_cinfo") {
- continue
- }
- linetext += c.Text
- }
- linetext += " "
- }
- }
- line.Text = strings.TrimRight(linetext, " ")
- line.Text += "\n"
- line.OcrName = name
- var imgd parse.ImgDirect
- imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3]))
- line.Img = imgd
- lines = append(lines, line)
- }
- return lines, nil
-}
-
-func GetLineDetails(hocrfn string) (parse.LineDetails, error) {
- var newlines parse.LineDetails
-
- file, err := ioutil.ReadFile(hocrfn)
- if err != nil {
- return newlines, err
- }
-
- h, err := Parse(file)
- if err != nil {
- return newlines, err
- }
-
- pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1)
- pngf, err := os.Open(pngfn)
- if err != nil {
- return newlines, err
- }
- defer pngf.Close()
- img, err := png.Decode(pngf)
- if err != nil {
- return newlines, err
- }
-
- n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1)
- return parseLineDetails(h, img, n)
-}
diff --git a/parse/line.go b/parse/line.go
deleted file mode 100644
index 9a2be8e..0000000
--- a/parse/line.go
+++ /dev/null
@@ -1,67 +0,0 @@
-package parse
-
-// TODO: integrate in line-conf-buckets linedetail
-// TODO: add BucketUp() function here that does what both line-conf-buckets-tess.go
-// and line-conf-buckets.go do
-// TODO: consider naming this package line, and separating it from hocr and prob
-
-import (
- "image"
- "image/png"
- "io"
- "os"
-)
-
-type LineDetail struct {
- Name string
- Avgconf float64
- Img CopyableLine
- Text string
- OcrName string
-}
-
-type CopyableLine interface {
- CopyLineTo(io.Writer) (error)
-}
-
-// This is an implementation of the CopyableLine interface that
-// stores the image directly as an image.Image
-type ImgDirect struct {
- Img image.Image
-}
-
-func (i ImgDirect) CopyLineTo(w io.Writer) (error) {
- err := png.Encode(w, i.Img)
- if err != nil {
- return err
- }
- return nil
-}
-
-type ImgPath struct {
- Path string
-}
-
-func (i ImgPath) CopyLineTo(w io.Writer) (error) {
- f, err := os.Open(i.Path)
- if err != nil {
- return err
- }
- defer f.Close()
-
- _, err = io.Copy(w, f)
- return err
-}
-
-type LineDetails []LineDetail
-
-// Used by sort.Sort.
-func (l LineDetails) Len() int { return len(l) }
-
-// Used by sort.Sort.
-func (l LineDetails) Less(i, j int) bool {
- return l[i].Avgconf < l[j].Avgconf
-}
-
-// Used by sort.Sort.
-func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
diff --git a/parse/prob/prob.go b/parse/prob/prob.go
deleted file mode 100644
index 8d01cab..0000000
--- a/parse/prob/prob.go
+++ /dev/null
@@ -1,69 +0,0 @@
-package prob
-
-import (
- "io/ioutil"
- "path/filepath"
- "strconv"
- "strings"
-
- "git.rescribe.xyz/testingtools/parse"
-)
-
-func getLineAvg(f string) (float64, error) {
- totalconf := float64(0)
- num := 0
-
- prob, err := ioutil.ReadFile(f)
- if err != nil {
- return 0, err
- }
-
- for _, line := range strings.Split(string(prob), "\n") {
- fields := strings.Fields(line)
-
- if len(fields) == 2 {
- conf, err := strconv.ParseFloat(fields[1], 64)
- if err != nil {
- continue
- }
- totalconf += conf
- num += 1
- }
- }
- if num <= 0 {
- return 0, nil
- }
- avg := totalconf / float64(num)
- return avg, nil
-}
-
-// Note this only processes one line at a time
-func GetLineDetails(probfn string) (parse.LineDetails, error) {
- var line parse.LineDetail
- lines := make(parse.LineDetails, 0)
-
- avg, err := getLineAvg(probfn)
- if err != nil {
- return lines, err
- }
-
- filebase := strings.Replace(probfn, ".prob", "", 1)
-
- txt, err := ioutil.ReadFile(filebase + ".txt")
- if err != nil {
- return lines, err
- }
-
- line.Name = filepath.Base(filebase)
- line.Avgconf = avg
- line.Text = string(txt)
- line.OcrName = filepath.Dir(filebase)
-
- var imgfn parse.ImgPath
- imgfn.Path = filebase + ".bin.png"
- line.Img = imgfn
-
- lines = append(lines, line)
-
- return lines, nil
-}