From 30c088b90e7b6a25d93cbdad7564ff063e62afd3 Mon Sep 17 00:00:00 2001 From: Nick White Date: Fri, 25 Jan 2019 09:55:55 +0000 Subject: Reorganisation and cleanup --- README | 1 - bucket-lines/bucket-lines.go | 87 --------------------- bucket-lines/bucket.go | 129 ++++++++++++++++++++++++++++++ bucket-lines/main.go | 87 +++++++++++++++++++++ lib/hocr/hocr.go | 79 +++++++++++++++++++ lib/hocr/lines.go | 107 +++++++++++++++++++++++++ lib/line/line.go | 64 +++++++++++++++ lib/prob/prob.go | 69 +++++++++++++++++ parse/bucket.go | 123 ----------------------------- parse/hocr/hocr.go | 181 ------------------------------------------- parse/line.go | 67 ---------------- parse/prob/prob.go | 69 ----------------- 12 files changed, 535 insertions(+), 528 deletions(-) delete mode 100644 README delete mode 100644 bucket-lines/bucket-lines.go create mode 100644 bucket-lines/bucket.go create mode 100644 bucket-lines/main.go create mode 100644 lib/hocr/hocr.go create mode 100644 lib/hocr/lines.go create mode 100644 lib/line/line.go create mode 100644 lib/prob/prob.go delete mode 100644 parse/bucket.go delete mode 100644 parse/hocr/hocr.go delete mode 100644 parse/line.go delete mode 100644 parse/prob/prob.go diff --git a/README b/README deleted file mode 100644 index 697a1e1..0000000 --- a/README +++ /dev/null @@ -1 +0,0 @@ -There will be testing tools here. diff --git a/bucket-lines/bucket-lines.go b/bucket-lines/bucket-lines.go deleted file mode 100644 index ee81721..0000000 --- a/bucket-lines/bucket-lines.go +++ /dev/null @@ -1,87 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - "path/filepath" - - "git.rescribe.xyz/testingtools/parse" - "git.rescribe.xyz/testingtools/parse/hocr" - "git.rescribe.xyz/testingtools/parse/prob" -) - -func main() { - b := parse.BucketSpecs{ - // minimum confidence, name - { 0, "bad" }, - { 0.95, "95to98" }, - { 0.98, "98plus" }, - } - - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") - fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") - fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n\n") - fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n\n") - fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n\n") - fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") - fmt.Fprintf(os.Stderr, "option.\n\n") - fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") - fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") - flag.PrintDefaults() - fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") - fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") - } - dir := flag.String("d", "buckets", "Directory to store the buckets") - specs := flag.String("s", "", "JSON file describing specs to bucket into") - flag.Parse() - if flag.NArg() < 1 { - flag.Usage() - os.Exit(1) - } - - if *specs != "" { - js, err := ioutil.ReadFile(*specs) - if err != nil { - log.Fatal(err) - } - err = json.Unmarshal(js, &b) - if err != nil { - log.Fatal(err) - } - } - - var err error - lines := make(parse.LineDetails, 0) - - for _, f := range flag.Args() { - var newlines parse.LineDetails - switch ext := filepath.Ext(f); ext { - case ".prob": - newlines, err = prob.GetLineDetails(f) - case ".hocr": - newlines, err = hocr.GetLineDetails(f) - default: - log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) - continue - } - if err != nil { - log.Fatal(err) - } - - for _, l := range newlines { - lines = append(lines, l) - } - } - - stats, err := parse.BucketUp(lines, b, *dir) - if err != nil { - log.Fatal(err) - } - - parse.PrintBucketStats(os.Stdout, stats) -} diff --git a/bucket-lines/bucket.go b/bucket-lines/bucket.go new file mode 100644 index 0000000..5d9271a --- /dev/null +++ b/bucket-lines/bucket.go @@ -0,0 +1,129 @@ +package main + +import ( + "fmt" + "io" + "path/filepath" + "os" + "sort" + "strconv" + + "git.rescribe.xyz/testingtools/lib/line" +) + +type BucketSpec struct { + Min float64 + Name string +} +type BucketSpecs []BucketSpec +func (b BucketSpecs) Len() int { return len(b) } +func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } + +type BucketStat struct { + name string + num int +} +type BucketStats []BucketStat +func (b BucketStats) Len() int { return len(b) } +func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] } +func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } + +// Copies the image and text for a line into a directory based on +// the line confidence, as defined by the buckets struct +func bucketLine(l line.Detail, buckets BucketSpecs, dirname string) (string, error) { + var bucket string + + todir := "" + for _, b := range buckets { + if l.Avgconf >= b.Min { + todir = b.Name + bucket = b.Name + } + } + + if todir == "" { + return bucket, nil + } + + avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) + if len(avgstr) > 2 { + avgstr = avgstr[2:] + } + + base := filepath.Join(dirname, todir, filepath.Base(l.OcrName) + "_" + l.Name + "_" + avgstr) + + err := os.MkdirAll(filepath.Join(dirname, todir), 0700) + if err != nil { + return bucket, err + } + + f, err := os.Create(base + ".png") + if err != nil { + return bucket, err + } + defer f.Close() + + err = l.Img.CopyLineTo(f) + if err != nil { + return bucket, err + } + + f, err = os.Create(base + ".txt") + if err != nil { + return bucket, err + } + defer f.Close() + + _, err = io.WriteString(f, l.Text) + if err != nil { + return bucket, err + } + + return bucket, err +} + +// Copies line images and text into directories based on their +// confidence, as defined by the buckets struct, and returns +// statistics of whire lines went in the process. +func BucketUp(lines line.Details, buckets BucketSpecs, dirname string) (BucketStats, error) { + var all []string + var stats BucketStats + + sort.Sort(lines) + sort.Sort(buckets) + for _, l := range lines { + bname, err := bucketLine(l, buckets, dirname) + if err != nil { + return stats, err + } + all = append(all, bname) + } + + for _, b := range all { + i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) + if i == len(stats) { + newstat := BucketStat { b, 0 } + stats = append(stats, newstat) + i = len(stats) - 1 + } + stats[i].num++ + } + + return stats, nil +} + +// Prints statistics of where lines went when bucketing +func PrintBucketStats(w io.Writer, stats BucketStats) { + var total int + for _, s := range stats { + total += s.num + } + + fmt.Fprintf(w, "Copied %d lines\n", total) + fmt.Fprintf(w, "---------------------------------\n") + sort.Sort(stats) + for _, s := range stats { + fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100 * s.num / total) + } +} diff --git a/bucket-lines/main.go b/bucket-lines/main.go new file mode 100644 index 0000000..b70e358 --- /dev/null +++ b/bucket-lines/main.go @@ -0,0 +1,87 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "path/filepath" + + "git.rescribe.xyz/testingtools/lib/line" + "git.rescribe.xyz/testingtools/lib/hocr" + "git.rescribe.xyz/testingtools/lib/prob" +) + +func main() { + b := BucketSpecs{ + // minimum confidence, name + { 0, "bad" }, + { 0.95, "95to98" }, + { 0.98, "98plus" }, + } + + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [-s specs.json] [hocr1] [prob1] [hocr2] [...]\n") + fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") + fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n\n") + fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n\n") + fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n\n") + fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") + fmt.Fprintf(os.Stderr, "option.\n\n") + fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") + fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n") + flag.PrintDefaults() + fmt.Fprintf(os.Stderr, "\nAn example specs.json file would be the following:\n") + fmt.Fprintf(os.Stderr, "[{\"min\": 0, \"name\": \"terrible\"}, {\"min\": 0.80, \"name\": \"ok\"}, {\"min\": 0.98, \"name\": \"great\"}]\n") + } + dir := flag.String("d", "buckets", "Directory to store the buckets") + specs := flag.String("s", "", "JSON file describing specs to bucket into") + flag.Parse() + if flag.NArg() < 1 { + flag.Usage() + os.Exit(1) + } + + if *specs != "" { + js, err := ioutil.ReadFile(*specs) + if err != nil { + log.Fatal(err) + } + err = json.Unmarshal(js, &b) + if err != nil { + log.Fatal(err) + } + } + + var err error + lines := make(line.Details, 0) + + for _, f := range flag.Args() { + var newlines line.Details + switch ext := filepath.Ext(f); ext { + case ".prob": + newlines, err = prob.GetLineDetails(f) + case ".hocr": + newlines, err = hocr.GetLineDetails(f) + default: + log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) + continue + } + if err != nil { + log.Fatal(err) + } + + for _, l := range newlines { + lines = append(lines, l) + } + } + + stats, err := BucketUp(lines, b, *dir) + if err != nil { + log.Fatal(err) + } + + PrintBucketStats(os.Stdout, stats) +} diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go new file mode 100644 index 0000000..0d10819 --- /dev/null +++ b/lib/hocr/hocr.go @@ -0,0 +1,79 @@ +package hocr + +import ( + "encoding/xml" + "regexp" + "strconv" + "strings" +) + +type Hocr struct { + Lines []OcrLine `xml:"body>div>div>p>span"` +} + +type OcrLine struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Words []OcrWord `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrWord struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrChar struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +// Returns the confidence for a word based on its x_wconf value +func wordConf(s string) (float64, error) { + re, err := regexp.Compile(`x_wconf ([0-9.]+)`) + if err != nil { + return 0.0, err + } + conf := re.FindStringSubmatch(s) + return strconv.ParseFloat(conf[1], 64) +} + +func boxCoords(s string) ([4]int, error) { + var coords [4]int + re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) + if err != nil { + return coords, err + } + coordstr := re.FindStringSubmatch(s) + for i := range coords { + c, err := strconv.Atoi(coordstr[i+1]) + if err != nil { + return coords, err + } + coords[i] = c + } + return coords, nil +} + +func noText(s string) bool { + t := strings.Trim(s, " \n") + return len(t) == 0 +} + +func Parse(b []byte) (Hocr, error) { + var hocr Hocr + + err := xml.Unmarshal(b, &hocr) + if err != nil { + return hocr, err + } + + return hocr, nil +} diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go new file mode 100644 index 0000000..4902b40 --- /dev/null +++ b/lib/hocr/lines.go @@ -0,0 +1,107 @@ +package hocr + +// TODO: Parse line name to zero pad line numbers, so they can +// be sorted easily +// TODO: have same filename format as .prob uses, so include base +// dirname, and don't include line numbers if there's only +// one line in the hocr + +import ( + "image" + "image/png" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "git.rescribe.xyz/testingtools/lib/line" +) + +func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) { + lines := make(line.Details, 0) + + for _, l := range h.Lines { + totalconf := float64(0) + num := 0 + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return lines, err + } + num++ + totalconf += c + } + + coords, err := boxCoords(l.Title) + if err != nil { + return lines, err + } + + var ln line.Detail + ln.Name = l.Id + ln.Avgconf = (totalconf/float64(num)) / 100 + linetext := "" + + linetext = l.Text + if(noText(linetext)) { + linetext = "" + for _, w := range l.Words { + if(w.Class != "ocrx_word") { + continue + } + linetext += w.Text + " " + } + } + if(noText(linetext)) { + linetext = "" + for _, w := range l.Words { + if(w.Class != "ocrx_word") { + continue + } + for _, c := range w.Chars { + if(c.Class != "ocrx_cinfo") { + continue + } + linetext += c.Text + } + linetext += " " + } + } + ln.Text = strings.TrimRight(linetext, " ") + ln.Text += "\n" + ln.OcrName = name + var imgd line.ImgDirect + imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) + ln.Img = imgd + lines = append(lines, ln) + } + return lines, nil +} + +func GetLineDetails(hocrfn string) (line.Details, error) { + var newlines line.Details + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return newlines, err + } + + h, err := Parse(file) + if err != nil { + return newlines, err + } + + pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) + pngf, err := os.Open(pngfn) + if err != nil { + return newlines, err + } + defer pngf.Close() + img, err := png.Decode(pngf) + if err != nil { + return newlines, err + } + + n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) + return parseLineDetails(h, img, n) +} diff --git a/lib/line/line.go b/lib/line/line.go new file mode 100644 index 0000000..3adac0a --- /dev/null +++ b/lib/line/line.go @@ -0,0 +1,64 @@ +package line + +import ( + "image" + "image/png" + "io" + "os" +) + +type Detail struct { + Name string + Avgconf float64 + Img CopyableImg + Text string + OcrName string +} + +type CopyableImg interface { + CopyLineTo(io.Writer) (error) +} + +type Details []Detail + +// Used by sort.Sort. +func (l Details) Len() int { return len(l) } + +// Used by sort.Sort. +func (l Details) Less(i, j int) bool { + return l[i].Avgconf < l[j].Avgconf +} + +// Used by sort.Sort. +func (l Details) Swap(i, j int) { l[i], l[j] = l[j], l[i] } + +// This is an implementation of the CopyableImg interface that +// stores the image directly as an image.Image +type ImgDirect struct { + Img image.Image +} + +func (i ImgDirect) CopyLineTo(w io.Writer) (error) { + err := png.Encode(w, i.Img) + if err != nil { + return err + } + return nil +} + +// This is an implementation of the CopyableImg interface that +// stores the path of an image +type ImgPath struct { + Path string +} + +func (i ImgPath) CopyLineTo(w io.Writer) (error) { + f, err := os.Open(i.Path) + if err != nil { + return err + } + defer f.Close() + + _, err = io.Copy(w, f) + return err +} diff --git a/lib/prob/prob.go b/lib/prob/prob.go new file mode 100644 index 0000000..0299a96 --- /dev/null +++ b/lib/prob/prob.go @@ -0,0 +1,69 @@ +package prob + +import ( + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "git.rescribe.xyz/testingtools/lib/line" +) + +func getLineAvg(f string) (float64, error) { + totalconf := float64(0) + num := 0 + + prob, err := ioutil.ReadFile(f) + if err != nil { + return 0, err + } + + for _, l := range strings.Split(string(prob), "\n") { + fields := strings.Fields(l) + + if len(fields) == 2 { + conf, err := strconv.ParseFloat(fields[1], 64) + if err != nil { + continue + } + totalconf += conf + num += 1 + } + } + if num <= 0 { + return 0, nil + } + avg := totalconf / float64(num) + return avg, nil +} + +// Note this only processes one line at a time +func GetLineDetails(probfn string) (line.Details, error) { + var l line.Detail + lines := make(line.Details, 0) + + avg, err := getLineAvg(probfn) + if err != nil { + return lines, err + } + + filebase := strings.Replace(probfn, ".prob", "", 1) + + txt, err := ioutil.ReadFile(filebase + ".txt") + if err != nil { + return lines, err + } + + l.Name = filepath.Base(filebase) + l.Avgconf = avg + l.Text = string(txt) + l.OcrName = filepath.Dir(filebase) + + var imgfn line.ImgPath + imgfn.Path = filebase + ".bin.png" + l.Img = imgfn + + lines = append(lines, l) + + return lines, nil +} diff --git a/parse/bucket.go b/parse/bucket.go deleted file mode 100644 index 44b1d24..0000000 --- a/parse/bucket.go +++ /dev/null @@ -1,123 +0,0 @@ -package parse - -import ( - "fmt" - "io" - "path/filepath" - "os" - "sort" - "strconv" -) - -type BucketSpec struct { - Min float64 - Name string -} -type BucketSpecs []BucketSpec -func (b BucketSpecs) Len() int { return len(b) } -func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } - -func bucketLine(l LineDetail, buckets BucketSpecs, dirname string) (string, error) { - var bucket string - - todir := "" - for _, b := range buckets { - if l.Avgconf >= b.Min { - todir = b.Name - bucket = b.Name - } - } - - if todir == "" { - return bucket, nil - } - - avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) - if len(avgstr) > 2 { - avgstr = avgstr[2:] - } - - base := filepath.Join(dirname, todir, filepath.Base(l.OcrName) + "_" + l.Name + "_" + avgstr) - - err := os.MkdirAll(filepath.Join(dirname, todir), 0700) - if err != nil { - return bucket, err - } - - f, err := os.Create(base + ".png") - if err != nil { - return bucket, err - } - defer f.Close() - - err = l.Img.CopyLineTo(f) - if err != nil { - return bucket, err - } - - f, err = os.Create(base + ".txt") - if err != nil { - return bucket, err - } - defer f.Close() - - _, err = io.WriteString(f, l.Text) - if err != nil { - return bucket, err - } - - return bucket, err -} - -type BucketStat struct { - name string - num int -} -type BucketStats []BucketStat -func (b BucketStats) Len() int { return len(b) } -func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } - -// Copies line images and text into directories based on their -// confidence, as defined by the buckets struct -func BucketUp(lines LineDetails, buckets BucketSpecs, dirname string) (BucketStats, error) { - var all []string - var stats BucketStats - - sort.Sort(lines) - sort.Sort(buckets) - for _, l := range lines { - bname, err := bucketLine(l, buckets, dirname) - if err != nil { - return stats, err - } - all = append(all, bname) - } - - for _, b := range all { - i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) - if i == len(stats) { - newstat := BucketStat { b, 0 } - stats = append(stats, newstat) - i = len(stats) - 1 - } - stats[i].num++ - } - - return stats, nil -} - -func PrintBucketStats(w io.Writer, stats BucketStats) { - var total int - for _, s := range stats { - total += s.num - } - - fmt.Fprintf(w, "Copied %d lines\n", total) - fmt.Fprintf(w, "---------------------------------\n") - sort.Sort(stats) - for _, s := range stats { - fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100 * s.num / total) - } -} diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go deleted file mode 100644 index 81250a9..0000000 --- a/parse/hocr/hocr.go +++ /dev/null @@ -1,181 +0,0 @@ -package hocr - -// TODO: Parse line name to zero pad line numbers, so they can -// be sorted easily -// TODO: have same filename format as .prob uses, so include base -// dirname, and don't include line numbers if there's only -// one line in the hocr - -import ( - "encoding/xml" - "image" - "image/png" - "io/ioutil" - "os" - "path/filepath" - "regexp" - "strconv" - "strings" - - "git.rescribe.xyz/testingtools/parse" -) - -type Hocr struct { - Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Words []OcrWord `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrWord struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -type OcrChar struct { - Class string `xml:"class,attr"` - Id string `xml:"id,attr"` - Title string `xml:"title,attr"` - Chars []OcrChar `xml:"span"` - Text string `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { - re, err := regexp.Compile(`x_wconf ([0-9.]+)`) - if err != nil { - return 0.0, err - } - conf := re.FindStringSubmatch(s) - return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { - var coords [4]int - re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) - if err != nil { - return coords, err - } - coordstr := re.FindStringSubmatch(s) - for i := range coords { - c, err := strconv.Atoi(coordstr[i+1]) - if err != nil { - return coords, err - } - coords[i] = c - } - return coords, nil -} - -func noText(s string) bool { - t := strings.Trim(s, " \n") - return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { - var hocr Hocr - - err := xml.Unmarshal(b, &hocr) - if err != nil { - return hocr, err - } - - return hocr, nil -} - -func parseLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, error) { - lines := make(parse.LineDetails, 0) - - for _, l := range h.Lines { - totalconf := float64(0) - num := 0 - for _, w := range l.Words { - c, err := wordConf(w.Title) - if err != nil { - return lines, err - } - num++ - totalconf += c - } - - coords, err := boxCoords(l.Title) - if err != nil { - return lines, err - } - - var line parse.LineDetail - line.Name = l.Id - line.Avgconf = (totalconf/float64(num)) / 100 - linetext := "" - - linetext = l.Text - if(noText(linetext)) { - linetext = "" - for _, w := range l.Words { - if(w.Class != "ocrx_word") { - continue - } - linetext += w.Text + " " - } - } - if(noText(linetext)) { - linetext = "" - for _, w := range l.Words { - if(w.Class != "ocrx_word") { - continue - } - for _, c := range w.Chars { - if(c.Class != "ocrx_cinfo") { - continue - } - linetext += c.Text - } - linetext += " " - } - } - line.Text = strings.TrimRight(linetext, " ") - line.Text += "\n" - line.OcrName = name - var imgd parse.ImgDirect - imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) - line.Img = imgd - lines = append(lines, line) - } - return lines, nil -} - -func GetLineDetails(hocrfn string) (parse.LineDetails, error) { - var newlines parse.LineDetails - - file, err := ioutil.ReadFile(hocrfn) - if err != nil { - return newlines, err - } - - h, err := Parse(file) - if err != nil { - return newlines, err - } - - pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) - pngf, err := os.Open(pngfn) - if err != nil { - return newlines, err - } - defer pngf.Close() - img, err := png.Decode(pngf) - if err != nil { - return newlines, err - } - - n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) - return parseLineDetails(h, img, n) -} diff --git a/parse/line.go b/parse/line.go deleted file mode 100644 index 9a2be8e..0000000 --- a/parse/line.go +++ /dev/null @@ -1,67 +0,0 @@ -package parse - -// TODO: integrate in line-conf-buckets linedetail -// TODO: add BucketUp() function here that does what both line-conf-buckets-tess.go -// and line-conf-buckets.go do -// TODO: consider naming this package line, and separating it from hocr and prob - -import ( - "image" - "image/png" - "io" - "os" -) - -type LineDetail struct { - Name string - Avgconf float64 - Img CopyableLine - Text string - OcrName string -} - -type CopyableLine interface { - CopyLineTo(io.Writer) (error) -} - -// This is an implementation of the CopyableLine interface that -// stores the image directly as an image.Image -type ImgDirect struct { - Img image.Image -} - -func (i ImgDirect) CopyLineTo(w io.Writer) (error) { - err := png.Encode(w, i.Img) - if err != nil { - return err - } - return nil -} - -type ImgPath struct { - Path string -} - -func (i ImgPath) CopyLineTo(w io.Writer) (error) { - f, err := os.Open(i.Path) - if err != nil { - return err - } - defer f.Close() - - _, err = io.Copy(w, f) - return err -} - -type LineDetails []LineDetail - -// Used by sort.Sort. -func (l LineDetails) Len() int { return len(l) } - -// Used by sort.Sort. -func (l LineDetails) Less(i, j int) bool { - return l[i].Avgconf < l[j].Avgconf -} - -// Used by sort.Sort. -func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } diff --git a/parse/prob/prob.go b/parse/prob/prob.go deleted file mode 100644 index 8d01cab..0000000 --- a/parse/prob/prob.go +++ /dev/null @@ -1,69 +0,0 @@ -package prob - -import ( - "io/ioutil" - "path/filepath" - "strconv" - "strings" - - "git.rescribe.xyz/testingtools/parse" -) - -func getLineAvg(f string) (float64, error) { - totalconf := float64(0) - num := 0 - - prob, err := ioutil.ReadFile(f) - if err != nil { - return 0, err - } - - for _, line := range strings.Split(string(prob), "\n") { - fields := strings.Fields(line) - - if len(fields) == 2 { - conf, err := strconv.ParseFloat(fields[1], 64) - if err != nil { - continue - } - totalconf += conf - num += 1 - } - } - if num <= 0 { - return 0, nil - } - avg := totalconf / float64(num) - return avg, nil -} - -// Note this only processes one line at a time -func GetLineDetails(probfn string) (parse.LineDetails, error) { - var line parse.LineDetail - lines := make(parse.LineDetails, 0) - - avg, err := getLineAvg(probfn) - if err != nil { - return lines, err - } - - filebase := strings.Replace(probfn, ".prob", "", 1) - - txt, err := ioutil.ReadFile(filebase + ".txt") - if err != nil { - return lines, err - } - - line.Name = filepath.Base(filebase) - line.Avgconf = avg - line.Text = string(txt) - line.OcrName = filepath.Dir(filebase) - - var imgfn parse.ImgPath - imgfn.Path = filebase + ".bin.png" - line.Img = imgfn - - lines = append(lines, line) - - return lines, nil -} -- cgit v1.2.1-24-ge1ad