diff options
Diffstat (limited to 'pkg')
-rw-r--r-- | pkg/hocr/hocr.go | 129 | ||||
-rw-r--r-- | pkg/hocr/lines.go | 131 | ||||
-rw-r--r-- | pkg/line/line.go | 57 | ||||
-rw-r--r-- | pkg/prob/prob.go | 69 |
4 files changed, 386 insertions, 0 deletions
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go new file mode 100644 index 0000000..dcd0494 --- /dev/null +++ b/pkg/hocr/hocr.go @@ -0,0 +1,129 @@ +package hocr + +import ( + "encoding/xml" + "errors" + "io/ioutil" + "regexp" + "strconv" + "strings" +) + +type Hocr struct { + Lines []OcrLine `xml:"body>div>div>p>span"` +} + +type OcrLine struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Words []OcrWord `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrWord struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrChar struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Chars []OcrChar `xml:"span"` + Text string `xml:",chardata"` +} + +// Returns the confidence for a word based on its x_wconf value +func wordConf(s string) (float64, error) { + re, err := regexp.Compile(`x_wconf ([0-9.]+)`) + if err != nil { + return 0.0, err + } + conf := re.FindStringSubmatch(s) + return strconv.ParseFloat(conf[1], 64) +} + +func boxCoords(s string) ([4]int, error) { + var coords [4]int + re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) + if err != nil { + return coords, err + } + coordstr := re.FindStringSubmatch(s) + for i := range coords { + c, err := strconv.Atoi(coordstr[i+1]) + if err != nil { + return coords, err + } + coords[i] = c + } + return coords, nil +} + +func noText(s string) bool { + t := strings.Trim(s, " \n") + return len(t) == 0 +} + +func Parse(b []byte) (Hocr, error) { + var hocr Hocr + + err := xml.Unmarshal(b, &hocr) + if err != nil { + return hocr, err + } + + return hocr, nil +} + +func GetText(hocrfn string) (string, error) { + var s string + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return s, err + } + + h, err := Parse(file) + if err != nil { + return s, err + } + + + for _, l := range h.Lines { + s += getLineText(l) + } + return s, nil +} + +func GetAvgConf(hocrfn string) (float64, error) { + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return 0, err + } + + h, err := Parse(file) + if err != nil { + return 0, err + } + + var total, num float64 + for _, l := range h.Lines { + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return 0, err + } + total += c + num++ + } + } + if num == 0 { + return 0, errors.New("No words found") + } + return total / num, nil +} diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go new file mode 100644 index 0000000..e90b0a8 --- /dev/null +++ b/pkg/hocr/lines.go @@ -0,0 +1,131 @@ +package hocr + +// TODO: Parse line name to zero pad line numbers, so they can +// be sorted easily + +import ( + "image" + "image/png" + "io/ioutil" + "log" + "os" + "path/filepath" + "strings" + + "rescribe.xyz/utils/pkg/line" +) + +func getLineText(l OcrLine) (string) { + linetext := "" + + linetext = l.Text + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + linetext += w.Text + " " + } + } + if noText(linetext) { + linetext = "" + for _, w := range l.Words { + if w.Class != "ocrx_word" { + continue + } + for _, c := range w.Chars { + if c.Class != "ocrx_cinfo" { + continue + } + linetext += c.Text + } + linetext += " " + } + } + linetext = strings.TrimRight(linetext, " ") + linetext += "\n" + return linetext +} + +func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) { + lines := make(line.Details, 0) + + for _, l := range h.Lines { + totalconf := float64(0) + num := 0 + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + return lines, err + } + num++ + totalconf += c + } + + coords, err := boxCoords(l.Title) + if err != nil { + return lines, err + } + + var ln line.Detail + ln.Name = l.Id + ln.Avgconf = (totalconf / float64(num)) / 100 + ln.Text = getLineText(l) + ln.OcrName = name + if i != nil { + var imgd line.ImgDirect + imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) + ln.Img = imgd + } + lines = append(lines, ln) + } + return lines, nil +} + +func GetLineDetails(hocrfn string) (line.Details, error) { + var newlines line.Details + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return newlines, err + } + + h, err := Parse(file) + if err != nil { + return newlines, err + } + + var img image.Image + pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) + pngf, err := os.Open(pngfn) + if err != nil { + log.Println("Warning: can't open image %s\n", pngfn) + } else { + defer pngf.Close() + img, err = png.Decode(pngf) + if err != nil { + log.Println("Warning: can't load image %s\n", pngfn) + } + } + + n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) + return parseLineDetails(h, img, n) +} + +func GetLineBasics(hocrfn string) (line.Details, error) { + var newlines line.Details + + file, err := ioutil.ReadFile(hocrfn) + if err != nil { + return newlines, err + } + + h, err := Parse(file) + if err != nil { + return newlines, err + } + + n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) + return parseLineDetails(h, nil, n) +} diff --git a/pkg/line/line.go b/pkg/line/line.go new file mode 100644 index 0000000..d4e3e44 --- /dev/null +++ b/pkg/line/line.go @@ -0,0 +1,57 @@ +package line + +import ( + "image" + "image/png" + "io" + "os" +) + +type Detail struct { + Name string + Avgconf float64 + Img CopyableImg + Text string + OcrName string +} + +type CopyableImg interface { + CopyLineTo(io.Writer) error +} + +type Details []Detail + +func (l Details) Len() int { return len(l) } +func (l Details) Less(i, j int) bool { return l[i].Avgconf < l[j].Avgconf } +func (l Details) Swap(i, j int) { l[i], l[j] = l[j], l[i] } + +// This is an implementation of the CopyableImg interface that +// stores the image directly as an image.Image +type ImgDirect struct { + Img image.Image +} + +func (i ImgDirect) CopyLineTo(w io.Writer) error { + err := png.Encode(w, i.Img) + if err != nil { + return err + } + return nil +} + +// This is an implementation of the CopyableImg interface that +// stores the path of an image +type ImgPath struct { + Path string +} + +func (i ImgPath) CopyLineTo(w io.Writer) error { + f, err := os.Open(i.Path) + if err != nil { + return err + } + defer f.Close() + + _, err = io.Copy(w, f) + return err +} diff --git a/pkg/prob/prob.go b/pkg/prob/prob.go new file mode 100644 index 0000000..8bdb3d5 --- /dev/null +++ b/pkg/prob/prob.go @@ -0,0 +1,69 @@ +package prob + +import ( + "io/ioutil" + "path/filepath" + "strconv" + "strings" + + "rescribe.xyz/utils/pkg/line" +) + +func getLineAvg(f string) (float64, error) { + totalconf := float64(0) + num := 0 + + prob, err := ioutil.ReadFile(f) + if err != nil { + return 0, err + } + + for _, l := range strings.Split(string(prob), "\n") { + fields := strings.Fields(l) + + if len(fields) == 2 { + conf, err := strconv.ParseFloat(fields[1], 64) + if err != nil { + continue + } + totalconf += conf + num += 1 + } + } + if num <= 0 { + return 0, nil + } + avg := totalconf / float64(num) + return avg, nil +} + +// Note this only processes one line at a time +func GetLineDetails(probfn string) (line.Details, error) { + var l line.Detail + lines := make(line.Details, 0) + + avg, err := getLineAvg(probfn) + if err != nil { + return lines, err + } + + filebase := strings.Replace(probfn, ".prob", "", 1) + + txt, err := ioutil.ReadFile(filebase + ".txt") + if err != nil { + return lines, err + } + + l.Name = filepath.Base(filebase) + l.Avgconf = avg + l.Text = string(txt) + l.OcrName = filepath.Base(filepath.Dir(filebase)) + + var imgfn line.ImgPath + imgfn.Path = filebase + ".bin.png" + l.Img = imgfn + + lines = append(lines, l) + + return lines, nil +} |