diff options
-rw-r--r-- | line-conf-buckets-tess/line-conf-buckets-tess.go | 261 |
1 files changed, 261 insertions, 0 deletions
diff --git a/line-conf-buckets-tess/line-conf-buckets-tess.go b/line-conf-buckets-tess/line-conf-buckets-tess.go new file mode 100644 index 0000000..29df9f4 --- /dev/null +++ b/line-conf-buckets-tess/line-conf-buckets-tess.go @@ -0,0 +1,261 @@ +package main + +// TODO: combine this with line-conf-buckets, separating the parsing +// out to a separate library, probably +// see https://github.com/OCR-D/ocrd-train/issues/7 and https://github.com/OCR-D/ocrd-train/ +// for tips on creating lines of tif/txt. best thing is to use hocr-extract-images to extract +// images for each line, based on tesseract's hocr output. can then copy the ground truth +// for that +// initial plan for this is to identify the lines which are best, and extract the text, then +// later can extract the images from them +// +// ok, am parsing the hocr now, workflow should be: +// - run hocr-extract-images (outside of this) and have a directory of images named line-000.png +// - run this with hocr and hocr-images dir +// this then saves the text for the line alongside copying the image from the dir into a fresh dir, according to the line confidence +// +// actually, *should* be able to extract the images quite straightforwardly straight from go, which would be cool. so try to build that. +// should be super easy, with SubImage, see end of https://blog.golang.org/go-image-package + +import ( + "encoding/xml" + "flag" + "fmt" + "image" + "image/png" + "io" + "io/ioutil" + "log" + "os" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" +) + +type LineDetail struct { + name string + avgconf float64 + img image.Image + text string + hocrname string +} + +type LineDetails []LineDetail + +// Used by sort.Sort. +func (l LineDetails) Len() int { return len(l) } + +// Used by sort.Sort. +func (l LineDetails) Less(i, j int) bool { + return l[i].avgconf < l[j].avgconf +} + +// Used by sort.Sort. +func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } + +func copyline(filebase string, dirname string, basename string, avgconf string, outdir string, todir string) (err error) { + outname := filepath.Join(outdir, todir, filepath.Base(dirname) + "_" + basename + "_" + avgconf) + //log.Fatalf("I'd use '%s' as outname, and '%s' as filebase\n", outname, filebase) + + for _, extn := range []string{".bin.png", ".txt"} { + infile, err := os.Open(filebase + extn) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to open %s\n", filebase + extn) + return err + } + defer infile.Close() + + err = os.MkdirAll(filepath.Join(outdir, todir), 0700) + if err != nil { + return err + } + + outfile, err := os.Create(outname + extn) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to create %s\n", outname + extn) + return err + } + defer outfile.Close() + + _, err = io.Copy(outfile, infile) + if err != nil { + return err + } + } + + return err +} + +type Hocr struct { + Lines []OcrLine `xml:"body>div>div>p>span"` +} + +type OcrLine struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + Words []OcrWord `xml:"span"` + Text string `xml:",chardata"` +} + +type OcrWord struct { + Class string `xml:"class,attr"` + Id string `xml:"id,attr"` + Title string `xml:"title,attr"` + // TODO: also capture OcrChar where it exists, to grab text from it + // TODO: grab text from these elements, to save for the line +} + +// Returns the confidence for a word based on its x_wconf value +func wordConf(s string) (float64, error) { + re, err := regexp.Compile(`x_wconf ([0-9.]+)`) + if err != nil { + return 0.0, err + } + conf := re.FindStringSubmatch(s) + return strconv.ParseFloat(conf[1], 64) +} + +func boxCoords(s string) ([4]int, error) { + var coords [4]int + re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) + if err != nil { + return coords, err + } + coordstr := re.FindStringSubmatch(s) + for i := range coords { + c, err := strconv.Atoi(coordstr[i+1]) + if err != nil { + return coords, err + } + coords[i] = c + } + return coords, nil + +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: line-conf-buckets hocr1 [hocr2] [...]\n") + fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") + fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() < 1 { + flag.Usage() + os.Exit(1) + } + + lines := make(LineDetails, 0) + + var hocr Hocr + + for _, f := range flag.Args() { + file, err := ioutil.ReadFile(f) + if err != nil { + log.Fatal(err) + } + + err = xml.Unmarshal(file, &hocr) + if err != nil { + log.Fatal(err) + } + + pngfn := strings.Replace(f, ".hocr", ".png", 1) + pngf, err := os.Open(pngfn) + if err != nil { + log.Fatal(err) + } + defer pngf.Close() + img, err := png.Decode(pngf) + if err != nil { + log.Fatal(err) + } + + for _, l := range hocr.Lines { + totalconf := float64(0) + num := 0 + for _, w := range l.Words { + c, err := wordConf(w.Title) + if err != nil { + log.Fatal(err) + } + num++ + totalconf += c + } + + coords, err := boxCoords(l.Title) + if err != nil { + log.Fatal(err) + } + + var line LineDetail + line.name = l.Id + line.avgconf = totalconf/float64(num) + line.text = l.Text // TODO: get text from OcrWord and OcrChar (if available) + line.hocrname = strings.Replace(filepath.Base(f), ".hocr", "", 1) + line.img = img.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) + lines = append(lines, line) + } + } + + sort.Sort(lines) + + worstnum := 0 + mediumnum := 0 + bestnum := 0 + + outdir := "buckets" // TODO: set this from cmdline + todir := "" + + for _, l := range lines { + switch { + case l.avgconf < 0.95: + todir = "bad" + worstnum++ + case l.avgconf < 0.98: + todir = "95to98" + mediumnum++ + default: + todir = "98plus" + bestnum++ + } + + avgstr := strconv.FormatFloat(l.avgconf, 'G', -1, 64) + avgstr = strings.Replace(avgstr, ".", "", 1) + fmt.Printf("Line: %s, avg: %f, avgstr: %s\n", l.name, l.avgconf, avgstr) + outname := filepath.Join(outdir, todir, l.hocrname + "_" + l.name + "_" + avgstr + ".png") + + err := os.MkdirAll(filepath.Join(outdir, todir), 0700) + if err != nil { + log.Fatal(err) + } + + outfile, err := os.Create(outname) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to create %s\n", outname) + log.Fatal(err) + } + defer outfile.Close() + + err = png.Encode(outfile, l.img) + if err != nil { + log.Fatal(err) + } + // TODO: do same with saving line + + // TODO: copy the line.img and line.text into the appropriate place, using hocrname/name.ext + // TODO: test whether the line.img works properly with multiple hocrs, as it could be that as it's a pointer, it always points to the latest image (don't think so, but not sure) + } + + total := worstnum + mediumnum + bestnum + + fmt.Printf("Copied lines to %s\n", outdir) + fmt.Printf("---------------------------------\n") + fmt.Printf("Lines 98%%+ quality: %d%%\n", 100 * bestnum / total) + fmt.Printf("Lines 95-98%% quality: %d%%\n", 100 * mediumnum / total) + fmt.Printf("Lines <95%% quality: %d%%\n", 100 * worstnum / total) +} |