diff options
| author | Nick White <git@njw.name> | 2019-01-25 09:55:55 +0000 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-01-25 09:55:55 +0000 | 
| commit | 30c088b90e7b6a25d93cbdad7564ff063e62afd3 (patch) | |
| tree | c47d7bc086a076cfe5e702628c4e5e3b1eab1aa5 /parse | |
| parent | 1c17766952bdcd6f7d31d0fa1a2e504b1aa4f14a (diff) | |
Reorganisation and cleanup
Diffstat (limited to 'parse')
| -rw-r--r-- | parse/bucket.go | 123 | ||||
| -rw-r--r-- | parse/hocr/hocr.go | 181 | ||||
| -rw-r--r-- | parse/line.go | 67 | ||||
| -rw-r--r-- | parse/prob/prob.go | 69 | 
4 files changed, 0 insertions, 440 deletions
| diff --git a/parse/bucket.go b/parse/bucket.go deleted file mode 100644 index 44b1d24..0000000 --- a/parse/bucket.go +++ /dev/null @@ -1,123 +0,0 @@ -package parse - -import ( -	"fmt" -	"io" -	"path/filepath" -	"os" -	"sort" -	"strconv" -) - -type BucketSpec struct { -	Min float64 -	Name string -} -type BucketSpecs []BucketSpec -func (b BucketSpecs) Len() int { return len(b) } -func (b BucketSpecs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketSpecs) Less(i, j int) bool { return b[i].Min < b[j].Min } - -func bucketLine(l LineDetail, buckets BucketSpecs, dirname string) (string, error) { -	var bucket string - -	todir := "" -	for _, b := range buckets { -		if l.Avgconf >= b.Min { -			todir = b.Name -			bucket = b.Name -		} -	} - -	if todir == "" { -		return bucket, nil -	} - -	avgstr := strconv.FormatFloat(l.Avgconf, 'G', -1, 64) -	if len(avgstr) > 2 { -		avgstr = avgstr[2:] -	} - -	base := filepath.Join(dirname, todir, filepath.Base(l.OcrName) + "_" + l.Name + "_" + avgstr) - -	err := os.MkdirAll(filepath.Join(dirname, todir), 0700) -	if err != nil { -		return bucket, err -	} - -	f, err := os.Create(base + ".png") -	if err != nil { -		return bucket, err -	} -	defer f.Close() - -	err = l.Img.CopyLineTo(f) -	if err != nil { -		return bucket, err -	} - -	f, err = os.Create(base + ".txt") -	if err != nil { -		return bucket, err -	} -	defer f.Close() - -	_, err = io.WriteString(f, l.Text) -	if err != nil { -		return bucket, err -	} - -	return bucket, err -} - -type BucketStat struct { -	name string -	num int -} -type BucketStats []BucketStat -func (b BucketStats) Len() int { return len(b) } -func (b BucketStats) Swap(i, j int) { b[i], b[j] = b[j], b[i] } -func (b BucketStats) Less(i, j int) bool { return b[i].num < b[j].num } - -// Copies line images and text into directories based on their -// confidence, as defined by the buckets struct -func BucketUp(lines LineDetails, buckets BucketSpecs, dirname string) (BucketStats, error) { -	var all []string -	var stats BucketStats - -	sort.Sort(lines) -	sort.Sort(buckets) -	for _, l := range lines { -		bname, err := bucketLine(l, buckets, dirname) -		if err != nil { -			return stats, err -		} -		all = append(all, bname) -	} - -	for _, b := range all { -		i := sort.Search(len(stats), func(i int) bool { return stats[i].name == b }) -		if i == len(stats) { -			newstat := BucketStat { b, 0 } -			stats = append(stats, newstat) -			i = len(stats) - 1 -		} -		stats[i].num++ -	} - -	return stats, nil -} - -func PrintBucketStats(w io.Writer, stats BucketStats) { -	var total int -	for _, s := range stats { -		total += s.num -	} - -	fmt.Fprintf(w, "Copied %d lines\n", total) -	fmt.Fprintf(w, "---------------------------------\n") -	sort.Sort(stats) -	for _, s := range stats { -		fmt.Fprintf(w, "Lines in %7s: %2d%%\n", s.name, 100 * s.num / total) -	} -} diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go deleted file mode 100644 index 81250a9..0000000 --- a/parse/hocr/hocr.go +++ /dev/null @@ -1,181 +0,0 @@ -package hocr - -// TODO: Parse line name to zero pad line numbers, so they can -//       be sorted easily -// TODO: have same filename format as .prob uses, so include base -//       dirname, and don't include line numbers if there's only -//       one line in the hocr - -import ( -	"encoding/xml" -	"image" -	"image/png" -	"io/ioutil" -	"os" -	"path/filepath" -	"regexp" -	"strconv" -	"strings" - -	"git.rescribe.xyz/testingtools/parse" -) - -type Hocr struct { -	Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Words []OcrWord `xml:"span"` -	Text string `xml:",chardata"` -} - -type OcrWord struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text string `xml:",chardata"` -} - -type OcrChar struct { -	Class string `xml:"class,attr"` -	Id string `xml:"id,attr"` -	Title string `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text string `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { -	re, err := regexp.Compile(`x_wconf ([0-9.]+)`) -	if err != nil { -		return 0.0, err -	} -	conf := re.FindStringSubmatch(s) -	return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { -	var coords [4]int -	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) -	if err != nil { -		return coords, err -	} -	coordstr := re.FindStringSubmatch(s) -	for i := range coords { -		c, err := strconv.Atoi(coordstr[i+1]) -		if err != nil { -			return coords, err -		} -		coords[i] = c -	} -	return coords, nil -} - -func noText(s string) bool { -	t := strings.Trim(s, " \n") -	return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { -	var hocr Hocr - -	err := xml.Unmarshal(b, &hocr) -	if err != nil { -		return hocr, err -	} - -	return hocr, nil -} - -func parseLineDetails(h Hocr, i image.Image, name string) (parse.LineDetails, error) { -	lines := make(parse.LineDetails, 0) - -	for _, l := range h.Lines { -		totalconf := float64(0) -		num := 0 -		for _, w := range l.Words { -			c, err := wordConf(w.Title) -			if err != nil { -				return lines, err -			} -			num++ -			totalconf += c -		} - -		coords, err := boxCoords(l.Title) -		if err != nil { -			return lines, err -		} - -		var line parse.LineDetail -		line.Name = l.Id -		line.Avgconf = (totalconf/float64(num)) / 100 -		linetext := "" - -		linetext = l.Text -		if(noText(linetext)) { -			linetext = "" -			for _, w := range l.Words { -				if(w.Class != "ocrx_word") { -					continue -				} -				linetext += w.Text + " " -			} -		} -		if(noText(linetext)) { -			linetext = "" -			for _, w := range l.Words { -				if(w.Class != "ocrx_word") { -					continue -				} -				for _, c := range w.Chars { -					if(c.Class != "ocrx_cinfo") { -						continue -					} -					linetext += c.Text -				} -				linetext += " " -			} -		} -		line.Text = strings.TrimRight(linetext, " ") -		line.Text += "\n" -		line.OcrName = name -		var imgd parse.ImgDirect -		imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) -		line.Img = imgd -		lines = append(lines, line) -	} -	return lines, nil -} - -func GetLineDetails(hocrfn string) (parse.LineDetails, error) { -	var newlines parse.LineDetails - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return newlines, err -	} - -	h, err := Parse(file) -	if err != nil { -		return newlines, err -	} - -	pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) -	pngf, err := os.Open(pngfn) -	if err != nil { -		return newlines, err -	} -	defer pngf.Close() -	img, err := png.Decode(pngf) -	if err != nil { -		return newlines, err -	} - -	n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) -	return parseLineDetails(h, img, n) -} diff --git a/parse/line.go b/parse/line.go deleted file mode 100644 index 9a2be8e..0000000 --- a/parse/line.go +++ /dev/null @@ -1,67 +0,0 @@ -package parse - -// TODO: integrate in line-conf-buckets linedetail -// TODO: add BucketUp() function here that does what both line-conf-buckets-tess.go -//       and line-conf-buckets.go do -// TODO: consider naming this package line, and separating it from hocr and prob - -import ( -	"image" -	"image/png" -	"io" -	"os" -) - -type LineDetail struct { -	Name string -	Avgconf float64 -	Img CopyableLine -	Text string -	OcrName string -} - -type CopyableLine interface { -	CopyLineTo(io.Writer) (error) -} - -// This is an implementation of the CopyableLine interface that -// stores the image directly as an image.Image -type ImgDirect struct { -	Img image.Image -} - -func (i ImgDirect) CopyLineTo(w io.Writer) (error) { -	err := png.Encode(w, i.Img) -	if err != nil { -		return err -	} -	return nil -} - -type ImgPath struct { -	Path string -} - -func (i ImgPath) CopyLineTo(w io.Writer) (error) { -	f, err := os.Open(i.Path) -	if err != nil { -		return err -	} -	defer f.Close() - -	_, err = io.Copy(w, f) -	return err -} - -type LineDetails []LineDetail - -// Used by sort.Sort. -func (l LineDetails) Len() int { return len(l) } - -// Used by sort.Sort. -func (l LineDetails) Less(i, j int) bool { -	return l[i].Avgconf < l[j].Avgconf -} - -// Used by sort.Sort. -func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } diff --git a/parse/prob/prob.go b/parse/prob/prob.go deleted file mode 100644 index 8d01cab..0000000 --- a/parse/prob/prob.go +++ /dev/null @@ -1,69 +0,0 @@ -package prob - -import ( -	"io/ioutil" -	"path/filepath" -	"strconv" -	"strings" - -	"git.rescribe.xyz/testingtools/parse" -) - -func getLineAvg(f string) (float64, error) { -	totalconf := float64(0) -	num := 0 - -	prob, err := ioutil.ReadFile(f) -        if err != nil { -		return 0, err -	} - -	for _, line := range strings.Split(string(prob), "\n") { -		fields := strings.Fields(line) - -		if len(fields) == 2 { -			conf, err := strconv.ParseFloat(fields[1], 64) -			if err != nil { -				continue -			} -			totalconf += conf -			num += 1 -		} -	} -	if num <= 0 { -		return 0, nil -	} -	avg := totalconf / float64(num) -	return avg, nil -} - -// Note this only processes one line at a time -func GetLineDetails(probfn string) (parse.LineDetails, error) { -	var line parse.LineDetail -	lines := make(parse.LineDetails, 0) - -	avg, err := getLineAvg(probfn) -	if err != nil { -		return lines, err -	} - -	filebase := strings.Replace(probfn, ".prob", "", 1) - -	txt, err := ioutil.ReadFile(filebase + ".txt") -	if err != nil { -		return lines, err -	} - -	line.Name = filepath.Base(filebase) -	line.Avgconf = avg -	line.Text = string(txt) -	line.OcrName = filepath.Dir(filebase) - -	var imgfn parse.ImgPath -	imgfn.Path = filebase + ".bin.png" -	line.Img = imgfn - -	lines = append(lines, line) - -	return lines, nil -} | 
