diff options
| author | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 | 
|---|---|---|
| committer | Nick White <git@njw.name> | 2019-10-08 12:52:33 +0100 | 
| commit | 7482157a03ed3e9d7f45e54a126b391001f34948 (patch) | |
| tree | 52f87b9ca159fe4c04a0349de95ea9de82692b3c /lib | |
| parent | d43c11bf653bfe3c1ad1ed277f1ec08bf155cf98 (diff) | |
Separate out bookpipeline from catch-all go.git repo, and rename to rescribe.xyz/bookpipeline
The dependencies from the go.git repo will follow in due course.
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/hocr/hocr.go | 129 | ||||
| -rw-r--r-- | lib/hocr/lines.go | 131 | ||||
| -rw-r--r-- | lib/line/line.go | 57 | ||||
| -rw-r--r-- | lib/prob/prob.go | 69 | 
4 files changed, 0 insertions, 386 deletions
diff --git a/lib/hocr/hocr.go b/lib/hocr/hocr.go deleted file mode 100644 index dcd0494..0000000 --- a/lib/hocr/hocr.go +++ /dev/null @@ -1,129 +0,0 @@ -package hocr - -import ( -	"encoding/xml" -	"errors" -	"io/ioutil" -	"regexp" -	"strconv" -	"strings" -) - -type Hocr struct { -	Lines []OcrLine `xml:"body>div>div>p>span"` -} - -type OcrLine struct { -	Class string    `xml:"class,attr"` -	Id    string    `xml:"id,attr"` -	Title string    `xml:"title,attr"` -	Words []OcrWord `xml:"span"` -	Text  string    `xml:",chardata"` -} - -type OcrWord struct { -	Class string    `xml:"class,attr"` -	Id    string    `xml:"id,attr"` -	Title string    `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text  string    `xml:",chardata"` -} - -type OcrChar struct { -	Class string    `xml:"class,attr"` -	Id    string    `xml:"id,attr"` -	Title string    `xml:"title,attr"` -	Chars []OcrChar `xml:"span"` -	Text  string    `xml:",chardata"` -} - -// Returns the confidence for a word based on its x_wconf value -func wordConf(s string) (float64, error) { -	re, err := regexp.Compile(`x_wconf ([0-9.]+)`) -	if err != nil { -		return 0.0, err -	} -	conf := re.FindStringSubmatch(s) -	return strconv.ParseFloat(conf[1], 64) -} - -func boxCoords(s string) ([4]int, error) { -	var coords [4]int -	re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) -	if err != nil { -		return coords, err -	} -	coordstr := re.FindStringSubmatch(s) -	for i := range coords { -		c, err := strconv.Atoi(coordstr[i+1]) -		if err != nil { -			return coords, err -		} -		coords[i] = c -	} -	return coords, nil -} - -func noText(s string) bool { -	t := strings.Trim(s, " \n") -	return len(t) == 0 -} - -func Parse(b []byte) (Hocr, error) { -	var hocr Hocr - -	err := xml.Unmarshal(b, &hocr) -	if err != nil { -		return hocr, err -	} - -	return hocr, nil -} - -func GetText(hocrfn string) (string, error) { -	var s string - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return s, err -	} - -	h, err := Parse(file) -	if err != nil { -		return s, err -	} - - -	for _, l := range h.Lines { -		s += getLineText(l) -	} -	return s, nil -} - -func GetAvgConf(hocrfn string) (float64, error) { -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return 0, err -	} - -	h, err := Parse(file) -	if err != nil { -		return 0, err -	} - -	var total, num float64 -	for _, l := range h.Lines { -		for _, w := range l.Words { -			c, err := wordConf(w.Title) -			if err != nil { -				return 0, err -			} -			total += c -			num++ -		} -	} -	if num == 0 { -		return 0, errors.New("No words found") -	} -	return total / num, nil -} diff --git a/lib/hocr/lines.go b/lib/hocr/lines.go deleted file mode 100644 index 74e8f9a..0000000 --- a/lib/hocr/lines.go +++ /dev/null @@ -1,131 +0,0 @@ -package hocr - -// TODO: Parse line name to zero pad line numbers, so they can -//       be sorted easily - -import ( -	"image" -	"image/png" -	"io/ioutil" -	"log" -	"os" -	"path/filepath" -	"strings" - -	"rescribe.xyz/go.git/lib/line" -) - -func getLineText(l OcrLine) (string) { -	linetext := "" - -	linetext = l.Text -	if noText(linetext) { -		linetext = "" -		for _, w := range l.Words { -			if w.Class != "ocrx_word" { -				continue -			} -			linetext += w.Text + " " -		} -	} -	if noText(linetext) { -		linetext = "" -		for _, w := range l.Words { -			if w.Class != "ocrx_word" { -				continue -			} -			for _, c := range w.Chars { -				if c.Class != "ocrx_cinfo" { -					continue -				} -				linetext += c.Text -			} -			linetext += " " -		} -	} -	linetext = strings.TrimRight(linetext, " ") -	linetext += "\n" -	return linetext -} - -func parseLineDetails(h Hocr, i image.Image, name string) (line.Details, error) { -	lines := make(line.Details, 0) - -	for _, l := range h.Lines { -		totalconf := float64(0) -		num := 0 -		for _, w := range l.Words { -			c, err := wordConf(w.Title) -			if err != nil { -				return lines, err -			} -			num++ -			totalconf += c -		} - -		coords, err := boxCoords(l.Title) -		if err != nil { -			return lines, err -		} - -		var ln line.Detail -		ln.Name = l.Id -		ln.Avgconf = (totalconf / float64(num)) / 100 -		ln.Text = getLineText(l) -		ln.OcrName = name -		if i != nil { -			var imgd line.ImgDirect -			imgd.Img = i.(*image.Gray).SubImage(image.Rect(coords[0], coords[1], coords[2], coords[3])) -			ln.Img = imgd -		} -		lines = append(lines, ln) -	} -	return lines, nil -} - -func GetLineDetails(hocrfn string) (line.Details, error) { -	var newlines line.Details - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return newlines, err -	} - -	h, err := Parse(file) -	if err != nil { -		return newlines, err -	} - -	var img image.Image -	pngfn := strings.Replace(hocrfn, ".hocr", ".png", 1) -	pngf, err := os.Open(pngfn) -	if err != nil { -		log.Println("Warning: can't open image %s\n", pngfn) -	} else { -		defer pngf.Close() -		img, err = png.Decode(pngf) -		if err != nil { -			log.Println("Warning: can't load image %s\n", pngfn) -		} -	} - -	n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) -	return parseLineDetails(h, img, n) -} - -func GetLineBasics(hocrfn string) (line.Details, error) { -	var newlines line.Details - -	file, err := ioutil.ReadFile(hocrfn) -	if err != nil { -		return newlines, err -	} - -	h, err := Parse(file) -	if err != nil { -		return newlines, err -	} - -	n := strings.Replace(filepath.Base(hocrfn), ".hocr", "", 1) -	return parseLineDetails(h, nil, n) -} diff --git a/lib/line/line.go b/lib/line/line.go deleted file mode 100644 index d4e3e44..0000000 --- a/lib/line/line.go +++ /dev/null @@ -1,57 +0,0 @@ -package line - -import ( -	"image" -	"image/png" -	"io" -	"os" -) - -type Detail struct { -	Name    string -	Avgconf float64 -	Img     CopyableImg -	Text    string -	OcrName string -} - -type CopyableImg interface { -	CopyLineTo(io.Writer) error -} - -type Details []Detail - -func (l Details) Len() int           { return len(l) } -func (l Details) Less(i, j int) bool { return l[i].Avgconf < l[j].Avgconf } -func (l Details) Swap(i, j int)      { l[i], l[j] = l[j], l[i] } - -// This is an implementation of the CopyableImg interface that -// stores the image directly as an image.Image -type ImgDirect struct { -	Img image.Image -} - -func (i ImgDirect) CopyLineTo(w io.Writer) error { -	err := png.Encode(w, i.Img) -	if err != nil { -		return err -	} -	return nil -} - -// This is an implementation of the CopyableImg interface that -// stores the path of an image -type ImgPath struct { -	Path string -} - -func (i ImgPath) CopyLineTo(w io.Writer) error { -	f, err := os.Open(i.Path) -	if err != nil { -		return err -	} -	defer f.Close() - -	_, err = io.Copy(w, f) -	return err -} diff --git a/lib/prob/prob.go b/lib/prob/prob.go deleted file mode 100644 index 31a484d..0000000 --- a/lib/prob/prob.go +++ /dev/null @@ -1,69 +0,0 @@ -package prob - -import ( -	"io/ioutil" -	"path/filepath" -	"strconv" -	"strings" - -	"rescribe.xyz/go.git/lib/line" -) - -func getLineAvg(f string) (float64, error) { -	totalconf := float64(0) -	num := 0 - -	prob, err := ioutil.ReadFile(f) -	if err != nil { -		return 0, err -	} - -	for _, l := range strings.Split(string(prob), "\n") { -		fields := strings.Fields(l) - -		if len(fields) == 2 { -			conf, err := strconv.ParseFloat(fields[1], 64) -			if err != nil { -				continue -			} -			totalconf += conf -			num += 1 -		} -	} -	if num <= 0 { -		return 0, nil -	} -	avg := totalconf / float64(num) -	return avg, nil -} - -// Note this only processes one line at a time -func GetLineDetails(probfn string) (line.Details, error) { -	var l line.Detail -	lines := make(line.Details, 0) - -	avg, err := getLineAvg(probfn) -	if err != nil { -		return lines, err -	} - -	filebase := strings.Replace(probfn, ".prob", "", 1) - -	txt, err := ioutil.ReadFile(filebase + ".txt") -	if err != nil { -		return lines, err -	} - -	l.Name = filepath.Base(filebase) -	l.Avgconf = avg -	l.Text = string(txt) -	l.OcrName = filepath.Base(filepath.Dir(filebase)) - -	var imgfn line.ImgPath -	imgfn.Path = filebase + ".bin.png" -	l.Img = imgfn - -	lines = append(lines, l) - -	return lines, nil -}  | 
