summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-04-14 11:56:56 +0100
committerNick White <git@njw.name>2020-04-14 11:56:56 +0100
commit848f961c277525ebba3ab08fadc116970bcfed24 (patch)
tree00a6883568c29ac34d56f958f7d3479e372d8c31
parentd9612a98ab66409858e07a9a430d40beac1b17b8 (diff)
Add godoc documentation
-rw-r--r--cmd/avg-lines/main.go2
-rw-r--r--cmd/boxtotxt/main.go1
-rw-r--r--cmd/bucket-lines/main.go2
-rw-r--r--cmd/dehyphenate/main.go1
-rw-r--r--cmd/eeboxmltohocr/main.go2
-rw-r--r--cmd/fonttobytes/main.go2
-rw-r--r--cmd/getbests/main.go2
-rw-r--r--cmd/hocrtotxt/main.go1
-rw-r--r--cmd/pare-gt/main.go3
-rw-r--r--cmd/pgconf/main.go1
-rw-r--r--pkg/hocr/hocr.go9
-rw-r--r--pkg/hocr/lines.go5
-rw-r--r--pkg/line/line.go1
-rw-r--r--pkg/prob/prob.go3
14 files changed, 31 insertions, 4 deletions
diff --git a/cmd/avg-lines/main.go b/cmd/avg-lines/main.go
index a32f5ce..c1fd901 100644
--- a/cmd/avg-lines/main.go
+++ b/cmd/avg-lines/main.go
@@ -2,6 +2,8 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// avg-lines prints a report of the average confidence for each line,
+// sorted from worst to best
package main
import (
diff --git a/cmd/boxtotxt/main.go b/cmd/boxtotxt/main.go
index c8e4c02..b3b18b0 100644
--- a/cmd/boxtotxt/main.go
+++ b/cmd/boxtotxt/main.go
@@ -2,6 +2,7 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// boxtotxt converts a Tesseract .box file to plain text
package main
import (
diff --git a/cmd/bucket-lines/main.go b/cmd/bucket-lines/main.go
index 64ebffc..fddff21 100644
--- a/cmd/bucket-lines/main.go
+++ b/cmd/bucket-lines/main.go
@@ -2,6 +2,8 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// bucket-lines copies image-text line pairs into different directories
+// according to the average character probability for the line
package main
import (
diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go
index 58b735e..90a6cda 100644
--- a/cmd/dehyphenate/main.go
+++ b/cmd/dehyphenate/main.go
@@ -2,6 +2,7 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// dehyphenate does basic dehyphenation on a hocr file
package main
import (
diff --git a/cmd/eeboxmltohocr/main.go b/cmd/eeboxmltohocr/main.go
index aaad3a5..867717f 100644
--- a/cmd/eeboxmltohocr/main.go
+++ b/cmd/eeboxmltohocr/main.go
@@ -2,6 +2,8 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// eeboxmltohocr converts the XML from an EEBO download to hOCR, which
+// can be easily incorporated into a searchable PDF
package main
import (
diff --git a/cmd/fonttobytes/main.go b/cmd/fonttobytes/main.go
index 52883cb..085003b 100644
--- a/cmd/fonttobytes/main.go
+++ b/cmd/fonttobytes/main.go
@@ -2,6 +2,8 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// fonttobytes outputs a font file as a series of bytes in go format,
+// allowing a font to be easily embedded into a go binary
package main
import (
diff --git a/cmd/getbests/main.go b/cmd/getbests/main.go
index 2b0c40a..9eca0d8 100644
--- a/cmd/getbests/main.go
+++ b/cmd/getbests/main.go
@@ -2,6 +2,8 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// getbests downloads every 'best' file from a set of OCRed books
+// stored on cloud infrastructure
package main
import (
diff --git a/cmd/hocrtotxt/main.go b/cmd/hocrtotxt/main.go
index c3eb0f4..f8447e2 100644
--- a/cmd/hocrtotxt/main.go
+++ b/cmd/hocrtotxt/main.go
@@ -2,6 +2,7 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// hocrtotxt prints the text from a hocr file
package main
import (
diff --git a/cmd/pare-gt/main.go b/cmd/pare-gt/main.go
index 1180607..f5da496 100644
--- a/cmd/pare-gt/main.go
+++ b/cmd/pare-gt/main.go
@@ -2,6 +2,9 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// pare-gt moves some ground truth, ensuring that the same
+// proportions of each ground truth source are represented in the
+// moved section
package main
import (
diff --git a/cmd/pgconf/main.go b/cmd/pgconf/main.go
index 41b00f0..846a3d8 100644
--- a/cmd/pgconf/main.go
+++ b/cmd/pgconf/main.go
@@ -2,6 +2,7 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// pgconf prints the total confidence for a page of hOCR
package main
import (
diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go
index 9dea49c..6b43558 100644
--- a/pkg/hocr/hocr.go
+++ b/pkg/hocr/hocr.go
@@ -2,6 +2,8 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// hocr contains structures and functions for parsing and analysing
+// hocr files
package hocr
import (
@@ -51,6 +53,7 @@ func wordConf(s string) (float64, error) {
return strconv.ParseFloat(conf[1], 64)
}
+// BoxCoords parses bbox coordinate strings
func BoxCoords(s string) ([4]int, error) {
var coords [4]int
re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`)
@@ -73,6 +76,7 @@ func noText(s string) bool {
return len(t) == 0
}
+// Parse parses a hOCR file
func Parse(b []byte) (Hocr, error) {
var hocr Hocr
@@ -84,6 +88,7 @@ func Parse(b []byte) (Hocr, error) {
return hocr, nil
}
+// GetText parses a hOCR file and extracts the text from it
func GetText(hocrfn string) (string, error) {
var s string
@@ -104,6 +109,8 @@ func GetText(hocrfn string) (string, error) {
return s, nil
}
+// GetAvgConf calculates the average confidence of a hOCR file from
+// confidences embedded in each word
func GetAvgConf(hocrfn string) (float64, error) {
file, err := ioutil.ReadFile(hocrfn)
if err != nil {
@@ -134,7 +141,7 @@ func GetAvgConf(hocrfn string) (float64, error) {
// GetWordConfs is a utility function that parses a hocr
// file and returns an array containing the confidences
-// of each word therein.
+// of each word therein
func GetWordConfs(hocrfn string) ([]float64, error) {
var confs []float64
diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go
index 1387574..942bd01 100644
--- a/pkg/hocr/lines.go
+++ b/pkg/hocr/lines.go
@@ -20,6 +20,7 @@ import (
"rescribe.xyz/utils/pkg/line"
)
+// LineText extracts the text from an OcrLine
func LineText(l OcrLine) (string) {
linetext := ""
@@ -88,7 +89,7 @@ func parseLineDetails(h Hocr, i *image.Gray, name string) (line.Details, error)
}
// GetLineDetails parses a hocr file and returns a corresponding
-// line.Details, including image extracts for each line.
+// line.Details, including image extracts for each line
func GetLineDetails(hocrfn string) (line.Details, error) {
var newlines line.Details
@@ -121,7 +122,7 @@ func GetLineDetails(hocrfn string) (line.Details, error) {
}
// GetLineBasics parses a hocr file and returns a corresponding
-// line.Details, without any image extracts.
+// line.Details, without any image extracts
func GetLineBasics(hocrfn string) (line.Details, error) {
var newlines line.Details
diff --git a/pkg/line/line.go b/pkg/line/line.go
index b1e9021..bac7fa4 100644
--- a/pkg/line/line.go
+++ b/pkg/line/line.go
@@ -2,6 +2,7 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// line contains various functions to manipulate ocr lines
package line
import (
diff --git a/pkg/prob/prob.go b/pkg/prob/prob.go
index 7a6979a..67baa03 100644
--- a/pkg/prob/prob.go
+++ b/pkg/prob/prob.go
@@ -2,6 +2,7 @@
// Use of this source code is governed by the GPLv3
// license that can be found in the LICENSE file.
+// prob processes .prob files generated by ocropus
package prob
import (
@@ -41,7 +42,7 @@ func getLineAvg(f string) (float64, error) {
return avg, nil
}
-// Note this only processes one line at a time
+// GetLineDetails parses a .prob and corresponding .txt file
func GetLineDetails(probfn string) (line.Details, error) {
var l line.Detail
lines := make(line.Details, 0)