From 848f961c277525ebba3ab08fadc116970bcfed24 Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 14 Apr 2020 11:56:56 +0100 Subject: Add godoc documentation --- cmd/avg-lines/main.go | 2 ++ cmd/boxtotxt/main.go | 1 + cmd/bucket-lines/main.go | 2 ++ cmd/dehyphenate/main.go | 1 + cmd/eeboxmltohocr/main.go | 2 ++ cmd/fonttobytes/main.go | 2 ++ cmd/getbests/main.go | 2 ++ cmd/hocrtotxt/main.go | 1 + cmd/pare-gt/main.go | 3 +++ cmd/pgconf/main.go | 1 + pkg/hocr/hocr.go | 9 ++++++++- pkg/hocr/lines.go | 5 +++-- pkg/line/line.go | 1 + pkg/prob/prob.go | 3 ++- 14 files changed, 31 insertions(+), 4 deletions(-) diff --git a/cmd/avg-lines/main.go b/cmd/avg-lines/main.go index a32f5ce..c1fd901 100644 --- a/cmd/avg-lines/main.go +++ b/cmd/avg-lines/main.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// avg-lines prints a report of the average confidence for each line, +// sorted from worst to best package main import ( diff --git a/cmd/boxtotxt/main.go b/cmd/boxtotxt/main.go index c8e4c02..b3b18b0 100644 --- a/cmd/boxtotxt/main.go +++ b/cmd/boxtotxt/main.go @@ -2,6 +2,7 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// boxtotxt converts a Tesseract .box file to plain text package main import ( diff --git a/cmd/bucket-lines/main.go b/cmd/bucket-lines/main.go index 64ebffc..fddff21 100644 --- a/cmd/bucket-lines/main.go +++ b/cmd/bucket-lines/main.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// bucket-lines copies image-text line pairs into different directories +// according to the average character probability for the line package main import ( diff --git a/cmd/dehyphenate/main.go b/cmd/dehyphenate/main.go index 58b735e..90a6cda 100644 --- a/cmd/dehyphenate/main.go +++ b/cmd/dehyphenate/main.go @@ -2,6 +2,7 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// dehyphenate does basic dehyphenation on a hocr file package main import ( diff --git a/cmd/eeboxmltohocr/main.go b/cmd/eeboxmltohocr/main.go index aaad3a5..867717f 100644 --- a/cmd/eeboxmltohocr/main.go +++ b/cmd/eeboxmltohocr/main.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// eeboxmltohocr converts the XML from an EEBO download to hOCR, which +// can be easily incorporated into a searchable PDF package main import ( diff --git a/cmd/fonttobytes/main.go b/cmd/fonttobytes/main.go index 52883cb..085003b 100644 --- a/cmd/fonttobytes/main.go +++ b/cmd/fonttobytes/main.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// fonttobytes outputs a font file as a series of bytes in go format, +// allowing a font to be easily embedded into a go binary package main import ( diff --git a/cmd/getbests/main.go b/cmd/getbests/main.go index 2b0c40a..9eca0d8 100644 --- a/cmd/getbests/main.go +++ b/cmd/getbests/main.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// getbests downloads every 'best' file from a set of OCRed books +// stored on cloud infrastructure package main import ( diff --git a/cmd/hocrtotxt/main.go b/cmd/hocrtotxt/main.go index c3eb0f4..f8447e2 100644 --- a/cmd/hocrtotxt/main.go +++ b/cmd/hocrtotxt/main.go @@ -2,6 +2,7 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// hocrtotxt prints the text from a hocr file package main import ( diff --git a/cmd/pare-gt/main.go b/cmd/pare-gt/main.go index 1180607..f5da496 100644 --- a/cmd/pare-gt/main.go +++ b/cmd/pare-gt/main.go @@ -2,6 +2,9 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// pare-gt moves some ground truth, ensuring that the same +// proportions of each ground truth source are represented in the +// moved section package main import ( diff --git a/cmd/pgconf/main.go b/cmd/pgconf/main.go index 41b00f0..846a3d8 100644 --- a/cmd/pgconf/main.go +++ b/cmd/pgconf/main.go @@ -2,6 +2,7 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// pgconf prints the total confidence for a page of hOCR package main import ( diff --git a/pkg/hocr/hocr.go b/pkg/hocr/hocr.go index 9dea49c..6b43558 100644 --- a/pkg/hocr/hocr.go +++ b/pkg/hocr/hocr.go @@ -2,6 +2,8 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// hocr contains structures and functions for parsing and analysing +// hocr files package hocr import ( @@ -51,6 +53,7 @@ func wordConf(s string) (float64, error) { return strconv.ParseFloat(conf[1], 64) } +// BoxCoords parses bbox coordinate strings func BoxCoords(s string) ([4]int, error) { var coords [4]int re, err := regexp.Compile(`bbox ([0-9]+) ([0-9]+) ([0-9]+) ([0-9]+)`) @@ -73,6 +76,7 @@ func noText(s string) bool { return len(t) == 0 } +// Parse parses a hOCR file func Parse(b []byte) (Hocr, error) { var hocr Hocr @@ -84,6 +88,7 @@ func Parse(b []byte) (Hocr, error) { return hocr, nil } +// GetText parses a hOCR file and extracts the text from it func GetText(hocrfn string) (string, error) { var s string @@ -104,6 +109,8 @@ func GetText(hocrfn string) (string, error) { return s, nil } +// GetAvgConf calculates the average confidence of a hOCR file from +// confidences embedded in each word func GetAvgConf(hocrfn string) (float64, error) { file, err := ioutil.ReadFile(hocrfn) if err != nil { @@ -134,7 +141,7 @@ func GetAvgConf(hocrfn string) (float64, error) { // GetWordConfs is a utility function that parses a hocr // file and returns an array containing the confidences -// of each word therein. +// of each word therein func GetWordConfs(hocrfn string) ([]float64, error) { var confs []float64 diff --git a/pkg/hocr/lines.go b/pkg/hocr/lines.go index 1387574..942bd01 100644 --- a/pkg/hocr/lines.go +++ b/pkg/hocr/lines.go @@ -20,6 +20,7 @@ import ( "rescribe.xyz/utils/pkg/line" ) +// LineText extracts the text from an OcrLine func LineText(l OcrLine) (string) { linetext := "" @@ -88,7 +89,7 @@ func parseLineDetails(h Hocr, i *image.Gray, name string) (line.Details, error) } // GetLineDetails parses a hocr file and returns a corresponding -// line.Details, including image extracts for each line. +// line.Details, including image extracts for each line func GetLineDetails(hocrfn string) (line.Details, error) { var newlines line.Details @@ -121,7 +122,7 @@ func GetLineDetails(hocrfn string) (line.Details, error) { } // GetLineBasics parses a hocr file and returns a corresponding -// line.Details, without any image extracts. +// line.Details, without any image extracts func GetLineBasics(hocrfn string) (line.Details, error) { var newlines line.Details diff --git a/pkg/line/line.go b/pkg/line/line.go index b1e9021..bac7fa4 100644 --- a/pkg/line/line.go +++ b/pkg/line/line.go @@ -2,6 +2,7 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// line contains various functions to manipulate ocr lines package line import ( diff --git a/pkg/prob/prob.go b/pkg/prob/prob.go index 7a6979a..67baa03 100644 --- a/pkg/prob/prob.go +++ b/pkg/prob/prob.go @@ -2,6 +2,7 @@ // Use of this source code is governed by the GPLv3 // license that can be found in the LICENSE file. +// prob processes .prob files generated by ocropus package prob import ( @@ -41,7 +42,7 @@ func getLineAvg(f string) (float64, error) { return avg, nil } -// Note this only processes one line at a time +// GetLineDetails parses a .prob and corresponding .txt file func GetLineDetails(probfn string) (line.Details, error) { var l line.Detail lines := make(line.Details, 0) -- cgit v1.2.1-24-ge1ad