From a5d9f36716b1bb7b8590869175543829836e90f3 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Fri, 25 Jan 2019 16:00:32 +0000
Subject: Rewrite line-conf-avg to use libraries, and support hocr

---
 line-conf-avg/line-conf-avg.go | 104 ++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 73 deletions(-)

diff --git a/line-conf-avg/line-conf-avg.go b/line-conf-avg/line-conf-avg.go
index a25b25d..aba184b 100644
--- a/line-conf-avg/line-conf-avg.go
+++ b/line-conf-avg/line-conf-avg.go
@@ -1,46 +1,29 @@
 package main
 
-// TODO: rewrite this to use the parse/ packages
+// TODO: rename to avglines
 
 import (
-	"bufio"
 	"flag"
 	"fmt"
-	"io/ioutil"
 	"log"
 	"os"
 	"path/filepath"
 	"sort"
-	"strconv"
-	"strings"
-)
-
-type LineDetail struct {
-	Filename string
-	Avgconf float64
-	Filebase string
-	Basename string
-	Dirname string
-	Fulltext string
-}
-
-type LineDetails []LineDetail
-
-// Used by sort.Sort.
-func (l LineDetails) Len() int { return len(l) }
 
-// Used by sort.Sort.
-func (l LineDetails) Less(i, j int) bool {
-	return l[i].Avgconf < l[j].Avgconf
-}
-
-// Used by sort.Sort.
-func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
+	"rescribe.xyz/go.git/lib/line"
+	"rescribe.xyz/go.git/lib/hocr"
+	"rescribe.xyz/go.git/lib/prob"
+)
 
 func main() {
 	flag.Usage = func() {
-		fmt.Fprintf(os.Stderr, "Usage: line-conf-avg [-html] [-nosort] prob1 [prob2] [...]\n")
-		fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line\n")
+		fmt.Fprintf(os.Stderr, "Usage: line-conf-avg [-html] [-nosort] [prob1] [hocr1] [prob2] [...]\n")
+		fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n")
+		fmt.Fprintf(os.Stderr, "from worst to best.\n")
+		fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n")
+		fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n")
+		fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n")
+		fmt.Fprintf(os.Stderr, "option.\n\n")
 		flag.PrintDefaults()
 	}
 	var usehtml = flag.Bool("html", false, "output html page")
@@ -51,54 +34,27 @@ func main() {
 		os.Exit(1)
 	}
 
-	lines := make(LineDetails, 0)
+	var err error
+	lines := make(line.Details, 0)
 
 	for _, f := range flag.Args() {
-		file, err := os.Open(f)
+		var newlines line.Details
+		switch ext := filepath.Ext(f); ext {
+			case ".prob":
+				newlines, err = prob.GetLineDetails(f)
+			case ".hocr":
+				newlines, err = hocr.GetLineDetails(f)
+			default:
+				log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f)
+				continue
+		}
 		if err != nil {
 			log.Fatal(err)
 		}
-		defer file.Close()
 
-		reader := bufio.NewReader(file)
-
-		totalconf := float64(0)
-		num := 0
-
-		err = nil
-		for err == nil {
-			var line string
-                        line, err = reader.ReadString('\n')
-			fields := strings.Fields(line)
-
-			if len(fields) == 2 {
-				conf, converr := strconv.ParseFloat(fields[1], 64)
-				if converr != nil {
-					fmt.Fprintf(os.Stderr, "Error: can't convert '%s' to float (full line: %s)\n", fields[1], line)
-					continue
-				}
-				totalconf += conf
-				num += 1
-			}
-		}
-		avg := totalconf / float64(num)
-
-		if num == 0 || avg == 0 {
-			continue
-		}
-
-		var linedetail LineDetail
-		linedetail.Filename = f
-		linedetail.Avgconf = avg
-		linedetail.Filebase = strings.Replace(f, ".prob", "", 1)
-		linedetail.Basename = filepath.Base(linedetail.Filebase)
-		linedetail.Dirname = filepath.Dir(linedetail.Filebase)
-		ft, ferr := ioutil.ReadFile(linedetail.Filebase + ".txt")
-		if ferr != nil {
-			log.Fatal(err)
+		for _, l := range newlines {
+			lines = append(lines, l)
 		}
-		linedetail.Fulltext = string(ft)
-		lines = append(lines, linedetail)
 	}
 
 	if *nosort == false {
@@ -107,7 +63,7 @@ func main() {
 
 	if *usehtml == false {
 		for _, l := range lines {
-			fmt.Printf("%s: %.2f%%\n", l.Filename, l.Avgconf)
+			fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf)
 		}
 	} else {
 		fmt.Printf("<!DOCTYPE html><html><head><meta charset='UTF-8'><title></title><style>td {border: 1px solid #444}</style></head><body>\n")
@@ -115,8 +71,10 @@ func main() {
 		for _, l := range lines {
 			fmt.Printf("<tr>\n")
 			fmt.Printf("<td><h1>%.4f%%</h1></td>\n", l.Avgconf)
-			fmt.Printf("<td>%s</td>\n", l.Filebase)
-			fmt.Printf("<td><img src='%s' /><br />%s</td>\n", l.Filebase + ".bin.png", l.Fulltext)
+			fmt.Printf("<td>%s %s</td>\n", l.OcrName, l.Name)
+			// TODO: think about this, what do we want to do here? if showing imgs is important,
+			//       will need to copy them somewhere, so works with hocr too
+			//fmt.Printf("<td><img src='%s' /><br />%s</td>\n", l.Filebase + ".bin.png", l.Fulltext)
 			fmt.Printf("</tr>\n")
 		}
 		fmt.Printf("</table>\n")
-- 
cgit v1.2.1-24-ge1ad