diff options
Diffstat (limited to 'line-conf-avg')
-rw-r--r-- | line-conf-avg/line-conf-avg.go | 104 |
1 files changed, 31 insertions, 73 deletions
diff --git a/line-conf-avg/line-conf-avg.go b/line-conf-avg/line-conf-avg.go index a25b25d..aba184b 100644 --- a/line-conf-avg/line-conf-avg.go +++ b/line-conf-avg/line-conf-avg.go @@ -1,46 +1,29 @@ package main -// TODO: rewrite this to use the parse/ packages +// TODO: rename to avglines import ( - "bufio" "flag" "fmt" - "io/ioutil" "log" "os" "path/filepath" "sort" - "strconv" - "strings" -) - -type LineDetail struct { - Filename string - Avgconf float64 - Filebase string - Basename string - Dirname string - Fulltext string -} - -type LineDetails []LineDetail - -// Used by sort.Sort. -func (l LineDetails) Len() int { return len(l) } -// Used by sort.Sort. -func (l LineDetails) Less(i, j int) bool { - return l[i].Avgconf < l[j].Avgconf -} - -// Used by sort.Sort. -func (l LineDetails) Swap(i, j int) { l[i], l[j] = l[j], l[i] } + "rescribe.xyz/go.git/lib/line" + "rescribe.xyz/go.git/lib/hocr" + "rescribe.xyz/go.git/lib/prob" +) func main() { flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: line-conf-avg [-html] [-nosort] prob1 [prob2] [...]\n") - fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line\n") + fmt.Fprintf(os.Stderr, "Usage: line-conf-avg [-html] [-nosort] [prob1] [hocr1] [prob2] [...]\n") + fmt.Fprintf(os.Stderr, "Prints a report of the average confidence for each line, sorted\n") + fmt.Fprintf(os.Stderr, "from worst to best.\n") + fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n") + fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n") + fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") + fmt.Fprintf(os.Stderr, "option.\n\n") flag.PrintDefaults() } var usehtml = flag.Bool("html", false, "output html page") @@ -51,54 +34,27 @@ func main() { os.Exit(1) } - lines := make(LineDetails, 0) + var err error + lines := make(line.Details, 0) for _, f := range flag.Args() { - file, err := os.Open(f) + var newlines line.Details + switch ext := filepath.Ext(f); ext { + case ".prob": + newlines, err = prob.GetLineDetails(f) + case ".hocr": + newlines, err = hocr.GetLineDetails(f) + default: + log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) + continue + } if err != nil { log.Fatal(err) } - defer file.Close() - reader := bufio.NewReader(file) - - totalconf := float64(0) - num := 0 - - err = nil - for err == nil { - var line string - line, err = reader.ReadString('\n') - fields := strings.Fields(line) - - if len(fields) == 2 { - conf, converr := strconv.ParseFloat(fields[1], 64) - if converr != nil { - fmt.Fprintf(os.Stderr, "Error: can't convert '%s' to float (full line: %s)\n", fields[1], line) - continue - } - totalconf += conf - num += 1 - } - } - avg := totalconf / float64(num) - - if num == 0 || avg == 0 { - continue - } - - var linedetail LineDetail - linedetail.Filename = f - linedetail.Avgconf = avg - linedetail.Filebase = strings.Replace(f, ".prob", "", 1) - linedetail.Basename = filepath.Base(linedetail.Filebase) - linedetail.Dirname = filepath.Dir(linedetail.Filebase) - ft, ferr := ioutil.ReadFile(linedetail.Filebase + ".txt") - if ferr != nil { - log.Fatal(err) + for _, l := range newlines { + lines = append(lines, l) } - linedetail.Fulltext = string(ft) - lines = append(lines, linedetail) } if *nosort == false { @@ -107,7 +63,7 @@ func main() { if *usehtml == false { for _, l := range lines { - fmt.Printf("%s: %.2f%%\n", l.Filename, l.Avgconf) + fmt.Printf("%s %s: %.2f%%\n", l.OcrName, l.Name, l.Avgconf) } } else { fmt.Printf("<!DOCTYPE html><html><head><meta charset='UTF-8'><title></title><style>td {border: 1px solid #444}</style></head><body>\n") @@ -115,8 +71,10 @@ func main() { for _, l := range lines { fmt.Printf("<tr>\n") fmt.Printf("<td><h1>%.4f%%</h1></td>\n", l.Avgconf) - fmt.Printf("<td>%s</td>\n", l.Filebase) - fmt.Printf("<td><img src='%s' /><br />%s</td>\n", l.Filebase + ".bin.png", l.Fulltext) + fmt.Printf("<td>%s %s</td>\n", l.OcrName, l.Name) + // TODO: think about this, what do we want to do here? if showing imgs is important, + // will need to copy them somewhere, so works with hocr too + //fmt.Printf("<td><img src='%s' /><br />%s</td>\n", l.Filebase + ".bin.png", l.Fulltext) fmt.Printf("</tr>\n") } fmt.Printf("</table>\n") |