From 85a955c689f2c64bd5fb98635052684db5ca9e6f Mon Sep 17 00:00:00 2001 From: Nick White Date: Tue, 20 Oct 2020 16:12:31 +0100 Subject: Add postprocess-bythresh cmd --- cmd/postprocess-bythresh/main.go | 270 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 cmd/postprocess-bythresh/main.go diff --git a/cmd/postprocess-bythresh/main.go b/cmd/postprocess-bythresh/main.go new file mode 100644 index 0000000..37b77e7 --- /dev/null +++ b/cmd/postprocess-bythresh/main.go @@ -0,0 +1,270 @@ +package main + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + + "rescribe.xyz/utils/pkg/hocr" +) + +//TO DO: make writetofile return an error and handle that accordingly +// potential TO DO: add text versions where footer is cropped on odd/even pages only + + +// the trimblanks function trims the blank lines from a text input +func trimblanks(hocrfile string) string { + + filein := bytes.NewBufferString(hocrfile) + + var noblanks string + + scanner := bufio.NewScanner(filein) + + for scanner.Scan() { + + eachline := scanner.Text() + trimmed := strings.TrimSpace(eachline) + if len(trimmed) != 0 { + noblanks = noblanks + eachline + "\n" + } + } + return noblanks + +} + +// the dehyphenateString function is copy-pasted from Nick's code (see rescribe.xyz/utils/cmd/dehyphenator/main.go), written to dehyphenate +// a string or hocr file. Only one small change from the original: the string is dehyphenated and concatenated WITHOUT line breaks +func dehyphenateString(in string) string { + var newlines []string + lines := strings.Split(in, "\n") + for i, line := range lines { + words := strings.Split(line, " ") + last := words[len(words)-1] + // the - 2 here is to account for a trailing newline and counting from zero + if len(last) > 0 && last[len(last) - 1] == '-' && i < len(lines) - 2 { + nextwords := strings.Split(lines[i+1], " ") + if len(nextwords) > 0 { + line = line[0:len(line)-1] + nextwords[0] + } + if len(nextwords) > 1 { + lines[i+1] = strings.Join(nextwords[1:], " ") + } else { + lines[i+1] = "" + } + } + newlines = append(newlines, line) + } + return strings.Join(newlines, " ") +} + + +// the fullcrop function takes a text input and crops the first and the last line (if text is at least 2 lines long) +func fullcrop(noblanks string) string { + + + alllines := strings.Split(noblanks, "\n") + + if len(alllines) <= 2 { + return "" + } else { + return strings.Join(alllines[1:len(alllines)-2], "\n") + } + +} + +// the headcrop function takes a text input and crops the first line provided text is longer than 1 line +func headcrop(noblanks string) string { + + alllines := strings.Split(noblanks, "\n") + + switch { + + case len(alllines) == 2: + return strings.Join(alllines[1:], "\n") + + case len(alllines) < 2: + return "" + + default: + return strings.Join(alllines[1:], "\n") + + } + +} + +// the footcrop function takes a text input and crops the last line provided text is longer than 1 line +func footcrop(noblanks string) string { + + alllines := strings.Split(noblanks, "\n") + + switch { + + case len(alllines) == 2: + return strings.Join(alllines[0:len(alllines)-2], "\n") + + case len(alllines) < 2: + return "" + + default: + return strings.Join(alllines[0:len(alllines)-2], "\n") + + } + +} + +// the convertselect function selects the hocr from the bookdirectory above a given confidence threshold and +// converts it to text, trims each text and appends all into one textbase and saves it as a text file. +// the function returns one full version, one with headers and footers cropped, one with only +//headers cropped +func convertselect(bookdirectory, hocrfilename string, confthresh int) (string, string, string, string) { + + var alltxt string + var croptxt string + var killheadtxt string + var footkilltxt string + + + hocrfilepath := filepath.Join(bookdirectory, hocrfilename) + + confpath := filepath.Join(bookdirectory, "conf") + + readConf, err := os.Open(confpath) + if err != nil { + log.Fatalf("failed to open file: %s", err) + } + defer readConf.Close() + + scanner := bufio.NewScanner(readConf) + var confline string + var confvalue int + + for scanner.Scan() { + confline = scanner.Text() + if strings.Contains(confline, hocrfilename) { + substring := strings.Split(confline, " ") + if len(substring) != 2 { + log.Fatalf("Bailing as conf file %s doesn't seem to be formatted correctly (wants 2 fields separated by ' ')\n", confpath) + } + confvalue, _ = strconv.Atoi(substring[1]) + } + + } + readConf.Close() + + if confvalue > confthresh { + hocrfiletext, err := hocr.GetText(hocrfilepath) + if err != nil { + log.Fatal(err) + } + + + trimbest := trimblanks(hocrfiletext) + + alltxt = dehyphenateString(trimbest) + + croptxt = dehyphenateString(fullcrop(trimbest)) + + killheadtxt = dehyphenateString(headcrop(trimbest)) + + footkilltxt = dehyphenateString(footcrop(trimbest)) + + + } + return alltxt, croptxt, killheadtxt, footkilltxt +} + +// the writetofile function takes a directory, filename and text input and creates a text file within the bookdirectory from them. +func writetofile(bookdirectory, textfilebase, txt string) error { + alltxtfile := filepath.Join(bookdirectory, textfilebase) + + file, err := os.Create(alltxtfile) + if err != nil { + return fmt.Errorf("Error opening file %s: %v", alltxtfile, err) + } + defer file.Close() + if _, err := file.WriteString(txt); err != nil { + log.Println(err) + } +return err + +} + +func main() { + + confthresh := flag.Int("c", 30, "Chosen confidence threshold. Default:30") + + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: command -c confidence-threshold bookdirectory \n") + fmt.Fprintf(os.Stderr, "Creates different text versions from the hocr files of a bookdirectory.\n") + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() != 1 { + flag.Usage() + os.Exit(1) + } + + bookdirectory := flag.Arg(0) + confthreshstring := strconv.Itoa(*confthresh) + + fmt.Println("Postprocessing", bookdirectory, "with threshold", *confthresh) + + bestpath := filepath.Join(bookdirectory, "best") + + readBest, err := ioutil.ReadFile(bestpath) + if err != nil { + log.Fatalf("failed to read file: %s", err) + } + + Bestin := string([]byte(readBest)) + bestslice := strings.Split(Bestin, "\n") + sort.Strings(bestslice) + + var all, crop, killhead, killfoot string + + for _, v := range bestslice { + + if v != "" { + alltxt, croptxt, killheadtxt, footkilltxt := convertselect(bookdirectory, v, *confthresh) + all = all + " " + alltxt + crop = crop + " " + croptxt + killhead = killhead + " " + killheadtxt + killfoot = killfoot + " " + footkilltxt + + } + } + + + bookname:= filepath.Base(bookdirectory) + b := bookname + "_" + confthreshstring + + err1 := writetofile(bookdirectory, b + "_all.txt", all) + if err1 != nil { + log.Fatalf("Ah shit, we're going down, Nick says ABORT! %v", err1) + } + + err2 := writetofile(bookdirectory, b + "_crop.txt", crop) + if err2 != nil { + log.Fatalf("Ah shit, we're going down, Nick says ABORT! %v", err2) + } + + err3 := writetofile(bookdirectory, b + "_nohead.txt", killhead) + if err3 != nil { + log.Fatalf("Ah shit, we're going down, Nick says ABORT! %v", err3) + } + + err4 := writetofile(bookdirectory, b + "_nofoot.txt", killfoot) + if err4 != nil { + log.Fatalf("Ah shit, we're going down, Nick says ABORT! %v", err4) + } + +} -- cgit v1.2.1-24-ge1ad