diff options
| -rw-r--r-- | bucket-lines-prob/bucket-lines-prob.go | 56 | ||||
| -rw-r--r-- | bucket-lines/bucket-lines.go (renamed from bucket-lines-hocr/bucket-lines-hocr.go) | 34 | ||||
| -rw-r--r-- | parse/hocr/hocr.go | 9 | 
3 files changed, 29 insertions, 70 deletions
| diff --git a/bucket-lines-prob/bucket-lines-prob.go b/bucket-lines-prob/bucket-lines-prob.go deleted file mode 100644 index 728268d..0000000 --- a/bucket-lines-prob/bucket-lines-prob.go +++ /dev/null @@ -1,56 +0,0 @@ -package main - -import ( -	"flag" -	"fmt" -	"log" -	"os" - -	"git.rescribe.xyz/testingtools/parse" -	"git.rescribe.xyz/testingtools/parse/prob" -) - -func main() { -	b := parse.BucketSpecs{ -		// minimum confidence, name -		{ 0, "bad" }, -		{ 0.95, "95to98" }, -		{ 0.98, "98plus" }, -	} - -	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: bucket-lines-prob [-d dir] prob1 [prob2] [...]\n") -		fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") -		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") -		fmt.Fprintf(os.Stderr, "This uses the .prob files generated by ocropy-rpred's --probabilities\n") -		fmt.Fprintf(os.Stderr, "option, which it assumes will be in the same directory as the line's\n") -		fmt.Fprintf(os.Stderr, "image and text files.\n") -		flag.PrintDefaults() -	} -	dir := flag.String("d", "buckets", "Directory to store the buckets") -	flag.Parse() -	if flag.NArg() < 1 { -		flag.Usage() -		os.Exit(1) -	} - -	lines := make(parse.LineDetails, 0) - -	for _, f := range flag.Args() { -		newlines, err := prob.GetLineDetails(f) -		if err != nil { -			log.Fatal(err) -		} - -                for _, l := range newlines { -                        lines = append(lines, l) -                } -	} - -	stats, err := parse.BucketUp(lines, b, *dir) -	if err != nil { -		log.Fatal(err) -	} - -	parse.PrintBucketStats(os.Stdout, stats) -} diff --git a/bucket-lines-hocr/bucket-lines-hocr.go b/bucket-lines/bucket-lines.go index b35c824..ad73fcd 100644 --- a/bucket-lines-hocr/bucket-lines-hocr.go +++ b/bucket-lines/bucket-lines.go @@ -1,7 +1,5 @@  package main -// TODO: merge with -prob, using filename extension to determine what to do for each file -  import (  	"flag"  	"fmt" @@ -14,9 +12,11 @@ import (  	"git.rescribe.xyz/testingtools/parse"  	"git.rescribe.xyz/testingtools/parse/hocr" +	"git.rescribe.xyz/testingtools/parse/prob"  ) -func detailsFromFile(f string) (parse.LineDetails, error) { +// TODO: maybe move this into hocr.go +func detailsFromHocr(f string) (parse.LineDetails, error) {  	var newlines parse.LineDetails  	file, err := ioutil.ReadFile(f) @@ -45,6 +45,9 @@ func detailsFromFile(f string) (parse.LineDetails, error) {  }  func main() { +	// TODO: Allow different specs to be used for .prob vs .hocr. Do this +	//       by adding a field to LineDetails that is linked to a named +	//       BucketSpecs.  	b := parse.BucketSpecs{  		// minimum confidence, name  		{ 0, "bad" }, @@ -53,13 +56,15 @@ func main() {  	}  	flag.Usage = func() { -		fmt.Fprintf(os.Stderr, "Usage: bucket-lines-hocr [-d dir] hocr1 [hocr2] [...]\n") +		fmt.Fprintf(os.Stderr, "Usage: bucket-lines [-d dir] [hocr1] [prob1] [hocr2] [...]\n")  		fmt.Fprintf(os.Stderr, "Copies image-text line pairs into different directories according\n") -		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n") -		fmt.Fprintf(os.Stderr, "This uses the x_wconf data in .hocr files, which it assumes will be.\n") -		fmt.Fprintf(os.Stderr, "in the same directory as the line's image and text files. It can\n") -		fmt.Fprintf(os.Stderr, "handle hocr where each character is tagged separately and hocr where\n") -		fmt.Fprintf(os.Stderr, "only whole words are tagged.\n") +		fmt.Fprintf(os.Stderr, "to the average character probability for the line.\n\n") +		fmt.Fprintf(os.Stderr, "Both .hocr and .prob files can be processed.\n\n") +		fmt.Fprintf(os.Stderr, "For .hocr files, the x_wconf data is used to calculate confidence.\n\n") +		fmt.Fprintf(os.Stderr, "The .prob files are generated using ocropy-rpred's --probabilities\n") +		fmt.Fprintf(os.Stderr, "option.\n\n") +		fmt.Fprintf(os.Stderr, "The .prob and .hocr files are assumed to be in the same directory\n") +		fmt.Fprintf(os.Stderr, "as the line's image and text files.\n\n")  		flag.PrintDefaults()  	}  	dir := flag.String("d", "buckets", "Directory to store the buckets") @@ -69,10 +74,19 @@ func main() {  		os.Exit(1)  	} +	var err error  	lines := make(parse.LineDetails, 0)  	for _, f := range flag.Args() { -		newlines, err := detailsFromFile(f) +		var newlines parse.LineDetails +		switch ext := filepath.Ext(f); ext { +			case ".prob": +				newlines, err = prob.GetLineDetails(f) +			case ".hocr": +				newlines, err = detailsFromHocr(f) +			default: +				log.Printf("Skipping file '%s' as it isn't a .prob or .hocr\n", f) +		}  		if err != nil {  			log.Fatal(err)  		} diff --git a/parse/hocr/hocr.go b/parse/hocr/hocr.go index f7cac05..c03b73a 100644 --- a/parse/hocr/hocr.go +++ b/parse/hocr/hocr.go @@ -1,9 +1,10 @@  package hocr -// TODO: consider making GetLineDetails() a function of Hocr, so could do a -//       similar thing with prob format files too, and then fire them both -//       off a generic interface, potentially. -// TODO: Parse line name to zero pad line numbers, so they come out in the correct order +// TODO: Parse line name to zero pad line numbers, so they can +//       be sorted easily +// TODO: have same filename format as .prob uses, so include base +//       dirname, and don't include line numbers if there's only +//       one line in the hocr  import (  	"encoding/xml" | 
