From 01d5636bbb33edcace579b0026c89a86536b741d Mon Sep 17 00:00:00 2001 From: Nick White Date: Mon, 10 Feb 2020 16:11:16 +0000 Subject: Add pare-gt tool --- pare-gt/main.go | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 pare-gt/main.go diff --git a/pare-gt/main.go b/pare-gt/main.go new file mode 100644 index 0000000..d07e141 --- /dev/null +++ b/pare-gt/main.go @@ -0,0 +1,133 @@ +package main + +import ( + "flag" + "fmt" + "log" + "math/rand" + "os" + "path" + "path/filepath" + "strings" +) + +const usage = `Usage: pare-gt [-n num] gtdir movedir + +Moves some of the ground truth from gt-dir into movedir, +ensuring that the same proportions of each ground truth +source are represented in the moved section. Proportion of +ground truth source is calculated by taking the prefix of +the filename up to the first '-' character. +` + +// Prefixes is a map of the prefix string to a list of filenames +type Prefixes = map[string][]string + +// walker adds any .txt path to prefixes map, under the appropriate +// prefix (blank if no '-' separator was found) +func walker(prefixes *Prefixes) filepath.WalkFunc{ + return func(fpath string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + ext := path.Ext(fpath) + if ext != ".txt" { + return nil + } + base := path.Base(fpath) + idx := strings.Index(base, "-") + var prefix string + if idx > -1 { + prefix = base[0:idx] + } + noext := strings.TrimSuffix(fpath, ext) + (*prefixes)[prefix] = append((*prefixes)[prefix], noext) + return nil + } +} + +// inStrSlice checks whether a given string is part of a slice of +// strings +func inStrSlice(sl []string, s string) bool { + for _, v := range sl { + if s == v { + return true + } + } + return false +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), usage) + flag.PrintDefaults() + } + numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.") + flag.Parse() + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + for _, d := range flag.Args() { + info, err := os.Stat(d) + if err != nil || !info.IsDir() { + log.Fatalln("Error accessing directory", flag.Arg(0), err) + } + } + + var prefixes Prefixes + prefixes = make(Prefixes) + err := filepath.Walk(flag.Arg(0), walker(&prefixes)) + if err != nil { + log.Fatalln("Failed to walk", flag.Arg(0), err) + } + + var total, sample int + for _, v := range prefixes { + total += len(v) + // fmt.Printf("\n%s:\n%s\n", i, v) + } + + sample = total / *numtopare + + // filestomove contains the names of files to move minus file extension + var filestomove []string + + // select random samples for each prefix, proportional to + // the amount of that prefix there are in the whole set + for _, prefix := range prefixes { + len := len(prefix) + if len == 1 { + continue + } + numtoget := int(float64(sample) / float64(total) * float64(len)) + if numtoget < 1 { + numtoget = 1 + } + for i:=0; i