diff options
Diffstat (limited to 'cmd/pare-gt')
-rw-r--r-- | cmd/pare-gt/main.go | 147 | ||||
-rw-r--r-- | cmd/pare-gt/main_test.go | 78 |
2 files changed, 225 insertions, 0 deletions
diff --git a/cmd/pare-gt/main.go b/cmd/pare-gt/main.go new file mode 100644 index 0000000..a4d9600 --- /dev/null +++ b/cmd/pare-gt/main.go @@ -0,0 +1,147 @@ +package main + +import ( + "flag" + "fmt" + "log" + "math/rand" + "os" + "path" + "path/filepath" + "sort" + "strings" +) + +const usage = `Usage: pare-gt [-n num] gtdir movedir + +Moves some of the ground truth from gt-dir into movedir, +ensuring that the same proportions of each ground truth +source are represented in the moved section. Proportion of +ground truth source is calculated by taking the prefix of +the filename up to the first '-' character. +` + +// Prefixes is a map of the prefix string to a list of filenames +type Prefixes = map[string][]string + +// walker adds any .txt path to prefixes map, under the appropriate +// prefix (blank if no '-' separator was found) +func walker(prefixes *Prefixes) filepath.WalkFunc { + return func(fpath string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + ext := path.Ext(fpath) + if ext != ".txt" { + return nil + } + base := path.Base(fpath) + idx := strings.Index(base, "-") + var prefix string + if idx > -1 { + prefix = base[0:idx] + } + noext := strings.TrimSuffix(fpath, ext) + (*prefixes)[prefix] = append((*prefixes)[prefix], noext) + return nil + } +} + +// inStrSlice checks whether a given string is part of a slice of +// strings +func inStrSlice(sl []string, s string) bool { + for _, v := range sl { + if s == v { + return true + } + } + return false +} + +// samplePrefixes selects random samples for each prefix, proportional +// to the amount of that prefix there are in the whole set, so that a +// total of perctosample% are sampled. +func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) { + var total, sample int + var keys []string + for i, v := range prefixes { + total += len(v) + // needed for determinism + sort.Strings(prefixes[i]) + keys = append(keys, i) + } + + sample = (total * perctosample) / 100 + + // This ensures the map is looped over deterministically + sort.Strings(keys) + for _, key := range keys { + prefix := prefixes[key] + len := len(prefix) + if len == 1 { + continue + } + numtoget := int(float64(sample) / float64(total) * float64(len)) + if numtoget >= len { + numtoget = len - 1 + } + if numtoget < 1 { + numtoget = 1 + } + for i := 0; i < numtoget; i++ { + var selected string + selected = prefix[rand.Int()%len] + // pick a different random selection if the first one is + // already in the filestomove slice + for inStrSlice(filestomove, selected) { + selected = prefix[rand.Int()%len] + } + filestomove = append(filestomove, selected) + } + } + + return +} + +func main() { + flag.Usage = func() { + fmt.Fprintf(flag.CommandLine.Output(), usage) + flag.PrintDefaults() + } + numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.") + flag.Parse() + if flag.NArg() != 2 { + flag.Usage() + os.Exit(1) + } + + for _, d := range flag.Args() { + info, err := os.Stat(d) + if err != nil || !info.IsDir() { + log.Fatalln("Error accessing directory", flag.Arg(0), err) + } + } + + var prefixes Prefixes + prefixes = make(Prefixes) + err := filepath.Walk(flag.Arg(0), walker(&prefixes)) + if err != nil { + log.Fatalln("Failed to walk", flag.Arg(0), err) + } + + filestomove := samplePrefixes(*numtopare, prefixes) + + for _, f := range filestomove { + fmt.Println("Moving ground truth", f) + b := path.Base(f) + for _, ext := range []string{".txt", ".png"} { + err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext)) + if err != nil { + log.Fatalln("Error moving file", f+ext, err) + } + } + } +} diff --git a/cmd/pare-gt/main_test.go b/cmd/pare-gt/main_test.go new file mode 100644 index 0000000..c381a86 --- /dev/null +++ b/cmd/pare-gt/main_test.go @@ -0,0 +1,78 @@ +package main + +import ( + "fmt" + "testing" +) + +func TestSamplePrefixes(t *testing.T) { + prefixes := Prefixes{ + "1471-Orthographia": { + "1471-Orthographia-Tortellius_00001.txt", + "1471-Orthographia-Tortellius_00002.txt", + "1471-Orthographia-Tortellius_00003.txt", + "1471-Orthographia-Tortellius_00004.txt", + "1471-Orthographia-Tortellius_00005.txt", + "1471-Orthographia-Tortellius_00006.txt", + "1471-Orthographia-Tortellius_00007.txt", + "1471-Orthographia-Tortellius_00008.txt", + "1471-Orthographia-Tortellius_00009.txt", + "1471-Orthographia-Tortellius_000010.txt", + "1471-Orthographia-Tortellius_000011.txt", + "1471-Orthographia-Tortellius_000012.txt", + "1471-Orthographia-Tortellius_000013.txt", + "1471-Orthographia-Tortellius_000014.txt", + "1471-Orthographia-Tortellius_000015.txt", + "1471-Orthographia-Tortellius_000016.txt", + "1471-Orthographia-Tortellius_000017.txt", + "1471-Orthographia-Tortellius_000018.txt", + "1471-Orthographia-Tortellius_000019.txt", + "1471-Orthographia-Tortellius_000020.txt", + }, + "Kallimachos_1509": { + "Kallimachos_1509-ShipOfFools-Barclay_00121.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00123.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00124.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00125.txt", + "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", + }, + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": { + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", + "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", + }, + } + + cases := []struct { + perc int + expected []string + }{ + {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}}, + {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}}, + {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}}, + } + + for _, c := range cases { + t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) { + actual := samplePrefixes(c.perc, prefixes) + if len(c.expected) != len(actual) { + t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual) + return + } + for i, v := range c.expected { + if actual[i] != v { + t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual) + } + } + }) + } +} |