diff options
Diffstat (limited to 'pare-gt')
-rw-r--r-- | pare-gt/main.go | 147 | ||||
-rw-r--r-- | pare-gt/main_test.go | 78 |
2 files changed, 0 insertions, 225 deletions
diff --git a/pare-gt/main.go b/pare-gt/main.go deleted file mode 100644 index a4d9600..0000000 --- a/pare-gt/main.go +++ /dev/null @@ -1,147 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "log" - "math/rand" - "os" - "path" - "path/filepath" - "sort" - "strings" -) - -const usage = `Usage: pare-gt [-n num] gtdir movedir - -Moves some of the ground truth from gt-dir into movedir, -ensuring that the same proportions of each ground truth -source are represented in the moved section. Proportion of -ground truth source is calculated by taking the prefix of -the filename up to the first '-' character. -` - -// Prefixes is a map of the prefix string to a list of filenames -type Prefixes = map[string][]string - -// walker adds any .txt path to prefixes map, under the appropriate -// prefix (blank if no '-' separator was found) -func walker(prefixes *Prefixes) filepath.WalkFunc { - return func(fpath string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if info.IsDir() { - return nil - } - ext := path.Ext(fpath) - if ext != ".txt" { - return nil - } - base := path.Base(fpath) - idx := strings.Index(base, "-") - var prefix string - if idx > -1 { - prefix = base[0:idx] - } - noext := strings.TrimSuffix(fpath, ext) - (*prefixes)[prefix] = append((*prefixes)[prefix], noext) - return nil - } -} - -// inStrSlice checks whether a given string is part of a slice of -// strings -func inStrSlice(sl []string, s string) bool { - for _, v := range sl { - if s == v { - return true - } - } - return false -} - -// samplePrefixes selects random samples for each prefix, proportional -// to the amount of that prefix there are in the whole set, so that a -// total of perctosample% are sampled. -func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) { - var total, sample int - var keys []string - for i, v := range prefixes { - total += len(v) - // needed for determinism - sort.Strings(prefixes[i]) - keys = append(keys, i) - } - - sample = (total * perctosample) / 100 - - // This ensures the map is looped over deterministically - sort.Strings(keys) - for _, key := range keys { - prefix := prefixes[key] - len := len(prefix) - if len == 1 { - continue - } - numtoget := int(float64(sample) / float64(total) * float64(len)) - if numtoget >= len { - numtoget = len - 1 - } - if numtoget < 1 { - numtoget = 1 - } - for i := 0; i < numtoget; i++ { - var selected string - selected = prefix[rand.Int()%len] - // pick a different random selection if the first one is - // already in the filestomove slice - for inStrSlice(filestomove, selected) { - selected = prefix[rand.Int()%len] - } - filestomove = append(filestomove, selected) - } - } - - return -} - -func main() { - flag.Usage = func() { - fmt.Fprintf(flag.CommandLine.Output(), usage) - flag.PrintDefaults() - } - numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.") - flag.Parse() - if flag.NArg() != 2 { - flag.Usage() - os.Exit(1) - } - - for _, d := range flag.Args() { - info, err := os.Stat(d) - if err != nil || !info.IsDir() { - log.Fatalln("Error accessing directory", flag.Arg(0), err) - } - } - - var prefixes Prefixes - prefixes = make(Prefixes) - err := filepath.Walk(flag.Arg(0), walker(&prefixes)) - if err != nil { - log.Fatalln("Failed to walk", flag.Arg(0), err) - } - - filestomove := samplePrefixes(*numtopare, prefixes) - - for _, f := range filestomove { - fmt.Println("Moving ground truth", f) - b := path.Base(f) - for _, ext := range []string{".txt", ".png"} { - err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext)) - if err != nil { - log.Fatalln("Error moving file", f+ext, err) - } - } - } -} diff --git a/pare-gt/main_test.go b/pare-gt/main_test.go deleted file mode 100644 index c381a86..0000000 --- a/pare-gt/main_test.go +++ /dev/null @@ -1,78 +0,0 @@ -package main - -import ( - "fmt" - "testing" -) - -func TestSamplePrefixes(t *testing.T) { - prefixes := Prefixes{ - "1471-Orthographia": { - "1471-Orthographia-Tortellius_00001.txt", - "1471-Orthographia-Tortellius_00002.txt", - "1471-Orthographia-Tortellius_00003.txt", - "1471-Orthographia-Tortellius_00004.txt", - "1471-Orthographia-Tortellius_00005.txt", - "1471-Orthographia-Tortellius_00006.txt", - "1471-Orthographia-Tortellius_00007.txt", - "1471-Orthographia-Tortellius_00008.txt", - "1471-Orthographia-Tortellius_00009.txt", - "1471-Orthographia-Tortellius_000010.txt", - "1471-Orthographia-Tortellius_000011.txt", - "1471-Orthographia-Tortellius_000012.txt", - "1471-Orthographia-Tortellius_000013.txt", - "1471-Orthographia-Tortellius_000014.txt", - "1471-Orthographia-Tortellius_000015.txt", - "1471-Orthographia-Tortellius_000016.txt", - "1471-Orthographia-Tortellius_000017.txt", - "1471-Orthographia-Tortellius_000018.txt", - "1471-Orthographia-Tortellius_000019.txt", - "1471-Orthographia-Tortellius_000020.txt", - }, - "Kallimachos_1509": { - "Kallimachos_1509-ShipOfFools-Barclay_00121.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00123.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00124.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00125.txt", - "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", - }, - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": { - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", - "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", - }, - } - - cases := []struct { - perc int - expected []string - }{ - {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}}, - {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}}, - {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}}, - } - - for _, c := range cases { - t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) { - actual := samplePrefixes(c.perc, prefixes) - if len(c.expected) != len(actual) { - t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual) - return - } - for i, v := range c.expected { - if actual[i] != v { - t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual) - } - } - }) - } -} |