summaryrefslogtreecommitdiff
path: root/cmd/pare-gt
diff options
context:
space:
mode:
authorNick White <git@njw.name>2020-02-27 17:45:16 +0000
committerNick White <git@njw.name>2020-02-27 17:45:16 +0000
commit3880414bbf2d6f2cd05e208abf919ae5ceabeddc (patch)
treedee30a151048de65a3e42cfdae7739c4502e148f /cmd/pare-gt
parentcda45588cfb796fdd2af27b1851685270df2c02b (diff)
Reorganise all commands to be behind cmd/
Diffstat (limited to 'cmd/pare-gt')
-rw-r--r--cmd/pare-gt/main.go147
-rw-r--r--cmd/pare-gt/main_test.go78
2 files changed, 225 insertions, 0 deletions
diff --git a/cmd/pare-gt/main.go b/cmd/pare-gt/main.go
new file mode 100644
index 0000000..a4d9600
--- /dev/null
+++ b/cmd/pare-gt/main.go
@@ -0,0 +1,147 @@
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "math/rand"
+ "os"
+ "path"
+ "path/filepath"
+ "sort"
+ "strings"
+)
+
+const usage = `Usage: pare-gt [-n num] gtdir movedir
+
+Moves some of the ground truth from gt-dir into movedir,
+ensuring that the same proportions of each ground truth
+source are represented in the moved section. Proportion of
+ground truth source is calculated by taking the prefix of
+the filename up to the first '-' character.
+`
+
+// Prefixes is a map of the prefix string to a list of filenames
+type Prefixes = map[string][]string
+
+// walker adds any .txt path to prefixes map, under the appropriate
+// prefix (blank if no '-' separator was found)
+func walker(prefixes *Prefixes) filepath.WalkFunc {
+ return func(fpath string, info os.FileInfo, err error) error {
+ if err != nil {
+ return err
+ }
+ if info.IsDir() {
+ return nil
+ }
+ ext := path.Ext(fpath)
+ if ext != ".txt" {
+ return nil
+ }
+ base := path.Base(fpath)
+ idx := strings.Index(base, "-")
+ var prefix string
+ if idx > -1 {
+ prefix = base[0:idx]
+ }
+ noext := strings.TrimSuffix(fpath, ext)
+ (*prefixes)[prefix] = append((*prefixes)[prefix], noext)
+ return nil
+ }
+}
+
+// inStrSlice checks whether a given string is part of a slice of
+// strings
+func inStrSlice(sl []string, s string) bool {
+ for _, v := range sl {
+ if s == v {
+ return true
+ }
+ }
+ return false
+}
+
+// samplePrefixes selects random samples for each prefix, proportional
+// to the amount of that prefix there are in the whole set, so that a
+// total of perctosample% are sampled.
+func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) {
+ var total, sample int
+ var keys []string
+ for i, v := range prefixes {
+ total += len(v)
+ // needed for determinism
+ sort.Strings(prefixes[i])
+ keys = append(keys, i)
+ }
+
+ sample = (total * perctosample) / 100
+
+ // This ensures the map is looped over deterministically
+ sort.Strings(keys)
+ for _, key := range keys {
+ prefix := prefixes[key]
+ len := len(prefix)
+ if len == 1 {
+ continue
+ }
+ numtoget := int(float64(sample) / float64(total) * float64(len))
+ if numtoget >= len {
+ numtoget = len - 1
+ }
+ if numtoget < 1 {
+ numtoget = 1
+ }
+ for i := 0; i < numtoget; i++ {
+ var selected string
+ selected = prefix[rand.Int()%len]
+ // pick a different random selection if the first one is
+ // already in the filestomove slice
+ for inStrSlice(filestomove, selected) {
+ selected = prefix[rand.Int()%len]
+ }
+ filestomove = append(filestomove, selected)
+ }
+ }
+
+ return
+}
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(flag.CommandLine.Output(), usage)
+ flag.PrintDefaults()
+ }
+ numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.")
+ flag.Parse()
+ if flag.NArg() != 2 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ for _, d := range flag.Args() {
+ info, err := os.Stat(d)
+ if err != nil || !info.IsDir() {
+ log.Fatalln("Error accessing directory", flag.Arg(0), err)
+ }
+ }
+
+ var prefixes Prefixes
+ prefixes = make(Prefixes)
+ err := filepath.Walk(flag.Arg(0), walker(&prefixes))
+ if err != nil {
+ log.Fatalln("Failed to walk", flag.Arg(0), err)
+ }
+
+ filestomove := samplePrefixes(*numtopare, prefixes)
+
+ for _, f := range filestomove {
+ fmt.Println("Moving ground truth", f)
+ b := path.Base(f)
+ for _, ext := range []string{".txt", ".png"} {
+ err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext))
+ if err != nil {
+ log.Fatalln("Error moving file", f+ext, err)
+ }
+ }
+ }
+}
diff --git a/cmd/pare-gt/main_test.go b/cmd/pare-gt/main_test.go
new file mode 100644
index 0000000..c381a86
--- /dev/null
+++ b/cmd/pare-gt/main_test.go
@@ -0,0 +1,78 @@
+package main
+
+import (
+ "fmt"
+ "testing"
+)
+
+func TestSamplePrefixes(t *testing.T) {
+ prefixes := Prefixes{
+ "1471-Orthographia": {
+ "1471-Orthographia-Tortellius_00001.txt",
+ "1471-Orthographia-Tortellius_00002.txt",
+ "1471-Orthographia-Tortellius_00003.txt",
+ "1471-Orthographia-Tortellius_00004.txt",
+ "1471-Orthographia-Tortellius_00005.txt",
+ "1471-Orthographia-Tortellius_00006.txt",
+ "1471-Orthographia-Tortellius_00007.txt",
+ "1471-Orthographia-Tortellius_00008.txt",
+ "1471-Orthographia-Tortellius_00009.txt",
+ "1471-Orthographia-Tortellius_000010.txt",
+ "1471-Orthographia-Tortellius_000011.txt",
+ "1471-Orthographia-Tortellius_000012.txt",
+ "1471-Orthographia-Tortellius_000013.txt",
+ "1471-Orthographia-Tortellius_000014.txt",
+ "1471-Orthographia-Tortellius_000015.txt",
+ "1471-Orthographia-Tortellius_000016.txt",
+ "1471-Orthographia-Tortellius_000017.txt",
+ "1471-Orthographia-Tortellius_000018.txt",
+ "1471-Orthographia-Tortellius_000019.txt",
+ "1471-Orthographia-Tortellius_000020.txt",
+ },
+ "Kallimachos_1509": {
+ "Kallimachos_1509-ShipOfFools-Barclay_00121.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00122.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00123.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00124.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00125.txt",
+ "Kallimachos_1509-ShipOfFools-Barclay_00126.txt",
+ },
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": {
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt",
+ "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt",
+ },
+ }
+
+ cases := []struct {
+ perc int
+ expected []string
+ }{
+ {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}},
+ {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}},
+ {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}},
+ }
+
+ for _, c := range cases {
+ t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) {
+ actual := samplePrefixes(c.perc, prefixes)
+ if len(c.expected) != len(actual) {
+ t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual)
+ return
+ }
+ for i, v := range c.expected {
+ if actual[i] != v {
+ t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual)
+ }
+ }
+ })
+ }
+}