summaryrefslogtreecommitdiff
path: root/pare-gt
diff options
context:
space:
mode:
Diffstat (limited to 'pare-gt')
-rw-r--r--pare-gt/main.go147
-rw-r--r--pare-gt/main_test.go78
2 files changed, 0 insertions, 225 deletions
diff --git a/pare-gt/main.go b/pare-gt/main.go
deleted file mode 100644
index a4d9600..0000000
--- a/pare-gt/main.go
+++ /dev/null
@@ -1,147 +0,0 @@
-package main
-
-import (
- "flag"
- "fmt"
- "log"
- "math/rand"
- "os"
- "path"
- "path/filepath"
- "sort"
- "strings"
-)
-
-const usage = `Usage: pare-gt [-n num] gtdir movedir
-
-Moves some of the ground truth from gt-dir into movedir,
-ensuring that the same proportions of each ground truth
-source are represented in the moved section. Proportion of
-ground truth source is calculated by taking the prefix of
-the filename up to the first '-' character.
-`
-
-// Prefixes is a map of the prefix string to a list of filenames
-type Prefixes = map[string][]string
-
-// walker adds any .txt path to prefixes map, under the appropriate
-// prefix (blank if no '-' separator was found)
-func walker(prefixes *Prefixes) filepath.WalkFunc {
- return func(fpath string, info os.FileInfo, err error) error {
- if err != nil {
- return err
- }
- if info.IsDir() {
- return nil
- }
- ext := path.Ext(fpath)
- if ext != ".txt" {
- return nil
- }
- base := path.Base(fpath)
- idx := strings.Index(base, "-")
- var prefix string
- if idx > -1 {
- prefix = base[0:idx]
- }
- noext := strings.TrimSuffix(fpath, ext)
- (*prefixes)[prefix] = append((*prefixes)[prefix], noext)
- return nil
- }
-}
-
-// inStrSlice checks whether a given string is part of a slice of
-// strings
-func inStrSlice(sl []string, s string) bool {
- for _, v := range sl {
- if s == v {
- return true
- }
- }
- return false
-}
-
-// samplePrefixes selects random samples for each prefix, proportional
-// to the amount of that prefix there are in the whole set, so that a
-// total of perctosample% are sampled.
-func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) {
- var total, sample int
- var keys []string
- for i, v := range prefixes {
- total += len(v)
- // needed for determinism
- sort.Strings(prefixes[i])
- keys = append(keys, i)
- }
-
- sample = (total * perctosample) / 100
-
- // This ensures the map is looped over deterministically
- sort.Strings(keys)
- for _, key := range keys {
- prefix := prefixes[key]
- len := len(prefix)
- if len == 1 {
- continue
- }
- numtoget := int(float64(sample) / float64(total) * float64(len))
- if numtoget >= len {
- numtoget = len - 1
- }
- if numtoget < 1 {
- numtoget = 1
- }
- for i := 0; i < numtoget; i++ {
- var selected string
- selected = prefix[rand.Int()%len]
- // pick a different random selection if the first one is
- // already in the filestomove slice
- for inStrSlice(filestomove, selected) {
- selected = prefix[rand.Int()%len]
- }
- filestomove = append(filestomove, selected)
- }
- }
-
- return
-}
-
-func main() {
- flag.Usage = func() {
- fmt.Fprintf(flag.CommandLine.Output(), usage)
- flag.PrintDefaults()
- }
- numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.")
- flag.Parse()
- if flag.NArg() != 2 {
- flag.Usage()
- os.Exit(1)
- }
-
- for _, d := range flag.Args() {
- info, err := os.Stat(d)
- if err != nil || !info.IsDir() {
- log.Fatalln("Error accessing directory", flag.Arg(0), err)
- }
- }
-
- var prefixes Prefixes
- prefixes = make(Prefixes)
- err := filepath.Walk(flag.Arg(0), walker(&prefixes))
- if err != nil {
- log.Fatalln("Failed to walk", flag.Arg(0), err)
- }
-
- filestomove := samplePrefixes(*numtopare, prefixes)
-
- for _, f := range filestomove {
- fmt.Println("Moving ground truth", f)
- b := path.Base(f)
- for _, ext := range []string{".txt", ".png"} {
- err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext))
- if err != nil {
- log.Fatalln("Error moving file", f+ext, err)
- }
- }
- }
-}
diff --git a/pare-gt/main_test.go b/pare-gt/main_test.go
deleted file mode 100644
index c381a86..0000000
--- a/pare-gt/main_test.go
+++ /dev/null
@@ -1,78 +0,0 @@
-package main
-
-import (
- "fmt"
- "testing"
-)
-
-func TestSamplePrefixes(t *testing.T) {
- prefixes := Prefixes{
- "1471-Orthographia": {
- "1471-Orthographia-Tortellius_00001.txt",
- "1471-Orthographia-Tortellius_00002.txt",
- "1471-Orthographia-Tortellius_00003.txt",
- "1471-Orthographia-Tortellius_00004.txt",
- "1471-Orthographia-Tortellius_00005.txt",
- "1471-Orthographia-Tortellius_00006.txt",
- "1471-Orthographia-Tortellius_00007.txt",
- "1471-Orthographia-Tortellius_00008.txt",
- "1471-Orthographia-Tortellius_00009.txt",
- "1471-Orthographia-Tortellius_000010.txt",
- "1471-Orthographia-Tortellius_000011.txt",
- "1471-Orthographia-Tortellius_000012.txt",
- "1471-Orthographia-Tortellius_000013.txt",
- "1471-Orthographia-Tortellius_000014.txt",
- "1471-Orthographia-Tortellius_000015.txt",
- "1471-Orthographia-Tortellius_000016.txt",
- "1471-Orthographia-Tortellius_000017.txt",
- "1471-Orthographia-Tortellius_000018.txt",
- "1471-Orthographia-Tortellius_000019.txt",
- "1471-Orthographia-Tortellius_000020.txt",
- },
- "Kallimachos_1509": {
- "Kallimachos_1509-ShipOfFools-Barclay_00121.txt",
- "Kallimachos_1509-ShipOfFools-Barclay_00122.txt",
- "Kallimachos_1509-ShipOfFools-Barclay_00123.txt",
- "Kallimachos_1509-ShipOfFools-Barclay_00124.txt",
- "Kallimachos_1509-ShipOfFools-Barclay_00125.txt",
- "Kallimachos_1509-ShipOfFools-Barclay_00126.txt",
- },
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": {
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt",
- "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt",
- },
- }
-
- cases := []struct {
- perc int
- expected []string
- }{
- {1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}},
- {10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}},
- {20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}},
- }
-
- for _, c := range cases {
- t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) {
- actual := samplePrefixes(c.perc, prefixes)
- if len(c.expected) != len(actual) {
- t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual)
- return
- }
- for i, v := range c.expected {
- if actual[i] != v {
- t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual)
- }
- }
- })
- }
-}