diff options
Diffstat (limited to 'pare-gt')
| -rw-r--r-- | pare-gt/main.go | 147 | ||||
| -rw-r--r-- | pare-gt/main_test.go | 78 | 
2 files changed, 0 insertions, 225 deletions
| diff --git a/pare-gt/main.go b/pare-gt/main.go deleted file mode 100644 index a4d9600..0000000 --- a/pare-gt/main.go +++ /dev/null @@ -1,147 +0,0 @@ -package main - -import ( -	"flag" -	"fmt" -	"log" -	"math/rand" -	"os" -	"path" -	"path/filepath" -	"sort" -	"strings" -) - -const usage = `Usage: pare-gt [-n num] gtdir movedir - -Moves some of the ground truth from gt-dir into movedir, -ensuring that the same proportions of each ground truth -source are represented in the moved section. Proportion of -ground truth source is calculated by taking the prefix of -the filename up to the first '-' character. -` - -// Prefixes is a map of the prefix string to a list of filenames -type Prefixes = map[string][]string - -// walker adds any .txt path to prefixes map, under the appropriate -// prefix (blank if no '-' separator was found) -func walker(prefixes *Prefixes) filepath.WalkFunc { -	return func(fpath string, info os.FileInfo, err error) error { -		if err != nil { -			return err -		} -		if info.IsDir() { -			return nil -		} -		ext := path.Ext(fpath) -		if ext != ".txt" { -			return nil -		} -		base := path.Base(fpath) -		idx := strings.Index(base, "-") -		var prefix string -		if idx > -1 { -			prefix = base[0:idx] -		} -		noext := strings.TrimSuffix(fpath, ext) -		(*prefixes)[prefix] = append((*prefixes)[prefix], noext) -		return nil -	} -} - -// inStrSlice checks whether a given string is part of a slice of -// strings -func inStrSlice(sl []string, s string) bool { -	for _, v := range sl { -		if s == v { -			return true -		} -	} -	return false -} - -// samplePrefixes selects random samples for each prefix, proportional -// to the amount of that prefix there are in the whole set, so that a -// total of perctosample% are sampled. -func samplePrefixes(perctosample int, prefixes Prefixes) (filestomove []string) { -	var total, sample int -	var keys []string -	for i, v := range prefixes { -		total += len(v) -		// needed for determinism -		sort.Strings(prefixes[i]) -		keys = append(keys, i) -	} - -	sample = (total * perctosample) / 100 - -	// This ensures the map is looped over deterministically -	sort.Strings(keys) -	for _, key := range keys { -		prefix := prefixes[key] -		len := len(prefix) -		if len == 1 { -			continue -		} -		numtoget := int(float64(sample) / float64(total) * float64(len)) -		if numtoget >= len { -			numtoget = len - 1 -		} -		if numtoget < 1 { -			numtoget = 1 -		} -		for i := 0; i < numtoget; i++ { -			var selected string -			selected = prefix[rand.Int()%len] -			// pick a different random selection if the first one is -			// already in the filestomove slice -			for inStrSlice(filestomove, selected) { -				selected = prefix[rand.Int()%len] -			} -			filestomove = append(filestomove, selected) -		} -	} - -	return -} - -func main() { -	flag.Usage = func() { -		fmt.Fprintf(flag.CommandLine.Output(), usage) -		flag.PrintDefaults() -	} -	numtopare := flag.Int("n", 10, "Percentage of the ground truth to pare away.") -	flag.Parse() -	if flag.NArg() != 2 { -		flag.Usage() -		os.Exit(1) -	} - -	for _, d := range flag.Args() { -		info, err := os.Stat(d) -		if err != nil || !info.IsDir() { -			log.Fatalln("Error accessing directory", flag.Arg(0), err) -		} -	} - -	var prefixes Prefixes -	prefixes = make(Prefixes) -	err := filepath.Walk(flag.Arg(0), walker(&prefixes)) -	if err != nil { -		log.Fatalln("Failed to walk", flag.Arg(0), err) -	} - -	filestomove := samplePrefixes(*numtopare, prefixes) - -	for _, f := range filestomove { -		fmt.Println("Moving ground truth", f) -		b := path.Base(f) -		for _, ext := range []string{".txt", ".png"} { -			err = os.Rename(f+ext, path.Join(flag.Arg(1), b+ext)) -			if err != nil { -				log.Fatalln("Error moving file", f+ext, err) -			} -		} -	} -} diff --git a/pare-gt/main_test.go b/pare-gt/main_test.go deleted file mode 100644 index c381a86..0000000 --- a/pare-gt/main_test.go +++ /dev/null @@ -1,78 +0,0 @@ -package main - -import ( -	"fmt" -	"testing" -) - -func TestSamplePrefixes(t *testing.T) { -	prefixes := Prefixes{ -		"1471-Orthographia": { -			"1471-Orthographia-Tortellius_00001.txt", -			"1471-Orthographia-Tortellius_00002.txt", -			"1471-Orthographia-Tortellius_00003.txt", -			"1471-Orthographia-Tortellius_00004.txt", -			"1471-Orthographia-Tortellius_00005.txt", -			"1471-Orthographia-Tortellius_00006.txt", -			"1471-Orthographia-Tortellius_00007.txt", -			"1471-Orthographia-Tortellius_00008.txt", -			"1471-Orthographia-Tortellius_00009.txt", -			"1471-Orthographia-Tortellius_000010.txt", -			"1471-Orthographia-Tortellius_000011.txt", -			"1471-Orthographia-Tortellius_000012.txt", -			"1471-Orthographia-Tortellius_000013.txt", -			"1471-Orthographia-Tortellius_000014.txt", -			"1471-Orthographia-Tortellius_000015.txt", -			"1471-Orthographia-Tortellius_000016.txt", -			"1471-Orthographia-Tortellius_000017.txt", -			"1471-Orthographia-Tortellius_000018.txt", -			"1471-Orthographia-Tortellius_000019.txt", -			"1471-Orthographia-Tortellius_000020.txt", -		}, -		"Kallimachos_1509": { -			"Kallimachos_1509-ShipOfFools-Barclay_00121.txt", -			"Kallimachos_1509-ShipOfFools-Barclay_00122.txt", -			"Kallimachos_1509-ShipOfFools-Barclay_00123.txt", -			"Kallimachos_1509-ShipOfFools-Barclay_00124.txt", -			"Kallimachos_1509-ShipOfFools-Barclay_00125.txt", -			"Kallimachos_1509-ShipOfFools-Barclay_00126.txt", -		}, -		"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4": { -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_12_49.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_1_415.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_14_6628571428571429.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_16_865.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_17_62.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_18_6366666666666666.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", -			"buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_19_7857142857142857.txt", -		}, -	} - -	cases := []struct { -		perc     int -		expected []string -	}{ -		{1, []string{"1471-Orthographia-Tortellius_000019.txt", "Kallimachos_1509-ShipOfFools-Barclay_00122.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_13_9033333333333333.txt"}}, -		{10, []string{"1471-Orthographia-Tortellius_00002.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt"}}, -		{20, []string{"1471-Orthographia-Tortellius_00008.txt", "1471-Orthographia-Tortellius_000017.txt", "1471-Orthographia-Tortellius_00006.txt", "Kallimachos_1509-ShipOfFools-Barclay_00126.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_11_27.txt", "buckets_1678_DuHAMEL_PhilosophiaVetusEtNova_Vol4_0008_bin0.4-copy_line_1_10_59125.txt"}}, -	} - -	for _, c := range cases { -		t.Run(fmt.Sprintf("%d%%", c.perc), func(t *testing.T) { -			actual := samplePrefixes(c.perc, prefixes) -			if len(c.expected) != len(actual) { -				t.Fatalf("Number of files picked (%d) differs from expected (%d):\nExpected: %s\nActual: %s\n", len(actual), len(c.expected), c.expected, actual) -				return -			} -			for i, v := range c.expected { -				if actual[i] != v { -					t.Fatalf("Difference in expected and actual files (first difference is in index %d of actual):\n\nExpected:\n%s\n\nActual:\n%s\n", i, c.expected, actual) -				} -			} -		}) -	} -} | 
