author | Alberto Bertogli
<albertito@blitiri.com.ar> 2025-03-15 14:32:58 UTC |
committer | Alberto Bertogli
<albertito@blitiri.com.ar> 2025-03-16 11:41:36 UTC |
parent | 36f1cb944ece0ee8cf6a3a5b82a0ab371f631a7f |
errors_test.go | +5 | -0 |
subset.go | +58 | -0 |
subset_test.go | +67 | -0 |
summer.go | +8 | -0 |
test/help.t | +8 | -0 |
test/subset.t | +53 | -0 |
walk.go | +5 | -0 |
diff --git a/errors_test.go b/errors_test.go index 3c9cd8a..c62bd4d 100644 --- a/errors_test.go +++ b/errors_test.go @@ -13,6 +13,11 @@ import ( // can't easily simulate Read seeing an "xattr not found" in an end-to-end // test. +func init() { + // Initialize the subset options, since they are used as part of the walk. + options.subset, _ = NewSubset() +} + func TestDBReadError(t *testing.T) { f, err := os.Open("/dev/null") if err != nil { diff --git a/subset.go b/subset.go new file mode 100644 index 0000000..633abfe --- /dev/null +++ b/subset.go @@ -0,0 +1,58 @@ +package main + +import ( + "flag" + "fmt" + "math/rand/v2" +) + +// Flags. +var ( + subsetPct = flag.Uint("subsetpct", 100, + "percentage of files to process (0 = none, 100 = all)") + randSeed = flag.Uint64("subsetseed", 0, + "seed for the subset selection PRNG, useful for testing (0 = random)") +) + +type Subset struct { + // Percentage of files to process (0 = none, 100 = all). + percent uint + + // Random source for subset selection. + rand *rand.Rand +} + +func NewSubset() (*Subset, error) { + if *subsetPct > 100 { + return nil, fmt.Errorf( + "subset percentage %d must be in the [0, 100] range", + *subsetPct) + } + + // Seed the PRNG. If the user didn't specify a seed, use two random + // numbers. + seed1, seed2 := rand.Uint64(), rand.Uint64() + if *randSeed != 0 { + seed1 = 0 + seed2 = *randSeed + } + + return &Subset{ + percent: *subsetPct, + rand: rand.New(rand.NewPCG(seed1, seed2)), + }, nil +} + +func (s *Subset) ShouldProcess() bool { + // Special-case 0% and 100% to avoid picking a random number + // unnecessarily. + if s.percent == 100 { + return true + } else if s.percent == 0 { + return false + } + + // Note this is NOT thread-safe, but the caller is single threaded so it's + // fine for our use case. + return s.rand.UintN(100) < s.percent +} diff --git a/subset_test.go b/subset_test.go new file mode 100644 index 0000000..feed056 --- /dev/null +++ b/subset_test.go @@ -0,0 +1,67 @@ +package main + +import ( + "fmt" + "testing" +) + +// Create a Subset instance with a variety of percents, and then confirm that +// after 100k runs, the distribution of ShouldProcess() calls is as expected. +func TestSubset(t *testing.T) { + pcts := []uint{0, 1, 10, 50, 70, 90, 99, 100} + for _, pct := range pcts { + t.Run(fmt.Sprintf("percent=%d", pct), func(t *testing.T) { + testSubset(t, pct) + }) + } +} + +func testSubset(t *testing.T, pct uint) { + *subsetPct = pct + subset, err := NewSubset() + if err != nil { + t.Fatal(err) + } + + count := uint64(1_000_000) + selected := uint64(0) + for range count { + if subset.ShouldProcess() { + selected++ + } + } + + // Confirm that the number of selected items is within 3% of the expected. + // With these many samples, we don't expect false positives. + expected_min := count * uint64(max(int(pct)-3, 0)) / 100 + expected_max := count * uint64(min(int(pct)+3, 100)) / 100 + if selected < expected_min || selected > expected_max { + t.Errorf("selected %d items, expected [%d, %d]", + selected, expected_min, expected_max) + } +} + +// Benchmark the performance of the ShouldProcess() method, for 0%, 50%, and +// 100%. 0% and 100% should be faster since they don't pick a random number, +// and the 50% case should give us a sense of the overhead of the random +// decision. +func BenchmarkSubset(b *testing.B) { + for _, pct := range []uint{0, 50, 100} { + b.Run(fmt.Sprintf("percent=%d", pct), func(b *testing.B) { + benchmarkSubset(b, pct) + }) + } +} + +func benchmarkSubset(b *testing.B, pct uint) { + *subsetPct = pct + subset, err := NewSubset() + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + subset.ShouldProcess() + } +} diff --git a/summer.go b/summer.go index 131ab2e..cfdc86a 100644 --- a/summer.go +++ b/summer.go @@ -70,6 +70,9 @@ var options = struct { // How many files to process in parallel. parallel int + + // Subset to decide which files to process. + subset *Subset }{} func Usage() { @@ -105,6 +108,11 @@ func main() { options.parallel = runtime.NumCPU() } + options.subset, err = NewSubset() + if err != nil { + Fatalf("%v", err) + } + op := flag.Arg(0) roots := []string{} if flag.NArg() > 1 { diff --git a/test/help.t b/test/help.t index 4a2ec9f..d1f1622 100644 --- a/test/help.t +++ b/test/help.t @@ -41,6 +41,10 @@ No arguments. -parallel int \tnumber of files to process in parallel (0 = number of CPUs) (esc) -q\tquiet mode (esc) + -subsetpct uint + \tpercentage of files to process (0 = none, 100 = all) (default 100) (esc) + -subsetseed uint + \tseed for the subset selection PRNG, useful for testing (0 = random) (esc) -v\tverbose mode (list each file) (esc) -x\tdon't cross filesystem boundaries (esc) [1] @@ -85,6 +89,10 @@ Too few arguments. -parallel int \tnumber of files to process in parallel (0 = number of CPUs) (esc) -q\tquiet mode (esc) + -subsetpct uint + \tpercentage of files to process (0 = none, 100 = all) (default 100) (esc) + -subsetseed uint + \tseed for the subset selection PRNG, useful for testing (0 = random) (esc) -v\tverbose mode (list each file) (esc) -x\tdon't cross filesystem boundaries (esc) [1] diff --git a/test/subset.t b/test/subset.t new file mode 100644 index 0000000..24015be --- /dev/null +++ b/test/subset.t @@ -0,0 +1,53 @@ +Tests for only working on a subset of the files. + + $ alias summer="$TESTDIR/../summer" + +Test data, with some directories (so we can check we go in depth regardless of +the random subset). + + $ mkdir dir1 dir2 dir3 + $ touch dir1/file1 dir1/file2 dir1/file3 + $ touch dir2/file1 dir2/file2 dir2/file3 + $ touch dir3/file1 dir3/file2 dir3/file3 + +Pick a 50% subset, verify that only the files we expect are considered. Note we +need to use a fixed seed and disable parallelism so we have fully predictable +output to test against. + + $ summer -v \ + > -subsetseed=69 -subsetpct=50 -parallel=1 \ + > generate . + "dir1/file1": writing checksum \(checksum:0, mtime:\d+\) (re) + "dir1/file2": writing checksum \(checksum:0, mtime:\d+\) (re) + "dir1/file3": writing checksum \(checksum:0, mtime:\d+\) (re) + "dir2/file1": writing checksum \(checksum:0, mtime:\d+\) (re) + "dir3/file2": writing checksum \(checksum:0, mtime:\d+\) (re) + 0s: 0 matched, 0 modified, 5 new, 0 corrupted + +Verify using same subset and seed. + + $ summer -v \ + > -subsetseed=69 -subsetpct=50 -parallel=1 \ + > verify . + "dir1/file1": match \(checksum:0, mtime:\d+\) (re) + "dir1/file2": match \(checksum:0, mtime:\d+\) (re) + "dir1/file3": match \(checksum:0, mtime:\d+\) (re) + "dir2/file1": match \(checksum:0, mtime:\d+\) (re) + "dir3/file2": match \(checksum:0, mtime:\d+\) (re) + 0s: 5 matched, 0 modified, 0 new, 0 corrupted + +Check -subset flag validation. + + $ summer -subsetpct=101 verify . + subset percentage 101 must be in the [0, 100] range + [1] + +Test that with a 100% subset, all files are included. + + $ summer -subsetpct=100 verify . + 0s: 5 matched, 0 modified, 4 new, 0 corrupted + +Test that with a 0% subset, no files are included. + + $ summer -subsetpct=0 verify . + 0s: 0 matched, 0 modified, 0 new, 0 corrupted diff --git a/walk.go b/walk.go index fa6faa3..f8f0383 100644 --- a/walk.go +++ b/walk.go @@ -26,6 +26,11 @@ func openAndInfo(path string, d fs.DirEntry, err error, rootDev deviceID) (bool, return false, nil, nil, nil } + // If we are only processing a subset of the files, skip some of them. + if !options.subset.ShouldProcess() { + return false, nil, nil, nil + } + // It is important that we obtain fs.FileInfo at this point, before // reading any of the file contents, because the file could be modified // while we do so. See the comment on ChecksumV1.ModTimeUsec for more