git » summer » commit c87601c

Implement support for working on a random subset of files

author Alberto Bertogli
2025-03-15 14:32:58 UTC
committer Alberto Bertogli
2025-03-16 11:41:36 UTC
parent 36f1cb944ece0ee8cf6a3a5b82a0ab371f631a7f

Implement support for working on a random subset of files

This patch implements a new `-subsetpct` flag, that makes summer only
work on a randomly selected subset of the files, targeting to include
the given % of files.

Due to randomness, the actual files selected will be different each run,
and can be higher or lower than the given target percentage.

This is mainly useful to verify a random selection of files (for
example, to schedule a verification of a random 20% of files per day).

errors_test.go +5 -0
subset.go +58 -0
subset_test.go +67 -0
summer.go +8 -0
test/help.t +8 -0
test/subset.t +53 -0
walk.go +5 -0

diff --git a/errors_test.go b/errors_test.go
index 3c9cd8a..c62bd4d 100644
--- a/errors_test.go
+++ b/errors_test.go
@@ -13,6 +13,11 @@ import (
 // can't easily simulate Read seeing an "xattr not found" in an end-to-end
 // test.
 
+func init() {
+	// Initialize the subset options, since they are used as part of the walk.
+	options.subset, _ = NewSubset()
+}
+
 func TestDBReadError(t *testing.T) {
 	f, err := os.Open("/dev/null")
 	if err != nil {
diff --git a/subset.go b/subset.go
new file mode 100644
index 0000000..633abfe
--- /dev/null
+++ b/subset.go
@@ -0,0 +1,58 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"math/rand/v2"
+)
+
+// Flags.
+var (
+	subsetPct = flag.Uint("subsetpct", 100,
+		"percentage of files to process (0 = none, 100 = all)")
+	randSeed = flag.Uint64("subsetseed", 0,
+		"seed for the subset selection PRNG, useful for testing (0 = random)")
+)
+
+type Subset struct {
+	// Percentage of files to process (0 = none, 100 = all).
+	percent uint
+
+	// Random source for subset selection.
+	rand *rand.Rand
+}
+
+func NewSubset() (*Subset, error) {
+	if *subsetPct > 100 {
+		return nil, fmt.Errorf(
+			"subset percentage %d must be in the [0, 100] range",
+			*subsetPct)
+	}
+
+	// Seed the PRNG. If the user didn't specify a seed, use two random
+	// numbers.
+	seed1, seed2 := rand.Uint64(), rand.Uint64()
+	if *randSeed != 0 {
+		seed1 = 0
+		seed2 = *randSeed
+	}
+
+	return &Subset{
+		percent: *subsetPct,
+		rand:    rand.New(rand.NewPCG(seed1, seed2)),
+	}, nil
+}
+
+func (s *Subset) ShouldProcess() bool {
+	// Special-case 0% and 100% to avoid picking a random number
+	// unnecessarily.
+	if s.percent == 100 {
+		return true
+	} else if s.percent == 0 {
+		return false
+	}
+
+	// Note this is NOT thread-safe, but the caller is single threaded so it's
+	// fine for our use case.
+	return s.rand.UintN(100) < s.percent
+}
diff --git a/subset_test.go b/subset_test.go
new file mode 100644
index 0000000..feed056
--- /dev/null
+++ b/subset_test.go
@@ -0,0 +1,67 @@
+package main
+
+import (
+	"fmt"
+	"testing"
+)
+
+// Create a Subset instance with a variety of percents, and then confirm that
+// after 100k runs, the distribution of ShouldProcess() calls is as expected.
+func TestSubset(t *testing.T) {
+	pcts := []uint{0, 1, 10, 50, 70, 90, 99, 100}
+	for _, pct := range pcts {
+		t.Run(fmt.Sprintf("percent=%d", pct), func(t *testing.T) {
+			testSubset(t, pct)
+		})
+	}
+}
+
+func testSubset(t *testing.T, pct uint) {
+	*subsetPct = pct
+	subset, err := NewSubset()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	count := uint64(1_000_000)
+	selected := uint64(0)
+	for range count {
+		if subset.ShouldProcess() {
+			selected++
+		}
+	}
+
+	// Confirm that the number of selected items is within 3% of the expected.
+	// With these many samples, we don't expect false positives.
+	expected_min := count * uint64(max(int(pct)-3, 0)) / 100
+	expected_max := count * uint64(min(int(pct)+3, 100)) / 100
+	if selected < expected_min || selected > expected_max {
+		t.Errorf("selected %d items, expected [%d, %d]",
+			selected, expected_min, expected_max)
+	}
+}
+
+// Benchmark the performance of the ShouldProcess() method, for 0%, 50%, and
+// 100%. 0% and 100% should be faster since they don't pick a random number,
+// and the 50% case should give us a sense of the overhead of the random
+// decision.
+func BenchmarkSubset(b *testing.B) {
+	for _, pct := range []uint{0, 50, 100} {
+		b.Run(fmt.Sprintf("percent=%d", pct), func(b *testing.B) {
+			benchmarkSubset(b, pct)
+		})
+	}
+}
+
+func benchmarkSubset(b *testing.B, pct uint) {
+	*subsetPct = pct
+	subset, err := NewSubset()
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		subset.ShouldProcess()
+	}
+}
diff --git a/summer.go b/summer.go
index 131ab2e..cfdc86a 100644
--- a/summer.go
+++ b/summer.go
@@ -70,6 +70,9 @@ var options = struct {
 
 	// How many files to process in parallel.
 	parallel int
+
+	// Subset to decide which files to process.
+	subset *Subset
 }{}
 
 func Usage() {
@@ -105,6 +108,11 @@ func main() {
 		options.parallel = runtime.NumCPU()
 	}
 
+	options.subset, err = NewSubset()
+	if err != nil {
+		Fatalf("%v", err)
+	}
+
 	op := flag.Arg(0)
 	roots := []string{}
 	if flag.NArg() > 1 {
diff --git a/test/help.t b/test/help.t
index 4a2ec9f..d1f1622 100644
--- a/test/help.t
+++ b/test/help.t
@@ -41,6 +41,10 @@ No arguments.
     -parallel int
       \tnumber of files to process in parallel (0 = number of CPUs) (esc)
     -q\tquiet mode (esc)
+    -subsetpct uint
+      \tpercentage of files to process (0 = none, 100 = all) (default 100) (esc)
+    -subsetseed uint
+      \tseed for the subset selection PRNG, useful for testing (0 = random) (esc)
     -v\tverbose mode (list each file) (esc)
     -x\tdon't cross filesystem boundaries (esc)
   [1]
@@ -85,6 +89,10 @@ Too few arguments.
     -parallel int
       \tnumber of files to process in parallel (0 = number of CPUs) (esc)
     -q\tquiet mode (esc)
+    -subsetpct uint
+      \tpercentage of files to process (0 = none, 100 = all) (default 100) (esc)
+    -subsetseed uint
+      \tseed for the subset selection PRNG, useful for testing (0 = random) (esc)
     -v\tverbose mode (list each file) (esc)
     -x\tdon't cross filesystem boundaries (esc)
   [1]
diff --git a/test/subset.t b/test/subset.t
new file mode 100644
index 0000000..24015be
--- /dev/null
+++ b/test/subset.t
@@ -0,0 +1,53 @@
+Tests for only working on a subset of the files.
+
+  $ alias summer="$TESTDIR/../summer"
+
+Test data, with some directories (so we can check we go in depth regardless of
+the random subset).
+
+  $ mkdir dir1 dir2 dir3
+  $ touch dir1/file1 dir1/file2 dir1/file3
+  $ touch dir2/file1 dir2/file2 dir2/file3
+  $ touch dir3/file1 dir3/file2 dir3/file3
+
+Pick a 50% subset, verify that only the files we expect are considered. Note we
+need to use a fixed seed and disable parallelism so we have fully predictable
+output to test against.
+
+  $ summer -v \
+  >   -subsetseed=69 -subsetpct=50 -parallel=1 \
+  >   generate .
+  "dir1/file1": writing checksum \(checksum:0, mtime:\d+\) (re)
+  "dir1/file2": writing checksum \(checksum:0, mtime:\d+\) (re)
+  "dir1/file3": writing checksum \(checksum:0, mtime:\d+\) (re)
+  "dir2/file1": writing checksum \(checksum:0, mtime:\d+\) (re)
+  "dir3/file2": writing checksum \(checksum:0, mtime:\d+\) (re)
+  0s: 0 matched, 0 modified, 5 new, 0 corrupted
+
+Verify using same subset and seed.
+
+  $ summer -v \
+  >   -subsetseed=69 -subsetpct=50 -parallel=1 \
+  >   verify .
+  "dir1/file1": match \(checksum:0, mtime:\d+\) (re)
+  "dir1/file2": match \(checksum:0, mtime:\d+\) (re)
+  "dir1/file3": match \(checksum:0, mtime:\d+\) (re)
+  "dir2/file1": match \(checksum:0, mtime:\d+\) (re)
+  "dir3/file2": match \(checksum:0, mtime:\d+\) (re)
+  0s: 5 matched, 0 modified, 0 new, 0 corrupted
+
+Check -subset flag validation.
+
+  $ summer -subsetpct=101 verify .
+  subset percentage 101 must be in the [0, 100] range
+  [1]
+
+Test that with a 100% subset, all files are included.
+
+  $ summer -subsetpct=100 verify .
+  0s: 5 matched, 0 modified, 4 new, 0 corrupted
+
+Test that with a 0% subset, no files are included.
+
+  $ summer -subsetpct=0 verify .
+  0s: 0 matched, 0 modified, 0 new, 0 corrupted
diff --git a/walk.go b/walk.go
index fa6faa3..f8f0383 100644
--- a/walk.go
+++ b/walk.go
@@ -26,6 +26,11 @@ func openAndInfo(path string, d fs.DirEntry, err error, rootDev deviceID) (bool,
 		return false, nil, nil, nil
 	}
 
+	// If we are only processing a subset of the files, skip some of them.
+	if !options.subset.ShouldProcess() {
+		return false, nil, nil, nil
+	}
+
 	// It is important that we obtain fs.FileInfo at this point, before
 	// reading any of the file contents, because the file could be modified
 	// while we do so. See the comment on ChecksumV1.ModTimeUsec for more