// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package scan_test import ( "fmt" "internal/cpu" "internal/goarch" "internal/runtime/gc" "internal/runtime/gc/scan" "math/bits" "math/rand/v2" "slices" "sync" "testing" "unsafe" ) type scanFunc func(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) func testScanSpanPacked(t *testing.T, scanF scanFunc) { scanR := scan.ScanSpanPackedReference // Construct a fake memory mem, free := makeMem(t, 1) defer free() for i := range mem { // Use values > heap.PageSize because a scan function can discard // pointers smaller than this. mem[i] = uintptr(int(gc.PageSize) + i + 1) } // Construct a random pointer mask rnd := rand.New(rand.NewPCG(42, 42)) var ptrs gc.PtrMask for i := range ptrs { ptrs[i] = uintptr(rnd.Uint64()) } bufF := make([]uintptr, gc.PageWords) bufR := make([]uintptr, gc.PageWords) testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) { nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs) nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs) if nR != nF { t.Errorf("want %d count, got %d", nR, nF) } else if !slices.Equal(bufF[:nF], bufR[:nR]) { t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF]) } }) } func testObjs(t *testing.T, f func(t *testing.T, sizeClass int, objMask *gc.ObjMask)) { for sizeClass := range gc.NumSizeClasses { if sizeClass == 0 { continue } size := uintptr(gc.SizeClassToSize[sizeClass]) if size > gc.MinSizeForMallocHeader { break // Pointer/scalar metadata is not packed for larger sizes. } t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) { // Scan a few objects near i to test boundary conditions. const objMask = 0x101 nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size for i := range nObj - uintptr(bits.Len(objMask)-1) { t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) { var objs gc.ObjMask objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits) f(t, sizeClass, &objs) }) } }) } } var dataCacheSizes = sync.OnceValue(func() []uintptr { cs := cpu.DataCacheSizes() for i, c := range cs { fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize) } return cs }) func BenchmarkScanSpanPacked(b *testing.B) { benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses) } func benchmarkCacheSizes(b *testing.B, fn func(b *testing.B, heapPages int)) { cacheSizes := dataCacheSizes() b.Run("cache=tiny/pages=1", func(b *testing.B) { fn(b, 1) }) for i, cacheBytes := range cacheSizes { pages := int(cacheBytes*3/4) / gc.PageSize b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) { fn(b, pages) }) } if len(cacheSizes) == 0 { return } ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) { fn(b, ramPages) }) } func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) { for sc := range gc.NumSizeClasses { if sc == 0 { continue } if sc >= gc.MinSizeForMallocHeader { break } b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) { benchmarkScanSpanPacked(b, nPages, sc) }) } } func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) { rnd := rand.New(rand.NewPCG(42, 42)) // Construct a fake memory mem, free := makeMem(b, nPages) defer free() for i := range mem { // Use values > heap.PageSize because a scan function can discard // pointers smaller than this. mem[i] = uintptr(int(gc.PageSize) + i + 1) } // Construct a random pointer mask ptrs := make([]gc.PtrMask, nPages) for i := range ptrs { for j := range ptrs[i] { ptrs[i][j] = uintptr(rnd.Uint64()) } } // Visit the pages in a random order pageOrder := rnd.Perm(nPages) // Create the scan buffer. buf := make([]uintptr, gc.PageWords) // Sweep from 0 marks to all marks. We'll use the same marks for each page // because I don't think that predictability matters. objBytes := uintptr(gc.SizeClassToSize[sizeClass]) nObj := gc.PageSize / objBytes markOrder := rnd.Perm(int(nObj)) const steps = 11 for i := 0; i < steps; i++ { frac := float64(i) / float64(steps-1) // Set frac marks. nMarks := int(float64(len(markOrder))*frac + 0.5) var objMarks gc.ObjMask for _, mark := range markOrder[:nMarks] { objMarks[mark/goarch.PtrBits] |= 1 << (mark % goarch.PtrBits) } greyClusters := 0 for page := range ptrs { greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page]) } // Report MB/s of how much memory they're actually hitting. This assumes // 64 byte cache lines (TODO: Should it assume 128 byte cache lines?) // and expands each access to the whole cache line. This is useful for // comparing against memory bandwidth. // // TODO: Add a benchmark that just measures single core memory bandwidth // for comparison. (See runtime memcpy benchmarks.) // // TODO: Should there be a separate measure where we don't expand to // cache lines? avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs)) b.Run(fmt.Sprintf("pct=%d", int(100*frac)), func(b *testing.B) { b.Run("impl=Reference", func(b *testing.B) { b.SetBytes(avgBytes) for i := range b.N { page := pageOrder[i%len(pageOrder)] scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) } }) b.Run("impl=Go", func(b *testing.B) { b.SetBytes(avgBytes) for i := range b.N { page := pageOrder[i%len(pageOrder)] scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) } }) if scan.HasFastScanSpanPacked() { b.Run("impl=Platform", func(b *testing.B) { b.SetBytes(avgBytes) for i := range b.N { page := pageOrder[i%len(pageOrder)] scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page]) } }) } }) } } func countGreyClusters(sizeClass int, objMarks *gc.ObjMask, ptrMask *gc.PtrMask) int { clusters := 0 lastCluster := -1 expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize for word := range gc.PageWords { objI := uintptr(word) / expandBy if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 { continue } if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 { continue } c := word * 8 / goarch.PtrBits if c != lastCluster { lastCluster = c clusters++ } } return clusters } func BenchmarkScanMaxBandwidth(b *testing.B) { // Measure the theoretical "maximum" bandwidth of scanning by reproducing // the memory access pattern of a full page scan, but using memcpy as the // kernel instead of scanning. benchmarkCacheSizes(b, func(b *testing.B, heapPages int) { mem, free := makeMem(b, heapPages) defer free() for i := range mem { mem[i] = uintptr(int(gc.PageSize) + i + 1) } buf := make([]uintptr, gc.PageWords) // Visit the pages in a random order rnd := rand.New(rand.NewPCG(42, 42)) pageOrder := rnd.Perm(heapPages) b.SetBytes(int64(gc.PageSize)) b.ResetTimer() for i := range b.N { page := pageOrder[i%len(pageOrder)] copy(buf, mem[gc.PageWords*page:]) } }) }