Source file src/runtime/metrics_test.go

     1  // Copyright 2020 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package runtime_test
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"internal/abi"
    11  	"internal/goexperiment"
    12  	"internal/profile"
    13  	"internal/testenv"
    14  	"os"
    15  	"reflect"
    16  	"runtime"
    17  	"runtime/debug"
    18  	"runtime/metrics"
    19  	"runtime/pprof"
    20  	"runtime/trace"
    21  	"slices"
    22  	"sort"
    23  	"strings"
    24  	"sync"
    25  	"sync/atomic"
    26  	"testing"
    27  	"time"
    28  	"unsafe"
    29  )
    30  
    31  func prepareAllMetricsSamples() (map[string]metrics.Description, []metrics.Sample) {
    32  	all := metrics.All()
    33  	samples := make([]metrics.Sample, len(all))
    34  	descs := make(map[string]metrics.Description)
    35  	for i := range all {
    36  		samples[i].Name = all[i].Name
    37  		descs[all[i].Name] = all[i]
    38  	}
    39  	return descs, samples
    40  }
    41  
    42  func TestReadMetrics(t *testing.T) {
    43  	// Run a GC cycle to get some of the stats to be non-zero.
    44  	runtime.GC()
    45  
    46  	// Set an arbitrary memory limit to check the metric for it
    47  	limit := int64(512 * 1024 * 1024)
    48  	oldLimit := debug.SetMemoryLimit(limit)
    49  	defer debug.SetMemoryLimit(oldLimit)
    50  
    51  	// Set a GC percent to check the metric for it
    52  	gcPercent := 99
    53  	oldGCPercent := debug.SetGCPercent(gcPercent)
    54  	defer debug.SetGCPercent(oldGCPercent)
    55  
    56  	// Tests whether readMetrics produces values aligning
    57  	// with ReadMemStats while the world is stopped.
    58  	var mstats runtime.MemStats
    59  	_, samples := prepareAllMetricsSamples()
    60  	runtime.ReadMetricsSlow(&mstats, unsafe.Pointer(&samples[0]), len(samples), cap(samples))
    61  
    62  	checkUint64 := func(t *testing.T, m string, got, want uint64) {
    63  		t.Helper()
    64  		if got != want {
    65  			t.Errorf("metric %q: got %d, want %d", m, got, want)
    66  		}
    67  	}
    68  
    69  	// Check to make sure the values we read line up with other values we read.
    70  	var allocsBySize, gcPauses, schedPausesTotalGC *metrics.Float64Histogram
    71  	var tinyAllocs uint64
    72  	var mallocs, frees uint64
    73  	for i := range samples {
    74  		switch name := samples[i].Name; name {
    75  		case "/cgo/go-to-c-calls:calls":
    76  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(runtime.NumCgoCall()))
    77  		case "/memory/classes/heap/free:bytes":
    78  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapIdle-mstats.HeapReleased)
    79  		case "/memory/classes/heap/released:bytes":
    80  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapReleased)
    81  		case "/memory/classes/heap/objects:bytes":
    82  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapAlloc)
    83  		case "/memory/classes/heap/unused:bytes":
    84  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapInuse-mstats.HeapAlloc)
    85  		case "/memory/classes/heap/stacks:bytes":
    86  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackInuse)
    87  		case "/memory/classes/metadata/mcache/free:bytes":
    88  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheSys-mstats.MCacheInuse)
    89  		case "/memory/classes/metadata/mcache/inuse:bytes":
    90  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheInuse)
    91  		case "/memory/classes/metadata/mspan/free:bytes":
    92  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanSys-mstats.MSpanInuse)
    93  		case "/memory/classes/metadata/mspan/inuse:bytes":
    94  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanInuse)
    95  		case "/memory/classes/metadata/other:bytes":
    96  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.GCSys)
    97  		case "/memory/classes/os-stacks:bytes":
    98  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackSys-mstats.StackInuse)
    99  		case "/memory/classes/other:bytes":
   100  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.OtherSys)
   101  		case "/memory/classes/profiling/buckets:bytes":
   102  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
   103  		case "/memory/classes/total:bytes":
   104  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
   105  		case "/gc/heap/allocs-by-size:bytes":
   106  			hist := samples[i].Value.Float64Histogram()
   107  			// Skip size class 0 in BySize, because it's always empty and not represented
   108  			// in the histogram.
   109  			for i, sc := range mstats.BySize[1:] {
   110  				if b, s := hist.Buckets[i+1], float64(sc.Size+1); b != s {
   111  					t.Errorf("bucket does not match size class: got %f, want %f", b, s)
   112  					// The rest of the checks aren't expected to work anyway.
   113  					continue
   114  				}
   115  				if c, m := hist.Counts[i], sc.Mallocs; c != m {
   116  					t.Errorf("histogram counts do not much BySize for class %d: got %d, want %d", i, c, m)
   117  				}
   118  			}
   119  			allocsBySize = hist
   120  		case "/gc/heap/allocs:bytes":
   121  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc)
   122  		case "/gc/heap/frees-by-size:bytes":
   123  			hist := samples[i].Value.Float64Histogram()
   124  			// Skip size class 0 in BySize, because it's always empty and not represented
   125  			// in the histogram.
   126  			for i, sc := range mstats.BySize[1:] {
   127  				if b, s := hist.Buckets[i+1], float64(sc.Size+1); b != s {
   128  					t.Errorf("bucket does not match size class: got %f, want %f", b, s)
   129  					// The rest of the checks aren't expected to work anyway.
   130  					continue
   131  				}
   132  				if c, f := hist.Counts[i], sc.Frees; c != f {
   133  					t.Errorf("histogram counts do not match BySize for class %d: got %d, want %d", i, c, f)
   134  				}
   135  			}
   136  		case "/gc/heap/frees:bytes":
   137  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc-mstats.HeapAlloc)
   138  		case "/gc/heap/tiny/allocs:objects":
   139  			// Currently, MemStats adds tiny alloc count to both Mallocs AND Frees.
   140  			// The reason for this is because MemStats couldn't be extended at the time
   141  			// but there was a desire to have Mallocs at least be a little more representative,
   142  			// while having Mallocs - Frees still represent a live object count.
   143  			// Unfortunately, MemStats doesn't actually export a large allocation count,
   144  			// so it's impossible to pull this number out directly.
   145  			//
   146  			// Check tiny allocation count outside of this loop, by using the allocs-by-size
   147  			// histogram in order to figure out how many large objects there are.
   148  			tinyAllocs = samples[i].Value.Uint64()
   149  			// Because the next two metrics tests are checking against Mallocs and Frees,
   150  			// we can't check them directly for the same reason: we need to account for tiny
   151  			// allocations included in Mallocs and Frees.
   152  		case "/gc/heap/allocs:objects":
   153  			mallocs = samples[i].Value.Uint64()
   154  		case "/gc/heap/frees:objects":
   155  			frees = samples[i].Value.Uint64()
   156  		case "/gc/heap/live:bytes":
   157  			// Check for "obviously wrong" values. We can't check a stronger invariant,
   158  			// such as live <= HeapAlloc, because live is not 100% accurate. It's computed
   159  			// under racy conditions, and some objects may be double-counted (this is
   160  			// intentional and necessary for GC performance).
   161  			//
   162  			// Instead, check against a much more reasonable upper-bound: the amount of
   163  			// mapped heap memory. We can't possibly overcount to the point of exceeding
   164  			// total mapped heap memory, except if there's an accounting bug.
   165  			if live := samples[i].Value.Uint64(); live > mstats.HeapSys {
   166  				t.Errorf("live bytes: %d > heap sys: %d", live, mstats.HeapSys)
   167  			} else if live == 0 {
   168  				// Might happen if we don't call runtime.GC() above.
   169  				t.Error("live bytes is 0")
   170  			}
   171  		case "/gc/gomemlimit:bytes":
   172  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(limit))
   173  		case "/gc/heap/objects:objects":
   174  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
   175  		case "/gc/heap/goal:bytes":
   176  			checkUint64(t, name, samples[i].Value.Uint64(), mstats.NextGC)
   177  		case "/gc/gogc:percent":
   178  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(gcPercent))
   179  		case "/gc/cycles/automatic:gc-cycles":
   180  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC-mstats.NumForcedGC))
   181  		case "/gc/cycles/forced:gc-cycles":
   182  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumForcedGC))
   183  		case "/gc/cycles/total:gc-cycles":
   184  			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
   185  		case "/gc/pauses:seconds":
   186  			gcPauses = samples[i].Value.Float64Histogram()
   187  		case "/sched/pauses/total/gc:seconds":
   188  			schedPausesTotalGC = samples[i].Value.Float64Histogram()
   189  		}
   190  	}
   191  
   192  	// Check tinyAllocs.
   193  	nonTinyAllocs := uint64(0)
   194  	for _, c := range allocsBySize.Counts {
   195  		nonTinyAllocs += c
   196  	}
   197  	checkUint64(t, "/gc/heap/tiny/allocs:objects", tinyAllocs, mstats.Mallocs-nonTinyAllocs)
   198  
   199  	// Check allocation and free counts.
   200  	checkUint64(t, "/gc/heap/allocs:objects", mallocs, mstats.Mallocs-tinyAllocs)
   201  	checkUint64(t, "/gc/heap/frees:objects", frees, mstats.Frees-tinyAllocs)
   202  
   203  	// Verify that /gc/pauses:seconds is a copy of /sched/pauses/total/gc:seconds
   204  	if !reflect.DeepEqual(gcPauses.Buckets, schedPausesTotalGC.Buckets) {
   205  		t.Errorf("/gc/pauses:seconds buckets %v do not match /sched/pauses/total/gc:seconds buckets %v", gcPauses.Buckets, schedPausesTotalGC.Counts)
   206  	}
   207  	if !reflect.DeepEqual(gcPauses.Counts, schedPausesTotalGC.Counts) {
   208  		t.Errorf("/gc/pauses:seconds counts %v do not match /sched/pauses/total/gc:seconds counts %v", gcPauses.Counts, schedPausesTotalGC.Counts)
   209  	}
   210  }
   211  
   212  func TestReadMetricsConsistency(t *testing.T) {
   213  	// Tests whether readMetrics produces consistent, sensible values.
   214  	// The values are read concurrently with the runtime doing other
   215  	// things (e.g. allocating) so what we read can't reasonably compared
   216  	// to other runtime values (e.g. MemStats).
   217  
   218  	// Run a few GC cycles to get some of the stats to be non-zero.
   219  	runtime.GC()
   220  	runtime.GC()
   221  	runtime.GC()
   222  
   223  	// Set GOMAXPROCS high then sleep briefly to ensure we generate
   224  	// some idle time.
   225  	oldmaxprocs := runtime.GOMAXPROCS(10)
   226  	time.Sleep(time.Millisecond)
   227  	runtime.GOMAXPROCS(oldmaxprocs)
   228  
   229  	// Read all the supported metrics through the metrics package.
   230  	descs, samples := prepareAllMetricsSamples()
   231  	metrics.Read(samples)
   232  
   233  	// Check to make sure the values we read make sense.
   234  	var totalVirtual struct {
   235  		got, want uint64
   236  	}
   237  	var objects struct {
   238  		alloc, free             *metrics.Float64Histogram
   239  		allocs, frees           uint64
   240  		allocdBytes, freedBytes uint64
   241  		total, totalBytes       uint64
   242  	}
   243  	var gc struct {
   244  		numGC  uint64
   245  		pauses uint64
   246  	}
   247  	var totalScan struct {
   248  		got, want uint64
   249  	}
   250  	var cpu struct {
   251  		gcAssist    float64
   252  		gcDedicated float64
   253  		gcIdle      float64
   254  		gcPause     float64
   255  		gcTotal     float64
   256  
   257  		idle float64
   258  		user float64
   259  
   260  		scavengeAssist float64
   261  		scavengeBg     float64
   262  		scavengeTotal  float64
   263  
   264  		total float64
   265  	}
   266  	for i := range samples {
   267  		kind := samples[i].Value.Kind()
   268  		if want := descs[samples[i].Name].Kind; kind != want {
   269  			t.Errorf("supported metric %q has unexpected kind: got %d, want %d", samples[i].Name, kind, want)
   270  			continue
   271  		}
   272  		if samples[i].Name != "/memory/classes/total:bytes" && strings.HasPrefix(samples[i].Name, "/memory/classes") {
   273  			v := samples[i].Value.Uint64()
   274  			totalVirtual.want += v
   275  
   276  			// None of these stats should ever get this big.
   277  			// If they do, there's probably overflow involved,
   278  			// usually due to bad accounting.
   279  			if int64(v) < 0 {
   280  				t.Errorf("%q has high/negative value: %d", samples[i].Name, v)
   281  			}
   282  		}
   283  		switch samples[i].Name {
   284  		case "/cpu/classes/gc/mark/assist:cpu-seconds":
   285  			cpu.gcAssist = samples[i].Value.Float64()
   286  		case "/cpu/classes/gc/mark/dedicated:cpu-seconds":
   287  			cpu.gcDedicated = samples[i].Value.Float64()
   288  		case "/cpu/classes/gc/mark/idle:cpu-seconds":
   289  			cpu.gcIdle = samples[i].Value.Float64()
   290  		case "/cpu/classes/gc/pause:cpu-seconds":
   291  			cpu.gcPause = samples[i].Value.Float64()
   292  		case "/cpu/classes/gc/total:cpu-seconds":
   293  			cpu.gcTotal = samples[i].Value.Float64()
   294  		case "/cpu/classes/idle:cpu-seconds":
   295  			cpu.idle = samples[i].Value.Float64()
   296  		case "/cpu/classes/scavenge/assist:cpu-seconds":
   297  			cpu.scavengeAssist = samples[i].Value.Float64()
   298  		case "/cpu/classes/scavenge/background:cpu-seconds":
   299  			cpu.scavengeBg = samples[i].Value.Float64()
   300  		case "/cpu/classes/scavenge/total:cpu-seconds":
   301  			cpu.scavengeTotal = samples[i].Value.Float64()
   302  		case "/cpu/classes/total:cpu-seconds":
   303  			cpu.total = samples[i].Value.Float64()
   304  		case "/cpu/classes/user:cpu-seconds":
   305  			cpu.user = samples[i].Value.Float64()
   306  		case "/memory/classes/total:bytes":
   307  			totalVirtual.got = samples[i].Value.Uint64()
   308  		case "/memory/classes/heap/objects:bytes":
   309  			objects.totalBytes = samples[i].Value.Uint64()
   310  		case "/gc/heap/objects:objects":
   311  			objects.total = samples[i].Value.Uint64()
   312  		case "/gc/heap/allocs:bytes":
   313  			objects.allocdBytes = samples[i].Value.Uint64()
   314  		case "/gc/heap/allocs:objects":
   315  			objects.allocs = samples[i].Value.Uint64()
   316  		case "/gc/heap/allocs-by-size:bytes":
   317  			objects.alloc = samples[i].Value.Float64Histogram()
   318  		case "/gc/heap/frees:bytes":
   319  			objects.freedBytes = samples[i].Value.Uint64()
   320  		case "/gc/heap/frees:objects":
   321  			objects.frees = samples[i].Value.Uint64()
   322  		case "/gc/heap/frees-by-size:bytes":
   323  			objects.free = samples[i].Value.Float64Histogram()
   324  		case "/gc/cycles:gc-cycles":
   325  			gc.numGC = samples[i].Value.Uint64()
   326  		case "/gc/pauses:seconds":
   327  			h := samples[i].Value.Float64Histogram()
   328  			gc.pauses = 0
   329  			for i := range h.Counts {
   330  				gc.pauses += h.Counts[i]
   331  			}
   332  		case "/gc/scan/heap:bytes":
   333  			totalScan.want += samples[i].Value.Uint64()
   334  		case "/gc/scan/globals:bytes":
   335  			totalScan.want += samples[i].Value.Uint64()
   336  		case "/gc/scan/stack:bytes":
   337  			totalScan.want += samples[i].Value.Uint64()
   338  		case "/gc/scan/total:bytes":
   339  			totalScan.got = samples[i].Value.Uint64()
   340  		case "/sched/gomaxprocs:threads":
   341  			if got, want := samples[i].Value.Uint64(), uint64(runtime.GOMAXPROCS(-1)); got != want {
   342  				t.Errorf("gomaxprocs doesn't match runtime.GOMAXPROCS: got %d, want %d", got, want)
   343  			}
   344  		case "/sched/goroutines:goroutines":
   345  			if samples[i].Value.Uint64() < 1 {
   346  				t.Error("number of goroutines is less than one")
   347  			}
   348  		}
   349  	}
   350  	// Only check this on Linux where we can be reasonably sure we have a high-resolution timer.
   351  	if runtime.GOOS == "linux" {
   352  		if cpu.gcDedicated <= 0 && cpu.gcAssist <= 0 && cpu.gcIdle <= 0 {
   353  			t.Errorf("found no time spent on GC work: %#v", cpu)
   354  		}
   355  		if cpu.gcPause <= 0 {
   356  			t.Errorf("found no GC pauses: %f", cpu.gcPause)
   357  		}
   358  		if cpu.idle <= 0 {
   359  			t.Errorf("found no idle time: %f", cpu.idle)
   360  		}
   361  		if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.001) {
   362  			t.Errorf("calculated total GC CPU time not within %%0.1 of total: %f vs. %f", total, cpu.gcTotal)
   363  		}
   364  		if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.001) {
   365  			t.Errorf("calculated total scavenge CPU not within %%0.1 of total: %f vs. %f", total, cpu.scavengeTotal)
   366  		}
   367  		if cpu.total <= 0 {
   368  			t.Errorf("found no total CPU time passed")
   369  		}
   370  		if cpu.user <= 0 {
   371  			t.Errorf("found no user time passed")
   372  		}
   373  		if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.001) {
   374  			t.Errorf("calculated total CPU not within %%0.1 of total: %f vs. %f", total, cpu.total)
   375  		}
   376  	}
   377  	if totalVirtual.got != totalVirtual.want {
   378  		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
   379  	}
   380  	if got, want := objects.allocs-objects.frees, objects.total; got != want {
   381  		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
   382  	}
   383  	if got, want := objects.allocdBytes-objects.freedBytes, objects.totalBytes; got != want {
   384  		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
   385  	}
   386  	if b, c := len(objects.alloc.Buckets), len(objects.alloc.Counts); b != c+1 {
   387  		t.Errorf("allocs-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
   388  	}
   389  	if b, c := len(objects.free.Buckets), len(objects.free.Counts); b != c+1 {
   390  		t.Errorf("frees-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
   391  	}
   392  	if len(objects.alloc.Buckets) != len(objects.free.Buckets) {
   393  		t.Error("allocs-by-size and frees-by-size buckets don't match in length")
   394  	} else if len(objects.alloc.Counts) != len(objects.free.Counts) {
   395  		t.Error("allocs-by-size and frees-by-size counts don't match in length")
   396  	} else {
   397  		for i := range objects.alloc.Buckets {
   398  			ba := objects.alloc.Buckets[i]
   399  			bf := objects.free.Buckets[i]
   400  			if ba != bf {
   401  				t.Errorf("bucket %d is different for alloc and free hists: %f != %f", i, ba, bf)
   402  			}
   403  		}
   404  		if !t.Failed() {
   405  			var gotAlloc, gotFree uint64
   406  			want := objects.total
   407  			for i := range objects.alloc.Counts {
   408  				if objects.alloc.Counts[i] < objects.free.Counts[i] {
   409  					t.Errorf("found more allocs than frees in object dist bucket %d", i)
   410  					continue
   411  				}
   412  				gotAlloc += objects.alloc.Counts[i]
   413  				gotFree += objects.free.Counts[i]
   414  			}
   415  			if got := gotAlloc - gotFree; got != want {
   416  				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
   417  			}
   418  			if gotAlloc != objects.allocs {
   419  				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotAlloc, objects.allocs)
   420  			}
   421  			if gotFree != objects.frees {
   422  				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotFree, objects.frees)
   423  			}
   424  		}
   425  	}
   426  	// The current GC has at least 2 pauses per GC.
   427  	// Check to see if that value makes sense.
   428  	if gc.pauses < gc.numGC*2 {
   429  		t.Errorf("fewer pauses than expected: got %d, want at least %d", gc.pauses, gc.numGC*2)
   430  	}
   431  	if totalScan.got <= 0 {
   432  		t.Errorf("scannable GC space is empty: %d", totalScan.got)
   433  	}
   434  	if totalScan.got != totalScan.want {
   435  		t.Errorf("/gc/scan/total:bytes doesn't line up with sum of /gc/scan*: total %d vs. sum %d", totalScan.got, totalScan.want)
   436  	}
   437  }
   438  
   439  func BenchmarkReadMetricsLatency(b *testing.B) {
   440  	stop := applyGCLoad(b)
   441  
   442  	// Spend this much time measuring latencies.
   443  	latencies := make([]time.Duration, 0, 1024)
   444  	_, samples := prepareAllMetricsSamples()
   445  
   446  	// Hit metrics.Read continuously and measure.
   447  	b.ResetTimer()
   448  	for i := 0; i < b.N; i++ {
   449  		start := time.Now()
   450  		metrics.Read(samples)
   451  		latencies = append(latencies, time.Since(start))
   452  	}
   453  	// Make sure to stop the timer before we wait! The load created above
   454  	// is very heavy-weight and not easy to stop, so we could end up
   455  	// confusing the benchmarking framework for small b.N.
   456  	b.StopTimer()
   457  	stop()
   458  
   459  	// Disable the default */op metrics.
   460  	// ns/op doesn't mean anything because it's an average, but we
   461  	// have a sleep in our b.N loop above which skews this significantly.
   462  	b.ReportMetric(0, "ns/op")
   463  	b.ReportMetric(0, "B/op")
   464  	b.ReportMetric(0, "allocs/op")
   465  
   466  	// Sort latencies then report percentiles.
   467  	sort.Slice(latencies, func(i, j int) bool {
   468  		return latencies[i] < latencies[j]
   469  	})
   470  	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
   471  	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
   472  	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
   473  }
   474  
   475  var readMetricsSink [1024]interface{}
   476  
   477  func TestReadMetricsCumulative(t *testing.T) {
   478  	// Set up the set of metrics marked cumulative.
   479  	descs := metrics.All()
   480  	var samples [2][]metrics.Sample
   481  	samples[0] = make([]metrics.Sample, len(descs))
   482  	samples[1] = make([]metrics.Sample, len(descs))
   483  	total := 0
   484  	for i := range samples[0] {
   485  		if !descs[i].Cumulative {
   486  			continue
   487  		}
   488  		samples[0][total].Name = descs[i].Name
   489  		total++
   490  	}
   491  	samples[0] = samples[0][:total]
   492  	samples[1] = samples[1][:total]
   493  	copy(samples[1], samples[0])
   494  
   495  	// Start some noise in the background.
   496  	var wg sync.WaitGroup
   497  	wg.Add(1)
   498  	done := make(chan struct{})
   499  	go func() {
   500  		defer wg.Done()
   501  		for {
   502  			// Add more things here that could influence metrics.
   503  			for i := 0; i < len(readMetricsSink); i++ {
   504  				readMetricsSink[i] = make([]byte, 1024)
   505  				select {
   506  				case <-done:
   507  					return
   508  				default:
   509  				}
   510  			}
   511  			runtime.GC()
   512  		}
   513  	}()
   514  
   515  	sum := func(us []uint64) uint64 {
   516  		total := uint64(0)
   517  		for _, u := range us {
   518  			total += u
   519  		}
   520  		return total
   521  	}
   522  
   523  	// Populate the first generation.
   524  	metrics.Read(samples[0])
   525  
   526  	// Check to make sure that these metrics only grow monotonically.
   527  	for gen := 1; gen < 10; gen++ {
   528  		metrics.Read(samples[gen%2])
   529  		for i := range samples[gen%2] {
   530  			name := samples[gen%2][i].Name
   531  			vNew, vOld := samples[gen%2][i].Value, samples[1-(gen%2)][i].Value
   532  
   533  			switch vNew.Kind() {
   534  			case metrics.KindUint64:
   535  				new := vNew.Uint64()
   536  				old := vOld.Uint64()
   537  				if new < old {
   538  					t.Errorf("%s decreased: %d < %d", name, new, old)
   539  				}
   540  			case metrics.KindFloat64:
   541  				new := vNew.Float64()
   542  				old := vOld.Float64()
   543  				if new < old {
   544  					t.Errorf("%s decreased: %f < %f", name, new, old)
   545  				}
   546  			case metrics.KindFloat64Histogram:
   547  				new := sum(vNew.Float64Histogram().Counts)
   548  				old := sum(vOld.Float64Histogram().Counts)
   549  				if new < old {
   550  					t.Errorf("%s counts decreased: %d < %d", name, new, old)
   551  				}
   552  			}
   553  		}
   554  	}
   555  	close(done)
   556  
   557  	wg.Wait()
   558  }
   559  
   560  func withinEpsilon(v1, v2, e float64) bool {
   561  	return v2-v2*e <= v1 && v1 <= v2+v2*e
   562  }
   563  
   564  func TestMutexWaitTimeMetric(t *testing.T) {
   565  	var sample [1]metrics.Sample
   566  	sample[0].Name = "/sync/mutex/wait/total:seconds"
   567  
   568  	locks := []locker2{
   569  		new(mutex),
   570  		new(rwmutexWrite),
   571  		new(rwmutexReadWrite),
   572  		new(rwmutexWriteRead),
   573  	}
   574  	for _, lock := range locks {
   575  		t.Run(reflect.TypeOf(lock).Elem().Name(), func(t *testing.T) {
   576  			metrics.Read(sample[:])
   577  			before := time.Duration(sample[0].Value.Float64() * 1e9)
   578  
   579  			minMutexWaitTime := generateMutexWaitTime(lock)
   580  
   581  			metrics.Read(sample[:])
   582  			after := time.Duration(sample[0].Value.Float64() * 1e9)
   583  
   584  			if wt := after - before; wt < minMutexWaitTime {
   585  				t.Errorf("too little mutex wait time: got %s, want %s", wt, minMutexWaitTime)
   586  			}
   587  		})
   588  	}
   589  }
   590  
   591  // locker2 represents an API surface of two concurrent goroutines
   592  // locking the same resource, but through different APIs. It's intended
   593  // to abstract over the relationship of two Lock calls or an RLock
   594  // and a Lock call.
   595  type locker2 interface {
   596  	Lock1()
   597  	Unlock1()
   598  	Lock2()
   599  	Unlock2()
   600  }
   601  
   602  type mutex struct {
   603  	mu sync.Mutex
   604  }
   605  
   606  func (m *mutex) Lock1()   { m.mu.Lock() }
   607  func (m *mutex) Unlock1() { m.mu.Unlock() }
   608  func (m *mutex) Lock2()   { m.mu.Lock() }
   609  func (m *mutex) Unlock2() { m.mu.Unlock() }
   610  
   611  type rwmutexWrite struct {
   612  	mu sync.RWMutex
   613  }
   614  
   615  func (m *rwmutexWrite) Lock1()   { m.mu.Lock() }
   616  func (m *rwmutexWrite) Unlock1() { m.mu.Unlock() }
   617  func (m *rwmutexWrite) Lock2()   { m.mu.Lock() }
   618  func (m *rwmutexWrite) Unlock2() { m.mu.Unlock() }
   619  
   620  type rwmutexReadWrite struct {
   621  	mu sync.RWMutex
   622  }
   623  
   624  func (m *rwmutexReadWrite) Lock1()   { m.mu.RLock() }
   625  func (m *rwmutexReadWrite) Unlock1() { m.mu.RUnlock() }
   626  func (m *rwmutexReadWrite) Lock2()   { m.mu.Lock() }
   627  func (m *rwmutexReadWrite) Unlock2() { m.mu.Unlock() }
   628  
   629  type rwmutexWriteRead struct {
   630  	mu sync.RWMutex
   631  }
   632  
   633  func (m *rwmutexWriteRead) Lock1()   { m.mu.Lock() }
   634  func (m *rwmutexWriteRead) Unlock1() { m.mu.Unlock() }
   635  func (m *rwmutexWriteRead) Lock2()   { m.mu.RLock() }
   636  func (m *rwmutexWriteRead) Unlock2() { m.mu.RUnlock() }
   637  
   638  // generateMutexWaitTime causes a couple of goroutines
   639  // to block a whole bunch of times on a sync.Mutex, returning
   640  // the minimum amount of time that should be visible in the
   641  // /sync/mutex-wait:seconds metric.
   642  func generateMutexWaitTime(mu locker2) time.Duration {
   643  	// Set up the runtime to always track casgstatus transitions for metrics.
   644  	*runtime.CasGStatusAlwaysTrack = true
   645  
   646  	mu.Lock1()
   647  
   648  	// Start up a goroutine to wait on the lock.
   649  	gc := make(chan *runtime.G)
   650  	done := make(chan bool)
   651  	go func() {
   652  		gc <- runtime.Getg()
   653  
   654  		for {
   655  			mu.Lock2()
   656  			mu.Unlock2()
   657  			if <-done {
   658  				return
   659  			}
   660  		}
   661  	}()
   662  	gp := <-gc
   663  
   664  	// Set the block time high enough so that it will always show up, even
   665  	// on systems with coarse timer granularity.
   666  	const blockTime = 100 * time.Millisecond
   667  
   668  	// Make sure the goroutine spawned above actually blocks on the lock.
   669  	for {
   670  		if runtime.GIsWaitingOnMutex(gp) {
   671  			break
   672  		}
   673  		runtime.Gosched()
   674  	}
   675  
   676  	// Let some amount of time pass.
   677  	time.Sleep(blockTime)
   678  
   679  	// Let the other goroutine acquire the lock.
   680  	mu.Unlock1()
   681  	done <- true
   682  
   683  	// Reset flag.
   684  	*runtime.CasGStatusAlwaysTrack = false
   685  	return blockTime
   686  }
   687  
   688  // See issue #60276.
   689  func TestCPUMetricsSleep(t *testing.T) {
   690  	if runtime.GOOS == "wasip1" {
   691  		// Since wasip1 busy-waits in the scheduler, there's no meaningful idle
   692  		// time. This is accurately reflected in the metrics, but it means this
   693  		// test is basically meaningless on this platform.
   694  		t.Skip("wasip1 currently busy-waits in idle time; test not applicable")
   695  	}
   696  
   697  	names := []string{
   698  		"/cpu/classes/idle:cpu-seconds",
   699  
   700  		"/cpu/classes/gc/mark/assist:cpu-seconds",
   701  		"/cpu/classes/gc/mark/dedicated:cpu-seconds",
   702  		"/cpu/classes/gc/mark/idle:cpu-seconds",
   703  		"/cpu/classes/gc/pause:cpu-seconds",
   704  		"/cpu/classes/gc/total:cpu-seconds",
   705  		"/cpu/classes/scavenge/assist:cpu-seconds",
   706  		"/cpu/classes/scavenge/background:cpu-seconds",
   707  		"/cpu/classes/scavenge/total:cpu-seconds",
   708  		"/cpu/classes/total:cpu-seconds",
   709  		"/cpu/classes/user:cpu-seconds",
   710  	}
   711  	prep := func() []metrics.Sample {
   712  		mm := make([]metrics.Sample, len(names))
   713  		for i := range names {
   714  			mm[i].Name = names[i]
   715  		}
   716  		return mm
   717  	}
   718  	m1, m2 := prep(), prep()
   719  
   720  	const (
   721  		// Expected time spent idle.
   722  		dur = 100 * time.Millisecond
   723  
   724  		// maxFailures is the number of consecutive failures requires to cause the test to fail.
   725  		maxFailures = 10
   726  	)
   727  
   728  	failureIdleTimes := make([]float64, 0, maxFailures)
   729  
   730  	// If the bug we expect is happening, then the Sleep CPU time will be accounted for
   731  	// as user time rather than idle time. In an ideal world we'd expect the whole application
   732  	// to go instantly idle the moment this goroutine goes to sleep, and stay asleep for that
   733  	// duration. However, the Go runtime can easily eat into idle time while this goroutine is
   734  	// blocked in a sleep. For example, slow platforms might spend more time expected in the
   735  	// scheduler. Another example is that a Go runtime background goroutine could run while
   736  	// everything else is idle. Lastly, if a running goroutine is descheduled by the OS, enough
   737  	// time may pass such that the goroutine is ready to wake, even though the runtime couldn't
   738  	// observe itself as idle with nanotime.
   739  	//
   740  	// To deal with all this, we give a half-proc's worth of leniency.
   741  	//
   742  	// We also retry multiple times to deal with the fact that the OS might deschedule us before
   743  	// we yield and go idle. That has a rare enough chance that retries should resolve it.
   744  	// If the issue we expect is happening, it should be persistent.
   745  	minIdleCPUSeconds := dur.Seconds() * (float64(runtime.GOMAXPROCS(-1)) - 0.5)
   746  
   747  	// Let's make sure there's no background scavenge work to do.
   748  	//
   749  	// The runtime.GC calls below ensure the background sweeper
   750  	// will not run during the idle period.
   751  	debug.FreeOSMemory()
   752  
   753  	for retries := 0; retries < maxFailures; retries++ {
   754  		// Read 1.
   755  		runtime.GC() // Update /cpu/classes metrics.
   756  		metrics.Read(m1)
   757  
   758  		// Sleep.
   759  		time.Sleep(dur)
   760  
   761  		// Read 2.
   762  		runtime.GC() // Update /cpu/classes metrics.
   763  		metrics.Read(m2)
   764  
   765  		dt := m2[0].Value.Float64() - m1[0].Value.Float64()
   766  		if dt >= minIdleCPUSeconds {
   767  			// All is well. Test passed.
   768  			return
   769  		}
   770  		failureIdleTimes = append(failureIdleTimes, dt)
   771  		// Try again.
   772  	}
   773  
   774  	// We couldn't observe the expected idle time even once.
   775  	for i, dt := range failureIdleTimes {
   776  		t.Logf("try %2d: idle time = %.5fs\n", i+1, dt)
   777  	}
   778  	t.Logf("try %d breakdown:\n", len(failureIdleTimes))
   779  	for i := range names {
   780  		if m1[i].Value.Kind() == metrics.KindBad {
   781  			continue
   782  		}
   783  		t.Logf("\t%s %0.3f\n", names[i], m2[i].Value.Float64()-m1[i].Value.Float64())
   784  	}
   785  	t.Errorf(`time.Sleep did not contribute enough to "idle" class: minimum idle time = %.5fs`, minIdleCPUSeconds)
   786  }
   787  
   788  // Call f() and verify that the correct STW metrics increment. If isGC is true,
   789  // fn triggers a GC STW. Otherwise, fn triggers an other STW.
   790  func testSchedPauseMetrics(t *testing.T, fn func(t *testing.T), isGC bool) {
   791  	m := []metrics.Sample{
   792  		{Name: "/sched/pauses/stopping/gc:seconds"},
   793  		{Name: "/sched/pauses/stopping/other:seconds"},
   794  		{Name: "/sched/pauses/total/gc:seconds"},
   795  		{Name: "/sched/pauses/total/other:seconds"},
   796  	}
   797  
   798  	stoppingGC := &m[0]
   799  	stoppingOther := &m[1]
   800  	totalGC := &m[2]
   801  	totalOther := &m[3]
   802  
   803  	sampleCount := func(s *metrics.Sample) uint64 {
   804  		h := s.Value.Float64Histogram()
   805  
   806  		var n uint64
   807  		for _, c := range h.Counts {
   808  			n += c
   809  		}
   810  		return n
   811  	}
   812  
   813  	// Read baseline.
   814  	metrics.Read(m)
   815  
   816  	baselineStartGC := sampleCount(stoppingGC)
   817  	baselineStartOther := sampleCount(stoppingOther)
   818  	baselineTotalGC := sampleCount(totalGC)
   819  	baselineTotalOther := sampleCount(totalOther)
   820  
   821  	fn(t)
   822  
   823  	metrics.Read(m)
   824  
   825  	if isGC {
   826  		if got := sampleCount(stoppingGC); got <= baselineStartGC {
   827  			t.Errorf("/sched/pauses/stopping/gc:seconds sample count %d did not increase from baseline of %d", got, baselineStartGC)
   828  		}
   829  		if got := sampleCount(totalGC); got <= baselineTotalGC {
   830  			t.Errorf("/sched/pauses/total/gc:seconds sample count %d did not increase from baseline of %d", got, baselineTotalGC)
   831  		}
   832  
   833  		if got := sampleCount(stoppingOther); got != baselineStartOther {
   834  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d changed from baseline of %d", got, baselineStartOther)
   835  		}
   836  		if got := sampleCount(totalOther); got != baselineTotalOther {
   837  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d changed from baseline of %d", got, baselineTotalOther)
   838  		}
   839  	} else {
   840  		if got := sampleCount(stoppingGC); got != baselineStartGC {
   841  			t.Errorf("/sched/pauses/stopping/gc:seconds sample count %d changed from baseline of %d", got, baselineStartGC)
   842  		}
   843  		if got := sampleCount(totalGC); got != baselineTotalGC {
   844  			t.Errorf("/sched/pauses/total/gc:seconds sample count %d changed from baseline of %d", got, baselineTotalGC)
   845  		}
   846  
   847  		if got := sampleCount(stoppingOther); got <= baselineStartOther {
   848  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d did not increase from baseline of %d", got, baselineStartOther)
   849  		}
   850  		if got := sampleCount(totalOther); got <= baselineTotalOther {
   851  			t.Errorf("/sched/pauses/stopping/other:seconds sample count %d did not increase from baseline of %d", got, baselineTotalOther)
   852  		}
   853  	}
   854  }
   855  
   856  func TestSchedPauseMetrics(t *testing.T) {
   857  	tests := []struct {
   858  		name string
   859  		isGC bool
   860  		fn   func(t *testing.T)
   861  	}{
   862  		{
   863  			name: "runtime.GC",
   864  			isGC: true,
   865  			fn: func(t *testing.T) {
   866  				runtime.GC()
   867  			},
   868  		},
   869  		{
   870  			name: "runtime.GOMAXPROCS",
   871  			fn: func(t *testing.T) {
   872  				if runtime.GOARCH == "wasm" {
   873  					t.Skip("GOMAXPROCS >1 not supported on wasm")
   874  				}
   875  
   876  				n := runtime.GOMAXPROCS(0)
   877  				defer runtime.GOMAXPROCS(n)
   878  
   879  				runtime.GOMAXPROCS(n + 1)
   880  			},
   881  		},
   882  		{
   883  			name: "runtime.GoroutineProfile",
   884  			fn: func(t *testing.T) {
   885  				var s [1]runtime.StackRecord
   886  				runtime.GoroutineProfile(s[:])
   887  			},
   888  		},
   889  		{
   890  			name: "runtime.ReadMemStats",
   891  			fn: func(t *testing.T) {
   892  				var mstats runtime.MemStats
   893  				runtime.ReadMemStats(&mstats)
   894  			},
   895  		},
   896  		{
   897  			name: "runtime.Stack",
   898  			fn: func(t *testing.T) {
   899  				var b [64]byte
   900  				runtime.Stack(b[:], true)
   901  			},
   902  		},
   903  		{
   904  			name: "runtime/debug.WriteHeapDump",
   905  			fn: func(t *testing.T) {
   906  				if runtime.GOOS == "js" {
   907  					t.Skip("WriteHeapDump not supported on js")
   908  				}
   909  
   910  				f, err := os.CreateTemp(t.TempDir(), "heapdumptest")
   911  				if err != nil {
   912  					t.Fatalf("os.CreateTemp failed: %v", err)
   913  				}
   914  				defer os.Remove(f.Name())
   915  				defer f.Close()
   916  				debug.WriteHeapDump(f.Fd())
   917  			},
   918  		},
   919  		{
   920  			name: "runtime/trace.Start",
   921  			fn: func(t *testing.T) {
   922  				if trace.IsEnabled() {
   923  					t.Skip("tracing already enabled")
   924  				}
   925  
   926  				var buf bytes.Buffer
   927  				if err := trace.Start(&buf); err != nil {
   928  					t.Errorf("trace.Start err got %v want nil", err)
   929  				}
   930  				trace.Stop()
   931  			},
   932  		},
   933  	}
   934  
   935  	// These tests count STW pauses, classified based on whether they're related
   936  	// to the GC or not. Disable automatic GC cycles during the test so we don't
   937  	// have an incidental GC pause when we're trying to observe only
   938  	// non-GC-related pauses. This is especially important for the
   939  	// runtime/trace.Start test, since (as of this writing) that will block
   940  	// until any active GC mark phase completes.
   941  	defer debug.SetGCPercent(debug.SetGCPercent(-1))
   942  	runtime.GC()
   943  
   944  	for _, tc := range tests {
   945  		t.Run(tc.name, func(t *testing.T) {
   946  			testSchedPauseMetrics(t, tc.fn, tc.isGC)
   947  		})
   948  	}
   949  }
   950  
   951  func TestRuntimeLockMetricsAndProfile(t *testing.T) {
   952  	old := runtime.SetMutexProfileFraction(0) // enabled during sub-tests
   953  	defer runtime.SetMutexProfileFraction(old)
   954  	if old != 0 {
   955  		t.Fatalf("need MutexProfileRate 0, got %d", old)
   956  	}
   957  
   958  	{
   959  		before := os.Getenv("GODEBUG")
   960  		for _, s := range strings.Split(before, ",") {
   961  			if strings.HasPrefix(s, "runtimecontentionstacks=") {
   962  				t.Logf("GODEBUG includes explicit setting %q", s)
   963  			}
   964  		}
   965  		defer func() { os.Setenv("GODEBUG", before) }()
   966  		os.Setenv("GODEBUG", fmt.Sprintf("%s,runtimecontentionstacks=1", before))
   967  	}
   968  
   969  	t.Logf("NumCPU %d", runtime.NumCPU())
   970  	t.Logf("GOMAXPROCS %d", runtime.GOMAXPROCS(0))
   971  	if minCPU := 2; runtime.NumCPU() < minCPU {
   972  		t.Skipf("creating and observing contention on runtime-internal locks requires NumCPU >= %d", minCPU)
   973  	}
   974  
   975  	loadProfile := func(t *testing.T) *profile.Profile {
   976  		var w bytes.Buffer
   977  		pprof.Lookup("mutex").WriteTo(&w, 0)
   978  		p, err := profile.Parse(&w)
   979  		if err != nil {
   980  			t.Fatalf("failed to parse profile: %v", err)
   981  		}
   982  		if err := p.CheckValid(); err != nil {
   983  			t.Fatalf("invalid profile: %v", err)
   984  		}
   985  		return p
   986  	}
   987  
   988  	measureDelta := func(t *testing.T, fn func()) (metricGrowth, profileGrowth float64, p *profile.Profile) {
   989  		beforeProfile := loadProfile(t)
   990  		beforeMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
   991  		metrics.Read(beforeMetrics)
   992  
   993  		fn()
   994  
   995  		afterProfile := loadProfile(t)
   996  		afterMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
   997  		metrics.Read(afterMetrics)
   998  
   999  		sumSamples := func(p *profile.Profile, i int) int64 {
  1000  			var sum int64
  1001  			for _, s := range p.Sample {
  1002  				sum += s.Value[i]
  1003  			}
  1004  			return sum
  1005  		}
  1006  
  1007  		metricGrowth = afterMetrics[0].Value.Float64() - beforeMetrics[0].Value.Float64()
  1008  		profileGrowth = float64(sumSamples(afterProfile, 1)-sumSamples(beforeProfile, 1)) * time.Nanosecond.Seconds()
  1009  
  1010  		// The internal/profile package does not support compaction; this delta
  1011  		// profile will include separate positive and negative entries.
  1012  		p = afterProfile.Copy()
  1013  		if len(beforeProfile.Sample) > 0 {
  1014  			err := p.Merge(beforeProfile, -1)
  1015  			if err != nil {
  1016  				t.Fatalf("Merge profiles: %v", err)
  1017  			}
  1018  		}
  1019  
  1020  		return metricGrowth, profileGrowth, p
  1021  	}
  1022  
  1023  	testcase := func(strictTiming bool, acceptStacks [][]string, workers int, fn func() bool) func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
  1024  		return func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
  1025  			metricGrowth, profileGrowth, p := measureDelta(t, func() {
  1026  				var started, stopped sync.WaitGroup
  1027  				started.Add(workers)
  1028  				stopped.Add(workers)
  1029  				for i := 0; i < workers; i++ {
  1030  					w := &contentionWorker{
  1031  						before: func() {
  1032  							started.Done()
  1033  							started.Wait()
  1034  						},
  1035  						after: func() {
  1036  							stopped.Done()
  1037  						},
  1038  						fn: fn,
  1039  					}
  1040  					go w.run()
  1041  				}
  1042  				stopped.Wait()
  1043  			})
  1044  
  1045  			if profileGrowth == 0 {
  1046  				t.Errorf("no increase in mutex profile")
  1047  			}
  1048  			if metricGrowth == 0 && strictTiming {
  1049  				// If the critical section is very short, systems with low timer
  1050  				// resolution may be unable to measure it via nanotime.
  1051  				t.Errorf("no increase in /sync/mutex/wait/total:seconds metric")
  1052  			}
  1053  			// This comparison is possible because the time measurements in support of
  1054  			// runtime/pprof and runtime/metrics for runtime-internal locks are so close
  1055  			// together. It doesn't work as well for user-space contention, where the
  1056  			// involved goroutines are not _Grunnable the whole time and so need to pass
  1057  			// through the scheduler.
  1058  			t.Logf("lock contention growth in runtime/pprof's view  (%fs)", profileGrowth)
  1059  			t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
  1060  
  1061  			acceptStacks = append([][]string(nil), acceptStacks...)
  1062  			for i, stk := range acceptStacks {
  1063  				if goexperiment.StaticLockRanking {
  1064  					if !slices.ContainsFunc(stk, func(s string) bool {
  1065  						return s == "runtime.systemstack" || s == "runtime.mcall" || s == "runtime.mstart"
  1066  					}) {
  1067  						// stk is a call stack that is still on the user stack when
  1068  						// it calls runtime.unlock. Add the extra function that
  1069  						// we'll see, when the static lock ranking implementation of
  1070  						// runtime.unlockWithRank switches to the system stack.
  1071  						stk = append([]string{"runtime.unlockWithRank"}, stk...)
  1072  					}
  1073  				}
  1074  				acceptStacks[i] = stk
  1075  			}
  1076  
  1077  			var stks [][]string
  1078  			values := make([][2]int64, len(acceptStacks))
  1079  			for _, s := range p.Sample {
  1080  				var have []string
  1081  				for _, loc := range s.Location {
  1082  					for _, line := range loc.Line {
  1083  						have = append(have, line.Function.Name)
  1084  					}
  1085  				}
  1086  				stks = append(stks, have)
  1087  				for i, stk := range acceptStacks {
  1088  					if slices.Equal(have, stk) {
  1089  						values[i][0] += s.Value[0]
  1090  						values[i][1] += s.Value[1]
  1091  					}
  1092  				}
  1093  			}
  1094  			for i, stk := range acceptStacks {
  1095  				n += values[i][0]
  1096  				value += values[i][1]
  1097  				t.Logf("stack %v has samples totaling n=%d value=%d", stk, values[i][0], values[i][1])
  1098  			}
  1099  			if n == 0 && value == 0 {
  1100  				t.Logf("profile:\n%s", p)
  1101  				for _, have := range stks {
  1102  					t.Logf("have stack %v", have)
  1103  				}
  1104  				for _, stk := range acceptStacks {
  1105  					t.Errorf("want stack %v", stk)
  1106  				}
  1107  			}
  1108  
  1109  			return metricGrowth, profileGrowth, n, value
  1110  		}
  1111  	}
  1112  
  1113  	name := t.Name()
  1114  
  1115  	t.Run("runtime.lock", func(t *testing.T) {
  1116  		mus := make([]runtime.Mutex, 100)
  1117  		var needContention atomic.Int64
  1118  		delay := 100 * time.Microsecond // large relative to system noise, for comparison between clocks
  1119  		delayMicros := delay.Microseconds()
  1120  
  1121  		// The goroutine that acquires the lock will only proceed when it
  1122  		// detects that its partner is contended for the lock. That will lead to
  1123  		// live-lock if anything (such as a STW) prevents the partner goroutine
  1124  		// from running. Allowing the contention workers to pause and restart
  1125  		// (to allow a STW to proceed) makes it harder to confirm that we're
  1126  		// counting the correct number of contention events, since some locks
  1127  		// will end up contended twice. Instead, disable the GC.
  1128  		defer debug.SetGCPercent(debug.SetGCPercent(-1))
  1129  
  1130  		const workers = 2
  1131  		if runtime.GOMAXPROCS(0) < workers {
  1132  			t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workers)
  1133  		}
  1134  
  1135  		fn := func() bool {
  1136  			n := int(needContention.Load())
  1137  			if n < 0 {
  1138  				return false
  1139  			}
  1140  			mu := &mus[n]
  1141  
  1142  			runtime.Lock(mu)
  1143  			for int(needContention.Load()) == n {
  1144  				if runtime.MutexContended(mu) {
  1145  					// make them wait a little while
  1146  					for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
  1147  						runtime.Usleep(uint32(delayMicros))
  1148  					}
  1149  					break
  1150  				}
  1151  			}
  1152  			runtime.Unlock(mu)
  1153  			needContention.Store(int64(n - 1))
  1154  
  1155  			return true
  1156  		}
  1157  
  1158  		stks := [][]string{{
  1159  			"runtime.unlock",
  1160  			"runtime_test." + name + ".func5.1",
  1161  			"runtime_test.(*contentionWorker).run",
  1162  		}}
  1163  
  1164  		t.Run("sample-1", func(t *testing.T) {
  1165  			old := runtime.SetMutexProfileFraction(1)
  1166  			defer runtime.SetMutexProfileFraction(old)
  1167  
  1168  			needContention.Store(int64(len(mus) - 1))
  1169  			metricGrowth, profileGrowth, n, _ := testcase(true, stks, workers, fn)(t)
  1170  
  1171  			if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); have < want {
  1172  				// The test imposes a delay with usleep, verified with calls to
  1173  				// nanotime. Compare against the runtime/metrics package's view
  1174  				// (based on nanotime) rather than runtime/pprof's view (based
  1175  				// on cputicks).
  1176  				t.Errorf("runtime/metrics reported less than the known minimum contention duration (%fs < %fs)", have, want)
  1177  			}
  1178  			if have, want := n, int64(len(mus)); have != want {
  1179  				t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
  1180  			}
  1181  
  1182  			const slop = 1.5 // account for nanotime vs cputicks
  1183  			t.Run("compare timers", func(t *testing.T) {
  1184  				testenv.SkipFlaky(t, 64253)
  1185  				if profileGrowth > slop*metricGrowth || metricGrowth > slop*profileGrowth {
  1186  					t.Errorf("views differ by more than %fx", slop)
  1187  				}
  1188  			})
  1189  		})
  1190  
  1191  		t.Run("sample-2", func(t *testing.T) {
  1192  			testenv.SkipFlaky(t, 64253)
  1193  
  1194  			old := runtime.SetMutexProfileFraction(2)
  1195  			defer runtime.SetMutexProfileFraction(old)
  1196  
  1197  			needContention.Store(int64(len(mus) - 1))
  1198  			metricGrowth, profileGrowth, n, _ := testcase(true, stks, workers, fn)(t)
  1199  
  1200  			// With 100 trials and profile fraction of 2, we expect to capture
  1201  			// 50 samples. Allow the test to pass if we get at least 20 samples;
  1202  			// the CDF of the binomial distribution says there's less than a
  1203  			// 1e-9 chance of that, which is an acceptably low flakiness rate.
  1204  			const samplingSlop = 2.5
  1205  
  1206  			if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); samplingSlop*have < want {
  1207  				// The test imposes a delay with usleep, verified with calls to
  1208  				// nanotime. Compare against the runtime/metrics package's view
  1209  				// (based on nanotime) rather than runtime/pprof's view (based
  1210  				// on cputicks).
  1211  				t.Errorf("runtime/metrics reported less than the known minimum contention duration (%f * %fs < %fs)", samplingSlop, have, want)
  1212  			}
  1213  			if have, want := n, int64(len(mus)); float64(have) > float64(want)*samplingSlop || float64(want) > float64(have)*samplingSlop {
  1214  				t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
  1215  			}
  1216  
  1217  			const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
  1218  			if profileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*profileGrowth {
  1219  				t.Errorf("views differ by more than %fx", timerSlop)
  1220  			}
  1221  		})
  1222  	})
  1223  
  1224  	t.Run("runtime.semrelease", func(t *testing.T) {
  1225  		testenv.SkipFlaky(t, 64253)
  1226  
  1227  		old := runtime.SetMutexProfileFraction(1)
  1228  		defer runtime.SetMutexProfileFraction(old)
  1229  
  1230  		const workers = 3
  1231  		if runtime.GOMAXPROCS(0) < workers {
  1232  			t.Skipf("creating and observing contention on runtime-internal semaphores requires GOMAXPROCS >= %d", workers)
  1233  		}
  1234  
  1235  		var sem uint32 = 1
  1236  		var tries atomic.Int32
  1237  		tries.Store(10_000_000) // prefer controlled failure to timeout
  1238  		var sawContention atomic.Int32
  1239  		var need int32 = 1
  1240  		fn := func() bool {
  1241  			if sawContention.Load() >= need {
  1242  				return false
  1243  			}
  1244  			if tries.Add(-1) < 0 {
  1245  				return false
  1246  			}
  1247  
  1248  			runtime.Semacquire(&sem)
  1249  			runtime.Semrelease1(&sem, false, 0)
  1250  			if runtime.MutexContended(runtime.SemRootLock(&sem)) {
  1251  				sawContention.Add(1)
  1252  			}
  1253  			return true
  1254  		}
  1255  
  1256  		stks := [][]string{
  1257  			{
  1258  				"runtime.unlock",
  1259  				"runtime.semrelease1",
  1260  				"runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
  1261  				"runtime_test.(*contentionWorker).run",
  1262  			},
  1263  			{
  1264  				"runtime.unlock",
  1265  				"runtime.semacquire1",
  1266  				"runtime.semacquire",
  1267  				"runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
  1268  				"runtime_test.(*contentionWorker).run",
  1269  			},
  1270  		}
  1271  
  1272  		// Verify that we get call stack we expect, with anything more than zero
  1273  		// cycles / zero samples. The duration of each contention event is too
  1274  		// small relative to the expected overhead for us to verify its value
  1275  		// more directly. Leave that to the explicit lock/unlock test.
  1276  
  1277  		testcase(false, stks, workers, fn)(t)
  1278  
  1279  		if remaining := tries.Load(); remaining >= 0 {
  1280  			t.Logf("finished test early (%d tries remaining)", remaining)
  1281  		}
  1282  	})
  1283  }
  1284  
  1285  // contentionWorker provides cleaner call stacks for lock contention profile tests
  1286  type contentionWorker struct {
  1287  	before func()
  1288  	fn     func() bool
  1289  	after  func()
  1290  }
  1291  
  1292  func (w *contentionWorker) run() {
  1293  	defer w.after()
  1294  	w.before()
  1295  
  1296  	for w.fn() {
  1297  	}
  1298  }
  1299  
  1300  func TestCPUStats(t *testing.T) {
  1301  	// Run a few GC cycles to get some of the stats to be non-zero.
  1302  	runtime.GC()
  1303  	runtime.GC()
  1304  	runtime.GC()
  1305  
  1306  	// Set GOMAXPROCS high then sleep briefly to ensure we generate
  1307  	// some idle time.
  1308  	oldmaxprocs := runtime.GOMAXPROCS(10)
  1309  	time.Sleep(time.Millisecond)
  1310  	runtime.GOMAXPROCS(oldmaxprocs)
  1311  
  1312  	stats := runtime.ReadCPUStats()
  1313  	gcTotal := stats.GCAssistTime + stats.GCDedicatedTime + stats.GCIdleTime + stats.GCPauseTime
  1314  	if gcTotal != stats.GCTotalTime {
  1315  		t.Errorf("manually computed total does not match GCTotalTime: %d cpu-ns vs. %d cpu-ns", gcTotal, stats.GCTotalTime)
  1316  	}
  1317  	scavTotal := stats.ScavengeAssistTime + stats.ScavengeBgTime
  1318  	if scavTotal != stats.ScavengeTotalTime {
  1319  		t.Errorf("manually computed total does not match ScavengeTotalTime: %d cpu-ns vs. %d cpu-ns", scavTotal, stats.ScavengeTotalTime)
  1320  	}
  1321  	total := gcTotal + scavTotal + stats.IdleTime + stats.UserTime
  1322  	if total != stats.TotalTime {
  1323  		t.Errorf("manually computed overall total does not match TotalTime: %d cpu-ns vs. %d cpu-ns", total, stats.TotalTime)
  1324  	}
  1325  	if total == 0 {
  1326  		t.Error("total time is zero")
  1327  	}
  1328  	if gcTotal == 0 {
  1329  		t.Error("GC total time is zero")
  1330  	}
  1331  	if stats.IdleTime == 0 {
  1332  		t.Error("idle time is zero")
  1333  	}
  1334  }
  1335  
  1336  func TestMetricHeapUnusedLargeObjectOverflow(t *testing.T) {
  1337  	// This test makes sure /memory/classes/heap/unused:bytes
  1338  	// doesn't overflow when allocating and deallocating large
  1339  	// objects. It is a regression test for #67019.
  1340  	done := make(chan struct{})
  1341  	var wg sync.WaitGroup
  1342  	wg.Add(1)
  1343  	go func() {
  1344  		defer wg.Done()
  1345  		for {
  1346  			for range 10 {
  1347  				abi.Escape(make([]byte, 1<<20))
  1348  			}
  1349  			runtime.GC()
  1350  			select {
  1351  			case <-done:
  1352  				return
  1353  			default:
  1354  			}
  1355  		}
  1356  	}()
  1357  	s := []metrics.Sample{
  1358  		{Name: "/memory/classes/heap/unused:bytes"},
  1359  	}
  1360  	for range 1000 {
  1361  		metrics.Read(s)
  1362  		if s[0].Value.Uint64() > 1<<40 {
  1363  			t.Errorf("overflow")
  1364  			break
  1365  		}
  1366  	}
  1367  	done <- struct{}{}
  1368  	wg.Wait()
  1369  }
  1370  

View as plain text