trace.go

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Go execution tracer.
     6  // The tracer captures a wide range of execution events like goroutine
     7  // creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
     8  // changes of heap size, processor start/stop, etc and writes them to a buffer
     9  // in a compact form. A precise nanosecond-precision timestamp and a stack
    10  // trace is captured for most events.
    11  //
    12  // Tracer invariants (to keep the synchronization making sense):
    13  // - An m that has a trace buffer must be on either the allm or sched.freem lists.
    14  // - Any trace buffer mutation must either be happening in traceAdvance or between
    15  //   a traceAcquire and a subsequent traceRelease.
    16  // - traceAdvance cannot return until the previous generation's buffers are all flushed.
    17  //
    18  // See https://go.dev/issue/60773 for a link to the full design.
    19  
    20  package runtime
    21  
    22  import (
    23  	"internal/runtime/atomic"
    24  	"internal/trace/tracev2"
    25  	"unsafe"
    26  )
    27  
    28  // Trace state.
    29  
    30  // trace is global tracing context.
    31  var trace struct {
    32  	// trace.lock must only be acquired on the system stack where
    33  	// stack splits cannot happen while it is held.
    34  	lock mutex
    35  
    36  	// Trace buffer management.
    37  	//
    38  	// First we check the empty list for any free buffers. If not, buffers
    39  	// are allocated directly from the OS. Once they're filled up and/or
    40  	// flushed, they end up on the full queue for trace.gen%2.
    41  	//
    42  	// The trace reader takes buffers off the full list one-by-one and
    43  	// places them into reading until they're finished being read from.
    44  	// Then they're placed onto the empty list.
    45  	//
    46  	// Protected by trace.lock.
    47  	reading       *traceBuf // buffer currently handed off to user
    48  	empty         *traceBuf // stack of empty buffers
    49  	full          [2]traceBufQueue
    50  	workAvailable atomic.Bool
    51  
    52  	// State for the trace reader goroutine.
    53  	//
    54  	// Protected by trace.lock.
    55  	readerGen              atomic.Uintptr // the generation the reader is currently reading for
    56  	flushedGen             atomic.Uintptr // the last completed generation
    57  	headerWritten          bool           // whether ReadTrace has emitted trace header
    58  	endOfGenerationWritten bool           // whether readTrace has emitted the end of the generation signal
    59  
    60  	// doneSema is used to synchronize the reader and traceAdvance. Specifically,
    61  	// it notifies traceAdvance that the reader is done with a generation.
    62  	// Both semaphores are 0 by default (so, acquires block). traceAdvance
    63  	// attempts to acquire for gen%2 after flushing the last buffers for gen.
    64  	// Meanwhile the reader releases the sema for gen%2 when it has finished
    65  	// processing gen.
    66  	doneSema [2]uint32
    67  
    68  	// Trace data tables for deduplicating data going into the trace.
    69  	// There are 2 of each: one for gen%2, one for 1-gen%2.
    70  	stackTab  [2]traceStackTable  // maps stack traces to unique ids
    71  	stringTab [2]traceStringTable // maps strings to unique ids
    72  	typeTab   [2]traceTypeTable   // maps type pointers to unique ids
    73  
    74  	// cpuLogRead accepts CPU profile samples from the signal handler where
    75  	// they're generated. There are two profBufs here: one for gen%2, one for
    76  	// 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G,
    77  	// and M (respectively) that were active at the time of the sample. Because
    78  	// profBuf uses a record with all zeros in its header to indicate overflow,
    79  	// we make sure to make the P field always non-zero: The ID of a real P will
    80  	// start at bit 1, and bit 0 will be set. Samples that arrive while no P is
    81  	// running (such as near syscalls) will set the first header field to 0b10.
    82  	// This careful handling of the first header field allows us to store ID of
    83  	// the active G directly in the second field, even though that will be 0
    84  	// when sampling g0.
    85  	//
    86  	// Initialization and teardown of these fields is protected by traceAdvanceSema.
    87  	cpuLogRead  [2]*profBuf
    88  	signalLock  atomic.Uint32              // protects use of the following member, only usable in signal handlers
    89  	cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock
    90  	cpuSleep    *wakeableSleep
    91  	cpuLogDone  <-chan struct{}
    92  	cpuBuf      [2]*traceBuf
    93  
    94  	reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil
    95  
    96  	// Fast mappings from enumerations to string IDs that are prepopulated
    97  	// in the trace.
    98  	markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg
    99  	goStopReasons    [2][len(traceGoStopReasonStrings)]traceArg
   100  	goBlockReasons   [2][len(traceBlockReasonStrings)]traceArg
   101  
   102  	// enabled indicates whether tracing is enabled, but it is only an optimization,
   103  	// NOT the source of truth on whether tracing is enabled. Tracing is only truly
   104  	// enabled if gen != 0. This is used as an optimistic fast path check.
   105  	//
   106  	// Transitioning this value from true -> false is easy (once gen is 0)
   107  	// because it's OK for enabled to have a stale "true" value. traceAcquire will
   108  	// always double-check gen.
   109  	//
   110  	// Transitioning this value from false -> true is harder. We need to make sure
   111  	// this is observable as true strictly before gen != 0. To maintain this invariant
   112  	// we only make this transition with the world stopped and use the store to gen
   113  	// as a publication barrier.
   114  	enabled bool
   115  
   116  	// enabledWithAllocFree is set if debug.traceallocfree is != 0 when tracing begins.
   117  	// It follows the same synchronization protocol as enabled.
   118  	enabledWithAllocFree bool
   119  
   120  	// Trace generation counter.
   121  	gen            atomic.Uintptr
   122  	lastNonZeroGen uintptr // last non-zero value of gen
   123  
   124  	// shutdown is set when we are waiting for trace reader to finish after setting gen to 0
   125  	//
   126  	// Writes protected by trace.lock.
   127  	shutdown atomic.Bool
   128  
   129  	// Number of goroutines in syscall exiting slow path.
   130  	exitingSyscall atomic.Int32
   131  
   132  	// seqGC is the sequence counter for GC begin/end.
   133  	//
   134  	// Mutated only during stop-the-world.
   135  	seqGC uint64
   136  
   137  	// minPageHeapAddr is the minimum address of the page heap when tracing started.
   138  	minPageHeapAddr uint64
   139  
   140  	// debugMalloc is the value of debug.malloc before tracing began.
   141  	debugMalloc bool
   142  }
   143  
   144  // Trace public API.
   145  
   146  var (
   147  	traceAdvanceSema  uint32 = 1
   148  	traceShutdownSema uint32 = 1
   149  )
   150  
   151  // StartTrace enables tracing for the current process.
   152  // While tracing, the data will be buffered and available via [ReadTrace].
   153  // StartTrace returns an error if tracing is already enabled.
   154  // Most clients should use the [runtime/trace] package or the [testing] package's
   155  // -test.trace flag instead of calling StartTrace directly.
   156  func StartTrace() error {
   157  	if traceEnabled() || traceShuttingDown() {
   158  		return errorString("tracing is already enabled")
   159  	}
   160  	// Block until cleanup of the last trace is done.
   161  	semacquire(&traceShutdownSema)
   162  	semrelease(&traceShutdownSema)
   163  
   164  	// Hold traceAdvanceSema across trace start, since we'll want it on
   165  	// the other side of tracing being enabled globally.
   166  	semacquire(&traceAdvanceSema)
   167  
   168  	// Initialize CPU profile -> trace ingestion.
   169  	traceInitReadCPU()
   170  
   171  	// Compute the first generation for this StartTrace.
   172  	//
   173  	// Note: we start from the last non-zero generation rather than 1 so we
   174  	// can avoid resetting all the arrays indexed by gen%2 or gen%3. There's
   175  	// more than one of each per m, p, and goroutine.
   176  	firstGen := traceNextGen(trace.lastNonZeroGen)
   177  
   178  	// Reset GC sequencer.
   179  	trace.seqGC = 1
   180  
   181  	// Reset trace reader state.
   182  	trace.headerWritten = false
   183  	trace.readerGen.Store(firstGen)
   184  	trace.flushedGen.Store(0)
   185  
   186  	// Register some basic strings in the string tables.
   187  	traceRegisterLabelsAndReasons(firstGen)
   188  
   189  	// N.B. This may block for quite a while to get a frequency estimate. Do it
   190  	// here to minimize the time that the world is stopped.
   191  	frequency := traceClockUnitsPerSecond()
   192  
   193  	// Stop the world.
   194  	//
   195  	// The purpose of stopping the world is to make sure that no goroutine is in a
   196  	// context where it could emit an event by bringing all goroutines to a safe point
   197  	// with no opportunity to transition.
   198  	//
   199  	// The exception to this rule are goroutines that are concurrently exiting a syscall.
   200  	// Those will all be forced into the syscalling slow path, and we'll just make sure
   201  	// that we don't observe any goroutines in that critical section before starting
   202  	// the world again.
   203  	//
   204  	// A good follow-up question to this is why stopping the world is necessary at all
   205  	// given that we have traceAcquire and traceRelease. Unfortunately, those only help
   206  	// us when tracing is already active (for performance, so when tracing is off the
   207  	// tracing seqlock is left untouched). The main issue here is subtle: we're going to
   208  	// want to obtain a correct starting status for each goroutine, but there are windows
   209  	// of time in which we could read and emit an incorrect status. Specifically:
   210  	//
   211  	//	trace := traceAcquire()
   212  	//  // <----> problem window
   213  	//	casgstatus(gp, _Gwaiting, _Grunnable)
   214  	//	if trace.ok() {
   215  	//		trace.GoUnpark(gp, 2)
   216  	//		traceRelease(trace)
   217  	//	}
   218  	//
   219  	// More precisely, if we readgstatus for a gp while another goroutine is in the problem
   220  	// window and that goroutine didn't observe that tracing had begun, then we might write
   221  	// a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking
   222  	// the transition from GoWaiting to GoRunnable. The trace will then be broken, because
   223  	// future events will be emitted assuming the tracer sees GoRunnable.
   224  	//
   225  	// In short, what we really need here is to make sure that the next time *any goroutine*
   226  	// hits a traceAcquire, it sees that the trace is enabled.
   227  	//
   228  	// Note also that stopping the world is necessary to make sure sweep-related events are
   229  	// coherent. Since the world is stopped and sweeps are non-preemptible, we can never start
   230  	// the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this.
   231  	stw := stopTheWorld(stwStartTrace)
   232  
   233  	// Prevent sysmon from running any code that could generate events.
   234  	lock(&sched.sysmonlock)
   235  
   236  	// Grab the minimum page heap address. All Ps are stopped, so it's safe to read this since
   237  	// nothing can allocate heap memory.
   238  	trace.minPageHeapAddr = uint64(mheap_.pages.inUse.ranges[0].base.addr())
   239  
   240  	// Reset mSyscallID on all Ps while we have them stationary and the trace is disabled.
   241  	for _, pp := range allp {
   242  		pp.trace.mSyscallID = -1
   243  	}
   244  
   245  	// Start tracing.
   246  	//
   247  	// Set trace.enabled. This is *very* subtle. We need to maintain the invariant that if
   248  	// trace.gen != 0, then trace.enabled is always observed as true. Simultaneously, for
   249  	// performance, we need trace.enabled to be read without any synchronization.
   250  	//
   251  	// We ensure this is safe by stopping the world, which acts a global barrier on almost
   252  	// every M, and explicitly synchronize with any other Ms that could be running concurrently
   253  	// with us. Today, there are only two such cases:
   254  	// - sysmon, which we synchronized with by acquiring sysmonlock.
   255  	// - goroutines exiting syscalls, which we synchronize with via trace.exitingSyscall.
   256  	//
   257  	// After trace.gen is updated, other Ms may start creating trace buffers and emitting
   258  	// data into them.
   259  	trace.enabled = true
   260  	if debug.traceallocfree.Load() != 0 {
   261  		// Enable memory events since the GODEBUG is set.
   262  		trace.debugMalloc = debug.malloc
   263  		trace.enabledWithAllocFree = true
   264  		debug.malloc = true
   265  	}
   266  	trace.gen.Store(firstGen)
   267  
   268  	// Wait for exitingSyscall to drain.
   269  	//
   270  	// It may not monotonically decrease to zero, but in the limit it will always become
   271  	// zero because the world is stopped and there are no available Ps for syscall-exited
   272  	// goroutines to run on.
   273  	//
   274  	// Because we set gen before checking this, and because exitingSyscall is always incremented
   275  	// *before* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero
   276  	// that any goroutine that goes to exit a syscall from then on *must* observe the new gen as
   277  	// well as trace.enabled being set to true.
   278  	//
   279  	// The critical section on each goroutine here is going to be quite short, so the likelihood
   280  	// that we observe a zero value is high.
   281  	for trace.exitingSyscall.Load() != 0 {
   282  		osyield()
   283  	}
   284  
   285  	// Record some initial pieces of information.
   286  	//
   287  	// N.B. This will also emit a status event for this goroutine.
   288  	tl := traceAcquire()
   289  	traceSyncBatch(firstGen, frequency) // Get this as early in the trace as possible. See comment in traceAdvance.
   290  	tl.Gomaxprocs(gomaxprocs)           // Get this as early in the trace as possible. See comment in traceAdvance.
   291  	tl.STWStart(stwStartTrace)          // We didn't trace this above, so trace it now.
   292  
   293  	// Record the fact that a GC is active, if applicable.
   294  	if gcphase == _GCmark || gcphase == _GCmarktermination {
   295  		tl.GCActive()
   296  	}
   297  
   298  	// Dump a snapshot of memory, if enabled.
   299  	if trace.enabledWithAllocFree {
   300  		traceSnapshotMemory(firstGen)
   301  	}
   302  
   303  	// Record the heap goal so we have it at the very beginning of the trace.
   304  	tl.HeapGoal()
   305  
   306  	// Make sure a ProcStatus is emitted for every P, while we're here.
   307  	for _, pp := range allp {
   308  		tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end()
   309  	}
   310  	traceRelease(tl)
   311  
   312  	unlock(&sched.sysmonlock)
   313  	startTheWorld(stw)
   314  
   315  	traceStartReadCPU()
   316  	traceAdvancer.start()
   317  
   318  	semrelease(&traceAdvanceSema)
   319  	return nil
   320  }
   321  
   322  // StopTrace stops tracing, if it was previously enabled.
   323  // StopTrace only returns after all the reads for the trace have completed.
   324  func StopTrace() {
   325  	traceAdvance(true)
   326  }
   327  
   328  // traceAdvance moves tracing to the next generation, and cleans up the current generation,
   329  // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing
   330  // altogether instead of advancing to the next generation.
   331  //
   332  // traceAdvanceSema must not be held.
   333  //
   334  // traceAdvance is called by runtime/trace and golang.org/x/exp/trace using linkname.
   335  //
   336  //go:linkname traceAdvance
   337  func traceAdvance(stopTrace bool) {
   338  	semacquire(&traceAdvanceSema)
   339  
   340  	// Get the gen that we're advancing from. In this function we don't really care much
   341  	// about the generation we're advancing _into_ since we'll do all the cleanup in this
   342  	// generation for the next advancement.
   343  	gen := trace.gen.Load()
   344  	if gen == 0 {
   345  		// We may end up here traceAdvance is called concurrently with StopTrace.
   346  		semrelease(&traceAdvanceSema)
   347  		return
   348  	}
   349  
   350  	// Collect all the untraced Gs.
   351  	type untracedG struct {
   352  		gp           *g
   353  		goid         uint64
   354  		mid          int64
   355  		stackID      uint64
   356  		status       uint32
   357  		waitreason   waitReason
   358  		inMarkAssist bool
   359  	}
   360  	var untracedGs []untracedG
   361  	forEachGRace(func(gp *g) {
   362  		// Make absolutely sure all Gs are ready for the next
   363  		// generation. We need to do this even for dead Gs because
   364  		// they may come alive with a new identity, and its status
   365  		// traced bookkeeping might end up being stale.
   366  		// We may miss totally new goroutines, but they'll always
   367  		// have clean bookkeeping.
   368  		gp.trace.readyNextGen(gen)
   369  		// If the status was traced, nothing else to do.
   370  		if gp.trace.statusWasTraced(gen) {
   371  			return
   372  		}
   373  		// Scribble down information about this goroutine.
   374  		ug := untracedG{gp: gp, mid: -1}
   375  		systemstack(func() {
   376  			me := getg().m.curg
   377  			// We don't have to handle this G status transition because we
   378  			// already eliminated ourselves from consideration above.
   379  			casGToWaitingForSuspendG(me, _Grunning, waitReasonTraceGoroutineStatus)
   380  			// We need to suspend and take ownership of the G to safely read its
   381  			// goid. Note that we can't actually emit the event at this point
   382  			// because we might stop the G in a window where it's unsafe to write
   383  			// events based on the G's status. We need the global trace buffer flush
   384  			// coming up to make sure we're not racing with the G.
   385  			//
   386  			// It should be very unlikely that we try to preempt a running G here.
   387  			// The only situation that we might is that we're racing with a G
   388  			// that's running for the first time in this generation. Therefore,
   389  			// this should be relatively fast.
   390  			s := suspendG(gp)
   391  			if !s.dead {
   392  				ug.goid = s.g.goid
   393  				if s.g.m != nil {
   394  					ug.mid = int64(s.g.m.procid)
   395  				}
   396  				ug.status = readgstatus(s.g) &^ _Gscan
   397  				ug.waitreason = s.g.waitreason
   398  				ug.inMarkAssist = s.g.inMarkAssist
   399  				ug.stackID = traceStack(0, gp, gen)
   400  			}
   401  			resumeG(s)
   402  			casgstatus(me, _Gwaiting, _Grunning)
   403  		})
   404  		if ug.goid != 0 {
   405  			untracedGs = append(untracedGs, ug)
   406  		}
   407  	})
   408  
   409  	if !stopTrace {
   410  		// Re-register runtime goroutine labels and stop/block reasons.
   411  		traceRegisterLabelsAndReasons(traceNextGen(gen))
   412  	}
   413  
   414  	// N.B. This may block for quite a while to get a frequency estimate. Do it
   415  	// here to minimize the time that we prevent the world from stopping.
   416  	frequency := traceClockUnitsPerSecond()
   417  
   418  	// Now that we've done some of the heavy stuff, prevent the world from stopping.
   419  	// This is necessary to ensure the consistency of the STW events. If we're feeling
   420  	// adventurous we could lift this restriction and add a STWActive event, but the
   421  	// cost of maintaining this consistency is low. We're not going to hold this semaphore
   422  	// for very long and most STW periods are very short.
   423  	// Once we hold worldsema, prevent preemption as well so we're not interrupted partway
   424  	// through this. We want to get this done as soon as possible.
   425  	semacquire(&worldsema)
   426  	mp := acquirem()
   427  
   428  	// Advance the generation or stop the trace.
   429  	trace.lastNonZeroGen = gen
   430  	if stopTrace {
   431  		systemstack(func() {
   432  			// Ordering is important here. Set shutdown first, then disable tracing,
   433  			// so that conditions like (traceEnabled() || traceShuttingDown()) have
   434  			// no opportunity to be false. Hold the trace lock so this update appears
   435  			// atomic to the trace reader.
   436  			lock(&trace.lock)
   437  			trace.shutdown.Store(true)
   438  			trace.gen.Store(0)
   439  			unlock(&trace.lock)
   440  
   441  			// Clear trace.enabled. It is totally OK for this value to be stale,
   442  			// because traceAcquire will always double-check gen.
   443  			trace.enabled = false
   444  		})
   445  	} else {
   446  		trace.gen.Store(traceNextGen(gen))
   447  	}
   448  
   449  	// Emit a sync batch which contains a ClockSnapshot. Also emit a ProcsChange
   450  	// event so we have one on record for each generation. Let's emit it as soon
   451  	// as possible so that downstream tools can rely on the value being there
   452  	// fairly soon in a generation.
   453  	//
   454  	// It's important that we do this before allowing stop-the-worlds again,
   455  	// because the procs count could change.
   456  	if !stopTrace {
   457  		tl := traceAcquire()
   458  		traceSyncBatch(tl.gen, frequency)
   459  		tl.Gomaxprocs(gomaxprocs)
   460  		traceRelease(tl)
   461  	}
   462  
   463  	// Emit a GCActive event in the new generation if necessary.
   464  	//
   465  	// It's important that we do this before allowing stop-the-worlds again,
   466  	// because that could emit global GC-related events.
   467  	if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) {
   468  		tl := traceAcquire()
   469  		tl.GCActive()
   470  		traceRelease(tl)
   471  	}
   472  
   473  	// Preemption is OK again after this. If the world stops or whatever it's fine.
   474  	// We're just cleaning up the last generation after this point.
   475  	//
   476  	// We also don't care if the GC starts again after this for the same reasons.
   477  	releasem(mp)
   478  	semrelease(&worldsema)
   479  
   480  	// Snapshot allm and freem.
   481  	//
   482  	// Snapshotting after the generation counter update is sufficient.
   483  	// Because an m must be on either allm or sched.freem if it has an active trace
   484  	// buffer, new threads added to allm after this point must necessarily observe
   485  	// the new generation number (sched.lock acts as a barrier).
   486  	//
   487  	// Threads that exit before this point and are on neither list explicitly
   488  	// flush their own buffers in traceThreadDestroy.
   489  	//
   490  	// Snapshotting freem is necessary because Ms can continue to emit events
   491  	// while they're still on that list. Removal from sched.freem is serialized with
   492  	// this snapshot, so either we'll capture an m on sched.freem and race with
   493  	// the removal to flush its buffers (resolved by traceThreadDestroy acquiring
   494  	// the thread's seqlock, which one of us must win, so at least its old gen buffer
   495  	// will be flushed in time for the new generation) or it will have flushed its
   496  	// buffers before we snapshotted it to begin with.
   497  	lock(&sched.lock)
   498  	mToFlush := allm
   499  	for mp := mToFlush; mp != nil; mp = mp.alllink {
   500  		mp.trace.link = mp.alllink
   501  	}
   502  	for mp := sched.freem; mp != nil; mp = mp.freelink {
   503  		mp.trace.link = mToFlush
   504  		mToFlush = mp
   505  	}
   506  	unlock(&sched.lock)
   507  
   508  	// Iterate over our snapshot, flushing every buffer until we're done.
   509  	//
   510  	// Because trace writers read the generation while the seqlock is
   511  	// held, we can be certain that when there are no writers there are
   512  	// also no stale generation values left. Therefore, it's safe to flush
   513  	// any buffers that remain in that generation's slot.
   514  	const debugDeadlock = false
   515  	systemstack(func() {
   516  		// Track iterations for some rudimentary deadlock detection.
   517  		i := 0
   518  		detectedDeadlock := false
   519  
   520  		for mToFlush != nil {
   521  			prev := &mToFlush
   522  			for mp := *prev; mp != nil; {
   523  				if mp.trace.seqlock.Load()%2 != 0 {
   524  					// The M is writing. Come back to it later.
   525  					prev = &mp.trace.link
   526  					mp = mp.trace.link
   527  					continue
   528  				}
   529  				// Flush the trace buffer.
   530  				//
   531  				// trace.lock needed for traceBufFlush, but also to synchronize
   532  				// with traceThreadDestroy, which flushes both buffers unconditionally.
   533  				lock(&trace.lock)
   534  				for exp, buf := range mp.trace.buf[gen%2] {
   535  					if buf != nil {
   536  						traceBufFlush(buf, gen)
   537  						mp.trace.buf[gen%2][exp] = nil
   538  					}
   539  				}
   540  				unlock(&trace.lock)
   541  
   542  				// Remove the m from the flush list.
   543  				*prev = mp.trace.link
   544  				mp.trace.link = nil
   545  				mp = *prev
   546  			}
   547  			// Yield only if we're going to be going around the loop again.
   548  			if mToFlush != nil {
   549  				osyield()
   550  			}
   551  
   552  			if debugDeadlock {
   553  				// Try to detect a deadlock. We probably shouldn't loop here
   554  				// this many times.
   555  				if i > 100000 && !detectedDeadlock {
   556  					detectedDeadlock = true
   557  					println("runtime: failing to flush")
   558  					for mp := mToFlush; mp != nil; mp = mp.trace.link {
   559  						print("runtime: m=", mp.id, "\n")
   560  					}
   561  				}
   562  				i++
   563  			}
   564  		}
   565  	})
   566  
   567  	// At this point, the old generation is fully flushed minus stack and string
   568  	// tables, CPU samples, and goroutines that haven't run at all during the last
   569  	// generation.
   570  
   571  	// Check to see if any Gs still haven't had events written out for them.
   572  	statusWriter := unsafeTraceWriter(gen, nil)
   573  	for _, ug := range untracedGs {
   574  		if ug.gp.trace.statusWasTraced(gen) {
   575  			// It was traced, we don't need to do anything.
   576  			continue
   577  		}
   578  		// It still wasn't traced. Because we ensured all Ms stopped writing trace
   579  		// events to the last generation, that must mean the G never had its status
   580  		// traced in gen between when we recorded it and now. If that's true, the goid
   581  		// and status we recorded then is exactly what we want right now.
   582  		status := goStatusToTraceGoStatus(ug.status, ug.waitreason)
   583  		statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist, ug.stackID)
   584  	}
   585  	statusWriter.flush().end()
   586  
   587  	// Read everything out of the last gen's CPU profile buffer.
   588  	traceReadCPU(gen)
   589  
   590  	// Flush CPU samples, stacks, and strings for the last generation. This is safe,
   591  	// because we're now certain no M is writing to the last generation.
   592  	//
   593  	// Ordering is important here. traceCPUFlush may generate new stacks and dumping
   594  	// stacks may generate new strings.
   595  	traceCPUFlush(gen)
   596  	trace.stackTab[gen%2].dump(gen)
   597  	trace.typeTab[gen%2].dump(gen)
   598  	trace.stringTab[gen%2].reset(gen)
   599  
   600  	// That's it. This generation is done producing buffers.
   601  	systemstack(func() {
   602  		lock(&trace.lock)
   603  		trace.flushedGen.Store(gen)
   604  		unlock(&trace.lock)
   605  	})
   606  
   607  	// Perform status reset on dead Ps because they just appear as idle.
   608  	//
   609  	// Preventing preemption is sufficient to access allp safely. allp is only
   610  	// mutated by GOMAXPROCS calls, which require a STW.
   611  	//
   612  	// TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy
   613  	// events to indicate whether a P exists, rather than just making its
   614  	// existence implicit.
   615  	mp = acquirem()
   616  	for _, pp := range allp[len(allp):cap(allp)] {
   617  		pp.trace.readyNextGen(traceNextGen(gen))
   618  	}
   619  	releasem(mp)
   620  
   621  	if stopTrace {
   622  		// Acquire the shutdown sema to begin the shutdown process.
   623  		semacquire(&traceShutdownSema)
   624  
   625  		// Finish off CPU profile reading.
   626  		traceStopReadCPU()
   627  
   628  		// Reset debug.malloc if necessary. Note that this is set in a racy
   629  		// way; that's OK. Some mallocs may still enter into the debug.malloc
   630  		// block, but they won't generate events because tracing is disabled.
   631  		// That is, it's OK if mallocs read a stale debug.malloc or
   632  		// trace.enabledWithAllocFree value.
   633  		if trace.enabledWithAllocFree {
   634  			trace.enabledWithAllocFree = false
   635  			debug.malloc = trace.debugMalloc
   636  		}
   637  	} else {
   638  		// Go over each P and emit a status event for it if necessary.
   639  		//
   640  		// We do this at the beginning of the new generation instead of the
   641  		// end like we do for goroutines because forEachP doesn't give us a
   642  		// hook to skip Ps that have already been traced. Since we have to
   643  		// preempt all Ps anyway, might as well stay consistent with StartTrace
   644  		// which does this during the STW.
   645  		semacquire(&worldsema)
   646  		forEachP(waitReasonTraceProcStatus, func(pp *p) {
   647  			tl := traceAcquire()
   648  			if !pp.trace.statusWasTraced(tl.gen) {
   649  				tl.writer().writeProcStatusForP(pp, false).end()
   650  			}
   651  			traceRelease(tl)
   652  		})
   653  		semrelease(&worldsema)
   654  	}
   655  
   656  	// Block until the trace reader has finished processing the last generation.
   657  	semacquire(&trace.doneSema[gen%2])
   658  	if raceenabled {
   659  		raceacquire(unsafe.Pointer(&trace.doneSema[gen%2]))
   660  	}
   661  
   662  	// Double-check that things look as we expect after advancing and perform some
   663  	// final cleanup if the trace has fully stopped.
   664  	systemstack(func() {
   665  		lock(&trace.lock)
   666  		if !trace.full[gen%2].empty() {
   667  			throw("trace: non-empty full trace buffer for done generation")
   668  		}
   669  		if stopTrace {
   670  			if !trace.full[1-(gen%2)].empty() {
   671  				throw("trace: non-empty full trace buffer for next generation")
   672  			}
   673  			if trace.reading != nil || trace.reader.Load() != nil {
   674  				throw("trace: reading after shutdown")
   675  			}
   676  			// Free all the empty buffers.
   677  			for trace.empty != nil {
   678  				buf := trace.empty
   679  				trace.empty = buf.link
   680  				sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys)
   681  			}
   682  			// Clear trace.shutdown and other flags.
   683  			trace.headerWritten = false
   684  			trace.shutdown.Store(false)
   685  		}
   686  		unlock(&trace.lock)
   687  	})
   688  
   689  	if stopTrace {
   690  		// Clear the sweep state on every P for the next time tracing is enabled.
   691  		//
   692  		// It may be stale in the next trace because we may have ended tracing in
   693  		// the middle of a sweep on a P.
   694  		//
   695  		// It's fine not to call forEachP here because tracing is disabled and we
   696  		// know at this point that nothing is calling into the tracer, but we do
   697  		// need to look at dead Ps too just because GOMAXPROCS could have been called
   698  		// at any point since we stopped tracing, and we have to ensure there's no
   699  		// bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that
   700  		// might mutate allp by making ourselves briefly non-preemptible.
   701  		mp := acquirem()
   702  		for _, pp := range allp[:cap(allp)] {
   703  			pp.trace.inSweep = false
   704  			pp.trace.maySweep = false
   705  			pp.trace.swept = 0
   706  			pp.trace.reclaimed = 0
   707  		}
   708  		releasem(mp)
   709  	}
   710  
   711  	// Release the advance semaphore. If stopTrace is true we're still holding onto
   712  	// traceShutdownSema.
   713  	//
   714  	// Do a direct handoff. Don't let one caller of traceAdvance starve
   715  	// other calls to traceAdvance.
   716  	semrelease1(&traceAdvanceSema, true, 0)
   717  
   718  	if stopTrace {
   719  		// Stop the traceAdvancer. We can't be holding traceAdvanceSema here because
   720  		// we'll deadlock (we're blocked on the advancer goroutine exiting, but it
   721  		// may be currently trying to acquire traceAdvanceSema).
   722  		traceAdvancer.stop()
   723  		semrelease(&traceShutdownSema)
   724  	}
   725  }
   726  
   727  func traceNextGen(gen uintptr) uintptr {
   728  	if gen == ^uintptr(0) {
   729  		// gen is used both %2 and %3 and we want both patterns to continue when we loop around.
   730  		// ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation
   731  		// we want is even and one more than a multiple of 3. The smallest such number is 4.
   732  		return 4
   733  	}
   734  	return gen + 1
   735  }
   736  
   737  // traceRegisterLabelsAndReasons re-registers mark worker labels and
   738  // goroutine stop/block reasons in the string table for the provided
   739  // generation. Note: the provided generation must not have started yet.
   740  func traceRegisterLabelsAndReasons(gen uintptr) {
   741  	for i, label := range gcMarkWorkerModeStrings[:] {
   742  		trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label))
   743  	}
   744  	for i, str := range traceBlockReasonStrings[:] {
   745  		trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
   746  	}
   747  	for i, str := range traceGoStopReasonStrings[:] {
   748  		trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
   749  	}
   750  }
   751  
   752  // ReadTrace returns the next chunk of binary tracing data, blocking until data
   753  // is available. If tracing is turned off and all the data accumulated while it
   754  // was on has been returned, ReadTrace returns nil. The caller must copy the
   755  // returned data before calling ReadTrace again.
   756  // ReadTrace must be called from one goroutine at a time.
   757  func ReadTrace() []byte {
   758  	for {
   759  		buf := readTrace()
   760  
   761  		// Skip over the end-of-generation signal which must not appear
   762  		// in the final trace.
   763  		if len(buf) == 1 && tracev2.EventType(buf[0]) == tracev2.EvEndOfGeneration {
   764  			continue
   765  		}
   766  		return buf
   767  	}
   768  }
   769  
   770  // readTrace is the implementation of ReadTrace, except with an additional
   771  // in-band signal as to when the buffer is for a new generation.
   772  //
   773  //go:linkname readTrace runtime/trace.runtime_readTrace
   774  func readTrace() (buf []byte) {
   775  top:
   776  	var park bool
   777  	systemstack(func() {
   778  		buf, park = readTrace0()
   779  	})
   780  	if park {
   781  		gopark(func(gp *g, _ unsafe.Pointer) bool {
   782  			if !trace.reader.CompareAndSwapNoWB(nil, gp) {
   783  				// We're racing with another reader.
   784  				// Wake up and handle this case.
   785  				return false
   786  			}
   787  
   788  			if g2 := traceReader(); gp == g2 {
   789  				// New data arrived between unlocking
   790  				// and the CAS and we won the wake-up
   791  				// race, so wake up directly.
   792  				return false
   793  			} else if g2 != nil {
   794  				printlock()
   795  				println("runtime: got trace reader", g2, g2.goid)
   796  				throw("unexpected trace reader")
   797  			}
   798  
   799  			return true
   800  		}, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2)
   801  		goto top
   802  	}
   803  	return buf
   804  }
   805  
   806  // readTrace0 is ReadTrace's continuation on g0. This must run on the
   807  // system stack because it acquires trace.lock.
   808  //
   809  //go:systemstack
   810  func readTrace0() (buf []byte, park bool) {
   811  	if raceenabled {
   812  		// g0 doesn't have a race context. Borrow the user G's.
   813  		if getg().racectx != 0 {
   814  			throw("expected racectx == 0")
   815  		}
   816  		getg().racectx = getg().m.curg.racectx
   817  		// (This defer should get open-coded, which is safe on
   818  		// the system stack.)
   819  		defer func() { getg().racectx = 0 }()
   820  	}
   821  
   822  	// This function must not allocate while holding trace.lock:
   823  	// allocation can call heap allocate, which will try to emit a trace
   824  	// event while holding heap lock.
   825  	lock(&trace.lock)
   826  
   827  	if trace.reader.Load() != nil {
   828  		// More than one goroutine reads trace. This is bad.
   829  		// But we rather do not crash the program because of tracing,
   830  		// because tracing can be enabled at runtime on prod servers.
   831  		unlock(&trace.lock)
   832  		println("runtime: ReadTrace called from multiple goroutines simultaneously")
   833  		return nil, false
   834  	}
   835  	// Recycle the old buffer.
   836  	if buf := trace.reading; buf != nil {
   837  		buf.link = trace.empty
   838  		trace.empty = buf
   839  		trace.reading = nil
   840  	}
   841  	// Write trace header.
   842  	if !trace.headerWritten {
   843  		trace.headerWritten = true
   844  		unlock(&trace.lock)
   845  		return []byte("go 1.25 trace\x00\x00\x00"), false
   846  	}
   847  
   848  	// Read the next buffer.
   849  
   850  	if trace.readerGen.Load() == 0 {
   851  		trace.readerGen.Store(1)
   852  	}
   853  	var gen uintptr
   854  	for {
   855  		assertLockHeld(&trace.lock)
   856  		gen = trace.readerGen.Load()
   857  
   858  		// Check to see if we need to block for more data in this generation
   859  		// or if we need to move our generation forward.
   860  		if !trace.full[gen%2].empty() {
   861  			break
   862  		}
   863  		// Most of the time readerGen is one generation ahead of flushedGen, as the
   864  		// current generation is being read from. Then, once the last buffer is flushed
   865  		// into readerGen, flushedGen will rise to meet it. At this point, the tracer
   866  		// is waiting on the reader to finish flushing the last generation so that it
   867  		// can continue to advance.
   868  		if trace.flushedGen.Load() == gen {
   869  			// Write out the internal in-band end-of-generation signal.
   870  			if !trace.endOfGenerationWritten {
   871  				trace.endOfGenerationWritten = true
   872  				unlock(&trace.lock)
   873  				return []byte{byte(tracev2.EvEndOfGeneration)}, false
   874  			}
   875  
   876  			// Reset the flag.
   877  			trace.endOfGenerationWritten = false
   878  
   879  			// Handle shutdown.
   880  			if trace.shutdown.Load() {
   881  				unlock(&trace.lock)
   882  
   883  				// Wake up anyone waiting for us to be done with this generation.
   884  				//
   885  				// Do this after reading trace.shutdown, because the thread we're
   886  				// waking up is going to clear trace.shutdown.
   887  				if raceenabled {
   888  					// Model synchronization on trace.doneSema, which te race
   889  					// detector does not see. This is required to avoid false
   890  					// race reports on writer passed to trace.Start.
   891  					racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
   892  				}
   893  				semrelease(&trace.doneSema[gen%2])
   894  
   895  				// We're shutting down, and the last generation is fully
   896  				// read. We're done.
   897  				return nil, false
   898  			}
   899  			// Handle advancing to the next generation.
   900  
   901  			// The previous gen has had all of its buffers flushed, and
   902  			// there's nothing else for us to read. Advance the generation
   903  			// we're reading from and try again.
   904  			trace.readerGen.Store(trace.gen.Load())
   905  			unlock(&trace.lock)
   906  
   907  			// Wake up anyone waiting for us to be done with this generation.
   908  			//
   909  			// Do this after reading gen to make sure we can't have the trace
   910  			// advance until we've read it.
   911  			if raceenabled {
   912  				// See comment above in the shutdown case.
   913  				racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
   914  			}
   915  			semrelease(&trace.doneSema[gen%2])
   916  
   917  			// Reacquire the lock and go back to the top of the loop.
   918  			lock(&trace.lock)
   919  			continue
   920  		}
   921  		// Wait for new data.
   922  		//
   923  		// We don't simply use a note because the scheduler
   924  		// executes this goroutine directly when it wakes up
   925  		// (also a note would consume an M).
   926  		//
   927  		// Before we drop the lock, clear the workAvailable flag. Work can
   928  		// only be queued with trace.lock held, so this is at least true until
   929  		// we drop the lock.
   930  		trace.workAvailable.Store(false)
   931  		unlock(&trace.lock)
   932  		return nil, true
   933  	}
   934  	// Pull a buffer.
   935  	tbuf := trace.full[gen%2].pop()
   936  	trace.reading = tbuf
   937  	unlock(&trace.lock)
   938  	return tbuf.arr[:tbuf.pos], false
   939  }
   940  
   941  // traceReader returns the trace reader that should be woken up, if any.
   942  // Callers should first check (traceEnabled() || traceShuttingDown()).
   943  //
   944  // This must run on the system stack because it acquires trace.lock.
   945  //
   946  //go:systemstack
   947  func traceReader() *g {
   948  	gp := traceReaderAvailable()
   949  	if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) {
   950  		return nil
   951  	}
   952  	return gp
   953  }
   954  
   955  // traceReaderAvailable returns the trace reader if it is not currently
   956  // scheduled and should be. Callers should first check that
   957  // (traceEnabled() || traceShuttingDown()) is true.
   958  func traceReaderAvailable() *g {
   959  	// There are two conditions under which we definitely want to schedule
   960  	// the reader:
   961  	// - The reader is lagging behind in finishing off the last generation.
   962  	//   In this case, trace buffers could even be empty, but the trace
   963  	//   advancer will be waiting on the reader, so we have to make sure
   964  	//   to schedule the reader ASAP.
   965  	// - The reader has pending work to process for it's reader generation
   966  	//   (assuming readerGen is not lagging behind). Note that we also want
   967  	//   to be careful *not* to schedule the reader if there's no work to do.
   968  	//
   969  	// We also want to be careful not to schedule the reader if there's no
   970  	// reason to.
   971  	if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() {
   972  		return trace.reader.Load()
   973  	}
   974  	return nil
   975  }
   976  
   977  // Trace advancer goroutine.
   978  var traceAdvancer traceAdvancerState
   979  
   980  type traceAdvancerState struct {
   981  	timer *wakeableSleep
   982  	done  chan struct{}
   983  }
   984  
   985  // start starts a new traceAdvancer.
   986  func (s *traceAdvancerState) start() {
   987  	// Start a goroutine to periodically advance the trace generation.
   988  	s.done = make(chan struct{})
   989  	s.timer = newWakeableSleep()
   990  	go func() {
   991  		for traceEnabled() {
   992  			// Set a timer to wake us up
   993  			s.timer.sleep(int64(debug.traceadvanceperiod))
   994  
   995  			// Try to advance the trace.
   996  			traceAdvance(false)
   997  		}
   998  		s.done <- struct{}{}
   999  	}()
  1000  }
  1001  
  1002  // stop stops a traceAdvancer and blocks until it exits.
  1003  func (s *traceAdvancerState) stop() {
  1004  	s.timer.wake()
  1005  	<-s.done
  1006  	close(s.done)
  1007  	s.timer.close()
  1008  }
  1009  
  1010  // traceAdvancePeriod is the approximate period between
  1011  // new generations.
  1012  const defaultTraceAdvancePeriod = 1e9 // 1 second.
  1013  
  1014  // wakeableSleep manages a wakeable goroutine sleep.
  1015  //
  1016  // Users of this type must call init before first use and
  1017  // close to free up resources. Once close is called, init
  1018  // must be called before another use.
  1019  type wakeableSleep struct {
  1020  	timer *timer
  1021  
  1022  	// lock protects access to wakeup, but not send/recv on it.
  1023  	lock   mutex
  1024  	wakeup chan struct{}
  1025  }
  1026  
  1027  // newWakeableSleep initializes a new wakeableSleep and returns it.
  1028  func newWakeableSleep() *wakeableSleep {
  1029  	s := new(wakeableSleep)
  1030  	lockInit(&s.lock, lockRankWakeableSleep)
  1031  	s.wakeup = make(chan struct{}, 1)
  1032  	s.timer = new(timer)
  1033  	f := func(s any, _ uintptr, _ int64) {
  1034  		s.(*wakeableSleep).wake()
  1035  	}
  1036  	s.timer.init(f, s)
  1037  	return s
  1038  }
  1039  
  1040  // sleep sleeps for the provided duration in nanoseconds or until
  1041  // another goroutine calls wake.
  1042  //
  1043  // Must not be called by more than one goroutine at a time and
  1044  // must not be called concurrently with close.
  1045  func (s *wakeableSleep) sleep(ns int64) {
  1046  	s.timer.reset(nanotime()+ns, 0)
  1047  	lock(&s.lock)
  1048  	if raceenabled {
  1049  		raceacquire(unsafe.Pointer(&s.lock))
  1050  	}
  1051  	wakeup := s.wakeup
  1052  	if raceenabled {
  1053  		racerelease(unsafe.Pointer(&s.lock))
  1054  	}
  1055  	unlock(&s.lock)
  1056  	<-wakeup
  1057  	s.timer.stop()
  1058  }
  1059  
  1060  // wake awakens any goroutine sleeping on the timer.
  1061  //
  1062  // Safe for concurrent use with all other methods.
  1063  func (s *wakeableSleep) wake() {
  1064  	// Grab the wakeup channel, which may be nil if we're
  1065  	// racing with close.
  1066  	lock(&s.lock)
  1067  	if raceenabled {
  1068  		raceacquire(unsafe.Pointer(&s.lock))
  1069  	}
  1070  	if s.wakeup != nil {
  1071  		// Non-blocking send.
  1072  		//
  1073  		// Others may also write to this channel and we don't
  1074  		// want to block on the receiver waking up. This also
  1075  		// effectively batches together wakeup notifications.
  1076  		select {
  1077  		case s.wakeup <- struct{}{}:
  1078  		default:
  1079  		}
  1080  	}
  1081  	if raceenabled {
  1082  		racerelease(unsafe.Pointer(&s.lock))
  1083  	}
  1084  	unlock(&s.lock)
  1085  }
  1086  
  1087  // close wakes any goroutine sleeping on the timer and prevents
  1088  // further sleeping on it.
  1089  //
  1090  // Once close is called, the wakeableSleep must no longer be used.
  1091  //
  1092  // It must only be called once no goroutine is sleeping on the
  1093  // timer *and* nothing else will call wake concurrently.
  1094  func (s *wakeableSleep) close() {
  1095  	// Set wakeup to nil so that a late timer ends up being a no-op.
  1096  	lock(&s.lock)
  1097  	if raceenabled {
  1098  		raceacquire(unsafe.Pointer(&s.lock))
  1099  	}
  1100  	wakeup := s.wakeup
  1101  	s.wakeup = nil
  1102  
  1103  	// Close the channel.
  1104  	close(wakeup)
  1105  
  1106  	if raceenabled {
  1107  		racerelease(unsafe.Pointer(&s.lock))
  1108  	}
  1109  	unlock(&s.lock)
  1110  	return
  1111  }
  1112
View as plain text