Source file src/runtime/trace.go
1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Go execution tracer. 6 // The tracer captures a wide range of execution events like goroutine 7 // creation/blocking/unblocking, syscall enter/exit/block, GC-related events, 8 // changes of heap size, processor start/stop, etc and writes them to a buffer 9 // in a compact form. A precise nanosecond-precision timestamp and a stack 10 // trace is captured for most events. 11 // 12 // Tracer invariants (to keep the synchronization making sense): 13 // - An m that has a trace buffer must be on either the allm or sched.freem lists. 14 // - Any trace buffer mutation must either be happening in traceAdvance or between 15 // a traceAcquire and a subsequent traceRelease. 16 // - traceAdvance cannot return until the previous generation's buffers are all flushed. 17 // 18 // See https://go.dev/issue/60773 for a link to the full design. 19 20 package runtime 21 22 import ( 23 "internal/runtime/atomic" 24 "internal/trace/tracev2" 25 "unsafe" 26 ) 27 28 // Trace state. 29 30 // trace is global tracing context. 31 var trace struct { 32 // trace.lock must only be acquired on the system stack where 33 // stack splits cannot happen while it is held. 34 lock mutex 35 36 // Trace buffer management. 37 // 38 // First we check the empty list for any free buffers. If not, buffers 39 // are allocated directly from the OS. Once they're filled up and/or 40 // flushed, they end up on the full queue for trace.gen%2. 41 // 42 // The trace reader takes buffers off the full list one-by-one and 43 // places them into reading until they're finished being read from. 44 // Then they're placed onto the empty list. 45 // 46 // Protected by trace.lock. 47 reading *traceBuf // buffer currently handed off to user 48 empty *traceBuf // stack of empty buffers 49 full [2]traceBufQueue 50 workAvailable atomic.Bool 51 52 // State for the trace reader goroutine. 53 // 54 // Protected by trace.lock. 55 readerGen atomic.Uintptr // the generation the reader is currently reading for 56 flushedGen atomic.Uintptr // the last completed generation 57 headerWritten bool // whether ReadTrace has emitted trace header 58 endOfGenerationWritten bool // whether readTrace has emitted the end of the generation signal 59 60 // doneSema is used to synchronize the reader and traceAdvance. Specifically, 61 // it notifies traceAdvance that the reader is done with a generation. 62 // Both semaphores are 0 by default (so, acquires block). traceAdvance 63 // attempts to acquire for gen%2 after flushing the last buffers for gen. 64 // Meanwhile the reader releases the sema for gen%2 when it has finished 65 // processing gen. 66 doneSema [2]uint32 67 68 // Trace data tables for deduplicating data going into the trace. 69 // There are 2 of each: one for gen%2, one for 1-gen%2. 70 stackTab [2]traceStackTable // maps stack traces to unique ids 71 stringTab [2]traceStringTable // maps strings to unique ids 72 typeTab [2]traceTypeTable // maps type pointers to unique ids 73 74 // cpuLogRead accepts CPU profile samples from the signal handler where 75 // they're generated. There are two profBufs here: one for gen%2, one for 76 // 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G, 77 // and M (respectively) that were active at the time of the sample. Because 78 // profBuf uses a record with all zeros in its header to indicate overflow, 79 // we make sure to make the P field always non-zero: The ID of a real P will 80 // start at bit 1, and bit 0 will be set. Samples that arrive while no P is 81 // running (such as near syscalls) will set the first header field to 0b10. 82 // This careful handling of the first header field allows us to store ID of 83 // the active G directly in the second field, even though that will be 0 84 // when sampling g0. 85 // 86 // Initialization and teardown of these fields is protected by traceAdvanceSema. 87 cpuLogRead [2]*profBuf 88 signalLock atomic.Uint32 // protects use of the following member, only usable in signal handlers 89 cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock 90 cpuSleep *wakeableSleep 91 cpuLogDone <-chan struct{} 92 cpuBuf [2]*traceBuf 93 94 reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil 95 96 // Fast mappings from enumerations to string IDs that are prepopulated 97 // in the trace. 98 markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg 99 goStopReasons [2][len(traceGoStopReasonStrings)]traceArg 100 goBlockReasons [2][len(traceBlockReasonStrings)]traceArg 101 102 // enabled indicates whether tracing is enabled, but it is only an optimization, 103 // NOT the source of truth on whether tracing is enabled. Tracing is only truly 104 // enabled if gen != 0. This is used as an optimistic fast path check. 105 // 106 // Transitioning this value from true -> false is easy (once gen is 0) 107 // because it's OK for enabled to have a stale "true" value. traceAcquire will 108 // always double-check gen. 109 // 110 // Transitioning this value from false -> true is harder. We need to make sure 111 // this is observable as true strictly before gen != 0. To maintain this invariant 112 // we only make this transition with the world stopped and use the store to gen 113 // as a publication barrier. 114 enabled bool 115 116 // enabledWithAllocFree is set if debug.traceallocfree is != 0 when tracing begins. 117 // It follows the same synchronization protocol as enabled. 118 enabledWithAllocFree bool 119 120 // Trace generation counter. 121 gen atomic.Uintptr 122 lastNonZeroGen uintptr // last non-zero value of gen 123 124 // shutdown is set when we are waiting for trace reader to finish after setting gen to 0 125 // 126 // Writes protected by trace.lock. 127 shutdown atomic.Bool 128 129 // Number of goroutines in syscall exiting slow path. 130 exitingSyscall atomic.Int32 131 132 // seqGC is the sequence counter for GC begin/end. 133 // 134 // Mutated only during stop-the-world. 135 seqGC uint64 136 137 // minPageHeapAddr is the minimum address of the page heap when tracing started. 138 minPageHeapAddr uint64 139 140 // debugMalloc is the value of debug.malloc before tracing began. 141 debugMalloc bool 142 } 143 144 // Trace public API. 145 146 var ( 147 traceAdvanceSema uint32 = 1 148 traceShutdownSema uint32 = 1 149 ) 150 151 // StartTrace enables tracing for the current process. 152 // While tracing, the data will be buffered and available via [ReadTrace]. 153 // StartTrace returns an error if tracing is already enabled. 154 // Most clients should use the [runtime/trace] package or the [testing] package's 155 // -test.trace flag instead of calling StartTrace directly. 156 func StartTrace() error { 157 if traceEnabled() || traceShuttingDown() { 158 return errorString("tracing is already enabled") 159 } 160 // Block until cleanup of the last trace is done. 161 semacquire(&traceShutdownSema) 162 semrelease(&traceShutdownSema) 163 164 // Hold traceAdvanceSema across trace start, since we'll want it on 165 // the other side of tracing being enabled globally. 166 semacquire(&traceAdvanceSema) 167 168 // Initialize CPU profile -> trace ingestion. 169 traceInitReadCPU() 170 171 // Compute the first generation for this StartTrace. 172 // 173 // Note: we start from the last non-zero generation rather than 1 so we 174 // can avoid resetting all the arrays indexed by gen%2 or gen%3. There's 175 // more than one of each per m, p, and goroutine. 176 firstGen := traceNextGen(trace.lastNonZeroGen) 177 178 // Reset GC sequencer. 179 trace.seqGC = 1 180 181 // Reset trace reader state. 182 trace.headerWritten = false 183 trace.readerGen.Store(firstGen) 184 trace.flushedGen.Store(0) 185 186 // Register some basic strings in the string tables. 187 traceRegisterLabelsAndReasons(firstGen) 188 189 // N.B. This may block for quite a while to get a frequency estimate. Do it 190 // here to minimize the time that the world is stopped. 191 frequency := traceClockUnitsPerSecond() 192 193 // Stop the world. 194 // 195 // The purpose of stopping the world is to make sure that no goroutine is in a 196 // context where it could emit an event by bringing all goroutines to a safe point 197 // with no opportunity to transition. 198 // 199 // The exception to this rule are goroutines that are concurrently exiting a syscall. 200 // Those will all be forced into the syscalling slow path, and we'll just make sure 201 // that we don't observe any goroutines in that critical section before starting 202 // the world again. 203 // 204 // A good follow-up question to this is why stopping the world is necessary at all 205 // given that we have traceAcquire and traceRelease. Unfortunately, those only help 206 // us when tracing is already active (for performance, so when tracing is off the 207 // tracing seqlock is left untouched). The main issue here is subtle: we're going to 208 // want to obtain a correct starting status for each goroutine, but there are windows 209 // of time in which we could read and emit an incorrect status. Specifically: 210 // 211 // trace := traceAcquire() 212 // // <----> problem window 213 // casgstatus(gp, _Gwaiting, _Grunnable) 214 // if trace.ok() { 215 // trace.GoUnpark(gp, 2) 216 // traceRelease(trace) 217 // } 218 // 219 // More precisely, if we readgstatus for a gp while another goroutine is in the problem 220 // window and that goroutine didn't observe that tracing had begun, then we might write 221 // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking 222 // the transition from GoWaiting to GoRunnable. The trace will then be broken, because 223 // future events will be emitted assuming the tracer sees GoRunnable. 224 // 225 // In short, what we really need here is to make sure that the next time *any goroutine* 226 // hits a traceAcquire, it sees that the trace is enabled. 227 // 228 // Note also that stopping the world is necessary to make sure sweep-related events are 229 // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start 230 // the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this. 231 stw := stopTheWorld(stwStartTrace) 232 233 // Prevent sysmon from running any code that could generate events. 234 lock(&sched.sysmonlock) 235 236 // Grab the minimum page heap address. All Ps are stopped, so it's safe to read this since 237 // nothing can allocate heap memory. 238 trace.minPageHeapAddr = uint64(mheap_.pages.inUse.ranges[0].base.addr()) 239 240 // Reset mSyscallID on all Ps while we have them stationary and the trace is disabled. 241 for _, pp := range allp { 242 pp.trace.mSyscallID = -1 243 } 244 245 // Start tracing. 246 // 247 // Set trace.enabled. This is *very* subtle. We need to maintain the invariant that if 248 // trace.gen != 0, then trace.enabled is always observed as true. Simultaneously, for 249 // performance, we need trace.enabled to be read without any synchronization. 250 // 251 // We ensure this is safe by stopping the world, which acts a global barrier on almost 252 // every M, and explicitly synchronize with any other Ms that could be running concurrently 253 // with us. Today, there are only two such cases: 254 // - sysmon, which we synchronized with by acquiring sysmonlock. 255 // - goroutines exiting syscalls, which we synchronize with via trace.exitingSyscall. 256 // 257 // After trace.gen is updated, other Ms may start creating trace buffers and emitting 258 // data into them. 259 trace.enabled = true 260 if debug.traceallocfree.Load() != 0 { 261 // Enable memory events since the GODEBUG is set. 262 trace.debugMalloc = debug.malloc 263 trace.enabledWithAllocFree = true 264 debug.malloc = true 265 } 266 trace.gen.Store(firstGen) 267 268 // Wait for exitingSyscall to drain. 269 // 270 // It may not monotonically decrease to zero, but in the limit it will always become 271 // zero because the world is stopped and there are no available Ps for syscall-exited 272 // goroutines to run on. 273 // 274 // Because we set gen before checking this, and because exitingSyscall is always incremented 275 // *before* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero 276 // that any goroutine that goes to exit a syscall from then on *must* observe the new gen as 277 // well as trace.enabled being set to true. 278 // 279 // The critical section on each goroutine here is going to be quite short, so the likelihood 280 // that we observe a zero value is high. 281 for trace.exitingSyscall.Load() != 0 { 282 osyield() 283 } 284 285 // Record some initial pieces of information. 286 // 287 // N.B. This will also emit a status event for this goroutine. 288 tl := traceAcquire() 289 traceSyncBatch(firstGen, frequency) // Get this as early in the trace as possible. See comment in traceAdvance. 290 tl.Gomaxprocs(gomaxprocs) // Get this as early in the trace as possible. See comment in traceAdvance. 291 tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now. 292 293 // Record the fact that a GC is active, if applicable. 294 if gcphase == _GCmark || gcphase == _GCmarktermination { 295 tl.GCActive() 296 } 297 298 // Dump a snapshot of memory, if enabled. 299 if trace.enabledWithAllocFree { 300 traceSnapshotMemory(firstGen) 301 } 302 303 // Record the heap goal so we have it at the very beginning of the trace. 304 tl.HeapGoal() 305 306 // Make sure a ProcStatus is emitted for every P, while we're here. 307 for _, pp := range allp { 308 tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end() 309 } 310 traceRelease(tl) 311 312 unlock(&sched.sysmonlock) 313 startTheWorld(stw) 314 315 traceStartReadCPU() 316 traceAdvancer.start() 317 318 semrelease(&traceAdvanceSema) 319 return nil 320 } 321 322 // StopTrace stops tracing, if it was previously enabled. 323 // StopTrace only returns after all the reads for the trace have completed. 324 func StopTrace() { 325 traceAdvance(true) 326 } 327 328 // traceAdvance moves tracing to the next generation, and cleans up the current generation, 329 // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing 330 // altogether instead of advancing to the next generation. 331 // 332 // traceAdvanceSema must not be held. 333 // 334 // traceAdvance is called by runtime/trace and golang.org/x/exp/trace using linkname. 335 // 336 //go:linkname traceAdvance 337 func traceAdvance(stopTrace bool) { 338 semacquire(&traceAdvanceSema) 339 340 // Get the gen that we're advancing from. In this function we don't really care much 341 // about the generation we're advancing _into_ since we'll do all the cleanup in this 342 // generation for the next advancement. 343 gen := trace.gen.Load() 344 if gen == 0 { 345 // We may end up here traceAdvance is called concurrently with StopTrace. 346 semrelease(&traceAdvanceSema) 347 return 348 } 349 350 // Collect all the untraced Gs. 351 type untracedG struct { 352 gp *g 353 goid uint64 354 mid int64 355 stackID uint64 356 status uint32 357 waitreason waitReason 358 inMarkAssist bool 359 } 360 var untracedGs []untracedG 361 forEachGRace(func(gp *g) { 362 // Make absolutely sure all Gs are ready for the next 363 // generation. We need to do this even for dead Gs because 364 // they may come alive with a new identity, and its status 365 // traced bookkeeping might end up being stale. 366 // We may miss totally new goroutines, but they'll always 367 // have clean bookkeeping. 368 gp.trace.readyNextGen(gen) 369 // If the status was traced, nothing else to do. 370 if gp.trace.statusWasTraced(gen) { 371 return 372 } 373 // Scribble down information about this goroutine. 374 ug := untracedG{gp: gp, mid: -1} 375 systemstack(func() { 376 me := getg().m.curg 377 // We don't have to handle this G status transition because we 378 // already eliminated ourselves from consideration above. 379 casGToWaitingForSuspendG(me, _Grunning, waitReasonTraceGoroutineStatus) 380 // We need to suspend and take ownership of the G to safely read its 381 // goid. Note that we can't actually emit the event at this point 382 // because we might stop the G in a window where it's unsafe to write 383 // events based on the G's status. We need the global trace buffer flush 384 // coming up to make sure we're not racing with the G. 385 // 386 // It should be very unlikely that we try to preempt a running G here. 387 // The only situation that we might is that we're racing with a G 388 // that's running for the first time in this generation. Therefore, 389 // this should be relatively fast. 390 s := suspendG(gp) 391 if !s.dead { 392 ug.goid = s.g.goid 393 if s.g.m != nil { 394 ug.mid = int64(s.g.m.procid) 395 } 396 ug.status = readgstatus(s.g) &^ _Gscan 397 ug.waitreason = s.g.waitreason 398 ug.inMarkAssist = s.g.inMarkAssist 399 ug.stackID = traceStack(0, gp, gen) 400 } 401 resumeG(s) 402 casgstatus(me, _Gwaiting, _Grunning) 403 }) 404 if ug.goid != 0 { 405 untracedGs = append(untracedGs, ug) 406 } 407 }) 408 409 if !stopTrace { 410 // Re-register runtime goroutine labels and stop/block reasons. 411 traceRegisterLabelsAndReasons(traceNextGen(gen)) 412 } 413 414 // N.B. This may block for quite a while to get a frequency estimate. Do it 415 // here to minimize the time that we prevent the world from stopping. 416 frequency := traceClockUnitsPerSecond() 417 418 // Now that we've done some of the heavy stuff, prevent the world from stopping. 419 // This is necessary to ensure the consistency of the STW events. If we're feeling 420 // adventurous we could lift this restriction and add a STWActive event, but the 421 // cost of maintaining this consistency is low. We're not going to hold this semaphore 422 // for very long and most STW periods are very short. 423 // Once we hold worldsema, prevent preemption as well so we're not interrupted partway 424 // through this. We want to get this done as soon as possible. 425 semacquire(&worldsema) 426 mp := acquirem() 427 428 // Advance the generation or stop the trace. 429 trace.lastNonZeroGen = gen 430 if stopTrace { 431 systemstack(func() { 432 // Ordering is important here. Set shutdown first, then disable tracing, 433 // so that conditions like (traceEnabled() || traceShuttingDown()) have 434 // no opportunity to be false. Hold the trace lock so this update appears 435 // atomic to the trace reader. 436 lock(&trace.lock) 437 trace.shutdown.Store(true) 438 trace.gen.Store(0) 439 unlock(&trace.lock) 440 441 // Clear trace.enabled. It is totally OK for this value to be stale, 442 // because traceAcquire will always double-check gen. 443 trace.enabled = false 444 }) 445 } else { 446 trace.gen.Store(traceNextGen(gen)) 447 } 448 449 // Emit a sync batch which contains a ClockSnapshot. Also emit a ProcsChange 450 // event so we have one on record for each generation. Let's emit it as soon 451 // as possible so that downstream tools can rely on the value being there 452 // fairly soon in a generation. 453 // 454 // It's important that we do this before allowing stop-the-worlds again, 455 // because the procs count could change. 456 if !stopTrace { 457 tl := traceAcquire() 458 traceSyncBatch(tl.gen, frequency) 459 tl.Gomaxprocs(gomaxprocs) 460 traceRelease(tl) 461 } 462 463 // Emit a GCActive event in the new generation if necessary. 464 // 465 // It's important that we do this before allowing stop-the-worlds again, 466 // because that could emit global GC-related events. 467 if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) { 468 tl := traceAcquire() 469 tl.GCActive() 470 traceRelease(tl) 471 } 472 473 // Preemption is OK again after this. If the world stops or whatever it's fine. 474 // We're just cleaning up the last generation after this point. 475 // 476 // We also don't care if the GC starts again after this for the same reasons. 477 releasem(mp) 478 semrelease(&worldsema) 479 480 // Snapshot allm and freem. 481 // 482 // Snapshotting after the generation counter update is sufficient. 483 // Because an m must be on either allm or sched.freem if it has an active trace 484 // buffer, new threads added to allm after this point must necessarily observe 485 // the new generation number (sched.lock acts as a barrier). 486 // 487 // Threads that exit before this point and are on neither list explicitly 488 // flush their own buffers in traceThreadDestroy. 489 // 490 // Snapshotting freem is necessary because Ms can continue to emit events 491 // while they're still on that list. Removal from sched.freem is serialized with 492 // this snapshot, so either we'll capture an m on sched.freem and race with 493 // the removal to flush its buffers (resolved by traceThreadDestroy acquiring 494 // the thread's seqlock, which one of us must win, so at least its old gen buffer 495 // will be flushed in time for the new generation) or it will have flushed its 496 // buffers before we snapshotted it to begin with. 497 lock(&sched.lock) 498 mToFlush := allm 499 for mp := mToFlush; mp != nil; mp = mp.alllink { 500 mp.trace.link = mp.alllink 501 } 502 for mp := sched.freem; mp != nil; mp = mp.freelink { 503 mp.trace.link = mToFlush 504 mToFlush = mp 505 } 506 unlock(&sched.lock) 507 508 // Iterate over our snapshot, flushing every buffer until we're done. 509 // 510 // Because trace writers read the generation while the seqlock is 511 // held, we can be certain that when there are no writers there are 512 // also no stale generation values left. Therefore, it's safe to flush 513 // any buffers that remain in that generation's slot. 514 const debugDeadlock = false 515 systemstack(func() { 516 // Track iterations for some rudimentary deadlock detection. 517 i := 0 518 detectedDeadlock := false 519 520 for mToFlush != nil { 521 prev := &mToFlush 522 for mp := *prev; mp != nil; { 523 if mp.trace.seqlock.Load()%2 != 0 { 524 // The M is writing. Come back to it later. 525 prev = &mp.trace.link 526 mp = mp.trace.link 527 continue 528 } 529 // Flush the trace buffer. 530 // 531 // trace.lock needed for traceBufFlush, but also to synchronize 532 // with traceThreadDestroy, which flushes both buffers unconditionally. 533 lock(&trace.lock) 534 for exp, buf := range mp.trace.buf[gen%2] { 535 if buf != nil { 536 traceBufFlush(buf, gen) 537 mp.trace.buf[gen%2][exp] = nil 538 } 539 } 540 unlock(&trace.lock) 541 542 // Remove the m from the flush list. 543 *prev = mp.trace.link 544 mp.trace.link = nil 545 mp = *prev 546 } 547 // Yield only if we're going to be going around the loop again. 548 if mToFlush != nil { 549 osyield() 550 } 551 552 if debugDeadlock { 553 // Try to detect a deadlock. We probably shouldn't loop here 554 // this many times. 555 if i > 100000 && !detectedDeadlock { 556 detectedDeadlock = true 557 println("runtime: failing to flush") 558 for mp := mToFlush; mp != nil; mp = mp.trace.link { 559 print("runtime: m=", mp.id, "\n") 560 } 561 } 562 i++ 563 } 564 } 565 }) 566 567 // At this point, the old generation is fully flushed minus stack and string 568 // tables, CPU samples, and goroutines that haven't run at all during the last 569 // generation. 570 571 // Check to see if any Gs still haven't had events written out for them. 572 statusWriter := unsafeTraceWriter(gen, nil) 573 for _, ug := range untracedGs { 574 if ug.gp.trace.statusWasTraced(gen) { 575 // It was traced, we don't need to do anything. 576 continue 577 } 578 // It still wasn't traced. Because we ensured all Ms stopped writing trace 579 // events to the last generation, that must mean the G never had its status 580 // traced in gen between when we recorded it and now. If that's true, the goid 581 // and status we recorded then is exactly what we want right now. 582 status := goStatusToTraceGoStatus(ug.status, ug.waitreason) 583 statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist, ug.stackID) 584 } 585 statusWriter.flush().end() 586 587 // Read everything out of the last gen's CPU profile buffer. 588 traceReadCPU(gen) 589 590 // Flush CPU samples, stacks, and strings for the last generation. This is safe, 591 // because we're now certain no M is writing to the last generation. 592 // 593 // Ordering is important here. traceCPUFlush may generate new stacks and dumping 594 // stacks may generate new strings. 595 traceCPUFlush(gen) 596 trace.stackTab[gen%2].dump(gen) 597 trace.typeTab[gen%2].dump(gen) 598 trace.stringTab[gen%2].reset(gen) 599 600 // That's it. This generation is done producing buffers. 601 systemstack(func() { 602 lock(&trace.lock) 603 trace.flushedGen.Store(gen) 604 unlock(&trace.lock) 605 }) 606 607 // Perform status reset on dead Ps because they just appear as idle. 608 // 609 // Preventing preemption is sufficient to access allp safely. allp is only 610 // mutated by GOMAXPROCS calls, which require a STW. 611 // 612 // TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy 613 // events to indicate whether a P exists, rather than just making its 614 // existence implicit. 615 mp = acquirem() 616 for _, pp := range allp[len(allp):cap(allp)] { 617 pp.trace.readyNextGen(traceNextGen(gen)) 618 } 619 releasem(mp) 620 621 if stopTrace { 622 // Acquire the shutdown sema to begin the shutdown process. 623 semacquire(&traceShutdownSema) 624 625 // Finish off CPU profile reading. 626 traceStopReadCPU() 627 628 // Reset debug.malloc if necessary. Note that this is set in a racy 629 // way; that's OK. Some mallocs may still enter into the debug.malloc 630 // block, but they won't generate events because tracing is disabled. 631 // That is, it's OK if mallocs read a stale debug.malloc or 632 // trace.enabledWithAllocFree value. 633 if trace.enabledWithAllocFree { 634 trace.enabledWithAllocFree = false 635 debug.malloc = trace.debugMalloc 636 } 637 } else { 638 // Go over each P and emit a status event for it if necessary. 639 // 640 // We do this at the beginning of the new generation instead of the 641 // end like we do for goroutines because forEachP doesn't give us a 642 // hook to skip Ps that have already been traced. Since we have to 643 // preempt all Ps anyway, might as well stay consistent with StartTrace 644 // which does this during the STW. 645 semacquire(&worldsema) 646 forEachP(waitReasonTraceProcStatus, func(pp *p) { 647 tl := traceAcquire() 648 if !pp.trace.statusWasTraced(tl.gen) { 649 tl.writer().writeProcStatusForP(pp, false).end() 650 } 651 traceRelease(tl) 652 }) 653 semrelease(&worldsema) 654 } 655 656 // Block until the trace reader has finished processing the last generation. 657 semacquire(&trace.doneSema[gen%2]) 658 if raceenabled { 659 raceacquire(unsafe.Pointer(&trace.doneSema[gen%2])) 660 } 661 662 // Double-check that things look as we expect after advancing and perform some 663 // final cleanup if the trace has fully stopped. 664 systemstack(func() { 665 lock(&trace.lock) 666 if !trace.full[gen%2].empty() { 667 throw("trace: non-empty full trace buffer for done generation") 668 } 669 if stopTrace { 670 if !trace.full[1-(gen%2)].empty() { 671 throw("trace: non-empty full trace buffer for next generation") 672 } 673 if trace.reading != nil || trace.reader.Load() != nil { 674 throw("trace: reading after shutdown") 675 } 676 // Free all the empty buffers. 677 for trace.empty != nil { 678 buf := trace.empty 679 trace.empty = buf.link 680 sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys) 681 } 682 // Clear trace.shutdown and other flags. 683 trace.headerWritten = false 684 trace.shutdown.Store(false) 685 } 686 unlock(&trace.lock) 687 }) 688 689 if stopTrace { 690 // Clear the sweep state on every P for the next time tracing is enabled. 691 // 692 // It may be stale in the next trace because we may have ended tracing in 693 // the middle of a sweep on a P. 694 // 695 // It's fine not to call forEachP here because tracing is disabled and we 696 // know at this point that nothing is calling into the tracer, but we do 697 // need to look at dead Ps too just because GOMAXPROCS could have been called 698 // at any point since we stopped tracing, and we have to ensure there's no 699 // bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that 700 // might mutate allp by making ourselves briefly non-preemptible. 701 mp := acquirem() 702 for _, pp := range allp[:cap(allp)] { 703 pp.trace.inSweep = false 704 pp.trace.maySweep = false 705 pp.trace.swept = 0 706 pp.trace.reclaimed = 0 707 } 708 releasem(mp) 709 } 710 711 // Release the advance semaphore. If stopTrace is true we're still holding onto 712 // traceShutdownSema. 713 // 714 // Do a direct handoff. Don't let one caller of traceAdvance starve 715 // other calls to traceAdvance. 716 semrelease1(&traceAdvanceSema, true, 0) 717 718 if stopTrace { 719 // Stop the traceAdvancer. We can't be holding traceAdvanceSema here because 720 // we'll deadlock (we're blocked on the advancer goroutine exiting, but it 721 // may be currently trying to acquire traceAdvanceSema). 722 traceAdvancer.stop() 723 semrelease(&traceShutdownSema) 724 } 725 } 726 727 func traceNextGen(gen uintptr) uintptr { 728 if gen == ^uintptr(0) { 729 // gen is used both %2 and %3 and we want both patterns to continue when we loop around. 730 // ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation 731 // we want is even and one more than a multiple of 3. The smallest such number is 4. 732 return 4 733 } 734 return gen + 1 735 } 736 737 // traceRegisterLabelsAndReasons re-registers mark worker labels and 738 // goroutine stop/block reasons in the string table for the provided 739 // generation. Note: the provided generation must not have started yet. 740 func traceRegisterLabelsAndReasons(gen uintptr) { 741 for i, label := range gcMarkWorkerModeStrings[:] { 742 trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label)) 743 } 744 for i, str := range traceBlockReasonStrings[:] { 745 trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 746 } 747 for i, str := range traceGoStopReasonStrings[:] { 748 trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 749 } 750 } 751 752 // ReadTrace returns the next chunk of binary tracing data, blocking until data 753 // is available. If tracing is turned off and all the data accumulated while it 754 // was on has been returned, ReadTrace returns nil. The caller must copy the 755 // returned data before calling ReadTrace again. 756 // ReadTrace must be called from one goroutine at a time. 757 func ReadTrace() []byte { 758 for { 759 buf := readTrace() 760 761 // Skip over the end-of-generation signal which must not appear 762 // in the final trace. 763 if len(buf) == 1 && tracev2.EventType(buf[0]) == tracev2.EvEndOfGeneration { 764 continue 765 } 766 return buf 767 } 768 } 769 770 // readTrace is the implementation of ReadTrace, except with an additional 771 // in-band signal as to when the buffer is for a new generation. 772 // 773 //go:linkname readTrace runtime/trace.runtime_readTrace 774 func readTrace() (buf []byte) { 775 top: 776 var park bool 777 systemstack(func() { 778 buf, park = readTrace0() 779 }) 780 if park { 781 gopark(func(gp *g, _ unsafe.Pointer) bool { 782 if !trace.reader.CompareAndSwapNoWB(nil, gp) { 783 // We're racing with another reader. 784 // Wake up and handle this case. 785 return false 786 } 787 788 if g2 := traceReader(); gp == g2 { 789 // New data arrived between unlocking 790 // and the CAS and we won the wake-up 791 // race, so wake up directly. 792 return false 793 } else if g2 != nil { 794 printlock() 795 println("runtime: got trace reader", g2, g2.goid) 796 throw("unexpected trace reader") 797 } 798 799 return true 800 }, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2) 801 goto top 802 } 803 return buf 804 } 805 806 // readTrace0 is ReadTrace's continuation on g0. This must run on the 807 // system stack because it acquires trace.lock. 808 // 809 //go:systemstack 810 func readTrace0() (buf []byte, park bool) { 811 if raceenabled { 812 // g0 doesn't have a race context. Borrow the user G's. 813 if getg().racectx != 0 { 814 throw("expected racectx == 0") 815 } 816 getg().racectx = getg().m.curg.racectx 817 // (This defer should get open-coded, which is safe on 818 // the system stack.) 819 defer func() { getg().racectx = 0 }() 820 } 821 822 // This function must not allocate while holding trace.lock: 823 // allocation can call heap allocate, which will try to emit a trace 824 // event while holding heap lock. 825 lock(&trace.lock) 826 827 if trace.reader.Load() != nil { 828 // More than one goroutine reads trace. This is bad. 829 // But we rather do not crash the program because of tracing, 830 // because tracing can be enabled at runtime on prod servers. 831 unlock(&trace.lock) 832 println("runtime: ReadTrace called from multiple goroutines simultaneously") 833 return nil, false 834 } 835 // Recycle the old buffer. 836 if buf := trace.reading; buf != nil { 837 buf.link = trace.empty 838 trace.empty = buf 839 trace.reading = nil 840 } 841 // Write trace header. 842 if !trace.headerWritten { 843 trace.headerWritten = true 844 unlock(&trace.lock) 845 return []byte("go 1.25 trace\x00\x00\x00"), false 846 } 847 848 // Read the next buffer. 849 850 if trace.readerGen.Load() == 0 { 851 trace.readerGen.Store(1) 852 } 853 var gen uintptr 854 for { 855 assertLockHeld(&trace.lock) 856 gen = trace.readerGen.Load() 857 858 // Check to see if we need to block for more data in this generation 859 // or if we need to move our generation forward. 860 if !trace.full[gen%2].empty() { 861 break 862 } 863 // Most of the time readerGen is one generation ahead of flushedGen, as the 864 // current generation is being read from. Then, once the last buffer is flushed 865 // into readerGen, flushedGen will rise to meet it. At this point, the tracer 866 // is waiting on the reader to finish flushing the last generation so that it 867 // can continue to advance. 868 if trace.flushedGen.Load() == gen { 869 // Write out the internal in-band end-of-generation signal. 870 if !trace.endOfGenerationWritten { 871 trace.endOfGenerationWritten = true 872 unlock(&trace.lock) 873 return []byte{byte(tracev2.EvEndOfGeneration)}, false 874 } 875 876 // Reset the flag. 877 trace.endOfGenerationWritten = false 878 879 // Handle shutdown. 880 if trace.shutdown.Load() { 881 unlock(&trace.lock) 882 883 // Wake up anyone waiting for us to be done with this generation. 884 // 885 // Do this after reading trace.shutdown, because the thread we're 886 // waking up is going to clear trace.shutdown. 887 if raceenabled { 888 // Model synchronization on trace.doneSema, which te race 889 // detector does not see. This is required to avoid false 890 // race reports on writer passed to trace.Start. 891 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 892 } 893 semrelease(&trace.doneSema[gen%2]) 894 895 // We're shutting down, and the last generation is fully 896 // read. We're done. 897 return nil, false 898 } 899 // Handle advancing to the next generation. 900 901 // The previous gen has had all of its buffers flushed, and 902 // there's nothing else for us to read. Advance the generation 903 // we're reading from and try again. 904 trace.readerGen.Store(trace.gen.Load()) 905 unlock(&trace.lock) 906 907 // Wake up anyone waiting for us to be done with this generation. 908 // 909 // Do this after reading gen to make sure we can't have the trace 910 // advance until we've read it. 911 if raceenabled { 912 // See comment above in the shutdown case. 913 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 914 } 915 semrelease(&trace.doneSema[gen%2]) 916 917 // Reacquire the lock and go back to the top of the loop. 918 lock(&trace.lock) 919 continue 920 } 921 // Wait for new data. 922 // 923 // We don't simply use a note because the scheduler 924 // executes this goroutine directly when it wakes up 925 // (also a note would consume an M). 926 // 927 // Before we drop the lock, clear the workAvailable flag. Work can 928 // only be queued with trace.lock held, so this is at least true until 929 // we drop the lock. 930 trace.workAvailable.Store(false) 931 unlock(&trace.lock) 932 return nil, true 933 } 934 // Pull a buffer. 935 tbuf := trace.full[gen%2].pop() 936 trace.reading = tbuf 937 unlock(&trace.lock) 938 return tbuf.arr[:tbuf.pos], false 939 } 940 941 // traceReader returns the trace reader that should be woken up, if any. 942 // Callers should first check (traceEnabled() || traceShuttingDown()). 943 // 944 // This must run on the system stack because it acquires trace.lock. 945 // 946 //go:systemstack 947 func traceReader() *g { 948 gp := traceReaderAvailable() 949 if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) { 950 return nil 951 } 952 return gp 953 } 954 955 // traceReaderAvailable returns the trace reader if it is not currently 956 // scheduled and should be. Callers should first check that 957 // (traceEnabled() || traceShuttingDown()) is true. 958 func traceReaderAvailable() *g { 959 // There are two conditions under which we definitely want to schedule 960 // the reader: 961 // - The reader is lagging behind in finishing off the last generation. 962 // In this case, trace buffers could even be empty, but the trace 963 // advancer will be waiting on the reader, so we have to make sure 964 // to schedule the reader ASAP. 965 // - The reader has pending work to process for it's reader generation 966 // (assuming readerGen is not lagging behind). Note that we also want 967 // to be careful *not* to schedule the reader if there's no work to do. 968 // 969 // We also want to be careful not to schedule the reader if there's no 970 // reason to. 971 if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() { 972 return trace.reader.Load() 973 } 974 return nil 975 } 976 977 // Trace advancer goroutine. 978 var traceAdvancer traceAdvancerState 979 980 type traceAdvancerState struct { 981 timer *wakeableSleep 982 done chan struct{} 983 } 984 985 // start starts a new traceAdvancer. 986 func (s *traceAdvancerState) start() { 987 // Start a goroutine to periodically advance the trace generation. 988 s.done = make(chan struct{}) 989 s.timer = newWakeableSleep() 990 go func() { 991 for traceEnabled() { 992 // Set a timer to wake us up 993 s.timer.sleep(int64(debug.traceadvanceperiod)) 994 995 // Try to advance the trace. 996 traceAdvance(false) 997 } 998 s.done <- struct{}{} 999 }() 1000 } 1001 1002 // stop stops a traceAdvancer and blocks until it exits. 1003 func (s *traceAdvancerState) stop() { 1004 s.timer.wake() 1005 <-s.done 1006 close(s.done) 1007 s.timer.close() 1008 } 1009 1010 // traceAdvancePeriod is the approximate period between 1011 // new generations. 1012 const defaultTraceAdvancePeriod = 1e9 // 1 second. 1013 1014 // wakeableSleep manages a wakeable goroutine sleep. 1015 // 1016 // Users of this type must call init before first use and 1017 // close to free up resources. Once close is called, init 1018 // must be called before another use. 1019 type wakeableSleep struct { 1020 timer *timer 1021 1022 // lock protects access to wakeup, but not send/recv on it. 1023 lock mutex 1024 wakeup chan struct{} 1025 } 1026 1027 // newWakeableSleep initializes a new wakeableSleep and returns it. 1028 func newWakeableSleep() *wakeableSleep { 1029 s := new(wakeableSleep) 1030 lockInit(&s.lock, lockRankWakeableSleep) 1031 s.wakeup = make(chan struct{}, 1) 1032 s.timer = new(timer) 1033 f := func(s any, _ uintptr, _ int64) { 1034 s.(*wakeableSleep).wake() 1035 } 1036 s.timer.init(f, s) 1037 return s 1038 } 1039 1040 // sleep sleeps for the provided duration in nanoseconds or until 1041 // another goroutine calls wake. 1042 // 1043 // Must not be called by more than one goroutine at a time and 1044 // must not be called concurrently with close. 1045 func (s *wakeableSleep) sleep(ns int64) { 1046 s.timer.reset(nanotime()+ns, 0) 1047 lock(&s.lock) 1048 if raceenabled { 1049 raceacquire(unsafe.Pointer(&s.lock)) 1050 } 1051 wakeup := s.wakeup 1052 if raceenabled { 1053 racerelease(unsafe.Pointer(&s.lock)) 1054 } 1055 unlock(&s.lock) 1056 <-wakeup 1057 s.timer.stop() 1058 } 1059 1060 // wake awakens any goroutine sleeping on the timer. 1061 // 1062 // Safe for concurrent use with all other methods. 1063 func (s *wakeableSleep) wake() { 1064 // Grab the wakeup channel, which may be nil if we're 1065 // racing with close. 1066 lock(&s.lock) 1067 if raceenabled { 1068 raceacquire(unsafe.Pointer(&s.lock)) 1069 } 1070 if s.wakeup != nil { 1071 // Non-blocking send. 1072 // 1073 // Others may also write to this channel and we don't 1074 // want to block on the receiver waking up. This also 1075 // effectively batches together wakeup notifications. 1076 select { 1077 case s.wakeup <- struct{}{}: 1078 default: 1079 } 1080 } 1081 if raceenabled { 1082 racerelease(unsafe.Pointer(&s.lock)) 1083 } 1084 unlock(&s.lock) 1085 } 1086 1087 // close wakes any goroutine sleeping on the timer and prevents 1088 // further sleeping on it. 1089 // 1090 // Once close is called, the wakeableSleep must no longer be used. 1091 // 1092 // It must only be called once no goroutine is sleeping on the 1093 // timer *and* nothing else will call wake concurrently. 1094 func (s *wakeableSleep) close() { 1095 // Set wakeup to nil so that a late timer ends up being a no-op. 1096 lock(&s.lock) 1097 if raceenabled { 1098 raceacquire(unsafe.Pointer(&s.lock)) 1099 } 1100 wakeup := s.wakeup 1101 s.wakeup = nil 1102 1103 // Close the channel. 1104 close(wakeup) 1105 1106 if raceenabled { 1107 racerelease(unsafe.Pointer(&s.lock)) 1108 } 1109 unlock(&s.lock) 1110 return 1111 } 1112