Source file src/runtime/trace.go
1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Go execution tracer. 6 // The tracer captures a wide range of execution events like goroutine 7 // creation/blocking/unblocking, syscall enter/exit/block, GC-related events, 8 // changes of heap size, processor start/stop, etc and writes them to a buffer 9 // in a compact form. A precise nanosecond-precision timestamp and a stack 10 // trace is captured for most events. 11 // 12 // Tracer invariants (to keep the synchronization making sense): 13 // - An m that has a trace buffer must be on either the allm or sched.freem lists. 14 // - Any trace buffer mutation must either be happening in traceAdvance or between 15 // a traceAcquire and a subsequent traceRelease. 16 // - traceAdvance cannot return until the previous generation's buffers are all flushed. 17 // 18 // See https://go.dev/issue/60773 for a link to the full design. 19 20 package runtime 21 22 import ( 23 "internal/runtime/atomic" 24 "unsafe" 25 ) 26 27 // Trace state. 28 29 // trace is global tracing context. 30 var trace struct { 31 // trace.lock must only be acquired on the system stack where 32 // stack splits cannot happen while it is held. 33 lock mutex 34 35 // Trace buffer management. 36 // 37 // First we check the empty list for any free buffers. If not, buffers 38 // are allocated directly from the OS. Once they're filled up and/or 39 // flushed, they end up on the full queue for trace.gen%2. 40 // 41 // The trace reader takes buffers off the full list one-by-one and 42 // places them into reading until they're finished being read from. 43 // Then they're placed onto the empty list. 44 // 45 // Protected by trace.lock. 46 reading *traceBuf // buffer currently handed off to user 47 empty *traceBuf // stack of empty buffers 48 full [2]traceBufQueue 49 workAvailable atomic.Bool 50 51 // State for the trace reader goroutine. 52 // 53 // Protected by trace.lock. 54 readerGen atomic.Uintptr // the generation the reader is currently reading for 55 flushedGen atomic.Uintptr // the last completed generation 56 headerWritten bool // whether ReadTrace has emitted trace header 57 58 // doneSema is used to synchronize the reader and traceAdvance. Specifically, 59 // it notifies traceAdvance that the reader is done with a generation. 60 // Both semaphores are 0 by default (so, acquires block). traceAdvance 61 // attempts to acquire for gen%2 after flushing the last buffers for gen. 62 // Meanwhile the reader releases the sema for gen%2 when it has finished 63 // processing gen. 64 doneSema [2]uint32 65 66 // Trace data tables for deduplicating data going into the trace. 67 // There are 2 of each: one for gen%2, one for 1-gen%2. 68 stackTab [2]traceStackTable // maps stack traces to unique ids 69 stringTab [2]traceStringTable // maps strings to unique ids 70 typeTab [2]traceTypeTable // maps type pointers to unique ids 71 72 // cpuLogRead accepts CPU profile samples from the signal handler where 73 // they're generated. There are two profBufs here: one for gen%2, one for 74 // 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G, 75 // and M (respectively) that were active at the time of the sample. Because 76 // profBuf uses a record with all zeros in its header to indicate overflow, 77 // we make sure to make the P field always non-zero: The ID of a real P will 78 // start at bit 1, and bit 0 will be set. Samples that arrive while no P is 79 // running (such as near syscalls) will set the first header field to 0b10. 80 // This careful handling of the first header field allows us to store ID of 81 // the active G directly in the second field, even though that will be 0 82 // when sampling g0. 83 // 84 // Initialization and teardown of these fields is protected by traceAdvanceSema. 85 cpuLogRead [2]*profBuf 86 signalLock atomic.Uint32 // protects use of the following member, only usable in signal handlers 87 cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock 88 cpuSleep *wakeableSleep 89 cpuLogDone <-chan struct{} 90 cpuBuf [2]*traceBuf 91 92 reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil 93 94 // Fast mappings from enumerations to string IDs that are prepopulated 95 // in the trace. 96 markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg 97 goStopReasons [2][len(traceGoStopReasonStrings)]traceArg 98 goBlockReasons [2][len(traceBlockReasonStrings)]traceArg 99 100 // enabled indicates whether tracing is enabled, but it is only an optimization, 101 // NOT the source of truth on whether tracing is enabled. Tracing is only truly 102 // enabled if gen != 0. This is used as an optimistic fast path check. 103 // 104 // Transitioning this value from true -> false is easy (once gen is 0) 105 // because it's OK for enabled to have a stale "true" value. traceAcquire will 106 // always double-check gen. 107 // 108 // Transitioning this value from false -> true is harder. We need to make sure 109 // this is observable as true strictly before gen != 0. To maintain this invariant 110 // we only make this transition with the world stopped and use the store to gen 111 // as a publication barrier. 112 enabled bool 113 114 // enabledWithAllocFree is set if debug.traceallocfree is != 0 when tracing begins. 115 // It follows the same synchronization protocol as enabled. 116 enabledWithAllocFree bool 117 118 // Trace generation counter. 119 gen atomic.Uintptr 120 lastNonZeroGen uintptr // last non-zero value of gen 121 122 // shutdown is set when we are waiting for trace reader to finish after setting gen to 0 123 // 124 // Writes protected by trace.lock. 125 shutdown atomic.Bool 126 127 // Number of goroutines in syscall exiting slow path. 128 exitingSyscall atomic.Int32 129 130 // seqGC is the sequence counter for GC begin/end. 131 // 132 // Mutated only during stop-the-world. 133 seqGC uint64 134 135 // minPageHeapAddr is the minimum address of the page heap when tracing started. 136 minPageHeapAddr uint64 137 138 // debugMalloc is the value of debug.malloc before tracing began. 139 debugMalloc bool 140 } 141 142 // Trace public API. 143 144 var ( 145 traceAdvanceSema uint32 = 1 146 traceShutdownSema uint32 = 1 147 ) 148 149 // StartTrace enables tracing for the current process. 150 // While tracing, the data will be buffered and available via [ReadTrace]. 151 // StartTrace returns an error if tracing is already enabled. 152 // Most clients should use the [runtime/trace] package or the [testing] package's 153 // -test.trace flag instead of calling StartTrace directly. 154 func StartTrace() error { 155 if traceEnabled() || traceShuttingDown() { 156 return errorString("tracing is already enabled") 157 } 158 // Block until cleanup of the last trace is done. 159 semacquire(&traceShutdownSema) 160 semrelease(&traceShutdownSema) 161 162 // Hold traceAdvanceSema across trace start, since we'll want it on 163 // the other side of tracing being enabled globally. 164 semacquire(&traceAdvanceSema) 165 166 // Initialize CPU profile -> trace ingestion. 167 traceInitReadCPU() 168 169 // Compute the first generation for this StartTrace. 170 // 171 // Note: we start from the last non-zero generation rather than 1 so we 172 // can avoid resetting all the arrays indexed by gen%2 or gen%3. There's 173 // more than one of each per m, p, and goroutine. 174 firstGen := traceNextGen(trace.lastNonZeroGen) 175 176 // Reset GC sequencer. 177 trace.seqGC = 1 178 179 // Reset trace reader state. 180 trace.headerWritten = false 181 trace.readerGen.Store(firstGen) 182 trace.flushedGen.Store(0) 183 184 // Register some basic strings in the string tables. 185 traceRegisterLabelsAndReasons(firstGen) 186 187 // Stop the world. 188 // 189 // The purpose of stopping the world is to make sure that no goroutine is in a 190 // context where it could emit an event by bringing all goroutines to a safe point 191 // with no opportunity to transition. 192 // 193 // The exception to this rule are goroutines that are concurrently exiting a syscall. 194 // Those will all be forced into the syscalling slow path, and we'll just make sure 195 // that we don't observe any goroutines in that critical section before starting 196 // the world again. 197 // 198 // A good follow-up question to this is why stopping the world is necessary at all 199 // given that we have traceAcquire and traceRelease. Unfortunately, those only help 200 // us when tracing is already active (for performance, so when tracing is off the 201 // tracing seqlock is left untouched). The main issue here is subtle: we're going to 202 // want to obtain a correct starting status for each goroutine, but there are windows 203 // of time in which we could read and emit an incorrect status. Specifically: 204 // 205 // trace := traceAcquire() 206 // // <----> problem window 207 // casgstatus(gp, _Gwaiting, _Grunnable) 208 // if trace.ok() { 209 // trace.GoUnpark(gp, 2) 210 // traceRelease(trace) 211 // } 212 // 213 // More precisely, if we readgstatus for a gp while another goroutine is in the problem 214 // window and that goroutine didn't observe that tracing had begun, then we might write 215 // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking 216 // the transition from GoWaiting to GoRunnable. The trace will then be broken, because 217 // future events will be emitted assuming the tracer sees GoRunnable. 218 // 219 // In short, what we really need here is to make sure that the next time *any goroutine* 220 // hits a traceAcquire, it sees that the trace is enabled. 221 // 222 // Note also that stopping the world is necessary to make sure sweep-related events are 223 // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start 224 // the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this. 225 stw := stopTheWorld(stwStartTrace) 226 227 // Prevent sysmon from running any code that could generate events. 228 lock(&sched.sysmonlock) 229 230 // Grab the minimum page heap address. All Ps are stopped, so it's safe to read this since 231 // nothing can allocate heap memory. 232 trace.minPageHeapAddr = uint64(mheap_.pages.inUse.ranges[0].base.addr()) 233 234 // Reset mSyscallID on all Ps while we have them stationary and the trace is disabled. 235 for _, pp := range allp { 236 pp.trace.mSyscallID = -1 237 } 238 239 // Start tracing. 240 // 241 // Set trace.enabled. This is *very* subtle. We need to maintain the invariant that if 242 // trace.gen != 0, then trace.enabled is always observed as true. Simultaneously, for 243 // performance, we need trace.enabled to be read without any synchronization. 244 // 245 // We ensure this is safe by stopping the world, which acts a global barrier on almost 246 // every M, and explicitly synchronize with any other Ms that could be running concurrently 247 // with us. Today, there are only two such cases: 248 // - sysmon, which we synchronized with by acquiring sysmonlock. 249 // - goroutines exiting syscalls, which we synchronize with via trace.exitingSyscall. 250 // 251 // After trace.gen is updated, other Ms may start creating trace buffers and emitting 252 // data into them. 253 trace.enabled = true 254 if debug.traceallocfree.Load() != 0 { 255 // Enable memory events since the GODEBUG is set. 256 trace.debugMalloc = debug.malloc 257 trace.enabledWithAllocFree = true 258 debug.malloc = true 259 } 260 trace.gen.Store(firstGen) 261 262 // Wait for exitingSyscall to drain. 263 // 264 // It may not monotonically decrease to zero, but in the limit it will always become 265 // zero because the world is stopped and there are no available Ps for syscall-exited 266 // goroutines to run on. 267 // 268 // Because we set gen before checking this, and because exitingSyscall is always incremented 269 // *before* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero 270 // that any goroutine that goes to exit a syscall from then on *must* observe the new gen as 271 // well as trace.enabled being set to true. 272 // 273 // The critical section on each goroutine here is going to be quite short, so the likelihood 274 // that we observe a zero value is high. 275 for trace.exitingSyscall.Load() != 0 { 276 osyield() 277 } 278 279 // Record some initial pieces of information. 280 // 281 // N.B. This will also emit a status event for this goroutine. 282 tl := traceAcquire() 283 tl.Gomaxprocs(gomaxprocs) // Get this as early in the trace as possible. See comment in traceAdvance. 284 tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now. 285 286 // Record the fact that a GC is active, if applicable. 287 if gcphase == _GCmark || gcphase == _GCmarktermination { 288 tl.GCActive() 289 } 290 291 // Dump a snapshot of memory, if enabled. 292 if trace.enabledWithAllocFree { 293 traceSnapshotMemory() 294 } 295 296 // Record the heap goal so we have it at the very beginning of the trace. 297 tl.HeapGoal() 298 299 // Make sure a ProcStatus is emitted for every P, while we're here. 300 for _, pp := range allp { 301 tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end() 302 } 303 traceRelease(tl) 304 305 unlock(&sched.sysmonlock) 306 startTheWorld(stw) 307 308 traceStartReadCPU() 309 traceAdvancer.start() 310 311 semrelease(&traceAdvanceSema) 312 return nil 313 } 314 315 // StopTrace stops tracing, if it was previously enabled. 316 // StopTrace only returns after all the reads for the trace have completed. 317 func StopTrace() { 318 traceAdvance(true) 319 } 320 321 // traceAdvance moves tracing to the next generation, and cleans up the current generation, 322 // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing 323 // altogether instead of advancing to the next generation. 324 // 325 // traceAdvanceSema must not be held. 326 func traceAdvance(stopTrace bool) { 327 semacquire(&traceAdvanceSema) 328 329 // Get the gen that we're advancing from. In this function we don't really care much 330 // about the generation we're advancing _into_ since we'll do all the cleanup in this 331 // generation for the next advancement. 332 gen := trace.gen.Load() 333 if gen == 0 { 334 // We may end up here traceAdvance is called concurrently with StopTrace. 335 semrelease(&traceAdvanceSema) 336 return 337 } 338 339 // Write an EvFrequency event for this generation. 340 // 341 // N.B. This may block for quite a while to get a good frequency estimate, so make sure we do 342 // this here and not e.g. on the trace reader. 343 traceFrequency(gen) 344 345 // Collect all the untraced Gs. 346 type untracedG struct { 347 gp *g 348 goid uint64 349 mid int64 350 stackID uint64 351 status uint32 352 waitreason waitReason 353 inMarkAssist bool 354 } 355 var untracedGs []untracedG 356 forEachGRace(func(gp *g) { 357 // Make absolutely sure all Gs are ready for the next 358 // generation. We need to do this even for dead Gs because 359 // they may come alive with a new identity, and its status 360 // traced bookkeeping might end up being stale. 361 // We may miss totally new goroutines, but they'll always 362 // have clean bookkeeping. 363 gp.trace.readyNextGen(gen) 364 // If the status was traced, nothing else to do. 365 if gp.trace.statusWasTraced(gen) { 366 return 367 } 368 // Scribble down information about this goroutine. 369 ug := untracedG{gp: gp, mid: -1} 370 systemstack(func() { 371 me := getg().m.curg 372 // We don't have to handle this G status transition because we 373 // already eliminated ourselves from consideration above. 374 casGToWaitingForGC(me, _Grunning, waitReasonTraceGoroutineStatus) 375 // We need to suspend and take ownership of the G to safely read its 376 // goid. Note that we can't actually emit the event at this point 377 // because we might stop the G in a window where it's unsafe to write 378 // events based on the G's status. We need the global trace buffer flush 379 // coming up to make sure we're not racing with the G. 380 // 381 // It should be very unlikely that we try to preempt a running G here. 382 // The only situation that we might is that we're racing with a G 383 // that's running for the first time in this generation. Therefore, 384 // this should be relatively fast. 385 s := suspendG(gp) 386 if !s.dead { 387 ug.goid = s.g.goid 388 if s.g.m != nil { 389 ug.mid = int64(s.g.m.procid) 390 } 391 ug.status = readgstatus(s.g) &^ _Gscan 392 ug.waitreason = s.g.waitreason 393 ug.inMarkAssist = s.g.inMarkAssist 394 ug.stackID = traceStack(0, gp, gen) 395 } 396 resumeG(s) 397 casgstatus(me, _Gwaiting, _Grunning) 398 }) 399 if ug.goid != 0 { 400 untracedGs = append(untracedGs, ug) 401 } 402 }) 403 404 if !stopTrace { 405 // Re-register runtime goroutine labels and stop/block reasons. 406 traceRegisterLabelsAndReasons(traceNextGen(gen)) 407 } 408 409 // Now that we've done some of the heavy stuff, prevent the world from stopping. 410 // This is necessary to ensure the consistency of the STW events. If we're feeling 411 // adventurous we could lift this restriction and add a STWActive event, but the 412 // cost of maintaining this consistency is low. We're not going to hold this semaphore 413 // for very long and most STW periods are very short. 414 // Once we hold worldsema, prevent preemption as well so we're not interrupted partway 415 // through this. We want to get this done as soon as possible. 416 semacquire(&worldsema) 417 mp := acquirem() 418 419 // Advance the generation or stop the trace. 420 trace.lastNonZeroGen = gen 421 if stopTrace { 422 systemstack(func() { 423 // Ordering is important here. Set shutdown first, then disable tracing, 424 // so that conditions like (traceEnabled() || traceShuttingDown()) have 425 // no opportunity to be false. Hold the trace lock so this update appears 426 // atomic to the trace reader. 427 lock(&trace.lock) 428 trace.shutdown.Store(true) 429 trace.gen.Store(0) 430 unlock(&trace.lock) 431 432 // Clear trace.enabled. It is totally OK for this value to be stale, 433 // because traceAcquire will always double-check gen. 434 trace.enabled = false 435 }) 436 } else { 437 trace.gen.Store(traceNextGen(gen)) 438 } 439 440 // Emit a ProcsChange event so we have one on record for each generation. 441 // Let's emit it as soon as possible so that downstream tools can rely on the value 442 // being there fairly soon in a generation. 443 // 444 // It's important that we do this before allowing stop-the-worlds again, 445 // because the procs count could change. 446 if !stopTrace { 447 tl := traceAcquire() 448 tl.Gomaxprocs(gomaxprocs) 449 traceRelease(tl) 450 } 451 452 // Emit a GCActive event in the new generation if necessary. 453 // 454 // It's important that we do this before allowing stop-the-worlds again, 455 // because that could emit global GC-related events. 456 if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) { 457 tl := traceAcquire() 458 tl.GCActive() 459 traceRelease(tl) 460 } 461 462 // Preemption is OK again after this. If the world stops or whatever it's fine. 463 // We're just cleaning up the last generation after this point. 464 // 465 // We also don't care if the GC starts again after this for the same reasons. 466 releasem(mp) 467 semrelease(&worldsema) 468 469 // Snapshot allm and freem. 470 // 471 // Snapshotting after the generation counter update is sufficient. 472 // Because an m must be on either allm or sched.freem if it has an active trace 473 // buffer, new threads added to allm after this point must necessarily observe 474 // the new generation number (sched.lock acts as a barrier). 475 // 476 // Threads that exit before this point and are on neither list explicitly 477 // flush their own buffers in traceThreadDestroy. 478 // 479 // Snapshotting freem is necessary because Ms can continue to emit events 480 // while they're still on that list. Removal from sched.freem is serialized with 481 // this snapshot, so either we'll capture an m on sched.freem and race with 482 // the removal to flush its buffers (resolved by traceThreadDestroy acquiring 483 // the thread's seqlock, which one of us must win, so at least its old gen buffer 484 // will be flushed in time for the new generation) or it will have flushed its 485 // buffers before we snapshotted it to begin with. 486 lock(&sched.lock) 487 mToFlush := allm 488 for mp := mToFlush; mp != nil; mp = mp.alllink { 489 mp.trace.link = mp.alllink 490 } 491 for mp := sched.freem; mp != nil; mp = mp.freelink { 492 mp.trace.link = mToFlush 493 mToFlush = mp 494 } 495 unlock(&sched.lock) 496 497 // Iterate over our snapshot, flushing every buffer until we're done. 498 // 499 // Because trace writers read the generation while the seqlock is 500 // held, we can be certain that when there are no writers there are 501 // also no stale generation values left. Therefore, it's safe to flush 502 // any buffers that remain in that generation's slot. 503 const debugDeadlock = false 504 systemstack(func() { 505 // Track iterations for some rudimentary deadlock detection. 506 i := 0 507 detectedDeadlock := false 508 509 for mToFlush != nil { 510 prev := &mToFlush 511 for mp := *prev; mp != nil; { 512 if mp.trace.seqlock.Load()%2 != 0 { 513 // The M is writing. Come back to it later. 514 prev = &mp.trace.link 515 mp = mp.trace.link 516 continue 517 } 518 // Flush the trace buffer. 519 // 520 // trace.lock needed for traceBufFlush, but also to synchronize 521 // with traceThreadDestroy, which flushes both buffers unconditionally. 522 lock(&trace.lock) 523 bufp := &mp.trace.buf[gen%2] 524 if *bufp != nil { 525 traceBufFlush(*bufp, gen) 526 *bufp = nil 527 } 528 unlock(&trace.lock) 529 530 // Remove the m from the flush list. 531 *prev = mp.trace.link 532 mp.trace.link = nil 533 mp = *prev 534 } 535 // Yield only if we're going to be going around the loop again. 536 if mToFlush != nil { 537 osyield() 538 } 539 540 if debugDeadlock { 541 // Try to detect a deadlock. We probably shouldn't loop here 542 // this many times. 543 if i > 100000 && !detectedDeadlock { 544 detectedDeadlock = true 545 println("runtime: failing to flush") 546 for mp := mToFlush; mp != nil; mp = mp.trace.link { 547 print("runtime: m=", mp.id, "\n") 548 } 549 } 550 i++ 551 } 552 } 553 }) 554 555 // At this point, the old generation is fully flushed minus stack and string 556 // tables, CPU samples, and goroutines that haven't run at all during the last 557 // generation. 558 559 // Check to see if any Gs still haven't had events written out for them. 560 statusWriter := unsafeTraceWriter(gen, nil) 561 for _, ug := range untracedGs { 562 if ug.gp.trace.statusWasTraced(gen) { 563 // It was traced, we don't need to do anything. 564 continue 565 } 566 // It still wasn't traced. Because we ensured all Ms stopped writing trace 567 // events to the last generation, that must mean the G never had its status 568 // traced in gen between when we recorded it and now. If that's true, the goid 569 // and status we recorded then is exactly what we want right now. 570 status := goStatusToTraceGoStatus(ug.status, ug.waitreason) 571 statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist, ug.stackID) 572 } 573 statusWriter.flush().end() 574 575 // Read everything out of the last gen's CPU profile buffer. 576 traceReadCPU(gen) 577 578 // Flush CPU samples, stacks, and strings for the last generation. This is safe, 579 // because we're now certain no M is writing to the last generation. 580 // 581 // Ordering is important here. traceCPUFlush may generate new stacks and dumping 582 // stacks may generate new strings. 583 traceCPUFlush(gen) 584 trace.stackTab[gen%2].dump(gen) 585 trace.typeTab[gen%2].dump(gen) 586 trace.stringTab[gen%2].reset(gen) 587 588 // That's it. This generation is done producing buffers. 589 systemstack(func() { 590 lock(&trace.lock) 591 trace.flushedGen.Store(gen) 592 unlock(&trace.lock) 593 }) 594 595 // Perform status reset on dead Ps because they just appear as idle. 596 // 597 // Preventing preemption is sufficient to access allp safely. allp is only 598 // mutated by GOMAXPROCS calls, which require a STW. 599 // 600 // TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy 601 // events to indicate whether a P exists, rather than just making its 602 // existence implicit. 603 mp = acquirem() 604 for _, pp := range allp[len(allp):cap(allp)] { 605 pp.trace.readyNextGen(traceNextGen(gen)) 606 } 607 releasem(mp) 608 609 if stopTrace { 610 // Acquire the shutdown sema to begin the shutdown process. 611 semacquire(&traceShutdownSema) 612 613 // Finish off CPU profile reading. 614 traceStopReadCPU() 615 616 // Reset debug.malloc if necessary. Note that this is set in a racy 617 // way; that's OK. Some mallocs may still enter into the debug.malloc 618 // block, but they won't generate events because tracing is disabled. 619 // That is, it's OK if mallocs read a stale debug.malloc or 620 // trace.enabledWithAllocFree value. 621 if trace.enabledWithAllocFree { 622 trace.enabledWithAllocFree = false 623 debug.malloc = trace.debugMalloc 624 } 625 } else { 626 // Go over each P and emit a status event for it if necessary. 627 // 628 // We do this at the beginning of the new generation instead of the 629 // end like we do for goroutines because forEachP doesn't give us a 630 // hook to skip Ps that have already been traced. Since we have to 631 // preempt all Ps anyway, might as well stay consistent with StartTrace 632 // which does this during the STW. 633 semacquire(&worldsema) 634 forEachP(waitReasonTraceProcStatus, func(pp *p) { 635 tl := traceAcquire() 636 if !pp.trace.statusWasTraced(tl.gen) { 637 tl.writer().writeProcStatusForP(pp, false).end() 638 } 639 traceRelease(tl) 640 }) 641 semrelease(&worldsema) 642 } 643 644 // Block until the trace reader has finished processing the last generation. 645 semacquire(&trace.doneSema[gen%2]) 646 if raceenabled { 647 raceacquire(unsafe.Pointer(&trace.doneSema[gen%2])) 648 } 649 650 // Double-check that things look as we expect after advancing and perform some 651 // final cleanup if the trace has fully stopped. 652 systemstack(func() { 653 lock(&trace.lock) 654 if !trace.full[gen%2].empty() { 655 throw("trace: non-empty full trace buffer for done generation") 656 } 657 if stopTrace { 658 if !trace.full[1-(gen%2)].empty() { 659 throw("trace: non-empty full trace buffer for next generation") 660 } 661 if trace.reading != nil || trace.reader.Load() != nil { 662 throw("trace: reading after shutdown") 663 } 664 // Free all the empty buffers. 665 for trace.empty != nil { 666 buf := trace.empty 667 trace.empty = buf.link 668 sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys) 669 } 670 // Clear trace.shutdown and other flags. 671 trace.headerWritten = false 672 trace.shutdown.Store(false) 673 } 674 unlock(&trace.lock) 675 }) 676 677 if stopTrace { 678 // Clear the sweep state on every P for the next time tracing is enabled. 679 // 680 // It may be stale in the next trace because we may have ended tracing in 681 // the middle of a sweep on a P. 682 // 683 // It's fine not to call forEachP here because tracing is disabled and we 684 // know at this point that nothing is calling into the tracer, but we do 685 // need to look at dead Ps too just because GOMAXPROCS could have been called 686 // at any point since we stopped tracing, and we have to ensure there's no 687 // bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that 688 // might mutate allp by making ourselves briefly non-preemptible. 689 mp := acquirem() 690 for _, pp := range allp[:cap(allp)] { 691 pp.trace.inSweep = false 692 pp.trace.maySweep = false 693 pp.trace.swept = 0 694 pp.trace.reclaimed = 0 695 } 696 releasem(mp) 697 } 698 699 // Release the advance semaphore. If stopTrace is true we're still holding onto 700 // traceShutdownSema. 701 // 702 // Do a direct handoff. Don't let one caller of traceAdvance starve 703 // other calls to traceAdvance. 704 semrelease1(&traceAdvanceSema, true, 0) 705 706 if stopTrace { 707 // Stop the traceAdvancer. We can't be holding traceAdvanceSema here because 708 // we'll deadlock (we're blocked on the advancer goroutine exiting, but it 709 // may be currently trying to acquire traceAdvanceSema). 710 traceAdvancer.stop() 711 semrelease(&traceShutdownSema) 712 } 713 } 714 715 func traceNextGen(gen uintptr) uintptr { 716 if gen == ^uintptr(0) { 717 // gen is used both %2 and %3 and we want both patterns to continue when we loop around. 718 // ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation 719 // we want is even and one more than a multiple of 3. The smallest such number is 4. 720 return 4 721 } 722 return gen + 1 723 } 724 725 // traceRegisterLabelsAndReasons re-registers mark worker labels and 726 // goroutine stop/block reasons in the string table for the provided 727 // generation. Note: the provided generation must not have started yet. 728 func traceRegisterLabelsAndReasons(gen uintptr) { 729 for i, label := range gcMarkWorkerModeStrings[:] { 730 trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label)) 731 } 732 for i, str := range traceBlockReasonStrings[:] { 733 trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 734 } 735 for i, str := range traceGoStopReasonStrings[:] { 736 trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 737 } 738 } 739 740 // ReadTrace returns the next chunk of binary tracing data, blocking until data 741 // is available. If tracing is turned off and all the data accumulated while it 742 // was on has been returned, ReadTrace returns nil. The caller must copy the 743 // returned data before calling ReadTrace again. 744 // ReadTrace must be called from one goroutine at a time. 745 func ReadTrace() []byte { 746 top: 747 var buf []byte 748 var park bool 749 systemstack(func() { 750 buf, park = readTrace0() 751 }) 752 if park { 753 gopark(func(gp *g, _ unsafe.Pointer) bool { 754 if !trace.reader.CompareAndSwapNoWB(nil, gp) { 755 // We're racing with another reader. 756 // Wake up and handle this case. 757 return false 758 } 759 760 if g2 := traceReader(); gp == g2 { 761 // New data arrived between unlocking 762 // and the CAS and we won the wake-up 763 // race, so wake up directly. 764 return false 765 } else if g2 != nil { 766 printlock() 767 println("runtime: got trace reader", g2, g2.goid) 768 throw("unexpected trace reader") 769 } 770 771 return true 772 }, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2) 773 goto top 774 } 775 776 return buf 777 } 778 779 // readTrace0 is ReadTrace's continuation on g0. This must run on the 780 // system stack because it acquires trace.lock. 781 // 782 //go:systemstack 783 func readTrace0() (buf []byte, park bool) { 784 if raceenabled { 785 // g0 doesn't have a race context. Borrow the user G's. 786 if getg().racectx != 0 { 787 throw("expected racectx == 0") 788 } 789 getg().racectx = getg().m.curg.racectx 790 // (This defer should get open-coded, which is safe on 791 // the system stack.) 792 defer func() { getg().racectx = 0 }() 793 } 794 795 // This function must not allocate while holding trace.lock: 796 // allocation can call heap allocate, which will try to emit a trace 797 // event while holding heap lock. 798 lock(&trace.lock) 799 800 if trace.reader.Load() != nil { 801 // More than one goroutine reads trace. This is bad. 802 // But we rather do not crash the program because of tracing, 803 // because tracing can be enabled at runtime on prod servers. 804 unlock(&trace.lock) 805 println("runtime: ReadTrace called from multiple goroutines simultaneously") 806 return nil, false 807 } 808 // Recycle the old buffer. 809 if buf := trace.reading; buf != nil { 810 buf.link = trace.empty 811 trace.empty = buf 812 trace.reading = nil 813 } 814 // Write trace header. 815 if !trace.headerWritten { 816 trace.headerWritten = true 817 unlock(&trace.lock) 818 return []byte("go 1.23 trace\x00\x00\x00"), false 819 } 820 821 // Read the next buffer. 822 823 if trace.readerGen.Load() == 0 { 824 trace.readerGen.Store(1) 825 } 826 var gen uintptr 827 for { 828 assertLockHeld(&trace.lock) 829 gen = trace.readerGen.Load() 830 831 // Check to see if we need to block for more data in this generation 832 // or if we need to move our generation forward. 833 if !trace.full[gen%2].empty() { 834 break 835 } 836 // Most of the time readerGen is one generation ahead of flushedGen, as the 837 // current generation is being read from. Then, once the last buffer is flushed 838 // into readerGen, flushedGen will rise to meet it. At this point, the tracer 839 // is waiting on the reader to finish flushing the last generation so that it 840 // can continue to advance. 841 if trace.flushedGen.Load() == gen { 842 if trace.shutdown.Load() { 843 unlock(&trace.lock) 844 845 // Wake up anyone waiting for us to be done with this generation. 846 // 847 // Do this after reading trace.shutdown, because the thread we're 848 // waking up is going to clear trace.shutdown. 849 if raceenabled { 850 // Model synchronization on trace.doneSema, which te race 851 // detector does not see. This is required to avoid false 852 // race reports on writer passed to trace.Start. 853 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 854 } 855 semrelease(&trace.doneSema[gen%2]) 856 857 // We're shutting down, and the last generation is fully 858 // read. We're done. 859 return nil, false 860 } 861 // The previous gen has had all of its buffers flushed, and 862 // there's nothing else for us to read. Advance the generation 863 // we're reading from and try again. 864 trace.readerGen.Store(trace.gen.Load()) 865 unlock(&trace.lock) 866 867 // Wake up anyone waiting for us to be done with this generation. 868 // 869 // Do this after reading gen to make sure we can't have the trace 870 // advance until we've read it. 871 if raceenabled { 872 // See comment above in the shutdown case. 873 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 874 } 875 semrelease(&trace.doneSema[gen%2]) 876 877 // Reacquire the lock and go back to the top of the loop. 878 lock(&trace.lock) 879 continue 880 } 881 // Wait for new data. 882 // 883 // We don't simply use a note because the scheduler 884 // executes this goroutine directly when it wakes up 885 // (also a note would consume an M). 886 // 887 // Before we drop the lock, clear the workAvailable flag. Work can 888 // only be queued with trace.lock held, so this is at least true until 889 // we drop the lock. 890 trace.workAvailable.Store(false) 891 unlock(&trace.lock) 892 return nil, true 893 } 894 // Pull a buffer. 895 tbuf := trace.full[gen%2].pop() 896 trace.reading = tbuf 897 unlock(&trace.lock) 898 return tbuf.arr[:tbuf.pos], false 899 } 900 901 // traceReader returns the trace reader that should be woken up, if any. 902 // Callers should first check (traceEnabled() || traceShuttingDown()). 903 // 904 // This must run on the system stack because it acquires trace.lock. 905 // 906 //go:systemstack 907 func traceReader() *g { 908 gp := traceReaderAvailable() 909 if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) { 910 return nil 911 } 912 return gp 913 } 914 915 // traceReaderAvailable returns the trace reader if it is not currently 916 // scheduled and should be. Callers should first check that 917 // (traceEnabled() || traceShuttingDown()) is true. 918 func traceReaderAvailable() *g { 919 // There are three conditions under which we definitely want to schedule 920 // the reader: 921 // - The reader is lagging behind in finishing off the last generation. 922 // In this case, trace buffers could even be empty, but the trace 923 // advancer will be waiting on the reader, so we have to make sure 924 // to schedule the reader ASAP. 925 // - The reader has pending work to process for it's reader generation 926 // (assuming readerGen is not lagging behind). Note that we also want 927 // to be careful *not* to schedule the reader if there's no work to do. 928 // - The trace is shutting down. The trace stopper blocks on the reader 929 // to finish, much like trace advancement. 930 // 931 // We also want to be careful not to schedule the reader if there's no 932 // reason to. 933 if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() { 934 return trace.reader.Load() 935 } 936 return nil 937 } 938 939 // Trace advancer goroutine. 940 var traceAdvancer traceAdvancerState 941 942 type traceAdvancerState struct { 943 timer *wakeableSleep 944 done chan struct{} 945 } 946 947 // start starts a new traceAdvancer. 948 func (s *traceAdvancerState) start() { 949 // Start a goroutine to periodically advance the trace generation. 950 s.done = make(chan struct{}) 951 s.timer = newWakeableSleep() 952 go func() { 953 for traceEnabled() { 954 // Set a timer to wake us up 955 s.timer.sleep(int64(debug.traceadvanceperiod)) 956 957 // Try to advance the trace. 958 traceAdvance(false) 959 } 960 s.done <- struct{}{} 961 }() 962 } 963 964 // stop stops a traceAdvancer and blocks until it exits. 965 func (s *traceAdvancerState) stop() { 966 s.timer.wake() 967 <-s.done 968 close(s.done) 969 s.timer.close() 970 } 971 972 // traceAdvancePeriod is the approximate period between 973 // new generations. 974 const defaultTraceAdvancePeriod = 1e9 // 1 second. 975 976 // wakeableSleep manages a wakeable goroutine sleep. 977 // 978 // Users of this type must call init before first use and 979 // close to free up resources. Once close is called, init 980 // must be called before another use. 981 type wakeableSleep struct { 982 timer *timer 983 984 // lock protects access to wakeup, but not send/recv on it. 985 lock mutex 986 wakeup chan struct{} 987 } 988 989 // newWakeableSleep initializes a new wakeableSleep and returns it. 990 func newWakeableSleep() *wakeableSleep { 991 s := new(wakeableSleep) 992 lockInit(&s.lock, lockRankWakeableSleep) 993 s.wakeup = make(chan struct{}, 1) 994 s.timer = new(timer) 995 f := func(s any, _ uintptr, _ int64) { 996 s.(*wakeableSleep).wake() 997 } 998 s.timer.init(f, s) 999 return s 1000 } 1001 1002 // sleep sleeps for the provided duration in nanoseconds or until 1003 // another goroutine calls wake. 1004 // 1005 // Must not be called by more than one goroutine at a time and 1006 // must not be called concurrently with close. 1007 func (s *wakeableSleep) sleep(ns int64) { 1008 s.timer.reset(nanotime()+ns, 0) 1009 lock(&s.lock) 1010 if raceenabled { 1011 raceacquire(unsafe.Pointer(&s.lock)) 1012 } 1013 wakeup := s.wakeup 1014 if raceenabled { 1015 racerelease(unsafe.Pointer(&s.lock)) 1016 } 1017 unlock(&s.lock) 1018 <-wakeup 1019 s.timer.stop() 1020 } 1021 1022 // wake awakens any goroutine sleeping on the timer. 1023 // 1024 // Safe for concurrent use with all other methods. 1025 func (s *wakeableSleep) wake() { 1026 // Grab the wakeup channel, which may be nil if we're 1027 // racing with close. 1028 lock(&s.lock) 1029 if raceenabled { 1030 raceacquire(unsafe.Pointer(&s.lock)) 1031 } 1032 if s.wakeup != nil { 1033 // Non-blocking send. 1034 // 1035 // Others may also write to this channel and we don't 1036 // want to block on the receiver waking up. This also 1037 // effectively batches together wakeup notifications. 1038 select { 1039 case s.wakeup <- struct{}{}: 1040 default: 1041 } 1042 } 1043 if raceenabled { 1044 racerelease(unsafe.Pointer(&s.lock)) 1045 } 1046 unlock(&s.lock) 1047 } 1048 1049 // close wakes any goroutine sleeping on the timer and prevents 1050 // further sleeping on it. 1051 // 1052 // Once close is called, the wakeableSleep must no longer be used. 1053 // 1054 // It must only be called once no goroutine is sleeping on the 1055 // timer *and* nothing else will call wake concurrently. 1056 func (s *wakeableSleep) close() { 1057 // Set wakeup to nil so that a late timer ends up being a no-op. 1058 lock(&s.lock) 1059 if raceenabled { 1060 raceacquire(unsafe.Pointer(&s.lock)) 1061 } 1062 wakeup := s.wakeup 1063 s.wakeup = nil 1064 1065 // Close the channel. 1066 close(wakeup) 1067 1068 if raceenabled { 1069 racerelease(unsafe.Pointer(&s.lock)) 1070 } 1071 unlock(&s.lock) 1072 return 1073 } 1074