Source file src/runtime/mgcmark_greenteagc.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Green Tea mark algorithm
     6  //
     7  // The core idea behind Green Tea is simple: achieve better locality during
     8  // mark/scan by delaying scanning so that we can accumulate objects to scan
     9  // within the same span, then scan the objects that have accumulated on the
    10  // span all together.
    11  //
    12  // By batching objects this way, we increase the chance that adjacent objects
    13  // will be accessed, amortize the cost of accessing object metadata, and create
    14  // better opportunities for prefetching. We can take this even further and
    15  // optimize the scan loop by size class (not yet completed) all the way to the
    16  // point of applying SIMD techniques to really tear through the heap.
    17  //
    18  // Naturally, this depends on being able to create opportunties to batch objects
    19  // together. The basic idea here is to have two sets of mark bits. One set is the
    20  // regular set of mark bits ("marks"), while the other essentially says that the
    21  // objects have been scanned already ("scans"). When we see a pointer for the first
    22  // time we set its mark and enqueue its span. We track these spans in work queues
    23  // with a FIFO policy, unlike workbufs which have a LIFO policy. Empirically, a
    24  // FIFO policy appears to work best for accumulating objects to scan on a span.
    25  // Later, when we dequeue the span, we find both the union and intersection of the
    26  // mark and scan bitsets. The union is then written back into the scan bits, while
    27  // the intersection is used to decide which objects need scanning, such that the GC
    28  // is still precise.
    29  //
    30  // Below is the bulk of the implementation, focusing on the worst case
    31  // for locality, small objects. Specifically, those that are smaller than
    32  // a few cache lines in size and whose metadata is stored the same way (at the
    33  // end of the span).
    34  
    35  //go:build goexperiment.greenteagc
    36  
    37  package runtime
    38  
    39  import (
    40  	"internal/goarch"
    41  	"internal/runtime/atomic"
    42  	"internal/runtime/gc"
    43  	"internal/runtime/gc/scan"
    44  	"internal/runtime/sys"
    45  	"unsafe"
    46  )
    47  
    48  const doubleCheckGreenTea = false
    49  
    50  // spanInlineMarkBits are mark bits that are inlined into the span
    51  // itself. gcUsesSpanInlineMarkBits may be used to check if objects
    52  // of a particular size use inline mark bits.
    53  //
    54  // Inline mark bits are a little bit more than just mark bits. They
    55  // consist of two parts: scans and marks. Marks are like pre-mark
    56  // bits. They're set once a pointer to an object is discovered for
    57  // the first time. The marks allow us to scan many objects in bulk
    58  // if we queue the whole span for scanning. Before we scan such objects
    59  // in bulk, we copy the marks to the scans, computing a diff along the
    60  // way. The resulting bitmap tells us which objects we should scan.
    61  //
    62  // The inlineMarkBits also hold state sufficient for scanning any
    63  // object in the span, as well as state for acquiring ownership of
    64  // the span for queuing. This avoids the need to look at the mspan when
    65  // scanning.
    66  type spanInlineMarkBits struct {
    67  	scans [63]uint8         // scanned bits.
    68  	owned spanScanOwnership // see the comment on spanScanOwnership.
    69  	marks [63]uint8         // mark bits.
    70  	class spanClass
    71  }
    72  
    73  // spanScanOwnership indicates whether some thread has acquired
    74  // the span for scanning, and whether there has been one or more
    75  // attempts to acquire the span. The latter information helps to
    76  // fast-track span scans that only apply to a single mark, skipping
    77  // the relatively costly merge-and-diff process for scans and marks
    78  // by allowing one to just set the mark directly.
    79  type spanScanOwnership uint8
    80  
    81  const (
    82  	spanScanUnowned  spanScanOwnership = 0         // Indicates the span is not acquired for scanning.
    83  	spanScanOneMark                    = 1 << iota // Indicates that only one mark bit is set relative to the scan bits.
    84  	spanScanManyMark                               // Indicates one or more scan bits may be set relative to the mark bits.
    85  	// "ManyMark" need not be exactly the value it has. In practice we just
    86  	// want to distinguish "none" from "one" from "many," so a comparison is
    87  	// sufficient (as opposed to a bit test) to check between these cases.
    88  )
    89  
    90  // load atomically loads from a pointer to a spanScanOwnership.
    91  func (o *spanScanOwnership) load() spanScanOwnership {
    92  	return spanScanOwnership(atomic.Load8((*uint8)(unsafe.Pointer(o))))
    93  }
    94  
    95  func (o *spanScanOwnership) or(v spanScanOwnership) spanScanOwnership {
    96  	// N.B. We round down the address and use Or32 because Or8 doesn't
    97  	// return a result, and it's strictly necessary for this protocol.
    98  	//
    99  	// Making Or8 return a result, while making the code look nicer, would
   100  	// not be strictly better on any supported platform, as an Or8 that
   101  	// returns a result is not a common instruction. On many platforms it
   102  	// would be implemented exactly as it is here, and since Or8 is
   103  	// exclusively used in the runtime and a hot function, we want to keep
   104  	// using its no-result version elsewhere for performance.
   105  	o32 := (*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(o)) &^ 0b11))
   106  	off := (uintptr(unsafe.Pointer(o)) & 0b11) * 8
   107  	if goarch.BigEndian {
   108  		off = 32 - off - 8
   109  	}
   110  	return spanScanOwnership(atomic.Or32(o32, uint32(v)<<off) >> off)
   111  }
   112  
   113  func (imb *spanInlineMarkBits) init(class spanClass, needzero bool) {
   114  	if imb == nil {
   115  		// This nil check and throw is almost pointless. Normally we would
   116  		// expect imb to never be nil. However, this is called on potentially
   117  		// freshly-allocated virtual memory. As of 2025, the compiler-inserted
   118  		// nil check is not a branch but a memory read that we expect to fault
   119  		// if the pointer really is nil.
   120  		//
   121  		// However, this causes a read of the page, and operating systems may
   122  		// take it as a hint to back the accessed memory with a read-only zero
   123  		// page. However, we immediately write to this memory, which can then
   124  		// force operating systems to have to update the page table and flush
   125  		// the TLB, causing a lot of churn for programs that are short-lived
   126  		// and monotonically grow in size.
   127  		//
   128  		// This nil check is thus an explicit branch instead of what the compiler
   129  		// would insert circa 2025, which is a memory read instruction.
   130  		//
   131  		// See go.dev/issue/74375 for details.
   132  		throw("runtime: span inline mark bits nil?")
   133  	}
   134  	if needzero {
   135  		// Use memclrNoHeapPointers to avoid having the compiler make a worse
   136  		// decision. We know that imb is both aligned and a nice power-of-two
   137  		// size that works well for wider SIMD instructions. The compiler likely
   138  		// has no idea that imb is aligned to 128 bytes.
   139  		memclrNoHeapPointers(unsafe.Pointer(imb), unsafe.Sizeof(spanInlineMarkBits{}))
   140  	}
   141  	imb.class = class
   142  }
   143  
   144  // tryAcquire attempts to acquire the span for scanning. On success, the caller
   145  // must queue the span for scanning or scan the span immediately.
   146  func (imb *spanInlineMarkBits) tryAcquire() bool {
   147  	switch imb.owned.load() {
   148  	case spanScanUnowned:
   149  		// Try to mark the span as having only one object marked.
   150  		if imb.owned.or(spanScanOneMark) == spanScanUnowned {
   151  			return true
   152  		}
   153  		// If we didn't see an old value of spanScanUnowned, then we must
   154  		// have raced with someone else and seen spanScanOneMark or greater.
   155  		// Fall through and try to set spanScanManyMark.
   156  		fallthrough
   157  	case spanScanOneMark:
   158  		// We may be the first to set *any* bit on owned. In such a case,
   159  		// we still need to make sure the span is queued.
   160  		return imb.owned.or(spanScanManyMark) == spanScanUnowned
   161  	}
   162  	return false
   163  }
   164  
   165  // release releases the span for scanning, allowing another thread to queue the span.
   166  //
   167  // Returns an upper bound on the number of mark bits set since the span was queued. The
   168  // upper bound is described as "one" (spanScanOneMark) or "many" (spanScanManyMark, with or
   169  // without spanScanOneMark). If the return value indicates only one mark bit was set, the
   170  // caller can be certain that it was the same mark bit that caused the span to get queued.
   171  // Take note of the fact that this is *only* an upper-bound. In particular, it may still
   172  // turn out that only one mark bit was set, even if the return value indicates "many".
   173  func (imb *spanInlineMarkBits) release() spanScanOwnership {
   174  	return spanScanOwnership(atomic.Xchg8((*uint8)(unsafe.Pointer(&imb.owned)), uint8(spanScanUnowned)))
   175  }
   176  
   177  // spanInlineMarkBitsFromBase returns the spanInlineMarkBits for a span whose start address is base.
   178  //
   179  // The span must be gcUsesSpanInlineMarkBits(span.elemsize).
   180  func spanInlineMarkBitsFromBase(base uintptr) *spanInlineMarkBits {
   181  	return (*spanInlineMarkBits)(unsafe.Pointer(base + gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{})))
   182  }
   183  
   184  // initInlineMarkBits initializes the inlineMarkBits stored at the end of the span.
   185  func (s *mspan) initInlineMarkBits() {
   186  	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
   187  		throw("expected span with inline mark bits")
   188  	}
   189  	// Zeroing is only necessary if this span wasn't just freshly allocated from the OS.
   190  	s.inlineMarkBits().init(s.spanclass, s.needzero != 0)
   191  }
   192  
   193  // moveInlineMarks merges the span's inline mark bits into dst and clears them.
   194  //
   195  // gcUsesSpanInlineMarkBits(s.elemsize) must be true.
   196  func (s *mspan) moveInlineMarks(dst *gcBits) {
   197  	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
   198  		throw("expected span with inline mark bits")
   199  	}
   200  	bytes := divRoundUp(uintptr(s.nelems), 8)
   201  	imb := s.inlineMarkBits()
   202  	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
   203  	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
   204  		marks := bswapIfBigEndian(imbMarks[i/goarch.PtrSize])
   205  		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
   206  			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
   207  		}
   208  		*(*uintptr)(unsafe.Pointer(dst.bytep(i))) |= bswapIfBigEndian(marks)
   209  	}
   210  	if doubleCheckGreenTea && !s.spanclass.noscan() && imb.marks != imb.scans {
   211  		throw("marks don't match scans for span with pointer")
   212  	}
   213  
   214  	// Reset the inline mark bits.
   215  	imb.init(s.spanclass, true /* We know these bits are always dirty now. */)
   216  }
   217  
   218  // inlineMarkBits returns the inline mark bits for the span.
   219  //
   220  // gcUsesSpanInlineMarkBits(s.elemsize) must be true.
   221  func (s *mspan) inlineMarkBits() *spanInlineMarkBits {
   222  	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
   223  		throw("expected span with inline mark bits")
   224  	}
   225  	return spanInlineMarkBitsFromBase(s.base())
   226  }
   227  
   228  func (s *mspan) markBitsForIndex(objIndex uintptr) (bits markBits) {
   229  	if gcUsesSpanInlineMarkBits(s.elemsize) {
   230  		bits.bytep = &s.inlineMarkBits().marks[objIndex/8]
   231  	} else {
   232  		bits.bytep = s.gcmarkBits.bytep(objIndex / 8)
   233  	}
   234  	bits.mask = uint8(1) << (objIndex % 8)
   235  	bits.index = objIndex
   236  	return
   237  }
   238  
   239  func (s *mspan) markBitsForBase() markBits {
   240  	if gcUsesSpanInlineMarkBits(s.elemsize) {
   241  		return markBits{&s.inlineMarkBits().marks[0], uint8(1), 0}
   242  	}
   243  	return markBits{&s.gcmarkBits.x, uint8(1), 0}
   244  }
   245  
   246  // scannedBitsForIndex returns a markBits representing the scanned bit
   247  // for objIndex in the inline mark bits.
   248  func (s *mspan) scannedBitsForIndex(objIndex uintptr) markBits {
   249  	return markBits{&s.inlineMarkBits().scans[objIndex/8], uint8(1) << (objIndex % 8), objIndex}
   250  }
   251  
   252  // gcUsesSpanInlineMarkBits returns true if a span holding objects of a certain size
   253  // has inline mark bits. size must be the span's elemsize.
   254  //
   255  // nosplit because this is called from gcmarknewobject, which is nosplit.
   256  //
   257  //go:nosplit
   258  func gcUsesSpanInlineMarkBits(size uintptr) bool {
   259  	return heapBitsInSpan(size) && size >= 16
   260  }
   261  
   262  // tryDeferToSpanScan tries to queue p on the span it points to, if it
   263  // points to a small object span (gcUsesSpanQueue size).
   264  func tryDeferToSpanScan(p uintptr, gcw *gcWork) bool {
   265  	if useCheckmark {
   266  		return false
   267  	}
   268  
   269  	// Quickly to see if this is a span that has inline mark bits.
   270  	ha := heapArenaOf(p)
   271  	if ha == nil {
   272  		return false
   273  	}
   274  	pageIdx := ((p / pageSize) / 8) % uintptr(len(ha.pageInUse))
   275  	pageMask := byte(1 << ((p / pageSize) % 8))
   276  	if ha.pageUseSpanInlineMarkBits[pageIdx]&pageMask == 0 {
   277  		return false
   278  	}
   279  
   280  	// Find the object's index from the span class info stored in the inline mark bits.
   281  	base := alignDown(p, gc.PageSize)
   282  	q := spanInlineMarkBitsFromBase(base)
   283  	objIndex := uint16((uint64(p-base) * uint64(gc.SizeClassToDivMagic[q.class.sizeclass()])) >> 32)
   284  
   285  	// Set mark bit.
   286  	idx, mask := objIndex/8, uint8(1)<<(objIndex%8)
   287  	if atomic.Load8(&q.marks[idx])&mask != 0 {
   288  		return true
   289  	}
   290  	atomic.Or8(&q.marks[idx], mask)
   291  
   292  	// Fast-track noscan objects.
   293  	if q.class.noscan() {
   294  		gcw.bytesMarked += uint64(gc.SizeClassToSize[q.class.sizeclass()])
   295  		return true
   296  	}
   297  
   298  	// Queue up the pointer (as a representative for its span).
   299  	if q.tryAcquire() {
   300  		if gcw.spanq.put(makeObjPtr(base, objIndex)) {
   301  			if gcphase == _GCmark {
   302  				// This is intentionally racy; the bit set here might get
   303  				// stomped on by a stealing P. See the comment in tryStealSpan
   304  				// for an explanation as to why this is OK.
   305  				if !work.spanqMask.read(uint32(gcw.id)) {
   306  					work.spanqMask.set(gcw.id)
   307  				}
   308  				gcw.mayNeedWorker = true
   309  			}
   310  			gcw.flushedWork = true
   311  		}
   312  	}
   313  	return true
   314  }
   315  
   316  // tryGetSpanFast attempts to get an entire span to scan.
   317  func (w *gcWork) tryGetSpanFast() objptr {
   318  	return w.spanq.tryGetFast()
   319  }
   320  
   321  // tryGetSpan attempts to get an entire span to scan.
   322  func (w *gcWork) tryGetSpan() objptr {
   323  	if s := w.spanq.tryGetFast(); s != 0 {
   324  		return s
   325  	}
   326  	// "Steal" from ourselves.
   327  	if s := w.spanq.steal(&w.spanq); s != 0 {
   328  		return s
   329  	}
   330  	// We failed to get any local work, so we're fresh out.
   331  	// Nobody else is going to add work for us. Clear our bit.
   332  	if work.spanqMask.read(uint32(w.id)) {
   333  		work.spanqMask.clear(w.id)
   334  	}
   335  	return 0
   336  }
   337  
   338  // spanQueue is a P-local stealable span queue.
   339  type spanQueue struct {
   340  	// head, tail, and ring represent a local non-thread-safe ring buffer.
   341  	head, tail uint32
   342  	ring       [256]objptr
   343  
   344  	// putsSinceDrain counts the number of put calls since the last drain.
   345  	putsSinceDrain int
   346  
   347  	// chain contains state visible to other Ps.
   348  	//
   349  	// In particular, that means a linked chain of single-producer multi-consumer
   350  	// ring buffers where the single producer is this P only.
   351  	//
   352  	// This linked chain structure is based off the sync.Pool dequeue.
   353  	chain struct {
   354  		// head is the spanSPMC to put to. This is only accessed
   355  		// by the producer, so doesn't need to be synchronized.
   356  		head *spanSPMC
   357  
   358  		// tail is the spanSPMC to steal from. This is accessed
   359  		// by consumers, so reads and writes must be atomic.
   360  		tail atomic.UnsafePointer // *spanSPMC
   361  	}
   362  }
   363  
   364  // putFast tries to put s onto the queue, but may fail if it's full.
   365  func (q *spanQueue) putFast(s objptr) (ok bool) {
   366  	if q.tail-q.head == uint32(len(q.ring)) {
   367  		return false
   368  	}
   369  	q.ring[q.tail%uint32(len(q.ring))] = s
   370  	q.tail++
   371  	return true
   372  }
   373  
   374  // put puts s onto the queue.
   375  //
   376  // Returns whether the caller should spin up a new worker.
   377  func (q *spanQueue) put(s objptr) bool {
   378  	// The constants below define the period of and volume of
   379  	// spans we spill to the spmc chain when the local queue is
   380  	// not full.
   381  	//
   382  	// spillPeriod must be > spillMax, otherwise that sets the
   383  	// effective maximum size of our local span queue. Even if
   384  	// we have a span ring of size N, but we flush K spans every
   385  	// K puts, then K becomes our effective maximum length. When
   386  	// spillPeriod > spillMax, then we're always spilling spans
   387  	// at a slower rate than we're accumulating them.
   388  	const (
   389  		// spillPeriod defines how often to check if we should
   390  		// spill some spans, counted in the number of calls to put.
   391  		spillPeriod = 64
   392  
   393  		// spillMax defines, at most, how many spans to drain with
   394  		// each spill.
   395  		spillMax = 16
   396  	)
   397  
   398  	if q.putFast(s) {
   399  		// Occasionally try to spill some work to generate parallelism.
   400  		q.putsSinceDrain++
   401  		if q.putsSinceDrain >= spillPeriod {
   402  			// Reset even if we don't drain, so we don't check every time.
   403  			q.putsSinceDrain = 0
   404  
   405  			// Try to drain some spans. Don't bother if there's very
   406  			// few of them or there's already spans in the spmc chain.
   407  			n := min((q.tail-q.head)/2, spillMax)
   408  			if n > 4 && q.chainEmpty() {
   409  				q.drain(n)
   410  				return true
   411  			}
   412  		}
   413  		return false
   414  	}
   415  
   416  	// We're out of space. Drain out our local spans.
   417  	q.drain(uint32(len(q.ring)) / 2)
   418  	if !q.putFast(s) {
   419  		throw("failed putFast after drain")
   420  	}
   421  	return true
   422  }
   423  
   424  // flush publishes all spans in the local queue to the spmc chain.
   425  func (q *spanQueue) flush() {
   426  	n := q.tail - q.head
   427  	if n == 0 {
   428  		return
   429  	}
   430  	q.drain(n)
   431  }
   432  
   433  // empty returns true if there's no more work on the queue.
   434  //
   435  // Not thread-safe. Must only be called by the owner of q.
   436  func (q *spanQueue) empty() bool {
   437  	// Check the local queue for work.
   438  	if q.tail-q.head > 0 {
   439  		return false
   440  	}
   441  	return q.chainEmpty()
   442  }
   443  
   444  // chainEmpty returns true if the spmc chain is empty.
   445  //
   446  // Thread-safe.
   447  func (q *spanQueue) chainEmpty() bool {
   448  	// Check the rest of the rings for work.
   449  	r := (*spanSPMC)(q.chain.tail.Load())
   450  	for r != nil {
   451  		if !r.empty() {
   452  			return false
   453  		}
   454  		r = (*spanSPMC)(r.prev.Load())
   455  	}
   456  	return true
   457  }
   458  
   459  // drain publishes n spans from the local queue to the spmc chain.
   460  func (q *spanQueue) drain(n uint32) {
   461  	q.putsSinceDrain = 0
   462  
   463  	if q.chain.head == nil {
   464  		// N.B. We target 1024, but this may be bigger if the physical
   465  		// page size is bigger, or if we can fit more uintptrs into a
   466  		// physical page. See newSpanSPMC docs.
   467  		r := newSpanSPMC(1024)
   468  		q.chain.head = r
   469  		q.chain.tail.StoreNoWB(unsafe.Pointer(r))
   470  	}
   471  
   472  	// Try to drain some of the queue to the head spmc.
   473  	if q.tryDrain(q.chain.head, n) {
   474  		return
   475  	}
   476  	// No space. Create a bigger spmc and add it to the chain.
   477  
   478  	// Double the size of the next one, up to a maximum.
   479  	//
   480  	// We double each time so we can avoid taking this slow path
   481  	// in the future, which involves a global lock. Ideally we want
   482  	// to hit a steady-state where the deepest any queue goes during
   483  	// a mark phase can fit in the ring.
   484  	//
   485  	// However, we still set a maximum on this. We set the maximum
   486  	// to something large to amortize the cost of lock acquisition, but
   487  	// still at a reasonable size for big heaps and/or a lot of Ps (which
   488  	// tend to be correlated).
   489  	//
   490  	// It's not too bad to burn relatively large-but-fixed amounts of per-P
   491  	// memory if we need to deal with really, really deep queues, since the
   492  	// constants of proportionality are small. Simultaneously, we want to
   493  	// avoid a situation where a single worker ends up queuing O(heap)
   494  	// work and then forever retains a queue of that size.
   495  	const maxCap = 1 << 20 / goarch.PtrSize
   496  	newCap := q.chain.head.cap * 2
   497  	if newCap > maxCap {
   498  		newCap = maxCap
   499  	}
   500  	newHead := newSpanSPMC(newCap)
   501  	if !q.tryDrain(newHead, n) {
   502  		throw("failed to put span on newly-allocated spanSPMC")
   503  	}
   504  	q.chain.head.prev.StoreNoWB(unsafe.Pointer(newHead))
   505  	q.chain.head = newHead
   506  }
   507  
   508  // tryDrain attempts to drain n spans from q's local queue to the chain.
   509  //
   510  // Returns whether it succeeded.
   511  func (q *spanQueue) tryDrain(r *spanSPMC, n uint32) bool {
   512  	if q.head+n > q.tail {
   513  		throw("attempt to drain too many elements")
   514  	}
   515  	h := r.head.Load() // synchronize with consumers
   516  	t := r.tail.Load()
   517  	rn := t - h
   518  	if rn+n <= r.cap {
   519  		for i := uint32(0); i < n; i++ {
   520  			*r.slot(t + i) = q.ring[(q.head+i)%uint32(len(q.ring))]
   521  		}
   522  		r.tail.Store(t + n) // Makes the items avail for consumption.
   523  		q.head += n
   524  		return true
   525  	}
   526  	return false
   527  }
   528  
   529  // tryGetFast attempts to get a span from the local queue, but may fail if it's empty,
   530  // returning false.
   531  func (q *spanQueue) tryGetFast() objptr {
   532  	if q.tail-q.head == 0 {
   533  		return 0
   534  	}
   535  	s := q.ring[q.head%uint32(len(q.ring))]
   536  	q.head++
   537  	return s
   538  }
   539  
   540  // steal takes some spans from the ring chain of another span queue.
   541  //
   542  // q == q2 is OK.
   543  func (q *spanQueue) steal(q2 *spanQueue) objptr {
   544  	r := (*spanSPMC)(q2.chain.tail.Load())
   545  	if r == nil {
   546  		return 0
   547  	}
   548  	for {
   549  		// It's important that we load the next pointer
   550  		// *before* popping the tail. In general, r may be
   551  		// transiently empty, but if next is non-nil before
   552  		// the pop and the pop fails, then r is permanently
   553  		// empty, which is the only condition under which it's
   554  		// safe to drop r from the chain.
   555  		r2 := (*spanSPMC)(r.prev.Load())
   556  
   557  		// Try to refill from one of the rings
   558  		if s := q.refill(r); s != 0 {
   559  			return s
   560  		}
   561  
   562  		if r2 == nil {
   563  			// This is the only ring. It's empty right
   564  			// now, but could be pushed to in the future.
   565  			return 0
   566  		}
   567  
   568  		// The tail of the chain has been drained, so move on
   569  		// to the next ring. Try to drop it from the chain
   570  		// so the next consumer doesn't have to look at the empty
   571  		// ring again.
   572  		if q2.chain.tail.CompareAndSwapNoWB(unsafe.Pointer(r), unsafe.Pointer(r2)) {
   573  			r.dead.Store(true)
   574  		}
   575  
   576  		r = r2
   577  	}
   578  }
   579  
   580  // refill takes some spans from r and puts them into q's local queue.
   581  //
   582  // One span is removed from the stolen spans and returned on success.
   583  // Failure to steal returns a zero objptr.
   584  //
   585  // steal is thread-safe with respect to r.
   586  func (q *spanQueue) refill(r *spanSPMC) objptr {
   587  	if q.tail-q.head != 0 {
   588  		throw("steal with local work available")
   589  	}
   590  
   591  	// Steal some spans.
   592  	var n uint32
   593  	for {
   594  		h := r.head.Load() // load-acquire, synchronize with other consumers
   595  		t := r.tail.Load() // load-acquire, synchronize with the producer
   596  		n = t - h
   597  		n = n - n/2
   598  		if n == 0 {
   599  			return 0
   600  		}
   601  		if n > r.cap { // read inconsistent h and t
   602  			continue
   603  		}
   604  		n = min(n, uint32(len(q.ring)/2))
   605  		for i := uint32(0); i < n; i++ {
   606  			q.ring[i] = *r.slot(h + i)
   607  		}
   608  		if r.head.CompareAndSwap(h, h+n) {
   609  			break
   610  		}
   611  	}
   612  
   613  	// Update local queue head and tail to reflect new buffered values.
   614  	q.head = 0
   615  	q.tail = n
   616  
   617  	// Pop off the head of the queue and return it.
   618  	return q.tryGetFast()
   619  }
   620  
   621  // destroy frees all chains in an empty spanQueue.
   622  //
   623  // Preconditions:
   624  // - World is stopped.
   625  // - GC is outside of the mark phase.
   626  // - (Therefore) the queue is empty.
   627  func (q *spanQueue) destroy() {
   628  	assertWorldStopped()
   629  	if gcphase != _GCoff {
   630  		throw("spanQueue.destroy during the mark phase")
   631  	}
   632  	if !q.empty() {
   633  		throw("spanQueue.destroy on non-empty queue")
   634  	}
   635  
   636  	lock(&work.spanSPMCs.lock)
   637  
   638  	// Remove and free each ring.
   639  	for r := (*spanSPMC)(q.chain.tail.Load()); r != nil; r = (*spanSPMC)(r.prev.Load()) {
   640  		prev := r.allprev
   641  		next := r.allnext
   642  		if prev != nil {
   643  			prev.allnext = next
   644  		}
   645  		if next != nil {
   646  			next.allprev = prev
   647  		}
   648  		if work.spanSPMCs.all == r {
   649  			work.spanSPMCs.all = next
   650  		}
   651  
   652  		r.deinit()
   653  		mheap_.spanSPMCAlloc.free(unsafe.Pointer(r))
   654  	}
   655  
   656  	q.chain.head = nil
   657  	q.chain.tail.Store(nil)
   658  	q.putsSinceDrain = 0
   659  
   660  	unlock(&work.spanSPMCs.lock)
   661  }
   662  
   663  // spanSPMC is a ring buffer of objptrs that represent spans.
   664  // Accessed without a lock.
   665  //
   666  // Single-producer, multi-consumer. The only producer is the P that owns this
   667  // queue, but any other P may consume from it.
   668  //
   669  // ## Invariants for memory management
   670  //
   671  // 1. All spanSPMCs are allocated from mheap_.spanSPMCAlloc.
   672  // 2. All allocated spanSPMCs must be on the work.spanSPMCs list.
   673  // 3. spanSPMCs may only be allocated if gcphase != _GCoff.
   674  // 4. spanSPMCs may only be deallocated if gcphase == _GCoff.
   675  //
   676  // Invariants (3) and (4) ensure that we do not need to concern ourselves with
   677  // tricky reuse issues that stem from not knowing when a thread is truly done
   678  // with a spanSPMC. For example, two threads could load the same spanSPMC from
   679  // the tail of the chain. One thread is then paused while the other steals the
   680  // last few elements off of it. It's not safe to free at that point since the
   681  // other thread will still inspect that spanSPMC, and we have no way of knowing
   682  // without more complex and/or heavyweight synchronization.
   683  //
   684  // Instead, we rely on the global synchronization inherent to GC phases, and
   685  // the fact that spanSPMCs are only ever used during the mark phase, to ensure
   686  // memory safety. This means we temporarily waste some memory, but it's only
   687  // until the end of the mark phase.
   688  type spanSPMC struct {
   689  	_ sys.NotInHeap
   690  
   691  	// allnext is the link to the next spanSPMC on the work.spanSPMCs list.
   692  	// This is used to find and free dead spanSPMCs. Protected by
   693  	// work.spanSPMCs.lock.
   694  	allnext *spanSPMC
   695  
   696  	// allprev is the link to the previous spanSPMC on the work.spanSPMCs
   697  	// list. This is used to find and free dead spanSPMCs. Protected by
   698  	// work.spanSPMCs.lock.
   699  	allprev *spanSPMC
   700  
   701  	// dead indicates whether the spanSPMC is no longer in use.
   702  	// Protected by the CAS to the prev field of the spanSPMC pointing
   703  	// to this spanSPMC. That is, whoever wins that CAS takes ownership
   704  	// of marking this spanSPMC as dead. See spanQueue.steal for details.
   705  	dead atomic.Bool
   706  
   707  	// prev is the next link up a spanQueue's SPMC chain, from tail to head,
   708  	// hence the name "prev." Set by a spanQueue's producer, cleared by a
   709  	// CAS in spanQueue.steal.
   710  	prev atomic.UnsafePointer // *spanSPMC
   711  
   712  	// head, tail, cap, and ring together represent a fixed-size SPMC lock-free
   713  	// ring buffer of size cap. The ring buffer contains objptr values.
   714  	head atomic.Uint32
   715  	tail atomic.Uint32
   716  	cap  uint32 // cap(ring))
   717  	ring *objptr
   718  }
   719  
   720  // newSpanSPMC allocates and initializes a new spmc with the provided capacity.
   721  //
   722  // newSpanSPMC may override the capacity with a larger one if the provided one would
   723  // waste memory.
   724  func newSpanSPMC(cap uint32) *spanSPMC {
   725  	lock(&work.spanSPMCs.lock)
   726  	r := (*spanSPMC)(mheap_.spanSPMCAlloc.alloc())
   727  	next := work.spanSPMCs.all
   728  	r.allnext = next
   729  	if next != nil {
   730  		next.allprev = r
   731  	}
   732  	work.spanSPMCs.all = r
   733  	unlock(&work.spanSPMCs.lock)
   734  
   735  	// If cap < the capacity of a single physical page, round up.
   736  	pageCap := uint32(physPageSize / goarch.PtrSize) // capacity of a single page
   737  	if cap < pageCap {
   738  		cap = pageCap
   739  	}
   740  	if cap&(cap-1) != 0 {
   741  		throw("spmc capacity must be a power of 2")
   742  	}
   743  
   744  	r.cap = cap
   745  	ring := sysAlloc(uintptr(cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys, "GC span queue")
   746  	atomic.StorepNoWB(unsafe.Pointer(&r.ring), ring)
   747  	return r
   748  }
   749  
   750  // empty returns true if the spmc is empty.
   751  //
   752  // empty is thread-safe.
   753  func (r *spanSPMC) empty() bool {
   754  	h := r.head.Load()
   755  	t := r.tail.Load()
   756  	return t == h
   757  }
   758  
   759  // deinit frees any resources the spanSPMC is holding onto and zeroes it.
   760  func (r *spanSPMC) deinit() {
   761  	sysFree(unsafe.Pointer(r.ring), uintptr(r.cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys)
   762  	r.ring = nil
   763  	r.dead.Store(false)
   764  	r.prev.StoreNoWB(nil)
   765  	r.head.Store(0)
   766  	r.tail.Store(0)
   767  	r.cap = 0
   768  	r.allnext = nil
   769  	r.allprev = nil
   770  }
   771  
   772  // slot returns a pointer to slot i%r.cap.
   773  func (r *spanSPMC) slot(i uint32) *objptr {
   774  	idx := uintptr(i & (r.cap - 1))
   775  	return (*objptr)(unsafe.Add(unsafe.Pointer(r.ring), idx*unsafe.Sizeof(objptr(0))))
   776  }
   777  
   778  // freeDeadSpanSPMCs frees dead spanSPMCs back to the OS.
   779  func freeDeadSpanSPMCs() {
   780  	// According to the SPMC memory management invariants, we can only free
   781  	// spanSPMCs outside of the mark phase. We ensure we do this in two ways.
   782  	//
   783  	// 1. We take the work.spanSPMCs lock, which we need anyway. This ensures
   784  	//    that we are non-preemptible. If this path becomes lock-free, we will
   785  	//    need to become non-preemptible in some other way.
   786  	// 2. Once we are non-preemptible, we check the gcphase, and back out if
   787  	//    it's not safe.
   788  	//
   789  	// This way, we ensure that we don't start freeing if we're in the wrong
   790  	// phase, and the phase can't change on us while we're freeing.
   791  	//
   792  	// TODO(go.dev/issue/75771): Due to the grow semantics in
   793  	// spanQueue.drain, we expect a steady-state of around one spanSPMC per
   794  	// P, with some spikes higher when Ps have more than one. For high
   795  	// GOMAXPROCS, or if this list otherwise gets long, it would be nice to
   796  	// have a way to batch work that allows preemption during processing.
   797  	lock(&work.spanSPMCs.lock)
   798  	if gcphase != _GCoff || work.spanSPMCs.all == nil {
   799  		unlock(&work.spanSPMCs.lock)
   800  		return
   801  	}
   802  	r := work.spanSPMCs.all
   803  	for r != nil {
   804  		next := r.allnext
   805  		if r.dead.Load() {
   806  			// It's dead. Deinitialize and free it.
   807  			prev := r.allprev
   808  			if prev != nil {
   809  				prev.allnext = next
   810  			}
   811  			if next != nil {
   812  				next.allprev = prev
   813  			}
   814  			if work.spanSPMCs.all == r {
   815  				work.spanSPMCs.all = next
   816  			}
   817  
   818  			r.deinit()
   819  			mheap_.spanSPMCAlloc.free(unsafe.Pointer(r))
   820  		}
   821  		r = next
   822  	}
   823  	unlock(&work.spanSPMCs.lock)
   824  }
   825  
   826  // tryStealSpan attempts to steal a span from another P's local queue.
   827  //
   828  // Returns a non-zero objptr on success.
   829  func (w *gcWork) tryStealSpan() objptr {
   830  	pp := getg().m.p.ptr()
   831  
   832  	for enum := stealOrder.start(cheaprand()); !enum.done(); enum.next() {
   833  		if !work.spanqMask.read(enum.position()) {
   834  			continue
   835  		}
   836  		p2 := allp[enum.position()]
   837  		if pp == p2 {
   838  			continue
   839  		}
   840  		if s := w.spanq.steal(&p2.gcw.spanq); s != 0 {
   841  			return s
   842  		}
   843  		// N.B. This is intentionally racy. We may stomp on a mask set by
   844  		// a P that just put a bunch of work into its local queue.
   845  		//
   846  		// This is OK because the ragged barrier in gcMarkDone will set
   847  		// the bit on each P if there's local work we missed. This race
   848  		// should generally be rare, since the window between noticing
   849  		// an empty local queue and this bit being set is quite small.
   850  		work.spanqMask.clear(int32(enum.position()))
   851  	}
   852  	return 0
   853  }
   854  
   855  // objptr consists of a span base and the index of the object in the span.
   856  type objptr uintptr
   857  
   858  // makeObjPtr creates an objptr from a span base address and an object index.
   859  func makeObjPtr(spanBase uintptr, objIndex uint16) objptr {
   860  	if doubleCheckGreenTea && spanBase&((1<<gc.PageShift)-1) != 0 {
   861  		throw("created objptr with address that is incorrectly aligned")
   862  	}
   863  	return objptr(spanBase | uintptr(objIndex))
   864  }
   865  
   866  func (p objptr) spanBase() uintptr {
   867  	return uintptr(p) &^ ((1 << gc.PageShift) - 1)
   868  }
   869  
   870  func (p objptr) objIndex() uint16 {
   871  	return uint16(p) & ((1 << gc.PageShift) - 1)
   872  }
   873  
   874  // scanSpan scans objects indicated marks&^scans and then scans those objects,
   875  // queuing the resulting pointers into gcw.
   876  func scanSpan(p objptr, gcw *gcWork) {
   877  	spanBase := p.spanBase()
   878  	imb := spanInlineMarkBitsFromBase(spanBase)
   879  	spanclass := imb.class
   880  	if spanclass.noscan() {
   881  		throw("noscan object in scanSpan")
   882  	}
   883  	elemsize := uintptr(gc.SizeClassToSize[spanclass.sizeclass()])
   884  
   885  	// Release span.
   886  	if imb.release() == spanScanOneMark {
   887  		// Nobody else set any mark bits on this span while it was acquired.
   888  		// That means p is the sole object we need to handle. Fast-track it.
   889  		objIndex := p.objIndex()
   890  		bytep := &imb.scans[objIndex/8]
   891  		mask := uint8(1) << (objIndex % 8)
   892  		if atomic.Load8(bytep)&mask != 0 {
   893  			return
   894  		}
   895  		atomic.Or8(bytep, mask)
   896  		gcw.bytesMarked += uint64(elemsize)
   897  		if debug.gctrace > 1 {
   898  			gcw.stats[spanclass.sizeclass()].sparseObjsScanned++
   899  		}
   900  		b := spanBase + uintptr(objIndex)*elemsize
   901  		scanObjectSmall(spanBase, b, elemsize, gcw)
   902  		return
   903  	}
   904  
   905  	// Compute nelems.
   906  	divMagic := uint64(gc.SizeClassToDivMagic[spanclass.sizeclass()])
   907  	usableSpanSize := uint64(gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{}))
   908  	if !spanclass.noscan() {
   909  		usableSpanSize -= gc.PageSize / goarch.PtrSize / 8
   910  	}
   911  	nelems := uint16((usableSpanSize * divMagic) >> 32)
   912  
   913  	// Grey objects and return if there's nothing else to do.
   914  	var toScan gc.ObjMask
   915  	objsMarked := spanSetScans(spanBase, nelems, imb, &toScan)
   916  	if objsMarked == 0 {
   917  		return
   918  	}
   919  	gcw.bytesMarked += uint64(objsMarked) * uint64(elemsize)
   920  
   921  	// Check if we have enough density to make a dartboard scan
   922  	// worthwhile. If not, just do what scanobject does, but
   923  	// localized to the span, using the dartboard.
   924  	if !scan.HasFastScanSpanPacked() || objsMarked < int(nelems/8) {
   925  		if debug.gctrace > 1 {
   926  			gcw.stats[spanclass.sizeclass()].spansSparseScanned++
   927  			gcw.stats[spanclass.sizeclass()].spanObjsSparseScanned += uint64(objsMarked)
   928  		}
   929  		scanObjectsSmall(spanBase, elemsize, nelems, gcw, &toScan)
   930  		return
   931  	}
   932  
   933  	// Scan the span.
   934  	//
   935  	// N.B. Use gcw.ptrBuf as the output buffer. This is a bit different
   936  	// from scanObjectsSmall, which puts addresses to dereference. ScanSpanPacked
   937  	// on the other hand, fills gcw.ptrBuf with already dereferenced pointers.
   938  	nptrs := scan.ScanSpanPacked(
   939  		unsafe.Pointer(spanBase),
   940  		&gcw.ptrBuf[0],
   941  		&toScan,
   942  		uintptr(spanclass.sizeclass()),
   943  		spanPtrMaskUnsafe(spanBase),
   944  	)
   945  	gcw.heapScanWork += int64(objsMarked) * int64(elemsize)
   946  
   947  	if debug.gctrace > 1 {
   948  		// Write down some statistics.
   949  		gcw.stats[spanclass.sizeclass()].spansDenseScanned++
   950  		gcw.stats[spanclass.sizeclass()].spanObjsDenseScanned += uint64(objsMarked)
   951  	}
   952  
   953  	// Process all the pointers we just got.
   954  	for _, p := range gcw.ptrBuf[:nptrs] {
   955  		if !tryDeferToSpanScan(p, gcw) {
   956  			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
   957  				greyobject(obj, 0, 0, span, gcw, objIndex)
   958  			}
   959  		}
   960  	}
   961  }
   962  
   963  // spanSetScans sets any unset mark bits that have their mark bits set in the inline mark bits.
   964  //
   965  // toScan is populated with bits indicating whether a particular mark bit was set.
   966  //
   967  // Returns the number of objects marked, which could be zero.
   968  func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toScan *gc.ObjMask) int {
   969  	arena, pageIdx, pageMask := pageIndexOf(spanBase)
   970  	if arena.pageMarks[pageIdx]&pageMask == 0 {
   971  		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
   972  	}
   973  
   974  	bytes := divRoundUp(uintptr(nelems), 8)
   975  	objsMarked := 0
   976  
   977  	// Careful: these two structures alias since ObjMask is much bigger
   978  	// than marks or scans. We do these unsafe shenanigans so that we can
   979  	// access the marks and scans by uintptrs rather than by byte.
   980  	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
   981  	imbScans := (*gc.ObjMask)(unsafe.Pointer(&imb.scans))
   982  
   983  	// Iterate over one uintptr-sized chunks at a time, computing both
   984  	// the union and intersection of marks and scans. Store the union
   985  	// into scans, and the intersection into toScan.
   986  	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
   987  		scans := atomic.Loaduintptr(&imbScans[i/goarch.PtrSize])
   988  		marks := imbMarks[i/goarch.PtrSize]
   989  		scans = bswapIfBigEndian(scans)
   990  		marks = bswapIfBigEndian(marks)
   991  		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
   992  			scans &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out owned
   993  			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
   994  		}
   995  		toGrey := marks &^ scans
   996  		toScan[i/goarch.PtrSize] = toGrey
   997  
   998  		// If there's anything left to grey, do it.
   999  		if toGrey != 0 {
  1000  			toGrey = bswapIfBigEndian(toGrey)
  1001  			if goarch.PtrSize == 4 {
  1002  				atomic.Or32((*uint32)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint32(toGrey))
  1003  			} else {
  1004  				atomic.Or64((*uint64)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint64(toGrey))
  1005  			}
  1006  		}
  1007  		objsMarked += sys.OnesCount64(uint64(toGrey))
  1008  	}
  1009  	return objsMarked
  1010  }
  1011  
  1012  func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) {
  1013  	ptrBits := heapBitsSmallForAddrInline(spanBase, b, objSize)
  1014  	gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
  1015  	nptrs := 0
  1016  	n := sys.OnesCount64(uint64(ptrBits))
  1017  	for range n {
  1018  		k := sys.TrailingZeros64(uint64(ptrBits))
  1019  		ptrBits &^= 1 << k
  1020  		addr := b + uintptr(k)*goarch.PtrSize
  1021  
  1022  		// Prefetch addr since we're about to use it. This point for prefetching
  1023  		// was chosen empirically.
  1024  		sys.Prefetch(addr)
  1025  
  1026  		// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
  1027  		gcw.ptrBuf[nptrs] = addr
  1028  		nptrs++
  1029  	}
  1030  
  1031  	// Process all the pointers we just got.
  1032  	for _, p := range gcw.ptrBuf[:nptrs] {
  1033  		p = *(*uintptr)(unsafe.Pointer(p))
  1034  		if p == 0 {
  1035  			continue
  1036  		}
  1037  		if !tryDeferToSpanScan(p, gcw) {
  1038  			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
  1039  				greyobject(obj, 0, 0, span, gcw, objIndex)
  1040  			}
  1041  		}
  1042  	}
  1043  }
  1044  
  1045  func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *gc.ObjMask) {
  1046  	nptrs := 0
  1047  	for i, bits := range scans {
  1048  		if i*(goarch.PtrSize*8) > int(elems) {
  1049  			break
  1050  		}
  1051  		n := sys.OnesCount64(uint64(bits))
  1052  		for range n {
  1053  			j := sys.TrailingZeros64(uint64(bits))
  1054  			bits &^= 1 << j
  1055  
  1056  			b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize
  1057  			ptrBits := heapBitsSmallForAddrInline(base, b, objSize)
  1058  			gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
  1059  
  1060  			n := sys.OnesCount64(uint64(ptrBits))
  1061  			for range n {
  1062  				k := sys.TrailingZeros64(uint64(ptrBits))
  1063  				ptrBits &^= 1 << k
  1064  				addr := b + uintptr(k)*goarch.PtrSize
  1065  
  1066  				// Prefetch addr since we're about to use it. This point for prefetching
  1067  				// was chosen empirically.
  1068  				sys.Prefetch(addr)
  1069  
  1070  				// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
  1071  				gcw.ptrBuf[nptrs] = addr
  1072  				nptrs++
  1073  			}
  1074  		}
  1075  	}
  1076  
  1077  	// Process all the pointers we just got.
  1078  	for _, p := range gcw.ptrBuf[:nptrs] {
  1079  		p = *(*uintptr)(unsafe.Pointer(p))
  1080  		if p == 0 {
  1081  			continue
  1082  		}
  1083  		if !tryDeferToSpanScan(p, gcw) {
  1084  			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
  1085  				greyobject(obj, 0, 0, span, gcw, objIndex)
  1086  			}
  1087  		}
  1088  	}
  1089  }
  1090  
  1091  func heapBitsSmallForAddrInline(spanBase, addr, elemsize uintptr) uintptr {
  1092  	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, elemsize)
  1093  	hbits := (*byte)(unsafe.Pointer(hbitsBase))
  1094  
  1095  	// These objects are always small enough that their bitmaps
  1096  	// fit in a single word, so just load the word or two we need.
  1097  	//
  1098  	// Mirrors mspan.writeHeapBitsSmall.
  1099  	//
  1100  	// We should be using heapBits(), but unfortunately it introduces
  1101  	// both bounds checks panics and throw which causes us to exceed
  1102  	// the nosplit limit in quite a few cases.
  1103  	i := (addr - spanBase) / goarch.PtrSize / ptrBits
  1104  	j := (addr - spanBase) / goarch.PtrSize % ptrBits
  1105  	bits := elemsize / goarch.PtrSize
  1106  	word0 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+0))))
  1107  	word1 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+1))))
  1108  
  1109  	var read uintptr
  1110  	if j+bits > ptrBits {
  1111  		// Two reads.
  1112  		bits0 := ptrBits - j
  1113  		bits1 := bits - bits0
  1114  		read = *word0 >> j
  1115  		read |= (*word1 & ((1 << bits1) - 1)) << bits0
  1116  	} else {
  1117  		// One read.
  1118  		read = (*word0 >> j) & ((1 << bits) - 1)
  1119  	}
  1120  	return read
  1121  }
  1122  
  1123  // spanPtrMaskUnsafe returns the pointer mask for a span with inline mark bits.
  1124  //
  1125  // The caller must ensure spanBase is the base of a span that:
  1126  // - 1 page in size,
  1127  // - Uses inline mark bits,
  1128  // - Contains pointers.
  1129  func spanPtrMaskUnsafe(spanBase uintptr) *gc.PtrMask {
  1130  	base := spanBase + gc.PageSize - unsafe.Sizeof(gc.PtrMask{}) - unsafe.Sizeof(spanInlineMarkBits{})
  1131  	return (*gc.PtrMask)(unsafe.Pointer(base))
  1132  }
  1133  
  1134  type sizeClassScanStats struct {
  1135  	spansDenseScanned     uint64 // Spans scanned with ScanSpanPacked.
  1136  	spanObjsDenseScanned  uint64 // Objects scanned with ScanSpanPacked.
  1137  	spansSparseScanned    uint64 // Spans scanned with scanObjectsSmall.
  1138  	spanObjsSparseScanned uint64 // Objects scanned with scanObjectsSmall.
  1139  	sparseObjsScanned     uint64 // Objects scanned with scanobject or scanObjectSmall.
  1140  	// Note: sparseObjsScanned is sufficient for both cases because
  1141  	// a particular size class either uses scanobject or scanObjectSmall,
  1142  	// not both. In the latter case, we also know that there was one
  1143  	// object scanned per span, so no need for a span counter.
  1144  }
  1145  
  1146  func dumpScanStats() {
  1147  	var (
  1148  		spansDenseScanned     uint64
  1149  		spanObjsDenseScanned  uint64
  1150  		spansSparseScanned    uint64
  1151  		spanObjsSparseScanned uint64
  1152  		sparseObjsScanned     uint64
  1153  	)
  1154  	for _, stats := range memstats.lastScanStats {
  1155  		spansDenseScanned += stats.spansDenseScanned
  1156  		spanObjsDenseScanned += stats.spanObjsDenseScanned
  1157  		spansSparseScanned += stats.spansSparseScanned
  1158  		spanObjsSparseScanned += stats.spanObjsSparseScanned
  1159  		sparseObjsScanned += stats.sparseObjsScanned
  1160  	}
  1161  	totalObjs := sparseObjsScanned + spanObjsSparseScanned + spanObjsDenseScanned
  1162  	totalSpans := spansSparseScanned + spansDenseScanned
  1163  	print("scan: total ", sparseObjsScanned, "+", spanObjsSparseScanned, "+", spanObjsDenseScanned, "=", totalObjs, " objs")
  1164  	print(", ", spansSparseScanned, "+", spansDenseScanned, "=", totalSpans, " spans\n")
  1165  	for i, stats := range memstats.lastScanStats {
  1166  		if stats == (sizeClassScanStats{}) {
  1167  			continue
  1168  		}
  1169  		totalObjs := stats.sparseObjsScanned + stats.spanObjsSparseScanned + stats.spanObjsDenseScanned
  1170  		totalSpans := stats.spansSparseScanned + stats.spansDenseScanned
  1171  		if i == 0 {
  1172  			print("scan: class L ")
  1173  		} else {
  1174  			print("scan: class ", gc.SizeClassToSize[i], "B ")
  1175  		}
  1176  		print(stats.sparseObjsScanned, "+", stats.spanObjsSparseScanned, "+", stats.spanObjsDenseScanned, "=", totalObjs, " objs")
  1177  		print(", ", stats.spansSparseScanned, "+", stats.spansDenseScanned, "=", totalSpans, " spans\n")
  1178  	}
  1179  }
  1180  
  1181  func (w *gcWork) flushScanStats(dst *[gc.NumSizeClasses]sizeClassScanStats) {
  1182  	for i := range w.stats {
  1183  		dst[i].spansDenseScanned += w.stats[i].spansDenseScanned
  1184  		dst[i].spanObjsDenseScanned += w.stats[i].spanObjsDenseScanned
  1185  		dst[i].spansSparseScanned += w.stats[i].spansSparseScanned
  1186  		dst[i].spanObjsSparseScanned += w.stats[i].spanObjsSparseScanned
  1187  		dst[i].sparseObjsScanned += w.stats[i].sparseObjsScanned
  1188  	}
  1189  	clear(w.stats[:])
  1190  }
  1191  
  1192  // gcMarkWorkAvailable reports whether there's any non-local work available to do.
  1193  //
  1194  // This is a heavyweight check and must only be used for correctness, not
  1195  // as a hint.
  1196  func gcMarkWorkAvailable() bool {
  1197  	if !work.full.empty() {
  1198  		return true // global work available
  1199  	}
  1200  	if work.markrootNext.Load() < work.markrootJobs.Load() {
  1201  		return true // root scan work available
  1202  	}
  1203  	if work.spanqMask.any() {
  1204  		return true // stealable local work available
  1205  	}
  1206  	return false
  1207  }
  1208  
  1209  // scanObject scans the object starting at b, adding pointers to gcw.
  1210  // b must point to the beginning of a heap object or an oblet.
  1211  // scanObject consults the GC bitmap for the pointer mask and the
  1212  // spans for the size of the object.
  1213  //
  1214  // Used only for !gcUsesSpanInlineMarkBits spans, but supports all
  1215  // object sizes and is safe to be called on all heap objects.
  1216  //
  1217  //go:nowritebarrier
  1218  func scanObject(b uintptr, gcw *gcWork) {
  1219  	// Prefetch object before we scan it.
  1220  	//
  1221  	// This will overlap fetching the beginning of the object with initial
  1222  	// setup before we start scanning the object.
  1223  	sys.Prefetch(b)
  1224  
  1225  	// Find the bits for b and the size of the object at b.
  1226  	//
  1227  	// b is either the beginning of an object, in which case this
  1228  	// is the size of the object to scan, or it points to an
  1229  	// oblet, in which case we compute the size to scan below.
  1230  	s := spanOfUnchecked(b)
  1231  	n := s.elemsize
  1232  	if n == 0 {
  1233  		throw("scanObject n == 0")
  1234  	}
  1235  	if s.spanclass.noscan() {
  1236  		// Correctness-wise this is ok, but it's inefficient
  1237  		// if noscan objects reach here.
  1238  		throw("scanObject of a noscan object")
  1239  	}
  1240  
  1241  	var tp typePointers
  1242  	if n > maxObletBytes {
  1243  		// Large object. Break into oblets for better
  1244  		// parallelism and lower latency.
  1245  		if b == s.base() {
  1246  			// Enqueue the other oblets to scan later.
  1247  			// Some oblets may be in b's scalar tail, but
  1248  			// these will be marked as "no more pointers",
  1249  			// so we'll drop out immediately when we go to
  1250  			// scan those.
  1251  			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
  1252  				if !gcw.putObjFast(oblet) {
  1253  					gcw.putObj(oblet)
  1254  				}
  1255  			}
  1256  		}
  1257  
  1258  		// Compute the size of the oblet. Since this object
  1259  		// must be a large object, s.base() is the beginning
  1260  		// of the object.
  1261  		n = s.base() + s.elemsize - b
  1262  		n = min(n, maxObletBytes)
  1263  		tp = s.typePointersOfUnchecked(s.base())
  1264  		tp = tp.fastForward(b-tp.addr, b+n)
  1265  	} else {
  1266  		tp = s.typePointersOfUnchecked(b)
  1267  	}
  1268  
  1269  	var scanSize uintptr
  1270  	for {
  1271  		var addr uintptr
  1272  		if tp, addr = tp.nextFast(); addr == 0 {
  1273  			if tp, addr = tp.next(b + n); addr == 0 {
  1274  				break
  1275  			}
  1276  		}
  1277  
  1278  		// Keep track of farthest pointer we found, so we can
  1279  		// update heapScanWork. TODO: is there a better metric,
  1280  		// now that we can skip scalar portions pretty efficiently?
  1281  		scanSize = addr - b + goarch.PtrSize
  1282  
  1283  		// Work here is duplicated in scanblock and above.
  1284  		// If you make changes here, make changes there too.
  1285  		obj := *(*uintptr)(unsafe.Pointer(addr))
  1286  
  1287  		// At this point we have extracted the next potential pointer.
  1288  		// Quickly filter out nil and pointers back to the current object.
  1289  		if obj != 0 && obj-b >= n {
  1290  			// Test if obj points into the Go heap and, if so,
  1291  			// mark the object.
  1292  			//
  1293  			// Note that it's possible for findObject to
  1294  			// fail if obj points to a just-allocated heap
  1295  			// object because of a race with growing the
  1296  			// heap. In this case, we know the object was
  1297  			// just allocated and hence will be marked by
  1298  			// allocation itself.
  1299  			if !tryDeferToSpanScan(obj, gcw) {
  1300  				if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
  1301  					greyobject(obj, b, addr-b, span, gcw, objIndex)
  1302  				}
  1303  			}
  1304  		}
  1305  	}
  1306  	gcw.bytesMarked += uint64(n)
  1307  	gcw.heapScanWork += int64(scanSize)
  1308  	if debug.gctrace > 1 {
  1309  		gcw.stats[s.spanclass.sizeclass()].sparseObjsScanned++
  1310  	}
  1311  }
  1312  

View as plain text