Source file src/runtime/mgcmark_greenteagc.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Green Tea mark algorithm
     6  //
     7  // The core idea behind Green Tea is simple: achieve better locality during
     8  // mark/scan by delaying scanning so that we can accumulate objects to scan
     9  // within the same span, then scan the objects that have accumulated on the
    10  // span all together.
    11  //
    12  // By batching objects this way, we increase the chance that adjacent objects
    13  // will be accessed, amortize the cost of accessing object metadata, and create
    14  // better opportunities for prefetching. We can take this even further and
    15  // optimize the scan loop by size class (not yet completed) all the way to the
    16  // point of applying SIMD techniques to really tear through the heap.
    17  //
    18  // Naturally, this depends on being able to create opportunties to batch objects
    19  // together. The basic idea here is to have two sets of mark bits. One set is the
    20  // regular set of mark bits ("marks"), while the other essentially says that the
    21  // objects have been scanned already ("scans"). When we see a pointer for the first
    22  // time we set its mark and enqueue its span. We track these spans in work queues
    23  // with a FIFO policy, unlike workbufs which have a LIFO policy. Empirically, a
    24  // FIFO policy appears to work best for accumulating objects to scan on a span.
    25  // Later, when we dequeue the span, we find both the union and intersection of the
    26  // mark and scan bitsets. The union is then written back into the scan bits, while
    27  // the intersection is used to decide which objects need scanning, such that the GC
    28  // is still precise.
    29  //
    30  // Below is the bulk of the implementation, focusing on the worst case
    31  // for locality, small objects. Specifically, those that are smaller than
    32  // a few cache lines in size and whose metadata is stored the same way (at the
    33  // end of the span).
    34  
    35  //go:build goexperiment.greenteagc
    36  
    37  package runtime
    38  
    39  import (
    40  	"internal/goarch"
    41  	"internal/runtime/atomic"
    42  	"internal/runtime/gc"
    43  	"internal/runtime/gc/scan"
    44  	"internal/runtime/sys"
    45  	"unsafe"
    46  )
    47  
    48  const doubleCheckGreenTea = false
    49  
    50  // spanInlineMarkBits are mark bits that are inlined into the span
    51  // itself. gcUsesSpanInlineMarkBits may be used to check if objects
    52  // of a particular size use inline mark bits.
    53  //
    54  // Inline mark bits are a little bit more than just mark bits. They
    55  // consist of two parts: scans and marks. Marks are like pre-mark
    56  // bits. They're set once a pointer to an object is discovered for
    57  // the first time. The marks allow us to scan many objects in bulk
    58  // if we queue the whole span for scanning. Before we scan such objects
    59  // in bulk, we copy the marks to the scans, computing a diff along the
    60  // way. The resulting bitmap tells us which objects we should scan.
    61  //
    62  // The inlineMarkBits also hold state sufficient for scanning any
    63  // object in the span, as well as state for acquiring ownership of
    64  // the span for queuing. This avoids the need to look at the mspan when
    65  // scanning.
    66  type spanInlineMarkBits struct {
    67  	scans [63]uint8         // scanned bits.
    68  	owned spanScanOwnership // see the comment on spanScanOwnership.
    69  	marks [63]uint8         // mark bits.
    70  	class spanClass
    71  }
    72  
    73  // spanScanOwnership indicates whether some thread has acquired
    74  // the span for scanning, and whether there has been one or more
    75  // attempts to acquire the span. The latter information helps to
    76  // fast-track span scans that only apply to a single mark, skipping
    77  // the relatively costly merge-and-diff process for scans and marks
    78  // by allowing one to just set the mark directly.
    79  type spanScanOwnership uint8
    80  
    81  const (
    82  	spanScanUnowned  spanScanOwnership = 0         // Indicates the span is not acquired for scanning.
    83  	spanScanOneMark                    = 1 << iota // Indicates that only one mark bit is set relative to the scan bits.
    84  	spanScanManyMark                               // Indicates one or more scan bits may be set relative to the mark bits.
    85  	// "ManyMark" need not be exactly the value it has. In practice we just
    86  	// want to distinguish "none" from "one" from "many," so a comparison is
    87  	// sufficient (as opposed to a bit test) to check between these cases.
    88  )
    89  
    90  // load atomically loads from a pointer to a spanScanOwnership.
    91  func (o *spanScanOwnership) load() spanScanOwnership {
    92  	return spanScanOwnership(atomic.Load8((*uint8)(unsafe.Pointer(o))))
    93  }
    94  
    95  func (o *spanScanOwnership) or(v spanScanOwnership) spanScanOwnership {
    96  	// N.B. We round down the address and use Or32 because Or8 doesn't
    97  	// return a result, and it's strictly necessary for this protocol.
    98  	//
    99  	// Making Or8 return a result, while making the code look nicer, would
   100  	// not be strictly better on any supported platform, as an Or8 that
   101  	// returns a result is not a common instruction. On many platforms it
   102  	// would be implemented exactly as it is here, and since Or8 is
   103  	// exclusively used in the runtime and a hot function, we want to keep
   104  	// using its no-result version elsewhere for performance.
   105  	o32 := (*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(o)) &^ 0b11))
   106  	off := (uintptr(unsafe.Pointer(o)) & 0b11) * 8
   107  	if goarch.BigEndian {
   108  		off = 32 - off - 8
   109  	}
   110  	return spanScanOwnership(atomic.Or32(o32, uint32(v)<<off) >> off)
   111  }
   112  
   113  func (imb *spanInlineMarkBits) init(class spanClass, needzero bool) {
   114  	if imb == nil {
   115  		// This nil check and throw is almost pointless. Normally we would
   116  		// expect imb to never be nil. However, this is called on potentially
   117  		// freshly-allocated virtual memory. As of 2025, the compiler-inserted
   118  		// nil check is not a branch but a memory read that we expect to fault
   119  		// if the pointer really is nil.
   120  		//
   121  		// However, this causes a read of the page, and operating systems may
   122  		// take it as a hint to back the accessed memory with a read-only zero
   123  		// page. However, we immediately write to this memory, which can then
   124  		// force operating systems to have to update the page table and flush
   125  		// the TLB, causing a lot of churn for programs that are short-lived
   126  		// and monotonically grow in size.
   127  		//
   128  		// This nil check is thus an explicit branch instead of what the compiler
   129  		// would insert circa 2025, which is a memory read instruction.
   130  		//
   131  		// See go.dev/issue/74375 for details.
   132  		throw("runtime: span inline mark bits nil?")
   133  	}
   134  	if needzero {
   135  		// Use memclrNoHeapPointers to avoid having the compiler make a worse
   136  		// decision. We know that imb is both aligned and a nice power-of-two
   137  		// size that works well for wider SIMD instructions. The compiler likely
   138  		// has no idea that imb is aligned to 128 bytes.
   139  		memclrNoHeapPointers(unsafe.Pointer(imb), unsafe.Sizeof(spanInlineMarkBits{}))
   140  	}
   141  	imb.class = class
   142  }
   143  
   144  // tryAcquire attempts to acquire the span for scanning. On success, the caller
   145  // must queue the span for scanning or scan the span immediately.
   146  func (imb *spanInlineMarkBits) tryAcquire() bool {
   147  	switch imb.owned.load() {
   148  	case spanScanUnowned:
   149  		// Try to mark the span as having only one object marked.
   150  		if imb.owned.or(spanScanOneMark) == spanScanUnowned {
   151  			return true
   152  		}
   153  		// If we didn't see an old value of spanScanUnowned, then we must
   154  		// have raced with someone else and seen spanScanOneMark or greater.
   155  		// Fall through and try to set spanScanManyMark.
   156  		fallthrough
   157  	case spanScanOneMark:
   158  		// We may be the first to set *any* bit on owned. In such a case,
   159  		// we still need to make sure the span is queued.
   160  		return imb.owned.or(spanScanManyMark) == spanScanUnowned
   161  	}
   162  	return false
   163  }
   164  
   165  // release releases the span for scanning, allowing another thread to queue the span.
   166  //
   167  // Returns an upper bound on the number of mark bits set since the span was queued. The
   168  // upper bound is described as "one" (spanScanOneMark) or "many" (spanScanManyMark, with or
   169  // without spanScanOneMark). If the return value indicates only one mark bit was set, the
   170  // caller can be certain that it was the same mark bit that caused the span to get queued.
   171  // Take note of the fact that this is *only* an upper-bound. In particular, it may still
   172  // turn out that only one mark bit was set, even if the return value indicates "many".
   173  func (imb *spanInlineMarkBits) release() spanScanOwnership {
   174  	return spanScanOwnership(atomic.Xchg8((*uint8)(unsafe.Pointer(&imb.owned)), uint8(spanScanUnowned)))
   175  }
   176  
   177  // spanInlineMarkBitsFromBase returns the spanInlineMarkBits for a span whose start address is base.
   178  //
   179  // The span must be gcUsesSpanInlineMarkBits(span.elemsize).
   180  func spanInlineMarkBitsFromBase(base uintptr) *spanInlineMarkBits {
   181  	return (*spanInlineMarkBits)(unsafe.Pointer(base + gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{})))
   182  }
   183  
   184  // initInlineMarkBits initializes the inlineMarkBits stored at the end of the span.
   185  func (s *mspan) initInlineMarkBits() {
   186  	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
   187  		throw("expected span with inline mark bits")
   188  	}
   189  	// Zeroing is only necessary if this span wasn't just freshly allocated from the OS.
   190  	s.inlineMarkBits().init(s.spanclass, s.needzero != 0)
   191  }
   192  
   193  // moveInlineMarks merges the span's inline mark bits into dst and clears them.
   194  //
   195  // gcUsesSpanInlineMarkBits(s.elemsize) must be true.
   196  func (s *mspan) moveInlineMarks(dst *gcBits) {
   197  	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
   198  		throw("expected span with inline mark bits")
   199  	}
   200  	bytes := divRoundUp(uintptr(s.nelems), 8)
   201  	imb := s.inlineMarkBits()
   202  	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
   203  	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
   204  		marks := bswapIfBigEndian(imbMarks[i/goarch.PtrSize])
   205  		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
   206  			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
   207  		}
   208  		*(*uintptr)(unsafe.Pointer(dst.bytep(i))) |= bswapIfBigEndian(marks)
   209  	}
   210  	if doubleCheckGreenTea && !s.spanclass.noscan() && imb.marks != imb.scans {
   211  		throw("marks don't match scans for span with pointer")
   212  	}
   213  
   214  	// Reset the inline mark bits.
   215  	imb.init(s.spanclass, true /* We know these bits are always dirty now. */)
   216  }
   217  
   218  // inlineMarkBits returns the inline mark bits for the span.
   219  //
   220  // gcUsesSpanInlineMarkBits(s.elemsize) must be true.
   221  func (s *mspan) inlineMarkBits() *spanInlineMarkBits {
   222  	if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) {
   223  		throw("expected span with inline mark bits")
   224  	}
   225  	return spanInlineMarkBitsFromBase(s.base())
   226  }
   227  
   228  func (s *mspan) markBitsForIndex(objIndex uintptr) (bits markBits) {
   229  	if gcUsesSpanInlineMarkBits(s.elemsize) {
   230  		bits.bytep = &s.inlineMarkBits().marks[objIndex/8]
   231  	} else {
   232  		bits.bytep = s.gcmarkBits.bytep(objIndex / 8)
   233  	}
   234  	bits.mask = uint8(1) << (objIndex % 8)
   235  	bits.index = objIndex
   236  	return
   237  }
   238  
   239  func (s *mspan) markBitsForBase() markBits {
   240  	if gcUsesSpanInlineMarkBits(s.elemsize) {
   241  		return markBits{&s.inlineMarkBits().marks[0], uint8(1), 0}
   242  	}
   243  	return markBits{&s.gcmarkBits.x, uint8(1), 0}
   244  }
   245  
   246  // scannedBitsForIndex returns a markBits representing the scanned bit
   247  // for objIndex in the inline mark bits.
   248  func (s *mspan) scannedBitsForIndex(objIndex uintptr) markBits {
   249  	return markBits{&s.inlineMarkBits().scans[objIndex/8], uint8(1) << (objIndex % 8), objIndex}
   250  }
   251  
   252  // gcUsesSpanInlineMarkBits returns true if a span holding objects of a certain size
   253  // has inline mark bits. size must be the span's elemsize.
   254  //
   255  // nosplit because this is called from gcmarknewobject, which is nosplit.
   256  //
   257  //go:nosplit
   258  func gcUsesSpanInlineMarkBits(size uintptr) bool {
   259  	return heapBitsInSpan(size) && size >= 16
   260  }
   261  
   262  // tryDeferToSpanScan tries to queue p on the span it points to, if it
   263  // points to a small object span (gcUsesSpanQueue size).
   264  func tryDeferToSpanScan(p uintptr, gcw *gcWork) bool {
   265  	if useCheckmark {
   266  		return false
   267  	}
   268  
   269  	// Quickly to see if this is a span that has inline mark bits.
   270  	ha := heapArenaOf(p)
   271  	if ha == nil {
   272  		return false
   273  	}
   274  	pageIdx := ((p / pageSize) / 8) % uintptr(len(ha.pageInUse))
   275  	pageMask := byte(1 << ((p / pageSize) % 8))
   276  	if ha.pageUseSpanInlineMarkBits[pageIdx]&pageMask == 0 {
   277  		return false
   278  	}
   279  
   280  	// Find the object's index from the span class info stored in the inline mark bits.
   281  	base := alignDown(p, gc.PageSize)
   282  	q := spanInlineMarkBitsFromBase(base)
   283  	objIndex := uint16((uint64(p-base) * uint64(gc.SizeClassToDivMagic[q.class.sizeclass()])) >> 32)
   284  
   285  	// Set mark bit.
   286  	idx, mask := objIndex/8, uint8(1)<<(objIndex%8)
   287  	if atomic.Load8(&q.marks[idx])&mask != 0 {
   288  		return true
   289  	}
   290  	atomic.Or8(&q.marks[idx], mask)
   291  
   292  	// Fast-track noscan objects.
   293  	if q.class.noscan() {
   294  		gcw.bytesMarked += uint64(gc.SizeClassToSize[q.class.sizeclass()])
   295  		return true
   296  	}
   297  
   298  	// Queue up the pointer (as a representative for its span).
   299  	if q.tryAcquire() {
   300  		if gcw.spanq.put(makeObjPtr(base, objIndex)) {
   301  			if gcphase == _GCmark {
   302  				// This is intentionally racy; the bit set here might get
   303  				// stomped on by a stealing P. See the comment in tryStealSpan
   304  				// for an explanation as to why this is OK.
   305  				if !work.spanqMask.read(uint32(gcw.id)) {
   306  					work.spanqMask.set(gcw.id)
   307  				}
   308  				gcw.mayNeedWorker = true
   309  			}
   310  			gcw.flushedWork = true
   311  		}
   312  	}
   313  	return true
   314  }
   315  
   316  // tryGetSpanFast attempts to get an entire span to scan.
   317  func (w *gcWork) tryGetSpanFast() objptr {
   318  	return w.spanq.tryGetFast()
   319  }
   320  
   321  // tryGetSpan attempts to get an entire span to scan.
   322  func (w *gcWork) tryGetSpan() objptr {
   323  	if s := w.spanq.tryGetFast(); s != 0 {
   324  		return s
   325  	}
   326  	// "Steal" from ourselves.
   327  	if s := w.spanq.steal(&w.spanq); s != 0 {
   328  		return s
   329  	}
   330  	// We failed to get any local work, so we're fresh out.
   331  	// Nobody else is going to add work for us. Clear our bit.
   332  	if work.spanqMask.read(uint32(w.id)) {
   333  		work.spanqMask.clear(w.id)
   334  	}
   335  	return 0
   336  }
   337  
   338  // spanQueue is a P-local stealable span queue.
   339  type spanQueue struct {
   340  	// head, tail, and ring represent a local non-thread-safe ring buffer.
   341  	head, tail uint32
   342  	ring       [256]objptr
   343  
   344  	// putsSinceDrain counts the number of put calls since the last drain.
   345  	putsSinceDrain int
   346  
   347  	// chain contains state visible to other Ps.
   348  	//
   349  	// In particular, that means a linked chain of single-producer multi-consumer
   350  	// ring buffers where the single producer is this P only.
   351  	//
   352  	// This linked chain structure is based off the sync.Pool dequeue.
   353  	chain struct {
   354  		// head is the spanSPMC to put to. This is only accessed
   355  		// by the producer, so doesn't need to be synchronized.
   356  		head *spanSPMC
   357  
   358  		// tail is the spanSPMC to steal from. This is accessed
   359  		// by consumers, so reads and writes must be atomic.
   360  		tail atomic.UnsafePointer // *spanSPMC
   361  	}
   362  }
   363  
   364  // putFast tries to put s onto the queue, but may fail if it's full.
   365  func (q *spanQueue) putFast(s objptr) (ok bool) {
   366  	if q.tail-q.head == uint32(len(q.ring)) {
   367  		return false
   368  	}
   369  	q.ring[q.tail%uint32(len(q.ring))] = s
   370  	q.tail++
   371  	return true
   372  }
   373  
   374  // put puts s onto the queue.
   375  //
   376  // Returns whether the caller should spin up a new worker.
   377  func (q *spanQueue) put(s objptr) bool {
   378  	// The constants below define the period of and volume of
   379  	// spans we spill to the spmc chain when the local queue is
   380  	// not full.
   381  	//
   382  	// spillPeriod must be > spillMax, otherwise that sets the
   383  	// effective maximum size of our local span queue. Even if
   384  	// we have a span ring of size N, but we flush K spans every
   385  	// K puts, then K becomes our effective maximum length. When
   386  	// spillPeriod > spillMax, then we're always spilling spans
   387  	// at a slower rate than we're accumulating them.
   388  	const (
   389  		// spillPeriod defines how often to check if we should
   390  		// spill some spans, counted in the number of calls to put.
   391  		spillPeriod = 64
   392  
   393  		// spillMax defines, at most, how many spans to drain with
   394  		// each spill.
   395  		spillMax = 16
   396  	)
   397  
   398  	if q.putFast(s) {
   399  		// Occasionally try to spill some work to generate parallelism.
   400  		q.putsSinceDrain++
   401  		if q.putsSinceDrain >= spillPeriod {
   402  			// Reset even if we don't drain, so we don't check every time.
   403  			q.putsSinceDrain = 0
   404  
   405  			// Try to drain some spans. Don't bother if there's very
   406  			// few of them or there's already spans in the spmc chain.
   407  			n := min((q.tail-q.head)/2, spillMax)
   408  			if n > 4 && q.chainEmpty() {
   409  				q.drain(n)
   410  				return true
   411  			}
   412  		}
   413  		return false
   414  	}
   415  
   416  	// We're out of space. Drain out our local spans.
   417  	q.drain(uint32(len(q.ring)) / 2)
   418  	if !q.putFast(s) {
   419  		throw("failed putFast after drain")
   420  	}
   421  	return true
   422  }
   423  
   424  // flush publishes all spans in the local queue to the spmc chain.
   425  func (q *spanQueue) flush() {
   426  	n := q.tail - q.head
   427  	if n == 0 {
   428  		return
   429  	}
   430  	q.drain(n)
   431  }
   432  
   433  // empty returns true if there's no more work on the queue.
   434  //
   435  // Not thread-safe. Must only be called by the owner of q.
   436  func (q *spanQueue) empty() bool {
   437  	// Check the local queue for work.
   438  	if q.tail-q.head > 0 {
   439  		return false
   440  	}
   441  	return q.chainEmpty()
   442  }
   443  
   444  // chainEmpty returns true if the spmc chain is empty.
   445  //
   446  // Thread-safe.
   447  func (q *spanQueue) chainEmpty() bool {
   448  	// Check the rest of the rings for work.
   449  	r := (*spanSPMC)(q.chain.tail.Load())
   450  	for r != nil {
   451  		if !r.empty() {
   452  			return false
   453  		}
   454  		r = (*spanSPMC)(r.prev.Load())
   455  	}
   456  	return true
   457  }
   458  
   459  // drain publishes n spans from the local queue to the spmc chain.
   460  func (q *spanQueue) drain(n uint32) {
   461  	q.putsSinceDrain = 0
   462  
   463  	if q.chain.head == nil {
   464  		// N.B. We target 1024, but this may be bigger if the physical
   465  		// page size is bigger, or if we can fit more uintptrs into a
   466  		// physical page. See newSpanSPMC docs.
   467  		r := newSpanSPMC(1024)
   468  		q.chain.head = r
   469  		q.chain.tail.StoreNoWB(unsafe.Pointer(r))
   470  	}
   471  
   472  	// Try to drain some of the queue to the head spmc.
   473  	if q.tryDrain(q.chain.head, n) {
   474  		return
   475  	}
   476  	// No space. Create a bigger spmc and add it to the chain.
   477  
   478  	// Double the size of the next one, up to a maximum.
   479  	//
   480  	// We double each time so we can avoid taking this slow path
   481  	// in the future, which involves a global lock. Ideally we want
   482  	// to hit a steady-state where the deepest any queue goes during
   483  	// a mark phase can fit in the ring.
   484  	//
   485  	// However, we still set a maximum on this. We set the maximum
   486  	// to something large to amortize the cost of lock acquisition, but
   487  	// still at a reasonable size for big heaps and/or a lot of Ps (which
   488  	// tend to be correlated).
   489  	//
   490  	// It's not too bad to burn relatively large-but-fixed amounts of per-P
   491  	// memory if we need to deal with really, really deep queues, since the
   492  	// constants of proportionality are small. Simultaneously, we want to
   493  	// avoid a situation where a single worker ends up queuing O(heap)
   494  	// work and then forever retains a queue of that size.
   495  	const maxCap = 1 << 20 / goarch.PtrSize
   496  	newCap := q.chain.head.cap * 2
   497  	if newCap > maxCap {
   498  		newCap = maxCap
   499  	}
   500  	newHead := newSpanSPMC(newCap)
   501  	if !q.tryDrain(newHead, n) {
   502  		throw("failed to put span on newly-allocated spanSPMC")
   503  	}
   504  	q.chain.head.prev.StoreNoWB(unsafe.Pointer(newHead))
   505  	q.chain.head = newHead
   506  }
   507  
   508  // tryDrain attempts to drain n spans from q's local queue to the chain.
   509  //
   510  // Returns whether it succeeded.
   511  func (q *spanQueue) tryDrain(r *spanSPMC, n uint32) bool {
   512  	if q.head+n > q.tail {
   513  		throw("attempt to drain too many elements")
   514  	}
   515  	h := r.head.Load() // synchronize with consumers
   516  	t := r.tail.Load()
   517  	rn := t - h
   518  	if rn+n <= r.cap {
   519  		for i := uint32(0); i < n; i++ {
   520  			*r.slot(t + i) = q.ring[(q.head+i)%uint32(len(q.ring))]
   521  		}
   522  		r.tail.Store(t + n) // Makes the items avail for consumption.
   523  		q.head += n
   524  		return true
   525  	}
   526  	return false
   527  }
   528  
   529  // tryGetFast attempts to get a span from the local queue, but may fail if it's empty,
   530  // returning false.
   531  func (q *spanQueue) tryGetFast() objptr {
   532  	if q.tail-q.head == 0 {
   533  		return 0
   534  	}
   535  	s := q.ring[q.head%uint32(len(q.ring))]
   536  	q.head++
   537  	return s
   538  }
   539  
   540  // steal takes some spans from the ring chain of another span queue.
   541  //
   542  // q == q2 is OK.
   543  func (q *spanQueue) steal(q2 *spanQueue) objptr {
   544  	r := (*spanSPMC)(q2.chain.tail.Load())
   545  	if r == nil {
   546  		return 0
   547  	}
   548  	for {
   549  		// It's important that we load the next pointer
   550  		// *before* popping the tail. In general, r may be
   551  		// transiently empty, but if next is non-nil before
   552  		// the pop and the pop fails, then r is permanently
   553  		// empty, which is the only condition under which it's
   554  		// safe to drop r from the chain.
   555  		r2 := (*spanSPMC)(r.prev.Load())
   556  
   557  		// Try to refill from one of the rings
   558  		if s := q.refill(r); s != 0 {
   559  			return s
   560  		}
   561  
   562  		if r2 == nil {
   563  			// This is the only ring. It's empty right
   564  			// now, but could be pushed to in the future.
   565  			return 0
   566  		}
   567  
   568  		// The tail of the chain has been drained, so move on
   569  		// to the next ring. Try to drop it from the chain
   570  		// so the next consumer doesn't have to look at the empty
   571  		// ring again.
   572  		if q2.chain.tail.CompareAndSwapNoWB(unsafe.Pointer(r), unsafe.Pointer(r2)) {
   573  			r.dead.Store(true)
   574  		}
   575  
   576  		r = r2
   577  	}
   578  }
   579  
   580  // refill takes some spans from r and puts them into q's local queue.
   581  //
   582  // One span is removed from the stolen spans and returned on success.
   583  // Failure to steal returns a zero objptr.
   584  //
   585  // steal is thread-safe with respect to r.
   586  func (q *spanQueue) refill(r *spanSPMC) objptr {
   587  	if q.tail-q.head != 0 {
   588  		throw("steal with local work available")
   589  	}
   590  
   591  	// Steal some spans.
   592  	var n uint32
   593  	for {
   594  		h := r.head.Load() // load-acquire, synchronize with other consumers
   595  		t := r.tail.Load() // load-acquire, synchronize with the producer
   596  		n = t - h
   597  		n = n - n/2
   598  		if n == 0 {
   599  			return 0
   600  		}
   601  		if n > r.cap { // read inconsistent h and t
   602  			continue
   603  		}
   604  		n = min(n, uint32(len(q.ring)/2))
   605  		for i := uint32(0); i < n; i++ {
   606  			q.ring[i] = *r.slot(h + i)
   607  		}
   608  		if r.head.CompareAndSwap(h, h+n) {
   609  			break
   610  		}
   611  	}
   612  
   613  	// Update local queue head and tail to reflect new buffered values.
   614  	q.head = 0
   615  	q.tail = n
   616  
   617  	// Pop off the head of the queue and return it.
   618  	return q.tryGetFast()
   619  }
   620  
   621  // destroy frees all chains in an empty spanQueue.
   622  //
   623  // Preconditions:
   624  // - World is stopped.
   625  // - GC is outside of the mark phase.
   626  // - (Therefore) the queue is empty.
   627  func (q *spanQueue) destroy() {
   628  	assertWorldStopped()
   629  	if gcphase != _GCoff {
   630  		throw("spanQueue.destroy during the mark phase")
   631  	}
   632  	if !q.empty() {
   633  		throw("spanQueue.destroy on non-empty queue")
   634  	}
   635  
   636  	lock(&work.spanSPMCs.lock)
   637  
   638  	// Remove, deinitialize, and free each ring.
   639  	for r := (*spanSPMC)(q.chain.tail.Load()); r != nil; r = (*spanSPMC)(r.prev.Load()) {
   640  		work.spanSPMCs.list.remove(unsafe.Pointer(r))
   641  		r.deinit()
   642  		mheap_.spanSPMCAlloc.free(unsafe.Pointer(r))
   643  	}
   644  
   645  	q.chain.head = nil
   646  	q.chain.tail.Store(nil)
   647  	q.putsSinceDrain = 0
   648  
   649  	unlock(&work.spanSPMCs.lock)
   650  }
   651  
   652  // spanSPMC is a ring buffer of objptrs that represent spans.
   653  // Accessed without a lock.
   654  //
   655  // Single-producer, multi-consumer. The only producer is the P that owns this
   656  // queue, but any other P may consume from it.
   657  //
   658  // ## Invariants for memory management
   659  //
   660  // 1. All spanSPMCs are allocated from mheap_.spanSPMCAlloc.
   661  // 2. All allocated spanSPMCs must be on the work.spanSPMCs list.
   662  // 3. spanSPMCs may only be allocated if gcphase != _GCoff.
   663  // 4. spanSPMCs may only be deallocated if gcphase == _GCoff.
   664  //
   665  // Invariants (3) and (4) ensure that we do not need to concern ourselves with
   666  // tricky reuse issues that stem from not knowing when a thread is truly done
   667  // with a spanSPMC. For example, two threads could load the same spanSPMC from
   668  // the tail of the chain. One thread is then paused while the other steals the
   669  // last few elements off of it. It's not safe to free at that point since the
   670  // other thread will still inspect that spanSPMC, and we have no way of knowing
   671  // without more complex and/or heavyweight synchronization.
   672  //
   673  // Instead, we rely on the global synchronization inherent to GC phases, and
   674  // the fact that spanSPMCs are only ever used during the mark phase, to ensure
   675  // memory safety. This means we temporarily waste some memory, but it's only
   676  // until the end of the mark phase.
   677  type spanSPMC struct {
   678  	_ sys.NotInHeap
   679  
   680  	// allnode is the linked list node for work.spanSPMCs list. This is
   681  	// used to find and free dead spanSPMCs. Protected by
   682  	// work.spanSPMCs.lock.
   683  	allnode listNodeManual
   684  
   685  	// dead indicates whether the spanSPMC is no longer in use.
   686  	// Protected by the CAS to the prev field of the spanSPMC pointing
   687  	// to this spanSPMC. That is, whoever wins that CAS takes ownership
   688  	// of marking this spanSPMC as dead. See spanQueue.steal for details.
   689  	dead atomic.Bool
   690  
   691  	// prev is the next link up a spanQueue's SPMC chain, from tail to head,
   692  	// hence the name "prev." Set by a spanQueue's producer, cleared by a
   693  	// CAS in spanQueue.steal.
   694  	prev atomic.UnsafePointer // *spanSPMC
   695  
   696  	// head, tail, cap, and ring together represent a fixed-size SPMC lock-free
   697  	// ring buffer of size cap. The ring buffer contains objptr values.
   698  	head atomic.Uint32
   699  	tail atomic.Uint32
   700  	cap  uint32 // cap(ring))
   701  	ring *objptr
   702  }
   703  
   704  // newSpanSPMC allocates and initializes a new spmc with the provided capacity.
   705  //
   706  // newSpanSPMC may override the capacity with a larger one if the provided one would
   707  // waste memory.
   708  func newSpanSPMC(cap uint32) *spanSPMC {
   709  	lock(&work.spanSPMCs.lock)
   710  	r := (*spanSPMC)(mheap_.spanSPMCAlloc.alloc())
   711  	work.spanSPMCs.list.push(unsafe.Pointer(r))
   712  	unlock(&work.spanSPMCs.lock)
   713  
   714  	// If cap < the capacity of a single physical page, round up.
   715  	pageCap := uint32(physPageSize / goarch.PtrSize) // capacity of a single page
   716  	if cap < pageCap {
   717  		cap = pageCap
   718  	}
   719  	if cap&(cap-1) != 0 {
   720  		throw("spmc capacity must be a power of 2")
   721  	}
   722  
   723  	r.cap = cap
   724  	ring := sysAlloc(uintptr(cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys, "GC span queue")
   725  	atomic.StorepNoWB(unsafe.Pointer(&r.ring), ring)
   726  	return r
   727  }
   728  
   729  // empty returns true if the spmc is empty.
   730  //
   731  // empty is thread-safe.
   732  func (r *spanSPMC) empty() bool {
   733  	h := r.head.Load()
   734  	t := r.tail.Load()
   735  	return t == h
   736  }
   737  
   738  // deinit frees any resources the spanSPMC is holding onto and zeroes it.
   739  func (r *spanSPMC) deinit() {
   740  	sysFree(unsafe.Pointer(r.ring), uintptr(r.cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys)
   741  	r.ring = nil
   742  	r.dead.Store(false)
   743  	r.prev.StoreNoWB(nil)
   744  	r.head.Store(0)
   745  	r.tail.Store(0)
   746  	r.cap = 0
   747  	r.allnode = listNodeManual{}
   748  }
   749  
   750  // slot returns a pointer to slot i%r.cap.
   751  func (r *spanSPMC) slot(i uint32) *objptr {
   752  	idx := uintptr(i & (r.cap - 1))
   753  	return (*objptr)(unsafe.Add(unsafe.Pointer(r.ring), idx*unsafe.Sizeof(objptr(0))))
   754  }
   755  
   756  // freeDeadSpanSPMCs frees dead spanSPMCs back to the OS.
   757  func freeDeadSpanSPMCs() {
   758  	// According to the SPMC memory management invariants, we can only free
   759  	// spanSPMCs outside of the mark phase. We ensure we do this in two ways.
   760  	//
   761  	// 1. We take the work.spanSPMCs lock, which we need anyway. This ensures
   762  	//    that we are non-preemptible. If this path becomes lock-free, we will
   763  	//    need to become non-preemptible in some other way.
   764  	// 2. Once we are non-preemptible, we check the gcphase, and back out if
   765  	//    it's not safe.
   766  	//
   767  	// This way, we ensure that we don't start freeing if we're in the wrong
   768  	// phase, and the phase can't change on us while we're freeing.
   769  	//
   770  	// TODO(go.dev/issue/75771): Due to the grow semantics in
   771  	// spanQueue.drain, we expect a steady-state of around one spanSPMC per
   772  	// P, with some spikes higher when Ps have more than one. For high
   773  	// GOMAXPROCS, or if this list otherwise gets long, it would be nice to
   774  	// have a way to batch work that allows preemption during processing.
   775  	lock(&work.spanSPMCs.lock)
   776  	if gcphase != _GCoff || work.spanSPMCs.list.empty() {
   777  		unlock(&work.spanSPMCs.lock)
   778  		return
   779  	}
   780  	r := (*spanSPMC)(work.spanSPMCs.list.head())
   781  	for r != nil {
   782  		next := (*spanSPMC)(unsafe.Pointer(r.allnode.next))
   783  		if r.dead.Load() {
   784  			// It's dead. Remove, deinitialize and free it.
   785  			work.spanSPMCs.list.remove(unsafe.Pointer(r))
   786  			r.deinit()
   787  			mheap_.spanSPMCAlloc.free(unsafe.Pointer(r))
   788  		}
   789  		r = next
   790  	}
   791  	unlock(&work.spanSPMCs.lock)
   792  }
   793  
   794  // tryStealSpan attempts to steal a span from another P's local queue.
   795  //
   796  // Returns a non-zero objptr on success.
   797  func (w *gcWork) tryStealSpan() objptr {
   798  	pp := getg().m.p.ptr()
   799  
   800  	for enum := stealOrder.start(cheaprand()); !enum.done(); enum.next() {
   801  		if !work.spanqMask.read(enum.position()) {
   802  			continue
   803  		}
   804  		p2 := allp[enum.position()]
   805  		if pp == p2 {
   806  			continue
   807  		}
   808  		if s := w.spanq.steal(&p2.gcw.spanq); s != 0 {
   809  			return s
   810  		}
   811  		// N.B. This is intentionally racy. We may stomp on a mask set by
   812  		// a P that just put a bunch of work into its local queue.
   813  		//
   814  		// This is OK because the ragged barrier in gcMarkDone will set
   815  		// the bit on each P if there's local work we missed. This race
   816  		// should generally be rare, since the window between noticing
   817  		// an empty local queue and this bit being set is quite small.
   818  		work.spanqMask.clear(int32(enum.position()))
   819  	}
   820  	return 0
   821  }
   822  
   823  // objptr consists of a span base and the index of the object in the span.
   824  type objptr uintptr
   825  
   826  // makeObjPtr creates an objptr from a span base address and an object index.
   827  func makeObjPtr(spanBase uintptr, objIndex uint16) objptr {
   828  	if doubleCheckGreenTea && spanBase&((1<<gc.PageShift)-1) != 0 {
   829  		throw("created objptr with address that is incorrectly aligned")
   830  	}
   831  	return objptr(spanBase | uintptr(objIndex))
   832  }
   833  
   834  func (p objptr) spanBase() uintptr {
   835  	return uintptr(p) &^ ((1 << gc.PageShift) - 1)
   836  }
   837  
   838  func (p objptr) objIndex() uint16 {
   839  	return uint16(p) & ((1 << gc.PageShift) - 1)
   840  }
   841  
   842  // scanSpan scans objects indicated marks&^scans and then scans those objects,
   843  // queuing the resulting pointers into gcw.
   844  func scanSpan(p objptr, gcw *gcWork) {
   845  	spanBase := p.spanBase()
   846  	imb := spanInlineMarkBitsFromBase(spanBase)
   847  	spanclass := imb.class
   848  	if spanclass.noscan() {
   849  		throw("noscan object in scanSpan")
   850  	}
   851  	elemsize := uintptr(gc.SizeClassToSize[spanclass.sizeclass()])
   852  
   853  	// Release span.
   854  	if imb.release() == spanScanOneMark {
   855  		// Nobody else set any mark bits on this span while it was acquired.
   856  		// That means p is the sole object we need to handle. Fast-track it.
   857  		objIndex := p.objIndex()
   858  		bytep := &imb.scans[objIndex/8]
   859  		mask := uint8(1) << (objIndex % 8)
   860  		if atomic.Load8(bytep)&mask != 0 {
   861  			return
   862  		}
   863  		atomic.Or8(bytep, mask)
   864  		gcw.bytesMarked += uint64(elemsize)
   865  		if debug.gctrace > 1 {
   866  			gcw.stats[spanclass.sizeclass()].sparseObjsScanned++
   867  		}
   868  		b := spanBase + uintptr(objIndex)*elemsize
   869  		scanObjectSmall(spanBase, b, elemsize, gcw)
   870  		return
   871  	}
   872  
   873  	// Compute nelems.
   874  	divMagic := uint64(gc.SizeClassToDivMagic[spanclass.sizeclass()])
   875  	usableSpanSize := uint64(gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{}))
   876  	if !spanclass.noscan() {
   877  		usableSpanSize -= gc.PageSize / goarch.PtrSize / 8
   878  	}
   879  	nelems := uint16((usableSpanSize * divMagic) >> 32)
   880  
   881  	// Grey objects and return if there's nothing else to do.
   882  	var toScan gc.ObjMask
   883  	objsMarked := spanSetScans(spanBase, nelems, imb, &toScan)
   884  	if objsMarked == 0 {
   885  		return
   886  	}
   887  	gcw.bytesMarked += uint64(objsMarked) * uint64(elemsize)
   888  
   889  	// Check if we have enough density to make a dartboard scan
   890  	// worthwhile. If not, just do what scanobject does, but
   891  	// localized to the span, using the dartboard.
   892  	if !scan.HasFastScanSpanPacked() || objsMarked < int(nelems/8) {
   893  		if debug.gctrace > 1 {
   894  			gcw.stats[spanclass.sizeclass()].spansSparseScanned++
   895  			gcw.stats[spanclass.sizeclass()].spanObjsSparseScanned += uint64(objsMarked)
   896  		}
   897  		scanObjectsSmall(spanBase, elemsize, nelems, gcw, &toScan)
   898  		return
   899  	}
   900  
   901  	// Scan the span.
   902  	//
   903  	// N.B. Use gcw.ptrBuf as the output buffer. This is a bit different
   904  	// from scanObjectsSmall, which puts addresses to dereference. ScanSpanPacked
   905  	// on the other hand, fills gcw.ptrBuf with already dereferenced pointers.
   906  	nptrs := scan.ScanSpanPacked(
   907  		unsafe.Pointer(spanBase),
   908  		&gcw.ptrBuf[0],
   909  		&toScan,
   910  		uintptr(spanclass.sizeclass()),
   911  		spanPtrMaskUnsafe(spanBase),
   912  	)
   913  	gcw.heapScanWork += int64(objsMarked) * int64(elemsize)
   914  
   915  	if debug.gctrace > 1 {
   916  		// Write down some statistics.
   917  		gcw.stats[spanclass.sizeclass()].spansDenseScanned++
   918  		gcw.stats[spanclass.sizeclass()].spanObjsDenseScanned += uint64(objsMarked)
   919  	}
   920  
   921  	// Process all the pointers we just got.
   922  	for _, p := range gcw.ptrBuf[:nptrs] {
   923  		if !tryDeferToSpanScan(p, gcw) {
   924  			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
   925  				greyobject(obj, 0, 0, span, gcw, objIndex)
   926  			}
   927  		}
   928  	}
   929  }
   930  
   931  // spanSetScans sets any unset mark bits that have their mark bits set in the inline mark bits.
   932  //
   933  // toScan is populated with bits indicating whether a particular mark bit was set.
   934  //
   935  // Returns the number of objects marked, which could be zero.
   936  func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toScan *gc.ObjMask) int {
   937  	arena, pageIdx, pageMask := pageIndexOf(spanBase)
   938  	if arena.pageMarks[pageIdx]&pageMask == 0 {
   939  		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
   940  	}
   941  
   942  	bytes := divRoundUp(uintptr(nelems), 8)
   943  	objsMarked := 0
   944  
   945  	// Careful: these two structures alias since ObjMask is much bigger
   946  	// than marks or scans. We do these unsafe shenanigans so that we can
   947  	// access the marks and scans by uintptrs rather than by byte.
   948  	imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks))
   949  	imbScans := (*gc.ObjMask)(unsafe.Pointer(&imb.scans))
   950  
   951  	// Iterate over one uintptr-sized chunks at a time, computing both
   952  	// the union and intersection of marks and scans. Store the union
   953  	// into scans, and the intersection into toScan.
   954  	for i := uintptr(0); i < bytes; i += goarch.PtrSize {
   955  		scans := atomic.Loaduintptr(&imbScans[i/goarch.PtrSize])
   956  		marks := imbMarks[i/goarch.PtrSize]
   957  		scans = bswapIfBigEndian(scans)
   958  		marks = bswapIfBigEndian(marks)
   959  		if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 {
   960  			scans &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out owned
   961  			marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class
   962  		}
   963  		toGrey := marks &^ scans
   964  		toScan[i/goarch.PtrSize] = toGrey
   965  
   966  		// If there's anything left to grey, do it.
   967  		if toGrey != 0 {
   968  			toGrey = bswapIfBigEndian(toGrey)
   969  			if goarch.PtrSize == 4 {
   970  				atomic.Or32((*uint32)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint32(toGrey))
   971  			} else {
   972  				atomic.Or64((*uint64)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint64(toGrey))
   973  			}
   974  		}
   975  		objsMarked += sys.OnesCount64(uint64(toGrey))
   976  	}
   977  	return objsMarked
   978  }
   979  
   980  func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) {
   981  	hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, objSize)
   982  	hbits := (*byte)(unsafe.Pointer(hbitsBase))
   983  	ptrBits := extractHeapBitsSmall(hbits, spanBase, b, objSize)
   984  	gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
   985  	nptrs := 0
   986  	n := sys.OnesCount64(uint64(ptrBits))
   987  	for range n {
   988  		k := sys.TrailingZeros64(uint64(ptrBits))
   989  		ptrBits &^= 1 << k
   990  		addr := b + uintptr(k)*goarch.PtrSize
   991  
   992  		// Prefetch addr since we're about to use it. This point for prefetching
   993  		// was chosen empirically.
   994  		sys.Prefetch(addr)
   995  
   996  		// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
   997  		gcw.ptrBuf[nptrs] = addr
   998  		nptrs++
   999  	}
  1000  
  1001  	// Process all the pointers we just got.
  1002  	for _, p := range gcw.ptrBuf[:nptrs] {
  1003  		p = *(*uintptr)(unsafe.Pointer(p))
  1004  		if p == 0 {
  1005  			continue
  1006  		}
  1007  		if !tryDeferToSpanScan(p, gcw) {
  1008  			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
  1009  				greyobject(obj, 0, 0, span, gcw, objIndex)
  1010  			}
  1011  		}
  1012  	}
  1013  }
  1014  
  1015  func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *gc.ObjMask) {
  1016  	nptrs := 0
  1017  	for i, bits := range scans {
  1018  		if i*(goarch.PtrSize*8) > int(elems) {
  1019  			break
  1020  		}
  1021  		n := sys.OnesCount64(uint64(bits))
  1022  		hbitsBase, _ := spanHeapBitsRange(base, gc.PageSize, objSize)
  1023  		hbits := (*byte)(unsafe.Pointer(hbitsBase))
  1024  		for range n {
  1025  			j := sys.TrailingZeros64(uint64(bits))
  1026  			bits &^= 1 << j
  1027  
  1028  			b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize
  1029  			ptrBits := extractHeapBitsSmall(hbits, base, b, objSize)
  1030  			gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize)
  1031  
  1032  			n := sys.OnesCount64(uint64(ptrBits))
  1033  			for range n {
  1034  				k := sys.TrailingZeros64(uint64(ptrBits))
  1035  				ptrBits &^= 1 << k
  1036  				addr := b + uintptr(k)*goarch.PtrSize
  1037  
  1038  				// Prefetch addr since we're about to use it. This point for prefetching
  1039  				// was chosen empirically.
  1040  				sys.Prefetch(addr)
  1041  
  1042  				// N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span.
  1043  				gcw.ptrBuf[nptrs] = addr
  1044  				nptrs++
  1045  			}
  1046  		}
  1047  	}
  1048  
  1049  	// Process all the pointers we just got.
  1050  	for _, p := range gcw.ptrBuf[:nptrs] {
  1051  		p = *(*uintptr)(unsafe.Pointer(p))
  1052  		if p == 0 {
  1053  			continue
  1054  		}
  1055  		if !tryDeferToSpanScan(p, gcw) {
  1056  			if obj, span, objIndex := findObject(p, 0, 0); obj != 0 {
  1057  				greyobject(obj, 0, 0, span, gcw, objIndex)
  1058  			}
  1059  		}
  1060  	}
  1061  }
  1062  
  1063  func extractHeapBitsSmall(hbits *byte, spanBase, addr, elemsize uintptr) uintptr {
  1064  	// These objects are always small enough that their bitmaps
  1065  	// fit in a single word, so just load the word or two we need.
  1066  	//
  1067  	// Mirrors mspan.writeHeapBitsSmall.
  1068  	//
  1069  	// We should be using heapBits(), but unfortunately it introduces
  1070  	// both bounds checks panics and throw which causes us to exceed
  1071  	// the nosplit limit in quite a few cases.
  1072  	i := (addr - spanBase) / goarch.PtrSize / ptrBits
  1073  	j := (addr - spanBase) / goarch.PtrSize % ptrBits
  1074  	bits := elemsize / goarch.PtrSize
  1075  	word0 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+0))))
  1076  	word1 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+1))))
  1077  
  1078  	var read uintptr
  1079  	if j+bits > ptrBits {
  1080  		// Two reads.
  1081  		bits0 := ptrBits - j
  1082  		bits1 := bits - bits0
  1083  		read = *word0 >> j
  1084  		read |= (*word1 & ((1 << bits1) - 1)) << bits0
  1085  	} else {
  1086  		// One read.
  1087  		read = (*word0 >> j) & ((1 << bits) - 1)
  1088  	}
  1089  	return read
  1090  }
  1091  
  1092  // spanPtrMaskUnsafe returns the pointer mask for a span with inline mark bits.
  1093  //
  1094  // The caller must ensure spanBase is the base of a span that:
  1095  // - 1 page in size,
  1096  // - Uses inline mark bits,
  1097  // - Contains pointers.
  1098  func spanPtrMaskUnsafe(spanBase uintptr) *gc.PtrMask {
  1099  	base := spanBase + gc.PageSize - unsafe.Sizeof(gc.PtrMask{}) - unsafe.Sizeof(spanInlineMarkBits{})
  1100  	return (*gc.PtrMask)(unsafe.Pointer(base))
  1101  }
  1102  
  1103  type sizeClassScanStats struct {
  1104  	spansDenseScanned     uint64 // Spans scanned with ScanSpanPacked.
  1105  	spanObjsDenseScanned  uint64 // Objects scanned with ScanSpanPacked.
  1106  	spansSparseScanned    uint64 // Spans scanned with scanObjectsSmall.
  1107  	spanObjsSparseScanned uint64 // Objects scanned with scanObjectsSmall.
  1108  	sparseObjsScanned     uint64 // Objects scanned with scanobject or scanObjectSmall.
  1109  	// Note: sparseObjsScanned is sufficient for both cases because
  1110  	// a particular size class either uses scanobject or scanObjectSmall,
  1111  	// not both. In the latter case, we also know that there was one
  1112  	// object scanned per span, so no need for a span counter.
  1113  }
  1114  
  1115  func dumpScanStats() {
  1116  	var (
  1117  		spansDenseScanned     uint64
  1118  		spanObjsDenseScanned  uint64
  1119  		spansSparseScanned    uint64
  1120  		spanObjsSparseScanned uint64
  1121  		sparseObjsScanned     uint64
  1122  	)
  1123  	for _, stats := range memstats.lastScanStats {
  1124  		spansDenseScanned += stats.spansDenseScanned
  1125  		spanObjsDenseScanned += stats.spanObjsDenseScanned
  1126  		spansSparseScanned += stats.spansSparseScanned
  1127  		spanObjsSparseScanned += stats.spanObjsSparseScanned
  1128  		sparseObjsScanned += stats.sparseObjsScanned
  1129  	}
  1130  	totalObjs := sparseObjsScanned + spanObjsSparseScanned + spanObjsDenseScanned
  1131  	totalSpans := spansSparseScanned + spansDenseScanned
  1132  	print("scan: total ", sparseObjsScanned, "+", spanObjsSparseScanned, "+", spanObjsDenseScanned, "=", totalObjs, " objs")
  1133  	print(", ", spansSparseScanned, "+", spansDenseScanned, "=", totalSpans, " spans\n")
  1134  	for i, stats := range memstats.lastScanStats {
  1135  		if stats == (sizeClassScanStats{}) {
  1136  			continue
  1137  		}
  1138  		totalObjs := stats.sparseObjsScanned + stats.spanObjsSparseScanned + stats.spanObjsDenseScanned
  1139  		totalSpans := stats.spansSparseScanned + stats.spansDenseScanned
  1140  		if i == 0 {
  1141  			print("scan: class L ")
  1142  		} else {
  1143  			print("scan: class ", gc.SizeClassToSize[i], "B ")
  1144  		}
  1145  		print(stats.sparseObjsScanned, "+", stats.spanObjsSparseScanned, "+", stats.spanObjsDenseScanned, "=", totalObjs, " objs")
  1146  		print(", ", stats.spansSparseScanned, "+", stats.spansDenseScanned, "=", totalSpans, " spans\n")
  1147  	}
  1148  }
  1149  
  1150  func (w *gcWork) flushScanStats(dst *[gc.NumSizeClasses]sizeClassScanStats) {
  1151  	for i := range w.stats {
  1152  		dst[i].spansDenseScanned += w.stats[i].spansDenseScanned
  1153  		dst[i].spanObjsDenseScanned += w.stats[i].spanObjsDenseScanned
  1154  		dst[i].spansSparseScanned += w.stats[i].spansSparseScanned
  1155  		dst[i].spanObjsSparseScanned += w.stats[i].spanObjsSparseScanned
  1156  		dst[i].sparseObjsScanned += w.stats[i].sparseObjsScanned
  1157  	}
  1158  	clear(w.stats[:])
  1159  }
  1160  
  1161  // gcMarkWorkAvailable reports whether there's any non-local work available to do.
  1162  //
  1163  // This is a heavyweight check and must only be used for correctness, not
  1164  // as a hint.
  1165  func gcMarkWorkAvailable() bool {
  1166  	if !work.full.empty() {
  1167  		return true // global work available
  1168  	}
  1169  	if work.markrootNext.Load() < work.markrootJobs.Load() {
  1170  		return true // root scan work available
  1171  	}
  1172  	if work.spanqMask.any() {
  1173  		return true // stealable local work available
  1174  	}
  1175  	return false
  1176  }
  1177  
  1178  // scanObject scans the object starting at b, adding pointers to gcw.
  1179  // b must point to the beginning of a heap object or an oblet.
  1180  // scanObject consults the GC bitmap for the pointer mask and the
  1181  // spans for the size of the object.
  1182  //
  1183  // Used only for !gcUsesSpanInlineMarkBits spans, but supports all
  1184  // object sizes and is safe to be called on all heap objects.
  1185  //
  1186  //go:nowritebarrier
  1187  func scanObject(b uintptr, gcw *gcWork) {
  1188  	// Prefetch object before we scan it.
  1189  	//
  1190  	// This will overlap fetching the beginning of the object with initial
  1191  	// setup before we start scanning the object.
  1192  	sys.Prefetch(b)
  1193  
  1194  	// Find the bits for b and the size of the object at b.
  1195  	//
  1196  	// b is either the beginning of an object, in which case this
  1197  	// is the size of the object to scan, or it points to an
  1198  	// oblet, in which case we compute the size to scan below.
  1199  	s := spanOfUnchecked(b)
  1200  	n := s.elemsize
  1201  	if n == 0 {
  1202  		throw("scanObject n == 0")
  1203  	}
  1204  	if s.spanclass.noscan() {
  1205  		// Correctness-wise this is ok, but it's inefficient
  1206  		// if noscan objects reach here.
  1207  		throw("scanObject of a noscan object")
  1208  	}
  1209  
  1210  	var tp typePointers
  1211  	if n > maxObletBytes {
  1212  		// Large object. Break into oblets for better
  1213  		// parallelism and lower latency.
  1214  		if b == s.base() {
  1215  			// Enqueue the other oblets to scan later.
  1216  			// Some oblets may be in b's scalar tail, but
  1217  			// these will be marked as "no more pointers",
  1218  			// so we'll drop out immediately when we go to
  1219  			// scan those.
  1220  			for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes {
  1221  				if !gcw.putObjFast(oblet) {
  1222  					gcw.putObj(oblet)
  1223  				}
  1224  			}
  1225  		}
  1226  
  1227  		// Compute the size of the oblet. Since this object
  1228  		// must be a large object, s.base() is the beginning
  1229  		// of the object.
  1230  		n = s.base() + s.elemsize - b
  1231  		n = min(n, maxObletBytes)
  1232  		tp = s.typePointersOfUnchecked(s.base())
  1233  		tp = tp.fastForward(b-tp.addr, b+n)
  1234  	} else {
  1235  		tp = s.typePointersOfUnchecked(b)
  1236  	}
  1237  
  1238  	var scanSize uintptr
  1239  	for {
  1240  		var addr uintptr
  1241  		if tp, addr = tp.nextFast(); addr == 0 {
  1242  			if tp, addr = tp.next(b + n); addr == 0 {
  1243  				break
  1244  			}
  1245  		}
  1246  
  1247  		// Keep track of farthest pointer we found, so we can
  1248  		// update heapScanWork. TODO: is there a better metric,
  1249  		// now that we can skip scalar portions pretty efficiently?
  1250  		scanSize = addr - b + goarch.PtrSize
  1251  
  1252  		// Work here is duplicated in scanblock and above.
  1253  		// If you make changes here, make changes there too.
  1254  		obj := *(*uintptr)(unsafe.Pointer(addr))
  1255  
  1256  		// At this point we have extracted the next potential pointer.
  1257  		// Quickly filter out nil and pointers back to the current object.
  1258  		if obj != 0 && obj-b >= n {
  1259  			// Test if obj points into the Go heap and, if so,
  1260  			// mark the object.
  1261  			//
  1262  			// Note that it's possible for findObject to
  1263  			// fail if obj points to a just-allocated heap
  1264  			// object because of a race with growing the
  1265  			// heap. In this case, we know the object was
  1266  			// just allocated and hence will be marked by
  1267  			// allocation itself.
  1268  			if !tryDeferToSpanScan(obj, gcw) {
  1269  				if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
  1270  					greyobject(obj, b, addr-b, span, gcw, objIndex)
  1271  				}
  1272  			}
  1273  		}
  1274  	}
  1275  	gcw.bytesMarked += uint64(n)
  1276  	gcw.heapScanWork += int64(scanSize)
  1277  	if debug.gctrace > 1 {
  1278  		gcw.stats[s.spanclass.sizeclass()].sparseObjsScanned++
  1279  	}
  1280  }
  1281  

View as plain text