// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Green Tea mark algorithm // // The core idea behind Green Tea is simple: achieve better locality during // mark/scan by delaying scanning so that we can accumulate objects to scan // within the same span, then scan the objects that have accumulated on the // span all together. // // By batching objects this way, we increase the chance that adjacent objects // will be accessed, amortize the cost of accessing object metadata, and create // better opportunities for prefetching. We can take this even further and // optimize the scan loop by size class (not yet completed) all the way to the // point of applying SIMD techniques to really tear through the heap. // // Naturally, this depends on being able to create opportunties to batch objects // together. The basic idea here is to have two sets of mark bits. One set is the // regular set of mark bits ("marks"), while the other essentially says that the // objects have been scanned already ("scans"). When we see a pointer for the first // time we set its mark and enqueue its span. We track these spans in work queues // with a FIFO policy, unlike workbufs which have a LIFO policy. Empirically, a // FIFO policy appears to work best for accumulating objects to scan on a span. // Later, when we dequeue the span, we find both the union and intersection of the // mark and scan bitsets. The union is then written back into the scan bits, while // the intersection is used to decide which objects need scanning, such that the GC // is still precise. // // Below is the bulk of the implementation, focusing on the worst case // for locality, small objects. Specifically, those that are smaller than // a few cache lines in size and whose metadata is stored the same way (at the // end of the span). //go:build goexperiment.greenteagc package runtime import ( "internal/goarch" "internal/runtime/atomic" "internal/runtime/gc" "internal/runtime/gc/scan" "internal/runtime/sys" "unsafe" ) const doubleCheckGreenTea = false // spanInlineMarkBits are mark bits that are inlined into the span // itself. gcUsesSpanInlineMarkBits may be used to check if objects // of a particular size use inline mark bits. // // Inline mark bits are a little bit more than just mark bits. They // consist of two parts: scans and marks. Marks are like pre-mark // bits. They're set once a pointer to an object is discovered for // the first time. The marks allow us to scan many objects in bulk // if we queue the whole span for scanning. Before we scan such objects // in bulk, we copy the marks to the scans, computing a diff along the // way. The resulting bitmap tells us which objects we should scan. // // The inlineMarkBits also hold state sufficient for scanning any // object in the span, as well as state for acquiring ownership of // the span for queuing. This avoids the need to look at the mspan when // scanning. type spanInlineMarkBits struct { scans [63]uint8 // scanned bits. owned spanScanOwnership // see the comment on spanScanOwnership. marks [63]uint8 // mark bits. class spanClass } // spanScanOwnership indicates whether some thread has acquired // the span for scanning, and whether there has been one or more // attempts to acquire the span. The latter information helps to // fast-track span scans that only apply to a single mark, skipping // the relatively costly merge-and-diff process for scans and marks // by allowing one to just set the mark directly. type spanScanOwnership uint8 const ( spanScanUnowned spanScanOwnership = 0 // Indicates the span is not acquired for scanning. spanScanOneMark = 1 << iota // Indicates that only one mark bit is set relative to the scan bits. spanScanManyMark // Indicates one or more scan bits may be set relative to the mark bits. // "ManyMark" need not be exactly the value it has. In practice we just // want to distinguish "none" from "one" from "many," so a comparison is // sufficient (as opposed to a bit test) to check between these cases. ) // load atomically loads from a pointer to a spanScanOwnership. func (o *spanScanOwnership) load() spanScanOwnership { return spanScanOwnership(atomic.Load8((*uint8)(unsafe.Pointer(o)))) } func (o *spanScanOwnership) or(v spanScanOwnership) spanScanOwnership { // N.B. We round down the address and use Or32 because Or8 doesn't // return a result, and it's strictly necessary for this protocol. // // Making Or8 return a result, while making the code look nicer, would // not be strictly better on any supported platform, as an Or8 that // returns a result is not a common instruction. On many platforms it // would be implemented exactly as it is here, and since Or8 is // exclusively used in the runtime and a hot function, we want to keep // using its no-result version elsewhere for performance. o32 := (*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(o)) &^ 0b11)) off := (uintptr(unsafe.Pointer(o)) & 0b11) * 8 if goarch.BigEndian { off = 32 - off - 8 } return spanScanOwnership(atomic.Or32(o32, uint32(v)<> off) } func (imb *spanInlineMarkBits) init(class spanClass, needzero bool) { if imb == nil { // This nil check and throw is almost pointless. Normally we would // expect imb to never be nil. However, this is called on potentially // freshly-allocated virtual memory. As of 2025, the compiler-inserted // nil check is not a branch but a memory read that we expect to fault // if the pointer really is nil. // // However, this causes a read of the page, and operating systems may // take it as a hint to back the accessed memory with a read-only zero // page. However, we immediately write to this memory, which can then // force operating systems to have to update the page table and flush // the TLB, causing a lot of churn for programs that are short-lived // and monotonically grow in size. // // This nil check is thus an explicit branch instead of what the compiler // would insert circa 2025, which is a memory read instruction. // // See go.dev/issue/74375 for details. throw("runtime: span inline mark bits nil?") } if needzero { // Use memclrNoHeapPointers to avoid having the compiler make a worse // decision. We know that imb is both aligned and a nice power-of-two // size that works well for wider SIMD instructions. The compiler likely // has no idea that imb is aligned to 128 bytes. memclrNoHeapPointers(unsafe.Pointer(imb), unsafe.Sizeof(spanInlineMarkBits{})) } imb.class = class } // tryAcquire attempts to acquire the span for scanning. On success, the caller // must queue the span for scanning or scan the span immediately. func (imb *spanInlineMarkBits) tryAcquire() bool { switch imb.owned.load() { case spanScanUnowned: // Try to mark the span as having only one object marked. if imb.owned.or(spanScanOneMark) == spanScanUnowned { return true } // If we didn't see an old value of spanScanUnowned, then we must // have raced with someone else and seen spanScanOneMark or greater. // Fall through and try to set spanScanManyMark. fallthrough case spanScanOneMark: // We may be the first to set *any* bit on owned. In such a case, // we still need to make sure the span is queued. return imb.owned.or(spanScanManyMark) == spanScanUnowned } return false } // release releases the span for scanning, allowing another thread to queue the span. // // Returns an upper bound on the number of mark bits set since the span was queued. The // upper bound is described as "one" (spanScanOneMark) or "many" (spanScanManyMark, with or // without spanScanOneMark). If the return value indicates only one mark bit was set, the // caller can be certain that it was the same mark bit that caused the span to get queued. // Take note of the fact that this is *only* an upper-bound. In particular, it may still // turn out that only one mark bit was set, even if the return value indicates "many". func (imb *spanInlineMarkBits) release() spanScanOwnership { return spanScanOwnership(atomic.Xchg8((*uint8)(unsafe.Pointer(&imb.owned)), uint8(spanScanUnowned))) } // spanInlineMarkBitsFromBase returns the spanInlineMarkBits for a span whose start address is base. // // The span must be gcUsesSpanInlineMarkBits(span.elemsize). func spanInlineMarkBitsFromBase(base uintptr) *spanInlineMarkBits { return (*spanInlineMarkBits)(unsafe.Pointer(base + gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{}))) } // initInlineMarkBits initializes the inlineMarkBits stored at the end of the span. func (s *mspan) initInlineMarkBits() { if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) { throw("expected span with inline mark bits") } // Zeroing is only necessary if this span wasn't just freshly allocated from the OS. s.inlineMarkBits().init(s.spanclass, s.needzero != 0) } // moveInlineMarks merges the span's inline mark bits into dst and clears them. // // gcUsesSpanInlineMarkBits(s.elemsize) must be true. func (s *mspan) moveInlineMarks(dst *gcBits) { if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) { throw("expected span with inline mark bits") } bytes := divRoundUp(uintptr(s.nelems), 8) imb := s.inlineMarkBits() imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks)) for i := uintptr(0); i < bytes; i += goarch.PtrSize { marks := bswapIfBigEndian(imbMarks[i/goarch.PtrSize]) if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 { marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class } *(*uintptr)(unsafe.Pointer(dst.bytep(i))) |= bswapIfBigEndian(marks) } if doubleCheckGreenTea && !s.spanclass.noscan() && imb.marks != imb.scans { throw("marks don't match scans for span with pointer") } // Reset the inline mark bits. imb.init(s.spanclass, true /* We know these bits are always dirty now. */) } // inlineMarkBits returns the inline mark bits for the span. // // gcUsesSpanInlineMarkBits(s.elemsize) must be true. func (s *mspan) inlineMarkBits() *spanInlineMarkBits { if doubleCheckGreenTea && !gcUsesSpanInlineMarkBits(s.elemsize) { throw("expected span with inline mark bits") } return spanInlineMarkBitsFromBase(s.base()) } func (s *mspan) markBitsForIndex(objIndex uintptr) (bits markBits) { if gcUsesSpanInlineMarkBits(s.elemsize) { bits.bytep = &s.inlineMarkBits().marks[objIndex/8] } else { bits.bytep = s.gcmarkBits.bytep(objIndex / 8) } bits.mask = uint8(1) << (objIndex % 8) bits.index = objIndex return } func (s *mspan) markBitsForBase() markBits { if gcUsesSpanInlineMarkBits(s.elemsize) { return markBits{&s.inlineMarkBits().marks[0], uint8(1), 0} } return markBits{&s.gcmarkBits.x, uint8(1), 0} } // scannedBitsForIndex returns a markBits representing the scanned bit // for objIndex in the inline mark bits. func (s *mspan) scannedBitsForIndex(objIndex uintptr) markBits { return markBits{&s.inlineMarkBits().scans[objIndex/8], uint8(1) << (objIndex % 8), objIndex} } // gcUsesSpanInlineMarkBits returns true if a span holding objects of a certain size // has inline mark bits. size must be the span's elemsize. // // nosplit because this is called from gcmarknewobject, which is nosplit. // //go:nosplit func gcUsesSpanInlineMarkBits(size uintptr) bool { return heapBitsInSpan(size) && size >= 16 } // tryDeferToSpanScan tries to queue p on the span it points to, if it // points to a small object span (gcUsesSpanQueue size). func tryDeferToSpanScan(p uintptr, gcw *gcWork) bool { if useCheckmark { return false } // Quickly to see if this is a span that has inline mark bits. ha := heapArenaOf(p) if ha == nil { return false } pageIdx := ((p / pageSize) / 8) % uintptr(len(ha.pageInUse)) pageMask := byte(1 << ((p / pageSize) % 8)) if ha.pageUseSpanInlineMarkBits[pageIdx]&pageMask == 0 { return false } // Find the object's index from the span class info stored in the inline mark bits. base := alignDown(p, gc.PageSize) q := spanInlineMarkBitsFromBase(base) objIndex := uint16((uint64(p-base) * uint64(gc.SizeClassToDivMagic[q.class.sizeclass()])) >> 32) // Set mark bit. idx, mask := objIndex/8, uint8(1)<<(objIndex%8) if atomic.Load8(&q.marks[idx])&mask != 0 { return true } atomic.Or8(&q.marks[idx], mask) // Fast-track noscan objects. if q.class.noscan() { gcw.bytesMarked += uint64(gc.SizeClassToSize[q.class.sizeclass()]) return true } // Queue up the pointer (as a representative for its span). if q.tryAcquire() { if gcw.spanq.put(makeObjPtr(base, objIndex)) { if gcphase == _GCmark { // This is intentionally racy; the bit set here might get // stomped on by a stealing P. See the comment in tryStealSpan // for an explanation as to why this is OK. if !work.spanqMask.read(uint32(gcw.id)) { work.spanqMask.set(gcw.id) } gcw.mayNeedWorker = true } gcw.flushedWork = true } } return true } // tryGetSpanFast attempts to get an entire span to scan. func (w *gcWork) tryGetSpanFast() objptr { return w.spanq.tryGetFast() } // tryGetSpan attempts to get an entire span to scan. func (w *gcWork) tryGetSpan() objptr { if s := w.spanq.tryGetFast(); s != 0 { return s } // "Steal" from ourselves. if s := w.spanq.steal(&w.spanq); s != 0 { return s } // We failed to get any local work, so we're fresh out. // Nobody else is going to add work for us. Clear our bit. if work.spanqMask.read(uint32(w.id)) { work.spanqMask.clear(w.id) } return 0 } // spanQueue is a P-local stealable span queue. type spanQueue struct { // head, tail, and ring represent a local non-thread-safe ring buffer. head, tail uint32 ring [256]objptr // putsSinceDrain counts the number of put calls since the last drain. putsSinceDrain int // chain contains state visible to other Ps. // // In particular, that means a linked chain of single-producer multi-consumer // ring buffers where the single producer is this P only. // // This linked chain structure is based off the sync.Pool dequeue. chain struct { // head is the spanSPMC to put to. This is only accessed // by the producer, so doesn't need to be synchronized. head *spanSPMC // tail is the spanSPMC to steal from. This is accessed // by consumers, so reads and writes must be atomic. tail atomic.UnsafePointer // *spanSPMC } } // putFast tries to put s onto the queue, but may fail if it's full. func (q *spanQueue) putFast(s objptr) (ok bool) { if q.tail-q.head == uint32(len(q.ring)) { return false } q.ring[q.tail%uint32(len(q.ring))] = s q.tail++ return true } // put puts s onto the queue. // // Returns whether the caller should spin up a new worker. func (q *spanQueue) put(s objptr) bool { // The constants below define the period of and volume of // spans we spill to the spmc chain when the local queue is // not full. // // spillPeriod must be > spillMax, otherwise that sets the // effective maximum size of our local span queue. Even if // we have a span ring of size N, but we flush K spans every // K puts, then K becomes our effective maximum length. When // spillPeriod > spillMax, then we're always spilling spans // at a slower rate than we're accumulating them. const ( // spillPeriod defines how often to check if we should // spill some spans, counted in the number of calls to put. spillPeriod = 64 // spillMax defines, at most, how many spans to drain with // each spill. spillMax = 16 ) if q.putFast(s) { // Occasionally try to spill some work to generate parallelism. q.putsSinceDrain++ if q.putsSinceDrain >= spillPeriod { // Reset even if we don't drain, so we don't check every time. q.putsSinceDrain = 0 // Try to drain some spans. Don't bother if there's very // few of them or there's already spans in the spmc chain. n := min((q.tail-q.head)/2, spillMax) if n > 4 && q.chainEmpty() { q.drain(n) return true } } return false } // We're out of space. Drain out our local spans. q.drain(uint32(len(q.ring)) / 2) if !q.putFast(s) { throw("failed putFast after drain") } return true } // flush publishes all spans in the local queue to the spmc chain. func (q *spanQueue) flush() { n := q.tail - q.head if n == 0 { return } q.drain(n) } // empty returns true if there's no more work on the queue. // // Not thread-safe. Must only be called by the owner of q. func (q *spanQueue) empty() bool { // Check the local queue for work. if q.tail-q.head > 0 { return false } return q.chainEmpty() } // chainEmpty returns true if the spmc chain is empty. // // Thread-safe. func (q *spanQueue) chainEmpty() bool { // Check the rest of the rings for work. r := (*spanSPMC)(q.chain.tail.Load()) for r != nil { if !r.empty() { return false } r = (*spanSPMC)(r.prev.Load()) } return true } // drain publishes n spans from the local queue to the spmc chain. func (q *spanQueue) drain(n uint32) { q.putsSinceDrain = 0 if q.chain.head == nil { // N.B. We target 1024, but this may be bigger if the physical // page size is bigger, or if we can fit more uintptrs into a // physical page. See newSpanSPMC docs. r := newSpanSPMC(1024) q.chain.head = r q.chain.tail.StoreNoWB(unsafe.Pointer(r)) } // Try to drain some of the queue to the head spmc. if q.tryDrain(q.chain.head, n) { return } // No space. Create a bigger spmc and add it to the chain. // Double the size of the next one, up to a maximum. // // We double each time so we can avoid taking this slow path // in the future, which involves a global lock. Ideally we want // to hit a steady-state where the deepest any queue goes during // a mark phase can fit in the ring. // // However, we still set a maximum on this. We set the maximum // to something large to amortize the cost of lock acquisition, but // still at a reasonable size for big heaps and/or a lot of Ps (which // tend to be correlated). // // It's not too bad to burn relatively large-but-fixed amounts of per-P // memory if we need to deal with really, really deep queues, since the // constants of proportionality are small. Simultaneously, we want to // avoid a situation where a single worker ends up queuing O(heap) // work and then forever retains a queue of that size. const maxCap = 1 << 20 / goarch.PtrSize newCap := q.chain.head.cap * 2 if newCap > maxCap { newCap = maxCap } newHead := newSpanSPMC(newCap) if !q.tryDrain(newHead, n) { throw("failed to put span on newly-allocated spanSPMC") } q.chain.head.prev.StoreNoWB(unsafe.Pointer(newHead)) q.chain.head = newHead } // tryDrain attempts to drain n spans from q's local queue to the chain. // // Returns whether it succeeded. func (q *spanQueue) tryDrain(r *spanSPMC, n uint32) bool { if q.head+n > q.tail { throw("attempt to drain too many elements") } h := r.head.Load() // synchronize with consumers t := r.tail.Load() rn := t - h if rn+n <= r.cap { for i := uint32(0); i < n; i++ { *r.slot(t + i) = q.ring[(q.head+i)%uint32(len(q.ring))] } r.tail.Store(t + n) // Makes the items avail for consumption. q.head += n return true } return false } // tryGetFast attempts to get a span from the local queue, but may fail if it's empty, // returning false. func (q *spanQueue) tryGetFast() objptr { if q.tail-q.head == 0 { return 0 } s := q.ring[q.head%uint32(len(q.ring))] q.head++ return s } // steal takes some spans from the ring chain of another span queue. // // q == q2 is OK. func (q *spanQueue) steal(q2 *spanQueue) objptr { r := (*spanSPMC)(q2.chain.tail.Load()) if r == nil { return 0 } for { // It's important that we load the next pointer // *before* popping the tail. In general, r may be // transiently empty, but if next is non-nil before // the pop and the pop fails, then r is permanently // empty, which is the only condition under which it's // safe to drop r from the chain. r2 := (*spanSPMC)(r.prev.Load()) // Try to refill from one of the rings if s := q.refill(r); s != 0 { return s } if r2 == nil { // This is the only ring. It's empty right // now, but could be pushed to in the future. return 0 } // The tail of the chain has been drained, so move on // to the next ring. Try to drop it from the chain // so the next consumer doesn't have to look at the empty // ring again. if q2.chain.tail.CompareAndSwapNoWB(unsafe.Pointer(r), unsafe.Pointer(r2)) { r.dead.Store(true) } r = r2 } } // refill takes some spans from r and puts them into q's local queue. // // One span is removed from the stolen spans and returned on success. // Failure to steal returns a zero objptr. // // steal is thread-safe with respect to r. func (q *spanQueue) refill(r *spanSPMC) objptr { if q.tail-q.head != 0 { throw("steal with local work available") } // Steal some spans. var n uint32 for { h := r.head.Load() // load-acquire, synchronize with other consumers t := r.tail.Load() // load-acquire, synchronize with the producer n = t - h n = n - n/2 if n == 0 { return 0 } if n > r.cap { // read inconsistent h and t continue } n = min(n, uint32(len(q.ring)/2)) for i := uint32(0); i < n; i++ { q.ring[i] = *r.slot(h + i) } if r.head.CompareAndSwap(h, h+n) { break } } // Update local queue head and tail to reflect new buffered values. q.head = 0 q.tail = n // Pop off the head of the queue and return it. return q.tryGetFast() } // destroy frees all chains in an empty spanQueue. // // Preconditions: // - World is stopped. // - GC is outside of the mark phase. // - (Therefore) the queue is empty. func (q *spanQueue) destroy() { assertWorldStopped() if gcphase != _GCoff { throw("spanQueue.destroy during the mark phase") } if !q.empty() { throw("spanQueue.destroy on non-empty queue") } lock(&work.spanSPMCs.lock) // Remove and free each ring. for r := (*spanSPMC)(q.chain.tail.Load()); r != nil; r = (*spanSPMC)(r.prev.Load()) { prev := r.allprev next := r.allnext if prev != nil { prev.allnext = next } if next != nil { next.allprev = prev } if work.spanSPMCs.all == r { work.spanSPMCs.all = next } r.deinit() mheap_.spanSPMCAlloc.free(unsafe.Pointer(r)) } q.chain.head = nil q.chain.tail.Store(nil) q.putsSinceDrain = 0 unlock(&work.spanSPMCs.lock) } // spanSPMC is a ring buffer of objptrs that represent spans. // Accessed without a lock. // // Single-producer, multi-consumer. The only producer is the P that owns this // queue, but any other P may consume from it. // // ## Invariants for memory management // // 1. All spanSPMCs are allocated from mheap_.spanSPMCAlloc. // 2. All allocated spanSPMCs must be on the work.spanSPMCs list. // 3. spanSPMCs may only be allocated if gcphase != _GCoff. // 4. spanSPMCs may only be deallocated if gcphase == _GCoff. // // Invariants (3) and (4) ensure that we do not need to concern ourselves with // tricky reuse issues that stem from not knowing when a thread is truly done // with a spanSPMC. For example, two threads could load the same spanSPMC from // the tail of the chain. One thread is then paused while the other steals the // last few elements off of it. It's not safe to free at that point since the // other thread will still inspect that spanSPMC, and we have no way of knowing // without more complex and/or heavyweight synchronization. // // Instead, we rely on the global synchronization inherent to GC phases, and // the fact that spanSPMCs are only ever used during the mark phase, to ensure // memory safety. This means we temporarily waste some memory, but it's only // until the end of the mark phase. type spanSPMC struct { _ sys.NotInHeap // allnext is the link to the next spanSPMC on the work.spanSPMCs list. // This is used to find and free dead spanSPMCs. Protected by // work.spanSPMCs.lock. allnext *spanSPMC // allprev is the link to the previous spanSPMC on the work.spanSPMCs // list. This is used to find and free dead spanSPMCs. Protected by // work.spanSPMCs.lock. allprev *spanSPMC // dead indicates whether the spanSPMC is no longer in use. // Protected by the CAS to the prev field of the spanSPMC pointing // to this spanSPMC. That is, whoever wins that CAS takes ownership // of marking this spanSPMC as dead. See spanQueue.steal for details. dead atomic.Bool // prev is the next link up a spanQueue's SPMC chain, from tail to head, // hence the name "prev." Set by a spanQueue's producer, cleared by a // CAS in spanQueue.steal. prev atomic.UnsafePointer // *spanSPMC // head, tail, cap, and ring together represent a fixed-size SPMC lock-free // ring buffer of size cap. The ring buffer contains objptr values. head atomic.Uint32 tail atomic.Uint32 cap uint32 // cap(ring)) ring *objptr } // newSpanSPMC allocates and initializes a new spmc with the provided capacity. // // newSpanSPMC may override the capacity with a larger one if the provided one would // waste memory. func newSpanSPMC(cap uint32) *spanSPMC { lock(&work.spanSPMCs.lock) r := (*spanSPMC)(mheap_.spanSPMCAlloc.alloc()) next := work.spanSPMCs.all r.allnext = next if next != nil { next.allprev = r } work.spanSPMCs.all = r unlock(&work.spanSPMCs.lock) // If cap < the capacity of a single physical page, round up. pageCap := uint32(physPageSize / goarch.PtrSize) // capacity of a single page if cap < pageCap { cap = pageCap } if cap&(cap-1) != 0 { throw("spmc capacity must be a power of 2") } r.cap = cap ring := sysAlloc(uintptr(cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys, "GC span queue") atomic.StorepNoWB(unsafe.Pointer(&r.ring), ring) return r } // empty returns true if the spmc is empty. // // empty is thread-safe. func (r *spanSPMC) empty() bool { h := r.head.Load() t := r.tail.Load() return t == h } // deinit frees any resources the spanSPMC is holding onto and zeroes it. func (r *spanSPMC) deinit() { sysFree(unsafe.Pointer(r.ring), uintptr(r.cap)*unsafe.Sizeof(objptr(0)), &memstats.gcMiscSys) r.ring = nil r.dead.Store(false) r.prev.StoreNoWB(nil) r.head.Store(0) r.tail.Store(0) r.cap = 0 r.allnext = nil r.allprev = nil } // slot returns a pointer to slot i%r.cap. func (r *spanSPMC) slot(i uint32) *objptr { idx := uintptr(i & (r.cap - 1)) return (*objptr)(unsafe.Add(unsafe.Pointer(r.ring), idx*unsafe.Sizeof(objptr(0)))) } // freeDeadSpanSPMCs frees dead spanSPMCs back to the OS. func freeDeadSpanSPMCs() { // According to the SPMC memory management invariants, we can only free // spanSPMCs outside of the mark phase. We ensure we do this in two ways. // // 1. We take the work.spanSPMCs lock, which we need anyway. This ensures // that we are non-preemptible. If this path becomes lock-free, we will // need to become non-preemptible in some other way. // 2. Once we are non-preemptible, we check the gcphase, and back out if // it's not safe. // // This way, we ensure that we don't start freeing if we're in the wrong // phase, and the phase can't change on us while we're freeing. // // TODO(go.dev/issue/75771): Due to the grow semantics in // spanQueue.drain, we expect a steady-state of around one spanSPMC per // P, with some spikes higher when Ps have more than one. For high // GOMAXPROCS, or if this list otherwise gets long, it would be nice to // have a way to batch work that allows preemption during processing. lock(&work.spanSPMCs.lock) if gcphase != _GCoff || work.spanSPMCs.all == nil { unlock(&work.spanSPMCs.lock) return } r := work.spanSPMCs.all for r != nil { next := r.allnext if r.dead.Load() { // It's dead. Deinitialize and free it. prev := r.allprev if prev != nil { prev.allnext = next } if next != nil { next.allprev = prev } if work.spanSPMCs.all == r { work.spanSPMCs.all = next } r.deinit() mheap_.spanSPMCAlloc.free(unsafe.Pointer(r)) } r = next } unlock(&work.spanSPMCs.lock) } // tryStealSpan attempts to steal a span from another P's local queue. // // Returns a non-zero objptr on success. func (w *gcWork) tryStealSpan() objptr { pp := getg().m.p.ptr() for enum := stealOrder.start(cheaprand()); !enum.done(); enum.next() { if !work.spanqMask.read(enum.position()) { continue } p2 := allp[enum.position()] if pp == p2 { continue } if s := w.spanq.steal(&p2.gcw.spanq); s != 0 { return s } // N.B. This is intentionally racy. We may stomp on a mask set by // a P that just put a bunch of work into its local queue. // // This is OK because the ragged barrier in gcMarkDone will set // the bit on each P if there's local work we missed. This race // should generally be rare, since the window between noticing // an empty local queue and this bit being set is quite small. work.spanqMask.clear(int32(enum.position())) } return 0 } // objptr consists of a span base and the index of the object in the span. type objptr uintptr // makeObjPtr creates an objptr from a span base address and an object index. func makeObjPtr(spanBase uintptr, objIndex uint16) objptr { if doubleCheckGreenTea && spanBase&((1< 1 { gcw.stats[spanclass.sizeclass()].sparseObjsScanned++ } b := spanBase + uintptr(objIndex)*elemsize scanObjectSmall(spanBase, b, elemsize, gcw) return } // Compute nelems. divMagic := uint64(gc.SizeClassToDivMagic[spanclass.sizeclass()]) usableSpanSize := uint64(gc.PageSize - unsafe.Sizeof(spanInlineMarkBits{})) if !spanclass.noscan() { usableSpanSize -= gc.PageSize / goarch.PtrSize / 8 } nelems := uint16((usableSpanSize * divMagic) >> 32) // Grey objects and return if there's nothing else to do. var toScan gc.ObjMask objsMarked := spanSetScans(spanBase, nelems, imb, &toScan) if objsMarked == 0 { return } gcw.bytesMarked += uint64(objsMarked) * uint64(elemsize) // Check if we have enough density to make a dartboard scan // worthwhile. If not, just do what scanobject does, but // localized to the span, using the dartboard. if !scan.HasFastScanSpanPacked() || objsMarked < int(nelems/8) { if debug.gctrace > 1 { gcw.stats[spanclass.sizeclass()].spansSparseScanned++ gcw.stats[spanclass.sizeclass()].spanObjsSparseScanned += uint64(objsMarked) } scanObjectsSmall(spanBase, elemsize, nelems, gcw, &toScan) return } // Scan the span. // // N.B. Use gcw.ptrBuf as the output buffer. This is a bit different // from scanObjectsSmall, which puts addresses to dereference. ScanSpanPacked // on the other hand, fills gcw.ptrBuf with already dereferenced pointers. nptrs := scan.ScanSpanPacked( unsafe.Pointer(spanBase), &gcw.ptrBuf[0], &toScan, uintptr(spanclass.sizeclass()), spanPtrMaskUnsafe(spanBase), ) gcw.heapScanWork += int64(objsMarked) * int64(elemsize) if debug.gctrace > 1 { // Write down some statistics. gcw.stats[spanclass.sizeclass()].spansDenseScanned++ gcw.stats[spanclass.sizeclass()].spanObjsDenseScanned += uint64(objsMarked) } // Process all the pointers we just got. for _, p := range gcw.ptrBuf[:nptrs] { if !tryDeferToSpanScan(p, gcw) { if obj, span, objIndex := findObject(p, 0, 0); obj != 0 { greyobject(obj, 0, 0, span, gcw, objIndex) } } } } // spanSetScans sets any unset mark bits that have their mark bits set in the inline mark bits. // // toScan is populated with bits indicating whether a particular mark bit was set. // // Returns the number of objects marked, which could be zero. func spanSetScans(spanBase uintptr, nelems uint16, imb *spanInlineMarkBits, toScan *gc.ObjMask) int { arena, pageIdx, pageMask := pageIndexOf(spanBase) if arena.pageMarks[pageIdx]&pageMask == 0 { atomic.Or8(&arena.pageMarks[pageIdx], pageMask) } bytes := divRoundUp(uintptr(nelems), 8) objsMarked := 0 // Careful: these two structures alias since ObjMask is much bigger // than marks or scans. We do these unsafe shenanigans so that we can // access the marks and scans by uintptrs rather than by byte. imbMarks := (*gc.ObjMask)(unsafe.Pointer(&imb.marks)) imbScans := (*gc.ObjMask)(unsafe.Pointer(&imb.scans)) // Iterate over one uintptr-sized chunks at a time, computing both // the union and intersection of marks and scans. Store the union // into scans, and the intersection into toScan. for i := uintptr(0); i < bytes; i += goarch.PtrSize { scans := atomic.Loaduintptr(&imbScans[i/goarch.PtrSize]) marks := imbMarks[i/goarch.PtrSize] scans = bswapIfBigEndian(scans) marks = bswapIfBigEndian(marks) if i/goarch.PtrSize == uintptr(len(imb.marks)+1)/goarch.PtrSize-1 { scans &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out owned marks &^= 0xff << ((goarch.PtrSize - 1) * 8) // mask out class } toGrey := marks &^ scans toScan[i/goarch.PtrSize] = toGrey // If there's anything left to grey, do it. if toGrey != 0 { toGrey = bswapIfBigEndian(toGrey) if goarch.PtrSize == 4 { atomic.Or32((*uint32)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint32(toGrey)) } else { atomic.Or64((*uint64)(unsafe.Pointer(&imbScans[i/goarch.PtrSize])), uint64(toGrey)) } } objsMarked += sys.OnesCount64(uint64(toGrey)) } return objsMarked } func scanObjectSmall(spanBase, b, objSize uintptr, gcw *gcWork) { ptrBits := heapBitsSmallForAddrInline(spanBase, b, objSize) gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize) nptrs := 0 n := sys.OnesCount64(uint64(ptrBits)) for range n { k := sys.TrailingZeros64(uint64(ptrBits)) ptrBits &^= 1 << k addr := b + uintptr(k)*goarch.PtrSize // Prefetch addr since we're about to use it. This point for prefetching // was chosen empirically. sys.Prefetch(addr) // N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span. gcw.ptrBuf[nptrs] = addr nptrs++ } // Process all the pointers we just got. for _, p := range gcw.ptrBuf[:nptrs] { p = *(*uintptr)(unsafe.Pointer(p)) if p == 0 { continue } if !tryDeferToSpanScan(p, gcw) { if obj, span, objIndex := findObject(p, 0, 0); obj != 0 { greyobject(obj, 0, 0, span, gcw, objIndex) } } } } func scanObjectsSmall(base, objSize uintptr, elems uint16, gcw *gcWork, scans *gc.ObjMask) { nptrs := 0 for i, bits := range scans { if i*(goarch.PtrSize*8) > int(elems) { break } n := sys.OnesCount64(uint64(bits)) for range n { j := sys.TrailingZeros64(uint64(bits)) bits &^= 1 << j b := base + uintptr(i*(goarch.PtrSize*8)+j)*objSize ptrBits := heapBitsSmallForAddrInline(base, b, objSize) gcw.heapScanWork += int64(sys.Len64(uint64(ptrBits)) * goarch.PtrSize) n := sys.OnesCount64(uint64(ptrBits)) for range n { k := sys.TrailingZeros64(uint64(ptrBits)) ptrBits &^= 1 << k addr := b + uintptr(k)*goarch.PtrSize // Prefetch addr since we're about to use it. This point for prefetching // was chosen empirically. sys.Prefetch(addr) // N.B. ptrBuf is always large enough to hold pointers for an entire 1-page span. gcw.ptrBuf[nptrs] = addr nptrs++ } } } // Process all the pointers we just got. for _, p := range gcw.ptrBuf[:nptrs] { p = *(*uintptr)(unsafe.Pointer(p)) if p == 0 { continue } if !tryDeferToSpanScan(p, gcw) { if obj, span, objIndex := findObject(p, 0, 0); obj != 0 { greyobject(obj, 0, 0, span, gcw, objIndex) } } } } func heapBitsSmallForAddrInline(spanBase, addr, elemsize uintptr) uintptr { hbitsBase, _ := spanHeapBitsRange(spanBase, gc.PageSize, elemsize) hbits := (*byte)(unsafe.Pointer(hbitsBase)) // These objects are always small enough that their bitmaps // fit in a single word, so just load the word or two we need. // // Mirrors mspan.writeHeapBitsSmall. // // We should be using heapBits(), but unfortunately it introduces // both bounds checks panics and throw which causes us to exceed // the nosplit limit in quite a few cases. i := (addr - spanBase) / goarch.PtrSize / ptrBits j := (addr - spanBase) / goarch.PtrSize % ptrBits bits := elemsize / goarch.PtrSize word0 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+0)))) word1 := (*uintptr)(unsafe.Pointer(addb(hbits, goarch.PtrSize*(i+1)))) var read uintptr if j+bits > ptrBits { // Two reads. bits0 := ptrBits - j bits1 := bits - bits0 read = *word0 >> j read |= (*word1 & ((1 << bits1) - 1)) << bits0 } else { // One read. read = (*word0 >> j) & ((1 << bits) - 1) } return read } // spanPtrMaskUnsafe returns the pointer mask for a span with inline mark bits. // // The caller must ensure spanBase is the base of a span that: // - 1 page in size, // - Uses inline mark bits, // - Contains pointers. func spanPtrMaskUnsafe(spanBase uintptr) *gc.PtrMask { base := spanBase + gc.PageSize - unsafe.Sizeof(gc.PtrMask{}) - unsafe.Sizeof(spanInlineMarkBits{}) return (*gc.PtrMask)(unsafe.Pointer(base)) } type sizeClassScanStats struct { spansDenseScanned uint64 // Spans scanned with ScanSpanPacked. spanObjsDenseScanned uint64 // Objects scanned with ScanSpanPacked. spansSparseScanned uint64 // Spans scanned with scanObjectsSmall. spanObjsSparseScanned uint64 // Objects scanned with scanObjectsSmall. sparseObjsScanned uint64 // Objects scanned with scanobject or scanObjectSmall. // Note: sparseObjsScanned is sufficient for both cases because // a particular size class either uses scanobject or scanObjectSmall, // not both. In the latter case, we also know that there was one // object scanned per span, so no need for a span counter. } func dumpScanStats() { var ( spansDenseScanned uint64 spanObjsDenseScanned uint64 spansSparseScanned uint64 spanObjsSparseScanned uint64 sparseObjsScanned uint64 ) for _, stats := range memstats.lastScanStats { spansDenseScanned += stats.spansDenseScanned spanObjsDenseScanned += stats.spanObjsDenseScanned spansSparseScanned += stats.spansSparseScanned spanObjsSparseScanned += stats.spanObjsSparseScanned sparseObjsScanned += stats.sparseObjsScanned } totalObjs := sparseObjsScanned + spanObjsSparseScanned + spanObjsDenseScanned totalSpans := spansSparseScanned + spansDenseScanned print("scan: total ", sparseObjsScanned, "+", spanObjsSparseScanned, "+", spanObjsDenseScanned, "=", totalObjs, " objs") print(", ", spansSparseScanned, "+", spansDenseScanned, "=", totalSpans, " spans\n") for i, stats := range memstats.lastScanStats { if stats == (sizeClassScanStats{}) { continue } totalObjs := stats.sparseObjsScanned + stats.spanObjsSparseScanned + stats.spanObjsDenseScanned totalSpans := stats.spansSparseScanned + stats.spansDenseScanned if i == 0 { print("scan: class L ") } else { print("scan: class ", gc.SizeClassToSize[i], "B ") } print(stats.sparseObjsScanned, "+", stats.spanObjsSparseScanned, "+", stats.spanObjsDenseScanned, "=", totalObjs, " objs") print(", ", stats.spansSparseScanned, "+", stats.spansDenseScanned, "=", totalSpans, " spans\n") } } func (w *gcWork) flushScanStats(dst *[gc.NumSizeClasses]sizeClassScanStats) { for i := range w.stats { dst[i].spansDenseScanned += w.stats[i].spansDenseScanned dst[i].spanObjsDenseScanned += w.stats[i].spanObjsDenseScanned dst[i].spansSparseScanned += w.stats[i].spansSparseScanned dst[i].spanObjsSparseScanned += w.stats[i].spanObjsSparseScanned dst[i].sparseObjsScanned += w.stats[i].sparseObjsScanned } clear(w.stats[:]) } // gcMarkWorkAvailable reports whether there's any non-local work available to do. // // This is a heavyweight check and must only be used for correctness, not // as a hint. func gcMarkWorkAvailable() bool { if !work.full.empty() { return true // global work available } if work.markrootNext.Load() < work.markrootJobs.Load() { return true // root scan work available } if work.spanqMask.any() { return true // stealable local work available } return false } // scanObject scans the object starting at b, adding pointers to gcw. // b must point to the beginning of a heap object or an oblet. // scanObject consults the GC bitmap for the pointer mask and the // spans for the size of the object. // // Used only for !gcUsesSpanInlineMarkBits spans, but supports all // object sizes and is safe to be called on all heap objects. // //go:nowritebarrier func scanObject(b uintptr, gcw *gcWork) { // Prefetch object before we scan it. // // This will overlap fetching the beginning of the object with initial // setup before we start scanning the object. sys.Prefetch(b) // Find the bits for b and the size of the object at b. // // b is either the beginning of an object, in which case this // is the size of the object to scan, or it points to an // oblet, in which case we compute the size to scan below. s := spanOfUnchecked(b) n := s.elemsize if n == 0 { throw("scanObject n == 0") } if s.spanclass.noscan() { // Correctness-wise this is ok, but it's inefficient // if noscan objects reach here. throw("scanObject of a noscan object") } var tp typePointers if n > maxObletBytes { // Large object. Break into oblets for better // parallelism and lower latency. if b == s.base() { // Enqueue the other oblets to scan later. // Some oblets may be in b's scalar tail, but // these will be marked as "no more pointers", // so we'll drop out immediately when we go to // scan those. for oblet := b + maxObletBytes; oblet < s.base()+s.elemsize; oblet += maxObletBytes { if !gcw.putObjFast(oblet) { gcw.putObj(oblet) } } } // Compute the size of the oblet. Since this object // must be a large object, s.base() is the beginning // of the object. n = s.base() + s.elemsize - b n = min(n, maxObletBytes) tp = s.typePointersOfUnchecked(s.base()) tp = tp.fastForward(b-tp.addr, b+n) } else { tp = s.typePointersOfUnchecked(b) } var scanSize uintptr for { var addr uintptr if tp, addr = tp.nextFast(); addr == 0 { if tp, addr = tp.next(b + n); addr == 0 { break } } // Keep track of farthest pointer we found, so we can // update heapScanWork. TODO: is there a better metric, // now that we can skip scalar portions pretty efficiently? scanSize = addr - b + goarch.PtrSize // Work here is duplicated in scanblock and above. // If you make changes here, make changes there too. obj := *(*uintptr)(unsafe.Pointer(addr)) // At this point we have extracted the next potential pointer. // Quickly filter out nil and pointers back to the current object. if obj != 0 && obj-b >= n { // Test if obj points into the Go heap and, if so, // mark the object. // // Note that it's possible for findObject to // fail if obj points to a just-allocated heap // object because of a race with growing the // heap. In this case, we know the object was // just allocated and hence will be marked by // allocation itself. if !tryDeferToSpanScan(obj, gcw) { if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 { greyobject(obj, b, addr-b, span, gcw, objIndex) } } } } gcw.bytesMarked += uint64(n) gcw.heapScanWork += int64(scanSize) if debug.gctrace > 1 { gcw.stats[s.spanclass.sizeclass()].sparseObjsScanned++ } }