// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file contains stub functions that are not meant to be called directly, // but that will be assembled together using the inlining logic in runtime/_mkmalloc // to produce a full mallocgc function that's specialized for a span class // or specific size in the case of the tiny allocator. // // To assemble a mallocgc function, the mallocStub function is cloned, and the call to // inlinedMalloc is replaced with the inlined body of smallScanNoHeaderStub, // smallNoScanStub or tinyStub, depending on the parameters being specialized. // // The size_ (for the tiny case) and elemsize_, sizeclass_, and noscanint_ (for all three cases) // identifiers are replaced with the value of the parameter in the specialized case. // The nextFreeFastStub, nextFreeFastTiny, heapSetTypeNoHeaderStub, and writeHeapBitsSmallStub // functions are also inlined by _mkmalloc. package runtime import ( "internal/goarch" "internal/runtime/sys" "unsafe" ) // These identifiers will all be replaced by the inliner. So their values don't // really matter: they just need to be set so that the stub functions, which // will never be used on their own, can compile. elemsize_ can't be set to // zero because we divide by it in nextFreeFastTiny, and the compiler would // complain about a division by zero. Its replaced value will always be greater // than zero. const elemsize_ = 8 const sizeclass_ = 0 const noscanint_ = 0 const size_ = 0 func malloc0(size uintptr, typ *_type, needzero bool) unsafe.Pointer { if doubleCheckMalloc { if gcphase == _GCmarktermination { throw("mallocgc called with gcphase == _GCmarktermination") } } // Short-circuit zero-sized allocation requests. return unsafe.Pointer(&zerobase) } func mallocPanic(size uintptr, typ *_type, needzero bool) unsafe.Pointer { panic("not defined for sizeclass") } // WARNING: mallocStub does not do any work for sanitizers so callers need // to steer out of this codepath early if sanitizers are enabled. func mallocStub(size uintptr, typ *_type, needzero bool) unsafe.Pointer { if doubleCheckMalloc { if gcphase == _GCmarktermination { throw("mallocgc called with gcphase == _GCmarktermination") } } // It's possible for any malloc to trigger sweeping, which may in // turn queue finalizers. Record this dynamic lock edge. // N.B. Compiled away if lockrank experiment is not enabled. lockRankMayQueueFinalizer() // Pre-malloc debug hooks. if debug.malloc { if x := preMallocgcDebug(size, typ); x != nil { return x } } // Assist the GC if needed. if gcBlackenEnabled != 0 { deductAssistCredit(size) } // Actually do the allocation. x, elemsize := inlinedMalloc(size, typ, needzero) // Notify valgrind, if enabled. // To allow the compiler to not know about valgrind, we do valgrind instrumentation // unlike the other sanitizers. if valgrindenabled { valgrindMalloc(x, size) } // Adjust our GC assist debt to account for internal fragmentation. if gcBlackenEnabled != 0 && elemsize != 0 { if assistG := getg().m.curg; assistG != nil { assistG.gcAssistBytes -= int64(elemsize - size) } } // Post-malloc debug hooks. if debug.malloc { postMallocgcDebug(x, elemsize, typ) } return x } // inlinedMalloc will never be called. It is defined just so that the compiler can compile // the mallocStub function, which will also never be called, but instead used as a template // to generate a size-specialized malloc function. The call to inlinedMalloc in mallocStub // will be replaced with the inlined body of smallScanNoHeaderStub, smallNoScanStub, or tinyStub // when generating the size-specialized malloc function. See the comment at the top of this // file for more information. func inlinedMalloc(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { return unsafe.Pointer(uintptr(0)), 0 } func doubleCheckSmallScanNoHeader(size uintptr, typ *_type, mp *m) { if mp.mallocing != 0 { throw("malloc deadlock") } if mp.gsignal == getg() { throw("malloc during signal") } if typ == nil || !typ.Pointers() { throw("noscan allocated in scan-only path") } if !heapBitsInSpan(size) { throw("heap bits in not in span for non-header-only path") } } func smallScanNoHeaderStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { const sizeclass = sizeclass_ const elemsize = elemsize_ // Set mp.mallocing to keep from being preempted by GC. mp := acquirem() if doubleCheckMalloc { doubleCheckSmallScanNoHeader(size, typ, mp) } mp.mallocing = 1 checkGCTrigger := false c := getMCache(mp) const spc = spanClass(sizeclass<<1) | spanClass(noscanint_) span := c.alloc[spc] v := nextFreeFastStub(span) if v == 0 { v, span, checkGCTrigger = c.nextFree(spc) } x := unsafe.Pointer(v) if span.needzero != 0 { memclrNoHeapPointers(x, elemsize) } if goarch.PtrSize == 8 && sizeclass == 1 { // initHeapBits already set the pointer bits for the 8-byte sizeclass // on 64-bit platforms. c.scanAlloc += 8 } else { dataSize := size // make the inliner happy x := uintptr(x) scanSize := heapSetTypeNoHeaderStub(x, dataSize, typ, span) c.scanAlloc += scanSize } // Ensure that the stores above that initialize x to // type-safe memory and set the heap bits occur before // the caller can make x observable to the garbage // collector. Otherwise, on weakly ordered machines, // the garbage collector could follow a pointer to x, // but see uninitialized memory or stale heap bits. publicationBarrier() if writeBarrier.enabled { // Allocate black during GC. // All slots hold nil so no scanning is needed. // This may be racing with GC so do it atomically if there can be // a race marking the bit. gcmarknewobject(span, uintptr(x)) } else { // Track the last free index before the mark phase. This field // is only used by the garbage collector. During the mark phase // this is used by the conservative scanner to filter out objects // that are both free and recently-allocated. It's safe to do that // because we allocate-black if the GC is enabled. The conservative // scanner produces pointers out of thin air, so without additional // synchronization it might otherwise observe a partially-initialized // object, which could crash the program. span.freeIndexForScan = span.freeindex } // Note cache c only valid while m acquired; see #47302 // // N.B. Use the full size because that matches how the GC // will update the mem profile on the "free" side. // // TODO(mknyszek): We should really count the header as part // of gc_sys or something. The code below just pretends it is // internal fragmentation and matches the GC's accounting by // using the whole allocation slot. c.nextSample -= int64(elemsize) if c.nextSample < 0 || MemProfileRate != c.memProfRate { profilealloc(mp, x, elemsize) } mp.mallocing = 0 releasem(mp) if checkGCTrigger { if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { gcStart(t) } } return x, elemsize } func doubleCheckSmallNoScan(typ *_type, mp *m) { if mp.mallocing != 0 { throw("malloc deadlock") } if mp.gsignal == getg() { throw("malloc during signal") } if typ != nil && typ.Pointers() { throw("expected noscan type for noscan alloc") } } func smallNoScanStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { // TODO(matloob): Add functionality to mkmalloc to allow us to inline a non-constant // sizeclass_ and elemsize_ value (instead just set to the expressions to look up the size class // and elemsize. We'd also need to teach mkmalloc that values that are touched by these (specifically // spc below) should turn into vars. This would allow us to generate mallocgcSmallNoScan itself, // so that its code could not diverge from the generated functions. const sizeclass = sizeclass_ const elemsize = elemsize_ // Set mp.mallocing to keep from being preempted by GC. mp := acquirem() if doubleCheckMalloc { doubleCheckSmallNoScan(typ, mp) } mp.mallocing = 1 checkGCTrigger := false c := getMCache(mp) const spc = spanClass(sizeclass<<1) | spanClass(noscanint_) span := c.alloc[spc] v := nextFreeFastStub(span) if v == 0 { v, span, checkGCTrigger = c.nextFree(spc) } x := unsafe.Pointer(v) if needzero && span.needzero != 0 { memclrNoHeapPointers(x, elemsize) } // Ensure that the stores above that initialize x to // type-safe memory and set the heap bits occur before // the caller can make x observable to the garbage // collector. Otherwise, on weakly ordered machines, // the garbage collector could follow a pointer to x, // but see uninitialized memory or stale heap bits. publicationBarrier() if writeBarrier.enabled { // Allocate black during GC. // All slots hold nil so no scanning is needed. // This may be racing with GC so do it atomically if there can be // a race marking the bit. gcmarknewobject(span, uintptr(x)) } else { // Track the last free index before the mark phase. This field // is only used by the garbage collector. During the mark phase // this is used by the conservative scanner to filter out objects // that are both free and recently-allocated. It's safe to do that // because we allocate-black if the GC is enabled. The conservative // scanner produces pointers out of thin air, so without additional // synchronization it might otherwise observe a partially-initialized // object, which could crash the program. span.freeIndexForScan = span.freeindex } // Note cache c only valid while m acquired; see #47302 // // N.B. Use the full size because that matches how the GC // will update the mem profile on the "free" side. // // TODO(mknyszek): We should really count the header as part // of gc_sys or something. The code below just pretends it is // internal fragmentation and matches the GC's accounting by // using the whole allocation slot. c.nextSample -= int64(elemsize) if c.nextSample < 0 || MemProfileRate != c.memProfRate { profilealloc(mp, x, elemsize) } mp.mallocing = 0 releasem(mp) if checkGCTrigger { if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { gcStart(t) } } return x, elemsize } func doubleCheckTiny(size uintptr, typ *_type, mp *m) { if mp.mallocing != 0 { throw("malloc deadlock") } if mp.gsignal == getg() { throw("malloc during signal") } if typ != nil && typ.Pointers() { throw("expected noscan for tiny alloc") } } func tinyStub(size uintptr, typ *_type, needzero bool) (unsafe.Pointer, uintptr) { const constsize = size_ const elemsize = elemsize_ // Set mp.mallocing to keep from being preempted by GC. mp := acquirem() if doubleCheckMalloc { doubleCheckTiny(constsize, typ, mp) } mp.mallocing = 1 // Tiny allocator. // // Tiny allocator combines several tiny allocation requests // into a single memory block. The resulting memory block // is freed when all subobjects are unreachable. The subobjects // must be noscan (don't have pointers), this ensures that // the amount of potentially wasted memory is bounded. // // Size of the memory block used for combining (maxTinySize) is tunable. // Current setting is 16 bytes, which relates to 2x worst case memory // wastage (when all but one subobjects are unreachable). // 8 bytes would result in no wastage at all, but provides less // opportunities for combining. // 32 bytes provides more opportunities for combining, // but can lead to 4x worst case wastage. // The best case winning is 8x regardless of block size. // // Objects obtained from tiny allocator must not be freed explicitly. // So when an object will be freed explicitly, we ensure that // its size >= maxTinySize. // // SetFinalizer has a special case for objects potentially coming // from tiny allocator, it such case it allows to set finalizers // for an inner byte of a memory block. // // The main targets of tiny allocator are small strings and // standalone escaping variables. On a json benchmark // the allocator reduces number of allocations by ~12% and // reduces heap size by ~20%. c := getMCache(mp) off := c.tinyoffset // Align tiny pointer for required (conservative) alignment. if constsize&7 == 0 { off = alignUp(off, 8) } else if goarch.PtrSize == 4 && constsize == 12 { // Conservatively align 12-byte objects to 8 bytes on 32-bit // systems so that objects whose first field is a 64-bit // value is aligned to 8 bytes and does not cause a fault on // atomic access. See issue 37262. // TODO(mknyszek): Remove this workaround if/when issue 36606 // is resolved. off = alignUp(off, 8) } else if constsize&3 == 0 { off = alignUp(off, 4) } else if constsize&1 == 0 { off = alignUp(off, 2) } if off+constsize <= maxTinySize && c.tiny != 0 { // The object fits into existing tiny block. x := unsafe.Pointer(c.tiny + off) c.tinyoffset = off + constsize c.tinyAllocs++ mp.mallocing = 0 releasem(mp) return x, 0 } // Allocate a new maxTinySize block. checkGCTrigger := false span := c.alloc[tinySpanClass] v := nextFreeFastTiny(span) if v == 0 { v, span, checkGCTrigger = c.nextFree(tinySpanClass) } x := unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 // Always zero (*[2]uint64)(x)[1] = 0 // See if we need to replace the existing tiny block with the new one // based on amount of remaining free space. if !raceenabled && (constsize < c.tinyoffset || c.tiny == 0) { // Note: disabled when race detector is on, see comment near end of this function. c.tiny = uintptr(x) c.tinyoffset = constsize } // Ensure that the stores above that initialize x to // type-safe memory and set the heap bits occur before // the caller can make x observable to the garbage // collector. Otherwise, on weakly ordered machines, // the garbage collector could follow a pointer to x, // but see uninitialized memory or stale heap bits. publicationBarrier() if writeBarrier.enabled { // Allocate black during GC. // All slots hold nil so no scanning is needed. // This may be racing with GC so do it atomically if there can be // a race marking the bit. gcmarknewobject(span, uintptr(x)) } else { // Track the last free index before the mark phase. This field // is only used by the garbage collector. During the mark phase // this is used by the conservative scanner to filter out objects // that are both free and recently-allocated. It's safe to do that // because we allocate-black if the GC is enabled. The conservative // scanner produces pointers out of thin air, so without additional // synchronization it might otherwise observe a partially-initialized // object, which could crash the program. span.freeIndexForScan = span.freeindex } // Note cache c only valid while m acquired; see #47302 // // N.B. Use the full size because that matches how the GC // will update the mem profile on the "free" side. // // TODO(mknyszek): We should really count the header as part // of gc_sys or something. The code below just pretends it is // internal fragmentation and matches the GC's accounting by // using the whole allocation slot. c.nextSample -= int64(elemsize) if c.nextSample < 0 || MemProfileRate != c.memProfRate { profilealloc(mp, x, elemsize) } mp.mallocing = 0 releasem(mp) if checkGCTrigger { if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { gcStart(t) } } if raceenabled { // Pad tinysize allocations so they are aligned with the end // of the tinyalloc region. This ensures that any arithmetic // that goes off the top end of the object will be detectable // by checkptr (issue 38872). // Note that we disable tinyalloc when raceenabled for this to work. // TODO: This padding is only performed when the race detector // is enabled. It would be nice to enable it if any package // was compiled with checkptr, but there's no easy way to // detect that (especially at compile time). // TODO: enable this padding for all allocations, not just // tinyalloc ones. It's tricky because of pointer maps. // Maybe just all noscan objects? x = add(x, elemsize-constsize) } return x, elemsize } // TODO(matloob): Should we let the go compiler inline this instead of using mkmalloc? // We won't be able to use elemsize_ but that's probably ok. func nextFreeFastTiny(span *mspan) gclinkptr { const nbytes = 8192 const nelems = uint16((nbytes - unsafe.Sizeof(spanInlineMarkBits{})) / elemsize_) var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache? result := span.freeindex + uint16(theBit) if result < nelems { freeidx := result + 1 if !(freeidx%64 == 0 && freeidx != nelems) { span.allocCache >>= uint(theBit + 1) span.freeindex = freeidx span.allocCount++ nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base()) } } } return nextFreeFastResult } func nextFreeFastStub(span *mspan) gclinkptr { var nextFreeFastResult gclinkptr if span.allocCache != 0 { theBit := sys.TrailingZeros64(span.allocCache) // Is there a free object in the allocCache? result := span.freeindex + uint16(theBit) if result < span.nelems { freeidx := result + 1 if !(freeidx%64 == 0 && freeidx != span.nelems) { span.allocCache >>= uint(theBit + 1) span.freeindex = freeidx span.allocCount++ nextFreeFastResult = gclinkptr(uintptr(result)*elemsize_ + span.base()) } } } return nextFreeFastResult } func heapSetTypeNoHeaderStub(x, dataSize uintptr, typ *_type, span *mspan) uintptr { if doubleCheckHeapSetType && (!heapBitsInSpan(dataSize) || !heapBitsInSpan(elemsize_)) { throw("tried to write heap bits, but no heap bits in span") } scanSize := writeHeapBitsSmallStub(span, x, dataSize, typ) if doubleCheckHeapSetType { doubleCheckHeapType(x, dataSize, typ, nil, span) } return scanSize } // writeHeapBitsSmallStub writes the heap bits for small objects whose ptr/scalar data is // stored as a bitmap at the end of the span. // // Assumes dataSize is <= ptrBits*goarch.PtrSize. x must be a pointer into the span. // heapBitsInSpan(dataSize) must be true. dataSize must be >= typ.Size_. // //go:nosplit func writeHeapBitsSmallStub(span *mspan, x, dataSize uintptr, typ *_type) uintptr { // The objects here are always really small, so a single load is sufficient. src0 := readUintptr(getGCMask(typ)) const elemsize = elemsize_ // Create repetitions of the bitmap if we have a small slice backing store. scanSize := typ.PtrBytes src := src0 if typ.Size_ == goarch.PtrSize { src = (1 << (dataSize / goarch.PtrSize)) - 1 } else { // N.B. We rely on dataSize being an exact multiple of the type size. // The alternative is to be defensive and mask out src to the length // of dataSize. The purpose is to save on one additional masking operation. if doubleCheckHeapSetType && !asanenabled && dataSize%typ.Size_ != 0 { throw("runtime: (*mspan).writeHeapBitsSmall: dataSize is not a multiple of typ.Size_") } for i := typ.Size_; i < dataSize; i += typ.Size_ { src |= src0 << (i / goarch.PtrSize) scanSize += typ.Size_ } } // Since we're never writing more than one uintptr's worth of bits, we're either going // to do one or two writes. dstBase, _ := spanHeapBitsRange(span.base(), pageSize, elemsize) dst := unsafe.Pointer(dstBase) o := (x - span.base()) / goarch.PtrSize i := o / ptrBits j := o % ptrBits const bits uintptr = elemsize / goarch.PtrSize // In the if statement below, we have to do two uintptr writes if the bits // we need to write straddle across two different memory locations. But if // the number of bits we're writing divides evenly into the number of bits // in the uintptr we're writing, this can never happen. Since bitsIsPowerOfTwo // is a compile-time constant in the generated code, in the case where the size is // a power of two less than or equal to ptrBits, the compiler can remove the // 'two writes' branch of the if statement and always do only one write without // the check. const bitsIsPowerOfTwo = bits&(bits-1) == 0 if bits > ptrBits || (!bitsIsPowerOfTwo && j+bits > ptrBits) { // Two writes. bits0 := ptrBits - j bits1 := bits - bits0 dst0 := (*uintptr)(add(dst, (i+0)*goarch.PtrSize)) dst1 := (*uintptr)(add(dst, (i+1)*goarch.PtrSize)) *dst0 = (*dst0)&(^uintptr(0)>>bits0) | (src << j) *dst1 = (*dst1)&^((1<> bits0) } else { // One write. dst := (*uintptr)(add(dst, i*goarch.PtrSize)) *dst = (*dst)&^(((1<<(min(bits, ptrBits)))-1)< ptrbits we always take the other branch } const doubleCheck = false if doubleCheck { writeHeapBitsDoubleCheck(span, x, dataSize, src, src0, i, j, bits, typ) } return scanSize } func writeHeapBitsDoubleCheck(span *mspan, x, dataSize, src, src0, i, j, bits uintptr, typ *_type) { srcRead := span.heapBitsSmallForAddr(x) if srcRead != src { print("runtime: x=", hex(x), " i=", i, " j=", j, " bits=", bits, "\n") print("runtime: dataSize=", dataSize, " typ.Size_=", typ.Size_, " typ.PtrBytes=", typ.PtrBytes, "\n") print("runtime: src0=", hex(src0), " src=", hex(src), " srcRead=", hex(srcRead), "\n") throw("bad pointer bits written for small object") } }