// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" // Test-only. TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24 MOVQ sizeClass+0(FP), CX MOVQ packed+8(FP), AX // Call the expander for this size class LEAQ ·gcExpandersAVX512(SB), BX CALL (BX)(CX*8) MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer VMOVDQU64 Z1, 0(DI) VMOVDQU64 Z2, 64(DI) VZEROUPPER RET TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44 // Z1+Z2 = Expand the grey object mask into a grey word mask MOVQ objMarks+16(FP), AX MOVQ sizeClass+24(FP), CX LEAQ ·gcExpandersAVX512(SB), BX CALL (BX)(CX*8) // Z3+Z4 = Load the pointer mask MOVQ ptrMask+32(FP), AX VMOVDQU64 0(AX), Z3 VMOVDQU64 64(AX), Z4 // Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask VPANDQ Z1, Z3, Z1 VPANDQ Z2, Z4, Z2 // Now each bit of Z1+Z2 represents one word of the span. // Thus, each byte covers 64 bytes of memory, which is also how // much we can fix in a Z register. // // We do a load/compress for each 64 byte frame. // // Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame VPOPCNTB Z1, Z3 // Requires BITALG VPOPCNTB Z2, Z4 // Store the scan mask and word counts at 0(SP) and 128(SP). // // TODO: Is it better to read directly from the registers? VMOVDQU64 Z1, 0(SP) VMOVDQU64 Z2, 64(SP) VMOVDQU64 Z3, 128(SP) VMOVDQU64 Z4, 192(SP) // SI = Current address in span MOVQ mem+0(FP), SI // DI = Scan buffer base MOVQ bufp+8(FP), DI // DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer MOVQ $0, DX // AX = address in scan mask, 128(AX) = address in popcount LEAQ 0(SP), AX // Loop over the 64 byte frames in this span. // BX = 1 past the end of the scan mask LEAQ 128(SP), BX // Align loop to a cache line so that performance is less sensitive // to how this function ends up laid out in memory. This is a hot // function in the GC, and this is a tight loop. We don't want // performance to waver wildly due to unrelated changes. PCALIGN $64 loop: // CX = Fetch the mask of words to load from this frame. MOVBQZX 0(AX), CX // Skip empty frames. TESTQ CX, CX JZ skip // Load the 64 byte frame. KMOVB CX, K1 VMOVDQA64 0(SI), Z1 // Collect just the pointers from the greyed objects into the scan buffer, // i.e., copy the word indices in the mask from Z1 into contiguous memory. VPCOMPRESSQ Z1, K1, (DI)(DX*8) // Advance the scan buffer position by the number of pointers. MOVBQZX 128(AX), CX ADDQ CX, DX skip: ADDQ $64, SI ADDQ $1, AX CMPQ AX, BX JB loop end: MOVL DX, count+40(FP) VZEROUPPER RET