Text file src/internal/runtime/gc/scan/scan_amd64.s

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  // Test-only.
     9  TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
    10  	MOVQ sizeClass+0(FP), CX
    11  	MOVQ packed+8(FP), AX
    12  
    13  	// Call the expander for this size class
    14  	LEAQ ·gcExpandersAVX512(SB), BX
    15  	CALL (BX)(CX*8)
    16  
    17  	MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
    18  	VMOVDQU64 Z1, 0(DI)
    19  	VMOVDQU64 Z2, 64(DI)
    20  	VZEROUPPER
    21  	RET
    22  
    23  TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
    24  	// Z1+Z2 = Expand the grey object mask into a grey word mask
    25  	MOVQ objMarks+16(FP), AX
    26  	MOVQ sizeClass+24(FP), CX
    27  	LEAQ ·gcExpandersAVX512(SB), BX
    28  	CALL (BX)(CX*8)
    29  
    30  	// Z3+Z4 = Load the pointer mask
    31  	MOVQ ptrMask+32(FP), AX
    32  	VMOVDQU64 0(AX), Z3
    33  	VMOVDQU64 64(AX), Z4
    34  
    35  	// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
    36  	VPANDQ Z1, Z3, Z1
    37  	VPANDQ Z2, Z4, Z2
    38  
    39  	// Now each bit of Z1+Z2 represents one word of the span.
    40  	// Thus, each byte covers 64 bytes of memory, which is also how
    41  	// much we can fix in a Z register.
    42  	//
    43  	// We do a load/compress for each 64 byte frame.
    44  	//
    45  	// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
    46  	VPOPCNTB Z1, Z3 // Requires BITALG
    47  	VPOPCNTB Z2, Z4
    48  
    49  	// Store the scan mask and word counts at 0(SP) and 128(SP).
    50  	//
    51  	// TODO: Is it better to read directly from the registers?
    52  	VMOVDQU64 Z1, 0(SP)
    53  	VMOVDQU64 Z2, 64(SP)
    54  	VMOVDQU64 Z3, 128(SP)
    55  	VMOVDQU64 Z4, 192(SP)
    56  
    57  	// SI = Current address in span
    58  	MOVQ mem+0(FP), SI
    59  	// DI = Scan buffer base
    60  	MOVQ bufp+8(FP), DI
    61  	// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
    62  	MOVQ $0, DX
    63  
    64  	// AX = address in scan mask, 128(AX) = address in popcount
    65  	LEAQ 0(SP), AX
    66  
    67  	// Loop over the 64 byte frames in this span.
    68  	// BX = 1 past the end of the scan mask
    69  	LEAQ 128(SP), BX
    70  
    71  	// Align loop to a cache line so that performance is less sensitive
    72  	// to how this function ends up laid out in memory. This is a hot
    73  	// function in the GC, and this is a tight loop. We don't want
    74  	// performance to waver wildly due to unrelated changes.
    75  	PCALIGN $64
    76  loop:
    77  	// CX = Fetch the mask of words to load from this frame.
    78  	MOVBQZX 0(AX), CX
    79  	// Skip empty frames.
    80  	TESTQ CX, CX
    81  	JZ skip
    82  
    83  	// Load the 64 byte frame.
    84  	KMOVB CX, K1
    85  	VMOVDQA64 0(SI), Z1
    86  
    87  	// Collect just the pointers from the greyed objects into the scan buffer,
    88  	// i.e., copy the word indices in the mask from Z1 into contiguous memory.
    89  	VPCOMPRESSQ Z1, K1, (DI)(DX*8)
    90  	// Advance the scan buffer position by the number of pointers.
    91  	MOVBQZX 128(AX), CX
    92  	ADDQ CX, DX
    93  
    94  skip:
    95  	ADDQ $64, SI
    96  	ADDQ $1, AX
    97  	CMPQ AX, BX
    98  	JB loop
    99  
   100  end:
   101  	MOVL DX, count+40(FP)
   102  	VZEROUPPER
   103  	RET
   104  

View as plain text