1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 // Test-only.
9 TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
10 MOVQ sizeClass+0(FP), CX
11 MOVQ packed+8(FP), AX
12
13 // Call the expander for this size class
14 LEAQ ·gcExpandersAVX512(SB), BX
15 CALL (BX)(CX*8)
16
17 MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
18 VMOVDQU64 Z1, 0(DI)
19 VMOVDQU64 Z2, 64(DI)
20 VZEROUPPER
21 RET
22
23 TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
24 // Z1+Z2 = Expand the grey object mask into a grey word mask
25 MOVQ objMarks+16(FP), AX
26 MOVQ sizeClass+24(FP), CX
27 LEAQ ·gcExpandersAVX512(SB), BX
28 CALL (BX)(CX*8)
29
30 // Z3+Z4 = Load the pointer mask
31 MOVQ ptrMask+32(FP), AX
32 VMOVDQU64 0(AX), Z3
33 VMOVDQU64 64(AX), Z4
34
35 // Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
36 VPANDQ Z1, Z3, Z1
37 VPANDQ Z2, Z4, Z2
38
39 // Now each bit of Z1+Z2 represents one word of the span.
40 // Thus, each byte covers 64 bytes of memory, which is also how
41 // much we can fix in a Z register.
42 //
43 // We do a load/compress for each 64 byte frame.
44 //
45 // Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
46 VPOPCNTB Z1, Z3 // Requires BITALG
47 VPOPCNTB Z2, Z4
48
49 // Store the scan mask and word counts at 0(SP) and 128(SP).
50 //
51 // TODO: Is it better to read directly from the registers?
52 VMOVDQU64 Z1, 0(SP)
53 VMOVDQU64 Z2, 64(SP)
54 VMOVDQU64 Z3, 128(SP)
55 VMOVDQU64 Z4, 192(SP)
56
57 // SI = Current address in span
58 MOVQ mem+0(FP), SI
59 // DI = Scan buffer base
60 MOVQ bufp+8(FP), DI
61 // DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
62 MOVQ $0, DX
63
64 // AX = address in scan mask, 128(AX) = address in popcount
65 LEAQ 0(SP), AX
66
67 // Loop over the 64 byte frames in this span.
68 // BX = 1 past the end of the scan mask
69 LEAQ 128(SP), BX
70
71 // Align loop to a cache line so that performance is less sensitive
72 // to how this function ends up laid out in memory. This is a hot
73 // function in the GC, and this is a tight loop. We don't want
74 // performance to waver wildly due to unrelated changes.
75 PCALIGN $64
76 loop:
77 // CX = Fetch the mask of words to load from this frame.
78 MOVBQZX 0(AX), CX
79 // Skip empty frames.
80 TESTQ CX, CX
81 JZ skip
82
83 // Load the 64 byte frame.
84 KMOVB CX, K1
85 VMOVDQA64 0(SI), Z1
86
87 // Collect just the pointers from the greyed objects into the scan buffer,
88 // i.e., copy the word indices in the mask from Z1 into contiguous memory.
89 VPCOMPRESSQ Z1, K1, (DI)(DX*8)
90 // Advance the scan buffer position by the number of pointers.
91 MOVBQZX 128(AX), CX
92 ADDQ CX, DX
93
94 skip:
95 ADDQ $64, SI
96 ADDQ $1, AX
97 CMPQ AX, BX
98 JB loop
99
100 end:
101 MOVL DX, count+40(FP)
102 VZEROUPPER
103 RET
104
View as plain text