Text file src/internal/runtime/gc/scan/filter_amd64.s

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·FilterNilAVX512(SB), NOSPLIT, $0-20
     9  	// Load arguments
    10  	MOVQ bufp+0(FP), R8	// R8 = bufp (start of the uint64 array)
    11  	MOVL n+8(FP), R9	// R9 = n (total length)
    12  	XORL R10, R10		// R10 = 0 (scanned = 0)
    13  	XORL R11, R11		// R11 = 0 (cnt = 0)
    14  
    15  	MOVL R9, R12	// R12 = n
    16  	SUBL R10, R12	// R12 = n - scanned
    17  	CMPL R12, $8	// Compare (n - scanned) with 8
    18  	JLT scalar_loop	// If (n - scanned) < 8, jump to the scalar cleanup
    19  
    20  vector_loop:
    21  	LEAQ (R8)(R10*8), R13	// R13 = buf[scanned:] address
    22  	VMOVDQU64 (R13), Z1		// Z1 = v (Load 8 uint64s)
    23  	VPCMPUQ $4, Z1, Z15, K1	// Z15 is always 0, compare Z1 with 0, results in K1.
    24  
    25  	LEAQ (R8)(R11*8), R14	// R14 = buf[cnt:] address
    26  	VPCOMPRESSQ Z1, K1, Z1	// compress v
    27  	VMOVDQU64 Z1, (R14)		// store v to buf[cnt:]
    28  
    29  	KMOVW K1, R15
    30  	POPCNTL R15, R15	// R15 = popcount(K1)
    31  
    32  	ADDL R15, R11	// cnt += popcount(K1)
    33  	ADDL $8, R10	// scanned += 8
    34  
    35  	MOVL R9, R12	// R12 = n
    36  	SUBL R10, R12	// R12 = n - scanned
    37  	CMPL R12, $8	// Compare (n - scanned) with 8
    38  	JGE vector_loop	// If (n - scanned) >= 8, continue loop
    39  
    40  scalar_loop:
    41  	CMPL R10, R9	// Compare scanned with n
    42  	JGE end			// If scanned >= n, loop is done
    43  
    44  scalar_next_i:
    45  	LEAQ (R8)(R10*8), R13	// R13 = &buf[scanned]
    46  	MOVQ (R13), R14			// R14 = buf[scanned]
    47  
    48  	CMPQ R14, $0
    49  	JE scalar_increment_i	// If buf[i] == 0, skip to increment i
    50  
    51  	LEAQ (R8)(R11*8), R15	// R15 = &buf[cnt]
    52  	MOVQ R14, (R15)			// buf[cnt] = buf[scanned]
    53  
    54  	ADDL $1, R11	// cnt++
    55  
    56  scalar_increment_i:
    57  	ADDL $1, R10	// scanned++
    58  
    59  	CMPL R10, R9
    60  	JL scalar_next_i	// if scanned < n, continue
    61  
    62  end:
    63  	MOVL R11, ret+16(FP)
    64  	RET
    65  

View as plain text