1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·FilterNilAVX512(SB), NOSPLIT, $0-20
9 // Load arguments
10 MOVQ bufp+0(FP), R8 // R8 = bufp (start of the uint64 array)
11 MOVL n+8(FP), R9 // R9 = n (total length)
12 XORL R10, R10 // R10 = 0 (scanned = 0)
13 XORL R11, R11 // R11 = 0 (cnt = 0)
14
15 MOVL R9, R12 // R12 = n
16 SUBL R10, R12 // R12 = n - scanned
17 CMPL R12, $8 // Compare (n - scanned) with 8
18 JLT scalar_loop // If (n - scanned) < 8, jump to the scalar cleanup
19
20 vector_loop:
21 LEAQ (R8)(R10*8), R13 // R13 = buf[scanned:] address
22 VMOVDQU64 (R13), Z1 // Z1 = v (Load 8 uint64s)
23 VPCMPUQ $4, Z1, Z15, K1 // Z15 is always 0, compare Z1 with 0, results in K1.
24
25 LEAQ (R8)(R11*8), R14 // R14 = buf[cnt:] address
26 VPCOMPRESSQ Z1, K1, Z1 // compress v
27 VMOVDQU64 Z1, (R14) // store v to buf[cnt:]
28
29 KMOVW K1, R15
30 POPCNTL R15, R15 // R15 = popcount(K1)
31
32 ADDL R15, R11 // cnt += popcount(K1)
33 ADDL $8, R10 // scanned += 8
34
35 MOVL R9, R12 // R12 = n
36 SUBL R10, R12 // R12 = n - scanned
37 CMPL R12, $8 // Compare (n - scanned) with 8
38 JGE vector_loop // If (n - scanned) >= 8, continue loop
39
40 scalar_loop:
41 CMPL R10, R9 // Compare scanned with n
42 JGE end // If scanned >= n, loop is done
43
44 scalar_next_i:
45 LEAQ (R8)(R10*8), R13 // R13 = &buf[scanned]
46 MOVQ (R13), R14 // R14 = buf[scanned]
47
48 CMPQ R14, $0
49 JE scalar_increment_i // If buf[i] == 0, skip to increment i
50
51 LEAQ (R8)(R11*8), R15 // R15 = &buf[cnt]
52 MOVQ R14, (R15) // buf[cnt] = buf[scanned]
53
54 ADDL $1, R11 // cnt++
55
56 scalar_increment_i:
57 ADDL $1, R10 // scanned++
58
59 CMPL R10, R9
60 JL scalar_next_i // if scanned < n, continue
61
62 end:
63 MOVL R11, ret+16(FP)
64 RET
65
View as plain text