1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
9 // R4 = b_base
10 // R5 = b_len
11 // R6 = b_cap (unused)
12 // R7 = byte to count
13 AND $0xff, R7, R6
14 JMP countbody<>(SB)
15
16 TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
17 // R4 = s_base
18 // R5 = s_len
19 // R6 = byte to count
20 AND $0xff, R6
21 JMP countbody<>(SB)
22
23 // input:
24 // R4 = s_base
25 // R5 = s_len
26 // R6 = byte to count
27 TEXT countbody<>(SB),NOSPLIT,$0
28 MOVV R0, R7 // count
29
30 // short path to handle 0-byte case
31 BEQ R5, done
32
33 // jump directly to tail length < 8
34 MOVV $8, R8
35 BLT R5, R8, tail
36
37 // Implemented using 256-bit SMID instructions
38 lasxCountBody:
39 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8
40 BEQ R8, lsxCountBody
41 XVMOVQ R6, X0.B32
42
43 // jump directly to lasx32 if length < 128
44 MOVV $128, R8
45 BLT R5, R8, lasx32
46 lasx128:
47 lasx128Loop:
48 XVMOVQ 0(R4), X1
49 XVMOVQ 32(R4), X2
50 XVMOVQ 64(R4), X3
51 XVMOVQ 96(R4), X4
52
53 XVSEQB X0, X1, X5
54 XVSEQB X0, X2, X6
55 XVSEQB X0, X3, X7
56 XVSEQB X0, X4, X8
57
58 XVANDB $1, X5, X5
59 XVANDB $1, X6, X6
60 XVANDB $1, X7, X7
61 XVANDB $1, X8, X8
62
63 XVPCNTV X5, X1
64 XVPCNTV X6, X2
65 XVPCNTV X7, X3
66 XVPCNTV X8, X4
67
68 XVADDV X2, X1
69 XVADDV X4, X3
70 XVADDV X3, X1
71
72 XVMOVQ X1.V[0], R9
73 XVMOVQ X1.V[1], R10
74 XVMOVQ X1.V[2], R11
75 XVMOVQ X1.V[3], R12
76
77 ADDV R9, R10
78 ADDV R11, R12
79 ADDV R10, R7
80 ADDV R12, R7
81
82 ADDV $-128, R5
83 ADDV $128, R4
84 BGE R5, R8, lasx128Loop
85
86 lasx32:
87 // jump directly to lasx8 if length < 32
88 MOVV $32, R8
89 BLT R5, R8, lasx8
90 lasx32Loop:
91 XVMOVQ 0(R4), X1
92 XVSEQB X0, X1, X2
93 XVANDB $1, X2, X2
94 XVPCNTV X2, X1
95 XVMOVQ X1.V[0], R9
96 XVMOVQ X1.V[1], R10
97 XVMOVQ X1.V[2], R11
98 XVMOVQ X1.V[3], R12
99 ADDV R9, R10
100 ADDV R11, R12
101 ADDV R10, R7
102 ADDV R12, R7
103 ADDV $-32, R5
104 ADDV $32, R4
105 BGE R5, R8, lasx32Loop
106 lasx8:
107 // jump directly to tail if length < 8
108 MOVV $8, R8
109 BLT R5, R8, tail
110 lasx8Loop:
111 MOVV 0(R4), R9
112 VMOVQ R9, V1.V[0]
113 VSEQB V0, V1, V2
114 VANDB $1, V2, V2
115 VPCNTV V2, V1
116
117 VMOVQ V1.V[0], R9
118 ADDV R9, R7
119 ADDV $-8, R5
120 ADDV $8, R4
121 BGE R5, R8, lasx8Loop
122 JMP tail
123
124 // Implemented using 128-bit SMID instructions
125 lsxCountBody:
126 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8
127 BEQ R8, genericCountBody
128 VMOVQ R6, V0.B16
129
130 // jump directly to lsx16 if length < 64
131 MOVV $64, R8
132 BLT R5, R8, lsx16
133 lsx64:
134 lsx64Loop:
135 VMOVQ 0(R4), V1
136 VMOVQ 16(R4), V2
137 VMOVQ 32(R4), V3
138 VMOVQ 48(R4), V4
139
140 VSEQB V0, V1, V5
141 VSEQB V0, V2, V6
142 VSEQB V0, V3, V7
143 VSEQB V0, V4, V8
144
145 VANDB $1, V5, V5
146 VANDB $1, V6, V6
147 VANDB $1, V7, V7
148 VANDB $1, V8, V8
149
150 VPCNTV V5, V1
151 VPCNTV V6, V2
152 VPCNTV V7, V3
153 VPCNTV V8, V4
154
155 VADDV V2, V1
156 VADDV V4, V3
157 VADDV V3, V1
158
159 VMOVQ V1.V[0], R9
160 VMOVQ V1.V[1], R10
161 ADDV R9, R7
162 ADDV R10, R7
163
164 ADDV $-64, R5
165 ADDV $64, R4
166 BGE R5, R8, lsx64Loop
167
168 lsx16:
169 // jump directly to lsx8 if length < 16
170 MOVV $16, R8
171 BLT R5, R8, lsx8
172 lsx16Loop:
173 VMOVQ 0(R4), V1
174 VSEQB V0, V1, V2
175 VANDB $1, V2, V2
176 VPCNTV V2, V1
177 VMOVQ V1.V[0], R9
178 VMOVQ V1.V[1], R10
179 ADDV R9, R7
180 ADDV R10, R7
181 ADDV $-16, R5
182 ADDV $16, R4
183 BGE R5, R8, lsx16Loop
184 lsx8:
185 // jump directly to tail if length < 8
186 MOVV $8, R8
187 BLT R5, R8, tail
188 lsx8Loop:
189 MOVV 0(R4), R9
190 VMOVQ R9, V1.V[0]
191 VSEQB V0, V1, V2
192 VANDB $1, V2, V2
193 VPCNTV V2, V1
194
195 VMOVQ V1.V[0], R9
196 ADDV R9, R7
197 ADDV $-8, R5
198 ADDV $8, R4
199 BGE R5, R8, lsx8Loop
200 JMP tail
201
202 // Implemented using general instructions
203 genericCountBody:
204 MOVV $4, R8
205 MOVV $1, R9
206 genericLoop:
207 BLT R5, R8, tail
208 ADDV $-4, R5
209 MOVWU (R4)(R5), R10
210 BSTRPICKW $7, R10, $0, R11
211 BSTRPICKW $15, R10, $8, R12
212 XOR R6, R11
213 XOR R6, R12
214 MASKNEZ R11, R9, R13
215 MASKNEZ R12, R9, R14
216 ADDV R13, R7
217 ADDV R14, R7
218 BSTRPICKW $23, R10, $16, R11
219 BSTRPICKW $31, R10, $24, R12
220 XOR R6, R11
221 XOR R6, R12
222 MASKNEZ R11, R9, R13
223 MASKNEZ R12, R9, R14
224 ADDV R13, R7
225 ADDV R14, R7
226 JMP genericLoop
227
228 // Work with tail shorter than 8 bytes
229 tail:
230 BEQ R5, done
231 ADDV $-1, R5
232 MOVBU (R4)(R5), R8
233 BNE R6, R8, tail
234 ADDV $1, R7
235 JMP tail
236 done:
237 MOVV R7, R4
238 RET
239
View as plain text