// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" TEXT ·Count(SB),NOSPLIT,$0-40 // R4 = b_base // R5 = b_len // R6 = b_cap (unused) // R7 = byte to count AND $0xff, R7, R6 JMP countbody<>(SB) TEXT ·CountString(SB),NOSPLIT,$0-32 // R4 = s_base // R5 = s_len // R6 = byte to count AND $0xff, R6 JMP countbody<>(SB) // input: // R4 = s_base // R5 = s_len // R6 = byte to count TEXT countbody<>(SB),NOSPLIT,$0 MOVV R0, R7 // count // short path to handle 0-byte case BEQ R5, done // jump directly to tail length < 8 MOVV $8, R8 BLT R5, R8, tail // Implemented using 256-bit SMID instructions lasxCountBody: MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8 BEQ R8, lsxCountBody XVMOVQ R6, X0.B32 // jump directly to lasx32 if length < 128 MOVV $128, R8 BLT R5, R8, lasx32 lasx128: lasx128Loop: XVMOVQ 0(R4), X1 XVMOVQ 32(R4), X2 XVMOVQ 64(R4), X3 XVMOVQ 96(R4), X4 XVSEQB X0, X1, X5 XVSEQB X0, X2, X6 XVSEQB X0, X3, X7 XVSEQB X0, X4, X8 XVANDB $1, X5, X5 XVANDB $1, X6, X6 XVANDB $1, X7, X7 XVANDB $1, X8, X8 XVPCNTV X5, X1 XVPCNTV X6, X2 XVPCNTV X7, X3 XVPCNTV X8, X4 XVADDV X2, X1 XVADDV X4, X3 XVADDV X3, X1 XVMOVQ X1.V[0], R9 XVMOVQ X1.V[1], R10 XVMOVQ X1.V[2], R11 XVMOVQ X1.V[3], R12 ADDV R9, R10 ADDV R11, R12 ADDV R10, R7 ADDV R12, R7 ADDV $-128, R5 ADDV $128, R4 BGE R5, R8, lasx128Loop lasx32: // jump directly to lasx8 if length < 32 MOVV $32, R8 BLT R5, R8, lasx8 lasx32Loop: XVMOVQ 0(R4), X1 XVSEQB X0, X1, X2 XVANDB $1, X2, X2 XVPCNTV X2, X1 XVMOVQ X1.V[0], R9 XVMOVQ X1.V[1], R10 XVMOVQ X1.V[2], R11 XVMOVQ X1.V[3], R12 ADDV R9, R10 ADDV R11, R12 ADDV R10, R7 ADDV R12, R7 ADDV $-32, R5 ADDV $32, R4 BGE R5, R8, lasx32Loop lasx8: // jump directly to tail if length < 8 MOVV $8, R8 BLT R5, R8, tail lasx8Loop: MOVV 0(R4), R9 VMOVQ R9, V1.V[0] VSEQB V0, V1, V2 VANDB $1, V2, V2 VPCNTV V2, V1 VMOVQ V1.V[0], R9 ADDV R9, R7 ADDV $-8, R5 ADDV $8, R4 BGE R5, R8, lasx8Loop JMP tail // Implemented using 128-bit SMID instructions lsxCountBody: MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8 BEQ R8, genericCountBody VMOVQ R6, V0.B16 // jump directly to lsx16 if length < 64 MOVV $64, R8 BLT R5, R8, lsx16 lsx64: lsx64Loop: VMOVQ 0(R4), V1 VMOVQ 16(R4), V2 VMOVQ 32(R4), V3 VMOVQ 48(R4), V4 VSEQB V0, V1, V5 VSEQB V0, V2, V6 VSEQB V0, V3, V7 VSEQB V0, V4, V8 VANDB $1, V5, V5 VANDB $1, V6, V6 VANDB $1, V7, V7 VANDB $1, V8, V8 VPCNTV V5, V1 VPCNTV V6, V2 VPCNTV V7, V3 VPCNTV V8, V4 VADDV V2, V1 VADDV V4, V3 VADDV V3, V1 VMOVQ V1.V[0], R9 VMOVQ V1.V[1], R10 ADDV R9, R7 ADDV R10, R7 ADDV $-64, R5 ADDV $64, R4 BGE R5, R8, lsx64Loop lsx16: // jump directly to lsx8 if length < 16 MOVV $16, R8 BLT R5, R8, lsx8 lsx16Loop: VMOVQ 0(R4), V1 VSEQB V0, V1, V2 VANDB $1, V2, V2 VPCNTV V2, V1 VMOVQ V1.V[0], R9 VMOVQ V1.V[1], R10 ADDV R9, R7 ADDV R10, R7 ADDV $-16, R5 ADDV $16, R4 BGE R5, R8, lsx16Loop lsx8: // jump directly to tail if length < 8 MOVV $8, R8 BLT R5, R8, tail lsx8Loop: MOVV 0(R4), R9 VMOVQ R9, V1.V[0] VSEQB V0, V1, V2 VANDB $1, V2, V2 VPCNTV V2, V1 VMOVQ V1.V[0], R9 ADDV R9, R7 ADDV $-8, R5 ADDV $8, R4 BGE R5, R8, lsx8Loop JMP tail // Implemented using general instructions genericCountBody: MOVV $4, R8 MOVV $1, R9 genericLoop: BLT R5, R8, tail ADDV $-4, R5 MOVWU (R4)(R5), R10 BSTRPICKW $7, R10, $0, R11 BSTRPICKW $15, R10, $8, R12 XOR R6, R11 XOR R6, R12 MASKNEZ R11, R9, R13 MASKNEZ R12, R9, R14 ADDV R13, R7 ADDV R14, R7 BSTRPICKW $23, R10, $16, R11 BSTRPICKW $31, R10, $24, R12 XOR R6, R11 XOR R6, R12 MASKNEZ R11, R9, R13 MASKNEZ R12, R9, R14 ADDV R13, R7 ADDV R14, R7 JMP genericLoop // Work with tail shorter than 8 bytes tail: BEQ R5, done ADDV $-1, R5 MOVBU (R4)(R5), R8 BNE R6, R8, tail ADDV $1, R7 JMP tail done: MOVV R7, R4 RET