Text file src/internal/bytealg/count_loong64.s

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
     9  	// R4 = b_base
    10  	// R5 = b_len
    11  	// R6 = b_cap (unused)
    12  	// R7 = byte to count
    13  	AND	$0xff, R7, R6
    14  	JMP	countbody<>(SB)
    15  
    16  TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
    17  	// R4 = s_base
    18  	// R5 = s_len
    19  	// R6 = byte to count
    20  	AND	$0xff, R6
    21  	JMP	countbody<>(SB)
    22  
    23  // input:
    24  //   R4 = s_base
    25  //   R5 = s_len
    26  //   R6 = byte to count
    27  TEXT countbody<>(SB),NOSPLIT,$0
    28  	MOVV	R0, R7	// count
    29  
    30  	// short path to handle 0-byte case
    31  	BEQ	R5, done
    32  
    33  	// jump directly to tail length < 8
    34  	MOVV	$8, R8
    35  	BLT	R5, R8, tail
    36  
    37  	// Implemented using 256-bit SMID instructions
    38  lasxCountBody:
    39  	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8
    40  	BEQ	R8, lsxCountBody
    41  	XVMOVQ	R6, X0.B32
    42  
    43  	// jump directly to lasx32 if length < 128
    44  	MOVV	$128, R8
    45  	BLT	R5, R8, lasx32
    46  lasx128:
    47  lasx128Loop:
    48  	XVMOVQ	0(R4), X1
    49  	XVMOVQ	32(R4), X2
    50  	XVMOVQ	64(R4), X3
    51  	XVMOVQ	96(R4), X4
    52  
    53  	XVSEQB  X0, X1, X5
    54  	XVSEQB  X0, X2, X6
    55  	XVSEQB  X0, X3, X7
    56  	XVSEQB  X0, X4, X8
    57  
    58  	XVANDB  $1, X5, X5
    59  	XVANDB  $1, X6, X6
    60  	XVANDB  $1, X7, X7
    61  	XVANDB  $1, X8, X8
    62  
    63  	XVPCNTV	X5, X1
    64  	XVPCNTV	X6, X2
    65  	XVPCNTV	X7, X3
    66  	XVPCNTV	X8, X4
    67  
    68  	XVADDV	X2, X1
    69  	XVADDV	X4, X3
    70  	XVADDV	X3, X1
    71  
    72  	XVMOVQ	X1.V[0], R9
    73  	XVMOVQ	X1.V[1], R10
    74  	XVMOVQ	X1.V[2], R11
    75  	XVMOVQ	X1.V[3], R12
    76  
    77  	ADDV	R9, R10
    78  	ADDV	R11, R12
    79  	ADDV	R10, R7
    80  	ADDV	R12, R7
    81  
    82  	ADDV	$-128, R5
    83  	ADDV	$128, R4
    84  	BGE	R5, R8, lasx128Loop
    85  
    86  lasx32:
    87  	// jump directly to lasx8 if length < 32
    88  	MOVV	$32, R8
    89  	BLT	R5, R8, lasx8
    90  lasx32Loop:
    91  	XVMOVQ	0(R4), X1
    92  	XVSEQB  X0, X1, X2
    93  	XVANDB  $1, X2, X2
    94  	XVPCNTV	X2, X1
    95  	XVMOVQ	X1.V[0], R9
    96  	XVMOVQ	X1.V[1], R10
    97  	XVMOVQ	X1.V[2], R11
    98  	XVMOVQ	X1.V[3], R12
    99  	ADDV	R9, R10
   100  	ADDV	R11, R12
   101  	ADDV	R10, R7
   102  	ADDV	R12, R7
   103  	ADDV	$-32, R5
   104  	ADDV	$32, R4
   105  	BGE	R5, R8, lasx32Loop
   106  lasx8:
   107  	// jump directly to tail if length < 8
   108  	MOVV	$8, R8
   109  	BLT	R5, R8, tail
   110  lasx8Loop:
   111  	MOVV	0(R4), R9
   112  	VMOVQ	R9, V1.V[0]
   113  	VSEQB	V0, V1, V2
   114  	VANDB	$1, V2, V2
   115  	VPCNTV	V2, V1
   116  
   117  	VMOVQ	V1.V[0], R9
   118  	ADDV	R9, R7
   119  	ADDV	$-8, R5
   120  	ADDV	$8, R4
   121  	BGE	R5, R8, lasx8Loop
   122  	JMP	tail
   123  
   124  	// Implemented using 128-bit SMID instructions
   125  lsxCountBody:
   126  	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8
   127  	BEQ	R8, genericCountBody
   128  	VMOVQ	R6, V0.B16
   129  
   130  	// jump directly to lsx16 if length < 64
   131  	MOVV	$64, R8
   132  	BLT	R5, R8, lsx16
   133  lsx64:
   134  lsx64Loop:
   135  	VMOVQ	0(R4),  V1
   136  	VMOVQ	16(R4), V2
   137  	VMOVQ	32(R4), V3
   138  	VMOVQ	48(R4), V4
   139  
   140  	VSEQB  V0, V1, V5
   141  	VSEQB  V0, V2, V6
   142  	VSEQB  V0, V3, V7
   143  	VSEQB  V0, V4, V8
   144  
   145  	VANDB  $1, V5, V5
   146  	VANDB  $1, V6, V6
   147  	VANDB  $1, V7, V7
   148  	VANDB  $1, V8, V8
   149  
   150  	VPCNTV	V5, V1
   151  	VPCNTV	V6, V2
   152  	VPCNTV	V7, V3
   153  	VPCNTV	V8, V4
   154  
   155  	VADDV	V2, V1
   156  	VADDV	V4, V3
   157  	VADDV	V3, V1
   158  
   159  	VMOVQ	V1.V[0], R9
   160  	VMOVQ	V1.V[1], R10
   161  	ADDV	R9, R7
   162  	ADDV	R10, R7
   163  
   164  	ADDV	$-64, R5
   165  	ADDV	$64, R4
   166  	BGE	R5, R8, lsx64Loop
   167  
   168  lsx16:
   169  	// jump directly to lsx8 if length < 16
   170  	MOVV	$16, R8
   171  	BLT	R5, R8, lsx8
   172  lsx16Loop:
   173  	VMOVQ	0(R4), V1
   174  	VSEQB	V0, V1, V2
   175  	VANDB  $1, V2, V2
   176  	VPCNTV	V2, V1
   177  	VMOVQ	V1.V[0], R9
   178  	VMOVQ	V1.V[1], R10
   179  	ADDV	R9, R7
   180  	ADDV	R10, R7
   181  	ADDV	$-16, R5
   182  	ADDV	$16, R4
   183  	BGE	R5, R8, lsx16Loop
   184  lsx8:
   185  	// jump directly to tail if length < 8
   186  	MOVV	$8, R8
   187  	BLT	R5, R8, tail
   188  lsx8Loop:
   189  	MOVV	0(R4), R9
   190  	VMOVQ	R9, V1.V[0]
   191  	VSEQB	V0, V1, V2
   192  	VANDB	$1, V2, V2
   193  	VPCNTV	V2, V1
   194  
   195  	VMOVQ	V1.V[0], R9
   196  	ADDV	R9, R7
   197  	ADDV	$-8, R5
   198  	ADDV	$8, R4
   199  	BGE	R5, R8, lsx8Loop
   200  	JMP	tail
   201  
   202  	// Implemented using general instructions
   203  genericCountBody:
   204  	MOVV	$4, R8
   205  	MOVV	$1, R9
   206  genericLoop:
   207  	BLT	R5, R8, tail
   208  	ADDV	$-4, R5
   209  	MOVWU	(R4)(R5), R10
   210  	BSTRPICKW	$7, R10, $0, R11
   211  	BSTRPICKW	$15, R10, $8, R12
   212  	XOR	R6, R11
   213  	XOR	R6, R12
   214  	MASKNEZ	R11, R9, R13
   215  	MASKNEZ	R12, R9, R14
   216  	ADDV	R13, R7
   217  	ADDV	R14, R7
   218  	BSTRPICKW	$23, R10, $16, R11
   219  	BSTRPICKW	$31, R10, $24, R12
   220  	XOR	R6, R11
   221  	XOR	R6, R12
   222  	MASKNEZ	R11, R9, R13
   223  	MASKNEZ	R12, R9, R14
   224  	ADDV	R13, R7
   225  	ADDV	R14, R7
   226  	JMP	genericLoop
   227  
   228  	// Work with tail shorter than 8 bytes
   229  tail:
   230  	BEQ	R5, done
   231  	ADDV	$-1, R5
   232  	MOVBU   (R4)(R5), R8
   233  	BNE	R6, R8, tail
   234  	ADDV	$1, R7
   235  	JMP	tail
   236  done:
   237  	MOVV	R7, R4
   238  	RET
   239  

View as plain text