Text file src/internal/bytealg/index_loong64.s

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
     9  	MOVV	R7, R6		// R6 = separator pointer
    10  	MOVV	R8, R7		// R7 = separator length
    11  	JMP	indexbody<>(SB)
    12  
    13  TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
    14  	JMP	indexbody<>(SB)
    15  
    16  // input:
    17  //   R4 = string
    18  //   R5 = length
    19  //   R6 = separator pointer
    20  //   R7 = separator length (2 <= len <= 64)
    21  TEXT indexbody<>(SB),NOSPLIT,$0
    22  	// main idea is to load 'sep' into separate register(s)
    23  	// to avoid repeatedly re-load it again and again
    24  	// for sebsequent substring comparisons
    25  	SUBV	R7, R5, R8
    26  	ADDV	R4, R8		// R8 contains the start of last substring for comparison
    27  	ADDV	$1, R4, R9	// store base for later
    28  
    29  	MOVV	$8, R5
    30  	BGE	R7, R5, len_gt_or_eq_8
    31  len_2_7:
    32  	AND	$0x4, R7, R5
    33  	BNE	R5, len_4_7
    34  
    35  len_2_3:
    36  	AND	$0x1, R7, R5
    37  	BNE	R5, len_3
    38  
    39  len_2:
    40  	MOVHU	(R6), R10
    41  loop_2:
    42  	BLT	R8, R4, not_found
    43  	MOVHU	(R4), R11
    44  	ADDV	$1, R4
    45  	BNE	R10, R11, loop_2
    46  	JMP	found
    47  
    48  len_3:
    49  	MOVHU	(R6), R10
    50  	MOVBU	2(R6), R11
    51  loop_3:
    52  	BLT	R8, R4, not_found
    53  	MOVHU	(R4), R12
    54  	ADDV	$1, R4
    55  	BNE	R10, R12, loop_3
    56  	MOVBU	1(R4), R13
    57  	BNE	R11, R13, loop_3
    58  	JMP	found
    59  
    60  len_4_7:
    61  	AND	$0x2, R7, R5
    62  	BNE	R5, len_6_7
    63  	AND	$0x1, R7, R5
    64  	BNE	R5, len_5
    65  len_4:
    66  	MOVWU	(R6), R10
    67  loop_4:
    68  	BLT	R8, R4, not_found
    69  	MOVWU	(R4), R11
    70  	ADDV	$1, R4
    71  	BNE	R10, R11, loop_4
    72  	JMP	found
    73  
    74  len_5:
    75  	MOVWU	(R6), R10
    76  	MOVBU	4(R6), R11
    77  loop_5:
    78  	BLT	R8, R4, not_found
    79  	MOVWU	(R4), R12
    80  	ADDV	$1, R4
    81  	BNE	R10, R12, loop_5
    82  	MOVBU	3(R4), R13
    83  	BNE	R11, R13, loop_5
    84  	JMP	found
    85  
    86  len_6_7:
    87  	AND	$0x1, R7, R5
    88  	BNE	R5, len_7
    89  len_6:
    90  	MOVWU	(R6), R10
    91  	MOVHU	4(R6), R11
    92  loop_6:
    93  	BLT	R8, R4, not_found
    94  	MOVWU	(R4), R12
    95  	ADDV	$1, R4
    96  	BNE	R10, R12, loop_6
    97  	MOVHU	3(R4), R13
    98  	BNE	R11, R13, loop_6
    99  	JMP	found
   100  
   101  len_7:
   102  	MOVWU	(R6), R10
   103  	MOVWU	3(R6), R11
   104  loop_7:
   105  	BLT	R8, R4, not_found
   106  	MOVWU	(R4), R12
   107  	ADDV	$1, R4
   108  	BNE	R10, R12, loop_7
   109  	MOVWU	2(R4), R13
   110  	BNE	R11, R13, loop_7
   111  	JMP	found
   112  
   113  len_gt_or_eq_8:
   114  	BEQ	R5, R7, len_8
   115  	MOVV	$17, R5
   116  	BGE	R7, R5, len_gt_or_eq_17
   117  	JMP	len_9_16
   118  len_8:
   119  	MOVV	(R6), R10
   120  loop_8:
   121  	BLT	R8, R4, not_found
   122  	MOVV	(R4), R11
   123  	ADDV	$1, R4
   124  	BNE	R10, R11, loop_8
   125  	JMP	found
   126  
   127  len_9_16:
   128  	MOVV	(R6), R10
   129  	SUBV	$8, R7
   130  	MOVV	(R6)(R7), R11
   131  	SUBV	$1, R7
   132  loop_9_16:
   133  	BLT	R8, R4, not_found
   134  	MOVV	(R4), R12
   135  	ADDV	$1, R4
   136  	BNE	R10, R12, loop_9_16
   137  	MOVV	(R4)(R7), R13
   138  	BNE	R11, R13, loop_9_16
   139  	JMP	found
   140  
   141  len_gt_or_eq_17:
   142  	MOVV	$25, R5
   143  	BGE	R7, R5, len_gt_or_eq_25
   144  len_17_24:
   145  	MOVV	0(R6), R10
   146  	MOVV	8(R6), R11
   147  	SUBV	$8, R7
   148  	MOVV	(R6)(R7), R12
   149  	SUBV	$1, R7
   150  loop_17_24:
   151  	BLT	R8, R4, not_found
   152  	MOVV	(R4), R13
   153  	ADDV	$1, R4
   154  	BNE	R10, R13, loop_17_24
   155  	MOVV	7(R4), R14
   156  	BNE	R11, R14, loop_17_24
   157  	MOVV	(R4)(R7), R15
   158  	BNE	R12, R15, loop_17_24
   159  	JMP	found
   160  
   161  len_gt_or_eq_25:
   162  	MOVV	$33, R5
   163  	BGE	R7, R5, len_gt_or_eq_33
   164  	MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R10
   165  	BNE	R10, lsx_len_25_32
   166  len_25_32:
   167  	MOVV	0(R6), R10
   168  	MOVV	8(R6), R11
   169  	MOVV	16(R6), R12
   170  	SUBV	$8, R7
   171  	MOVV	(R6)(R7), R13
   172  	SUBV	$1, R7
   173  loop_25_32:
   174  	BLT	R8, R4, not_found
   175  	MOVV	(R4), R14
   176  	ADDV	$1, R4
   177  	BNE	R10, R14, loop_25_32
   178  	MOVV	7(R4), R15
   179  	BNE	R11, R15, loop_25_32
   180  	MOVV	15(R4), R16
   181  	BNE	R12, R16, loop_25_32
   182  	MOVV	(R4)(R7), R17
   183  	BNE	R13, R17, loop_25_32
   184  	JMP	found
   185  
   186  	// On loong64, LSX is included if LASX is supported.
   187  lasx_len_25_32:
   188  lsx_len_25_32:
   189  	VMOVQ	0(R6), V0
   190  	SUBV	$16, R7
   191  	VMOVQ	(R6)(R7), V1
   192  	SUBV	$1, R7
   193  lsx_loop_25_32:
   194  	BLT	R8, R4, not_found
   195  	VMOVQ	(R4), V2
   196  	ADDV	$1, R4
   197  	VSEQV	V0, V2, V2
   198  	VSETANYEQV	V2, FCC0
   199  	BFPT	FCC0, lsx_loop_25_32
   200  
   201  	VMOVQ	(R4)(R7), V3
   202  	VSEQV	V1, V3, V3
   203  	VSETANYEQV	V3, FCC1
   204  	BFPT	FCC1, lsx_loop_25_32
   205  	JMP	found
   206  
   207  len_gt_or_eq_33:
   208  	MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R10
   209  	MOVV	$49, R5
   210  	BGE	R7, R5, len_gt_or_eq_49
   211  len_33_48:
   212  	BNE	R10, lasx_len_33_48
   213  	JMP	lsx_len_33_48
   214  
   215  len_gt_or_eq_49:
   216  len_49_64:
   217  	BNE	R10, lasx_len_49_64
   218  	JMP	lsx_len_49_64
   219  
   220  lsx_len_33_48:
   221  	VMOVQ	0(R6), V0
   222  	VMOVQ	16(R6), V1
   223  	SUBV	$16, R7
   224  	VMOVQ	(R6)(R7), V2
   225  	SUBV	$1, R7
   226  lsx_loop_33_48:
   227  	BLT	R8, R4, not_found
   228  	VMOVQ	0(R4), V3
   229  	ADDV	$1, R4
   230  	VSEQV	V0, V3, V3
   231  	VSETANYEQV	V3, FCC0
   232  	BFPT	FCC0, lsx_loop_33_48
   233  
   234  	VMOVQ	15(R4), V4
   235  	VSEQV	V1, V4, V4
   236  	VSETANYEQV	V4, FCC1
   237  	BFPT	FCC1, lsx_loop_33_48
   238  
   239  	VMOVQ	(R4)(R7), V5
   240  	VSEQV	V2, V5, V5
   241  	VSETANYEQV	V5, FCC2
   242  	BFPT	FCC2, lsx_loop_33_48
   243  	JMP	found
   244  
   245  lsx_len_49_64:
   246  	VMOVQ	0(R6), V0
   247  	VMOVQ	16(R6), V1
   248  	VMOVQ	32(R6), V2
   249  	SUBV	$16, R7
   250  	VMOVQ	(R6)(R7), V3
   251  	SUBV	$1, R7
   252  lsx_loop_49_64:
   253  	BLT	R8, R4, not_found
   254  	VMOVQ	0(R4), V4
   255  	ADDV	$1, R4
   256  	VSEQV	V0, V4, V4
   257  	VSETANYEQV	V4, FCC0
   258  	BFPT	FCC0, lsx_loop_49_64
   259  
   260  	VMOVQ	15(R4), V5
   261  	VSEQV	V1, V5, V5
   262  	VSETANYEQV	V5, FCC1
   263  	BFPT	FCC1, lsx_loop_49_64
   264  
   265  	VMOVQ	31(R4), V6
   266  	VSEQV	V2, V6, V6
   267  	VSETANYEQV	V6, FCC2
   268  	BFPT	FCC2, lsx_loop_49_64
   269  
   270  	VMOVQ	(R4)(R7), V7
   271  	VSEQV	V3, V7, V7
   272  	VSETANYEQV	V7, FCC3
   273  	BFPT	FCC3, lsx_loop_49_64
   274  	JMP	found
   275  
   276  lasx_len_33_48:
   277  lasx_len_49_64:
   278  lasx_len_33_64:
   279  	XVMOVQ	(R6), X0
   280  	SUBV	$32, R7
   281  	XVMOVQ	(R6)(R7), X1
   282  	SUBV	$1, R7
   283  lasx_loop_33_64:
   284  	BLT	R8, R4, not_found
   285  	XVMOVQ	(R4), X2
   286  	ADDV	$1, R4
   287  	XVSEQV	X0, X2, X3
   288  	XVSETANYEQV	X3, FCC0
   289  	BFPT	FCC0, lasx_loop_33_64
   290  
   291  	XVMOVQ	(R4)(R7), X4
   292  	XVSEQV	X1, X4, X5
   293  	XVSETANYEQV	X5, FCC1
   294  	BFPT	FCC1, lasx_loop_33_64
   295  	JMP	found
   296  
   297  found:
   298  	SUBV	R9, R4
   299  	RET
   300  
   301  not_found:
   302  	MOVV	$-1, R4
   303  	RET
   304  

View as plain text