Text file src/internal/bytealg/indexbyte_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // func IndexByte(b []byte, c byte) int
     8  // input:
     9  //   R0: b ptr
    10  //   R1: b len
    11  //   R2: b cap (unused)
    12  //   R3: c byte to search
    13  // return
    14  //   R0: result
    15  TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
    16  	MOVD	R3, R2
    17  	B	·IndexByteString<ABIInternal>(SB)
    18  
    19  // func IndexByteString(s string, c byte) int
    20  // input:
    21  //   R0: s ptr
    22  //   R1: s len
    23  //   R2: c byte to search
    24  // return
    25  //   R0: result
    26  TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
    27  	// Core algorithm:
    28  	// For each 32-byte chunk we calculate a 64-bit syndrome value,
    29  	// with two bits per byte. For each tuple, bit 0 is set if the
    30  	// relevant byte matched the requested character and bit 1 is
    31  	// not used (faster than using a 32bit syndrome). Since the bits
    32  	// in the syndrome reflect exactly the order in which things occur
    33  	// in the original string, counting trailing zeros allows to
    34  	// identify exactly which byte has matched.
    35  
    36  	CBZ	R1, fail
    37  #ifdef GOOS_android
    38  	ADD	R0, R1, R20   // R20 = end of data
    39  	BIC	$0xf, R0, R21 // R21 = earliest we can read
    40  #endif
    41  	MOVD	R0, R11
    42  	// Magic constant 0x40100401 allows us to identify
    43  	// which lane matches the requested byte.
    44  	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
    45  	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
    46  	MOVD	$0x40100401, R5
    47  	VMOV	R2, V0.B16
    48  	// Work with aligned 32-byte chunks
    49  	BIC	$0x1f, R0, R3
    50  	VMOV	R5, V5.S4
    51  	ANDS	$0x1f, R0, R9
    52  	AND	$0x1f, R1, R10
    53  	BEQ	loop
    54  
    55  	// Input string is not 32-byte aligned. We calculate the
    56  	// syndrome value for the aligned 32 bytes block containing
    57  	// the first bytes and mask off the irrelevant part.
    58  #ifdef GOOS_android
    59  	// Android requires us to not read outside an aligned 16-byte
    60  	// region because MTE might be enforced.
    61  	CMP	R21, R3
    62  	BLO	2(PC)
    63  	VLD1	(R3), [V1.B16]
    64  	ADD	$0x10, R3
    65  	CMP	R3, R20
    66  	BLS	2(PC)
    67  	VLD1	(R3), [V2.B16]
    68  	ADD	$0x10, R3
    69  #else
    70  	VLD1.P	(R3), [V1.B16, V2.B16]
    71  #endif
    72  	SUB	$0x20, R9, R4
    73  	ADDS	R4, R1, R1
    74  	VCMEQ	V0.B16, V1.B16, V3.B16
    75  	VCMEQ	V0.B16, V2.B16, V4.B16
    76  	VAND	V5.B16, V3.B16, V3.B16
    77  	VAND	V5.B16, V4.B16, V4.B16
    78  	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
    79  	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
    80  	VMOV	V6.D[0], R6
    81  	// Clear the irrelevant lower bits
    82  	LSL	$1, R9, R4
    83  	LSR	R4, R6, R6
    84  	LSL	R4, R6, R6
    85  	// The first block can also be the last
    86  	BLS	masklast
    87  	// Have we found something already?
    88  	CBNZ	R6, tail
    89  
    90  loop:
    91  #ifdef GOOS_android
    92  	CMP	R21, R3
    93  	BLO	2(PC)
    94  	VLD1	(R3), [V1.B16]
    95  	ADD	$0x10, R3
    96  	CMP	R3, R20
    97  	BLS	2(PC)
    98  	VLD1	(R3), [V2.B16]
    99  	ADD	$0x10, R3
   100  #else
   101  	VLD1.P	(R3), [V1.B16, V2.B16]
   102  #endif
   103  	SUBS	$0x20, R1, R1
   104  	VCMEQ	V0.B16, V1.B16, V3.B16
   105  	VCMEQ	V0.B16, V2.B16, V4.B16
   106  	// If we're out of data we finish regardless of the result
   107  	BLS	end
   108  	// Use a fast check for the termination condition
   109  	VORR	V4.B16, V3.B16, V6.B16
   110  	VADDP	V6.D2, V6.D2, V6.D2
   111  	VMOV	V6.D[0], R6
   112  	// We're not out of data, loop if we haven't found the character
   113  	CBZ	R6, loop
   114  
   115  end:
   116  	// Termination condition found, let's calculate the syndrome value
   117  	VAND	V5.B16, V3.B16, V3.B16
   118  	VAND	V5.B16, V4.B16, V4.B16
   119  	VADDP	V4.B16, V3.B16, V6.B16
   120  	VADDP	V6.B16, V6.B16, V6.B16
   121  	VMOV	V6.D[0], R6
   122  	// Only do the clear for the last possible block with less than 32 bytes
   123  	// Condition flags come from SUBS in the loop
   124  	BHS	tail
   125  
   126  masklast:
   127  	// Clear the irrelevant upper bits
   128  	ADD	R9, R10, R4
   129  	AND	$0x1f, R4, R4
   130  	SUB	$0x20, R4, R4
   131  	NEG	R4<<1, R4
   132  	LSL	R4, R6, R6
   133  	LSR	R4, R6, R6
   134  
   135  tail:
   136  	// Check that we have found a character
   137  	CBZ	R6, fail
   138  	// Count the trailing zeros using bit reversing
   139  	RBIT	R6, R6
   140  	// Compensate the last post-increment
   141  	SUB	$0x20, R3, R3
   142  	// And count the leading zeros
   143  	CLZ	R6, R6
   144  	// R6 is twice the offset into the fragment
   145  	ADD	R6>>1, R3, R0
   146  	// Compute the offset result
   147  	SUB	R11, R0, R0
   148  	RET
   149  
   150  fail:
   151  	MOVD	$-1, R0
   152  	RET
   153  

View as plain text