1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "textflag.h"
6
7 // func IndexByte(b []byte, c byte) int
8 // input:
9 // R0: b ptr
10 // R1: b len
11 // R2: b cap (unused)
12 // R3: c byte to search
13 // return
14 // R0: result
15 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
16 MOVD R3, R2
17 B ·IndexByteString<ABIInternal>(SB)
18
19 // func IndexByteString(s string, c byte) int
20 // input:
21 // R0: s ptr
22 // R1: s len
23 // R2: c byte to search
24 // return
25 // R0: result
26 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
27 // Core algorithm:
28 // For each 32-byte chunk we calculate a 64-bit syndrome value,
29 // with two bits per byte. For each tuple, bit 0 is set if the
30 // relevant byte matched the requested character and bit 1 is
31 // not used (faster than using a 32bit syndrome). Since the bits
32 // in the syndrome reflect exactly the order in which things occur
33 // in the original string, counting trailing zeros allows to
34 // identify exactly which byte has matched.
35
36 CBZ R1, fail
37 #ifdef GOOS_android
38 ADD R0, R1, R20 // R20 = end of data
39 BIC $0xf, R0, R21 // R21 = earliest we can read
40 #endif
41 MOVD R0, R11
42 // Magic constant 0x40100401 allows us to identify
43 // which lane matches the requested byte.
44 // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
45 // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
46 MOVD $0x40100401, R5
47 VMOV R2, V0.B16
48 // Work with aligned 32-byte chunks
49 BIC $0x1f, R0, R3
50 VMOV R5, V5.S4
51 ANDS $0x1f, R0, R9
52 AND $0x1f, R1, R10
53 BEQ loop
54
55 // Input string is not 32-byte aligned. We calculate the
56 // syndrome value for the aligned 32 bytes block containing
57 // the first bytes and mask off the irrelevant part.
58 #ifdef GOOS_android
59 // Android requires us to not read outside an aligned 16-byte
60 // region because MTE might be enforced.
61 CMP R21, R3
62 BLO 2(PC)
63 VLD1 (R3), [V1.B16]
64 ADD $0x10, R3
65 CMP R3, R20
66 BLS 2(PC)
67 VLD1 (R3), [V2.B16]
68 ADD $0x10, R3
69 #else
70 VLD1.P (R3), [V1.B16, V2.B16]
71 #endif
72 SUB $0x20, R9, R4
73 ADDS R4, R1, R1
74 VCMEQ V0.B16, V1.B16, V3.B16
75 VCMEQ V0.B16, V2.B16, V4.B16
76 VAND V5.B16, V3.B16, V3.B16
77 VAND V5.B16, V4.B16, V4.B16
78 VADDP V4.B16, V3.B16, V6.B16 // 256->128
79 VADDP V6.B16, V6.B16, V6.B16 // 128->64
80 VMOV V6.D[0], R6
81 // Clear the irrelevant lower bits
82 LSL $1, R9, R4
83 LSR R4, R6, R6
84 LSL R4, R6, R6
85 // The first block can also be the last
86 BLS masklast
87 // Have we found something already?
88 CBNZ R6, tail
89
90 loop:
91 #ifdef GOOS_android
92 CMP R21, R3
93 BLO 2(PC)
94 VLD1 (R3), [V1.B16]
95 ADD $0x10, R3
96 CMP R3, R20
97 BLS 2(PC)
98 VLD1 (R3), [V2.B16]
99 ADD $0x10, R3
100 #else
101 VLD1.P (R3), [V1.B16, V2.B16]
102 #endif
103 SUBS $0x20, R1, R1
104 VCMEQ V0.B16, V1.B16, V3.B16
105 VCMEQ V0.B16, V2.B16, V4.B16
106 // If we're out of data we finish regardless of the result
107 BLS end
108 // Use a fast check for the termination condition
109 VORR V4.B16, V3.B16, V6.B16
110 VADDP V6.D2, V6.D2, V6.D2
111 VMOV V6.D[0], R6
112 // We're not out of data, loop if we haven't found the character
113 CBZ R6, loop
114
115 end:
116 // Termination condition found, let's calculate the syndrome value
117 VAND V5.B16, V3.B16, V3.B16
118 VAND V5.B16, V4.B16, V4.B16
119 VADDP V4.B16, V3.B16, V6.B16
120 VADDP V6.B16, V6.B16, V6.B16
121 VMOV V6.D[0], R6
122 // Only do the clear for the last possible block with less than 32 bytes
123 // Condition flags come from SUBS in the loop
124 BHS tail
125
126 masklast:
127 // Clear the irrelevant upper bits
128 ADD R9, R10, R4
129 AND $0x1f, R4, R4
130 SUB $0x20, R4, R4
131 NEG R4<<1, R4
132 LSL R4, R6, R6
133 LSR R4, R6, R6
134
135 tail:
136 // Check that we have found a character
137 CBZ R6, fail
138 // Count the trailing zeros using bit reversing
139 RBIT R6, R6
140 // Compensate the last post-increment
141 SUB $0x20, R3, R3
142 // And count the leading zeros
143 CLZ R6, R6
144 // R6 is twice the offset into the fragment
145 ADD R6>>1, R3, R0
146 // Compute the offset result
147 SUB R11, R0, R0
148 RET
149
150 fail:
151 MOVD $-1, R0
152 RET
153
View as plain text