1 // Copyright 2022 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 // func keccakF1600NEON(a *[200]byte)
10 TEXT ·keccakF1600NEON(SB), $200-8
11 MOVD a+0(FP), R0
12 MOVD $round_consts<>(SB), R1
13 MOVD $24, R2 // counter for loop
14
15 VLD1.P 16(R0), [V0.D1, V1.D1]
16 VLD1.P 16(R0), [V2.D1, V3.D1]
17 VLD1.P 16(R0), [V4.D1, V5.D1]
18 VLD1.P 16(R0), [V6.D1, V7.D1]
19 VLD1.P 16(R0), [V8.D1, V9.D1]
20 VLD1.P 16(R0), [V10.D1, V11.D1]
21 VLD1.P 16(R0), [V12.D1, V13.D1]
22 VLD1.P 16(R0), [V14.D1, V15.D1]
23 VLD1.P 16(R0), [V16.D1, V17.D1]
24 VLD1.P 16(R0), [V18.D1, V19.D1]
25 VLD1.P 16(R0), [V20.D1, V21.D1]
26 VLD1.P 16(R0), [V22.D1, V23.D1]
27 VLD1 (R0), [V24.D1]
28
29 SUB $192, R0, R0
30
31 loop:
32 // theta
33 VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
34 VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
35 VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
36 VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
37 VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
38 VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
39 VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
40 VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
41 VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
42 VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
43
44 VRAX1 V27.D2, V25.D2, V30.D2
45 VRAX1 V28.D2, V26.D2, V31.D2
46 VRAX1 V29.D2, V27.D2, V27.D2
47 VRAX1 V25.D2, V28.D2, V28.D2
48 VRAX1 V26.D2, V29.D2, V29.D2
49
50 // theta and rho and Pi
51 VEOR V29.B16, V0.B16, V0.B16
52
53 VXAR $63, V30.D2, V1.D2, V25.D2
54
55 VXAR $20, V30.D2, V6.D2, V1.D2
56 VXAR $44, V28.D2, V9.D2, V6.D2
57 VXAR $3, V31.D2, V22.D2, V9.D2
58 VXAR $25, V28.D2, V14.D2, V22.D2
59 VXAR $46, V29.D2, V20.D2, V14.D2
60
61 VXAR $2, V31.D2, V2.D2, V26.D2
62
63 VXAR $21, V31.D2, V12.D2, V2.D2
64 VXAR $39, V27.D2, V13.D2, V12.D2
65 VXAR $56, V28.D2, V19.D2, V13.D2
66 VXAR $8, V27.D2, V23.D2, V19.D2
67 VXAR $23, V29.D2, V15.D2, V23.D2
68
69 VXAR $37, V28.D2, V4.D2, V15.D2
70
71 VXAR $50, V28.D2, V24.D2, V28.D2
72 VXAR $62, V30.D2, V21.D2, V24.D2
73 VXAR $9, V27.D2, V8.D2, V8.D2
74 VXAR $19, V30.D2, V16.D2, V4.D2
75 VXAR $28, V29.D2, V5.D2, V16.D2
76
77 VXAR $36, V27.D2, V3.D2, V5.D2
78
79 VXAR $43, V27.D2, V18.D2, V27.D2
80 VXAR $49, V31.D2, V17.D2, V3.D2
81 VXAR $54, V30.D2, V11.D2, V30.D2
82 VXAR $58, V31.D2, V7.D2, V31.D2
83 VXAR $61, V29.D2, V10.D2, V29.D2
84
85 // chi and iota
86 VBCAX V8.B16, V22.B16, V26.B16, V20.B16
87 VBCAX V22.B16, V23.B16, V8.B16, V21.B16
88 VBCAX V23.B16, V24.B16, V22.B16, V22.B16
89 VBCAX V24.B16, V26.B16, V23.B16, V23.B16
90 VBCAX V26.B16, V8.B16, V24.B16, V24.B16
91
92 VLD1R.P 8(R1), [V26.D2]
93
94 VBCAX V3.B16, V19.B16, V30.B16, V17.B16
95 VBCAX V19.B16, V15.B16, V3.B16, V18.B16
96 VBCAX V15.B16, V16.B16, V19.B16, V19.B16
97 VBCAX V16.B16, V30.B16, V15.B16, V15.B16
98 VBCAX V30.B16, V3.B16, V16.B16, V16.B16
99
100 VBCAX V31.B16, V12.B16, V25.B16, V10.B16
101 VBCAX V12.B16, V13.B16, V31.B16, V11.B16
102 VBCAX V13.B16, V14.B16, V12.B16, V12.B16
103 VBCAX V14.B16, V25.B16, V13.B16, V13.B16
104 VBCAX V25.B16, V31.B16, V14.B16, V14.B16
105
106 VBCAX V4.B16, V9.B16, V29.B16, V7.B16
107 VBCAX V9.B16, V5.B16, V4.B16, V8.B16
108 VBCAX V5.B16, V6.B16, V9.B16, V9.B16
109 VBCAX V6.B16, V29.B16, V5.B16, V5.B16
110 VBCAX V29.B16, V4.B16, V6.B16, V6.B16
111
112 VBCAX V28.B16, V0.B16, V27.B16, V3.B16
113 VBCAX V0.B16, V1.B16, V28.B16, V4.B16
114
115 VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
116
117 VBCAX V2.B16, V27.B16, V1.B16, V1.B16
118 VBCAX V27.B16, V28.B16, V2.B16, V2.B16
119
120 VEOR V26.B16, V0.B16, V0.B16 // iota
121
122 SUB $1, R2, R2
123 CBNZ R2, loop
124
125 VST1.P [V0.D1, V1.D1], 16(R0)
126 VST1.P [V2.D1, V3.D1], 16(R0)
127 VST1.P [V4.D1, V5.D1], 16(R0)
128 VST1.P [V6.D1, V7.D1], 16(R0)
129 VST1.P [V8.D1, V9.D1], 16(R0)
130 VST1.P [V10.D1, V11.D1], 16(R0)
131 VST1.P [V12.D1, V13.D1], 16(R0)
132 VST1.P [V14.D1, V15.D1], 16(R0)
133 VST1.P [V16.D1, V17.D1], 16(R0)
134 VST1.P [V18.D1, V19.D1], 16(R0)
135 VST1.P [V20.D1, V21.D1], 16(R0)
136 VST1.P [V22.D1, V23.D1], 16(R0)
137 VST1 [V24.D1], (R0)
138
139 RET
140
141 DATA round_consts<>+0x00(SB)/8, $0x0000000000000001
142 DATA round_consts<>+0x08(SB)/8, $0x0000000000008082
143 DATA round_consts<>+0x10(SB)/8, $0x800000000000808a
144 DATA round_consts<>+0x18(SB)/8, $0x8000000080008000
145 DATA round_consts<>+0x20(SB)/8, $0x000000000000808b
146 DATA round_consts<>+0x28(SB)/8, $0x0000000080000001
147 DATA round_consts<>+0x30(SB)/8, $0x8000000080008081
148 DATA round_consts<>+0x38(SB)/8, $0x8000000000008009
149 DATA round_consts<>+0x40(SB)/8, $0x000000000000008a
150 DATA round_consts<>+0x48(SB)/8, $0x0000000000000088
151 DATA round_consts<>+0x50(SB)/8, $0x0000000080008009
152 DATA round_consts<>+0x58(SB)/8, $0x000000008000000a
153 DATA round_consts<>+0x60(SB)/8, $0x000000008000808b
154 DATA round_consts<>+0x68(SB)/8, $0x800000000000008b
155 DATA round_consts<>+0x70(SB)/8, $0x8000000000008089
156 DATA round_consts<>+0x78(SB)/8, $0x8000000000008003
157 DATA round_consts<>+0x80(SB)/8, $0x8000000000008002
158 DATA round_consts<>+0x88(SB)/8, $0x8000000000000080
159 DATA round_consts<>+0x90(SB)/8, $0x000000000000800a
160 DATA round_consts<>+0x98(SB)/8, $0x800000008000000a
161 DATA round_consts<>+0xA0(SB)/8, $0x8000000080008081
162 DATA round_consts<>+0xA8(SB)/8, $0x8000000000008080
163 DATA round_consts<>+0xB0(SB)/8, $0x0000000080000001
164 DATA round_consts<>+0xB8(SB)/8, $0x8000000080008008
165 GLOBL round_consts<>(SB), NOPTR|RODATA, $192
166
View as plain text