1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build (ppc64 || ppc64le) && !purego
6
7 // Based on CRYPTOGAMS code with the following comment:
8 // # ====================================================================
9 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10 // # project. The module is, however, dual licensed under OpenSSL and
11 // # CRYPTOGAMS licenses depending on where you obtain it. For further
12 // # details see http://www.openssl.org/~appro/cryptogams/.
13 // # ====================================================================
14
15 // Original code can be found at the link below:
16 // https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl
17
18 // Some function names were changed to be consistent with Go function
19 // names. For instance, function aes_p8_set_{en,de}crypt_key become
20 // set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
21 // and a new session was created (doEncryptKeyAsm). This was necessary to
22 // avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
23 // There were other modifications as well but kept the same functionality.
24
25 #include "textflag.h"
26
27 // For expandKeyAsm
28 #define INP R3
29 #define BITS R4
30 #define OUTENC R5 // Pointer to next expanded encrypt key
31 #define PTR R6
32 #define CNT R7
33 #define ROUNDS R8
34 #define OUTDEC R9 // Pointer to next expanded decrypt key
35 #define TEMP R19
36 #define ZERO V0
37 #define IN0 V1
38 #define IN1 V2
39 #define KEY V3
40 #define RCON V4
41 #define MASK V5
42 #define TMP V6
43 #define STAGE V7
44 #define OUTPERM V8
45 #define OUTMASK V9
46 #define OUTHEAD V10
47 #define OUTTAIL V11
48
49 // For P9 instruction emulation
50 #define ESPERM V21 // Endian swapping permute into BE
51 #define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXVB16X
52
53 // For {en,de}cryptBlockAsm
54 #define BLK_INP R3
55 #define BLK_OUT R4
56 #define BLK_KEY R5
57 #define BLK_ROUNDS R6
58 #define BLK_IDX R7
59
60 DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
61 DATA ·rcon+0x08(SB)/8, $0x0706050403020100
62 DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
63 DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
64 DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
65 DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
66 DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
67 DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
68 DATA ·rcon+0x40(SB)/8, $0x0000000000000000
69 DATA ·rcon+0x48(SB)/8, $0x0000000000000000
70 GLOBL ·rcon(SB), RODATA, $80
71
72 #ifdef GOARCH_ppc64le
73 # ifdef GOPPC64_power9
74 #define P8_LXVB16X(RA,RB,VT) LXVB16X (RA+RB), VT
75 #define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA+RB)
76 #define XXBRD_ON_LE(VA,VT) XXBRD VA, VT
77 #define SETUP_ESPERM(rtmp)
78 # else
79 // On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
80 // doublewords and byte-swapping each doubleword to emulate BE load/stores.
81 #define NEEDS_ESPERM
82 #define P8_LXVB16X(RA,RB,VT) \
83 LXVD2X (RA+RB), VT \
84 VPERM VT, VT, ESPERM, VT
85
86 #define P8_STXVB16X(VS,RA,RB) \
87 VPERM VS, VS, ESPERM, TMP2 \
88 STXVD2X TMP2, (RA+RB)
89
90 #define XXBRD_ON_LE(VA,VT) \
91 VPERM VA, VA, ESPERM, VT
92
93 // Setup byte-swapping permute value in ESPERM for POWER9 instruction
94 // emulation macros.
95 #define SETUP_ESPERM(rtmp) \
96 MOVD $·rcon(SB), rtmp \
97 LVX (rtmp), ESPERM
98 # endif // defined(GOPPC64_power9)
99 #else
100 #define P8_LXVB16X(RA,RB,VT) LXVD2X (RA+RB), VT
101 #define P8_STXVB16X(VS,RA,RB) STXVD2X VS, (RA+RB)
102 #define XXBRD_ON_LE(VA, VT)
103 #define SETUP_ESPERM(rtmp)
104 #endif // defined(GOARCH_ppc64le)
105
106 // func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
107 TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
108 // Load the arguments inside the registers
109 MOVD nr+0(FP), ROUNDS
110 MOVD key+8(FP), INP
111 MOVD enc+16(FP), OUTENC
112 MOVD dec+24(FP), OUTDEC
113
114 #ifdef NEEDS_ESPERM
115 MOVD $·rcon(SB), PTR // PTR points to rcon addr
116 LVX (PTR), ESPERM
117 ADD $0x10, PTR
118 #else
119 MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
120 #endif
121
122 // Get key from memory and write aligned into VR
123 P8_LXVB16X(INP, R0, IN0)
124 ADD $0x10, INP, INP
125 MOVD $0x20, TEMP
126
127 CMPW ROUNDS, $12
128 LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON
129 LVX (PTR)(TEMP), MASK
130 ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON
131 MOVD $8, CNT // li 7,8 CNT = 8
132 VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :)
133 MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds)
134
135 // The expanded decrypt key is the expanded encrypt key stored in reverse order.
136 // Move OUTDEC to the last key location, and store in descending order.
137 ADD $160, OUTDEC, OUTDEC
138 BLT loop128
139 ADD $32, OUTDEC, OUTDEC
140 BEQ l192
141 ADD $32, OUTDEC, OUTDEC
142 JMP l256
143
144 loop128:
145 // Key schedule (Round 1 to 8)
146 VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
147 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
148 STXVD2X IN0, (R0+OUTENC)
149 STXVD2X IN0, (R0+OUTDEC)
150 VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
151 ADD $16, OUTENC, OUTENC
152 ADD $-16, OUTDEC, OUTDEC
153
154 VXOR IN0, TMP, IN0 // vxor 1,1,6
155 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
156 VXOR IN0, TMP, IN0 // vxor 1,1,6
157 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
158 VXOR IN0, TMP, IN0 // vxor 1,1,6
159 VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
160 VXOR IN0, KEY, IN0 // vxor 1,1,3
161 BDNZ loop128
162
163 LVX (PTR)(R0), RCON // lvx 4,0,6 Last two round keys
164
165 // Key schedule (Round 9)
166 VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat
167 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
168 STXVD2X IN0, (R0+OUTENC)
169 STXVD2X IN0, (R0+OUTDEC)
170 VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
171 ADD $16, OUTENC, OUTENC
172 ADD $-16, OUTDEC, OUTDEC
173
174 // Key schedule (Round 10)
175 VXOR IN0, TMP, IN0 // vxor 1,1,6
176 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
177 VXOR IN0, TMP, IN0 // vxor 1,1,6
178 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
179 VXOR IN0, TMP, IN0 // vxor 1,1,6
180 VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
181 VXOR IN0, KEY, IN0 // vxor 1,1,3
182
183 VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
184 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
185 STXVD2X IN0, (R0+OUTENC)
186 STXVD2X IN0, (R0+OUTDEC)
187 VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
188 ADD $16, OUTENC, OUTENC
189 ADD $-16, OUTDEC, OUTDEC
190
191 // Key schedule (Round 11)
192 VXOR IN0, TMP, IN0 // vxor 1,1,6
193 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
194 VXOR IN0, TMP, IN0 // vxor 1,1,6
195 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
196 VXOR IN0, TMP, IN0 // vxor 1,1,6
197 VXOR IN0, KEY, IN0 // vxor 1,1,3
198 STXVD2X IN0, (R0+OUTENC)
199 STXVD2X IN0, (R0+OUTDEC)
200
201 RET
202
203 l192:
204 LXSDX (INP+R0), IN1 // Load next 8 bytes into upper half of VSR.
205 XXBRD_ON_LE(IN1, IN1) // and convert to BE ordering on LE hosts.
206 MOVD $4, CNT // li 7,4
207 STXVD2X IN0, (R0+OUTENC)
208 STXVD2X IN0, (R0+OUTDEC)
209 ADD $16, OUTENC, OUTENC
210 ADD $-16, OUTDEC, OUTDEC
211 VSPLTISB $8, KEY // vspltisb 3,8
212 MOVD CNT, CTR // mtctr 7
213 VSUBUBM MASK, KEY, MASK // vsububm 5,5,3
214
215 loop192:
216 VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
217 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
218 VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
219
220 VXOR IN0, TMP, IN0 // vxor 1,1,6
221 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
222 VXOR IN0, TMP, IN0 // vxor 1,1,6
223 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
224 VXOR IN0, TMP, IN0 // vxor 1,1,6
225
226 VSLDOI $8, ZERO, IN1, STAGE // vsldoi 7,0,2,8
227 VSPLTW $3, IN0, TMP // vspltw 6,1,3
228 VXOR TMP, IN1, TMP // vxor 6,6,2
229 VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12
230 VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
231 VXOR IN1, TMP, IN1 // vxor 2,2,6
232 VXOR IN0, KEY, IN0 // vxor 1,1,3
233 VXOR IN1, KEY, IN1 // vxor 2,2,3
234 VSLDOI $8, STAGE, IN0, STAGE // vsldoi 7,7,1,8
235
236 VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
237 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
238 STXVD2X STAGE, (R0+OUTENC)
239 STXVD2X STAGE, (R0+OUTDEC)
240 VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
241 ADD $16, OUTENC, OUTENC
242 ADD $-16, OUTDEC, OUTDEC
243
244 VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8
245 VXOR IN0, TMP, IN0 // vxor 1,1,6
246 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
247 STXVD2X STAGE, (R0+OUTENC)
248 STXVD2X STAGE, (R0+OUTDEC)
249 VXOR IN0, TMP, IN0 // vxor 1,1,6
250 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
251 VXOR IN0, TMP, IN0 // vxor 1,1,6
252 ADD $16, OUTENC, OUTENC
253 ADD $-16, OUTDEC, OUTDEC
254
255 VSPLTW $3, IN0, TMP // vspltw 6,1,3
256 VXOR TMP, IN1, TMP // vxor 6,6,2
257 VSLDOI $12, ZERO, IN1, IN1 // vsldoi 2,0,2,12
258 VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
259 VXOR IN1, TMP, IN1 // vxor 2,2,6
260 VXOR IN0, KEY, IN0 // vxor 1,1,3
261 VXOR IN1, KEY, IN1 // vxor 2,2,3
262 STXVD2X IN0, (R0+OUTENC)
263 STXVD2X IN0, (R0+OUTDEC)
264 ADD $16, OUTENC, OUTENC
265 ADD $-16, OUTDEC, OUTDEC
266 BDNZ loop192
267
268 RET
269
270 l256:
271 P8_LXVB16X(INP, R0, IN1)
272 MOVD $7, CNT // li 7,7
273 STXVD2X IN0, (R0+OUTENC)
274 STXVD2X IN0, (R0+OUTDEC)
275 ADD $16, OUTENC, OUTENC
276 ADD $-16, OUTDEC, OUTDEC
277 MOVD CNT, CTR // mtctr 7
278
279 loop256:
280 VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
281 VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
282 STXVD2X IN1, (R0+OUTENC)
283 STXVD2X IN1, (R0+OUTDEC)
284 VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
285 ADD $16, OUTENC, OUTENC
286 ADD $-16, OUTDEC, OUTDEC
287
288 VXOR IN0, TMP, IN0 // vxor 1,1,6
289 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
290 VXOR IN0, TMP, IN0 // vxor 1,1,6
291 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
292 VXOR IN0, TMP, IN0 // vxor 1,1,6
293 VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
294 VXOR IN0, KEY, IN0 // vxor 1,1,3
295 STXVD2X IN0, (R0+OUTENC)
296 STXVD2X IN0, (R0+OUTDEC)
297 ADD $16, OUTENC, OUTENC
298 ADD $-16, OUTDEC, OUTDEC
299 BDZ done
300
301 VSPLTW $3, IN0, KEY // vspltw 3,1,3
302 VSLDOI $12, ZERO, IN1, TMP // vsldoi 6,0,2,12
303 VSBOX KEY, KEY // vsbox 3,3
304
305 VXOR IN1, TMP, IN1 // vxor 2,2,6
306 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
307 VXOR IN1, TMP, IN1 // vxor 2,2,6
308 VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
309 VXOR IN1, TMP, IN1 // vxor 2,2,6
310
311 VXOR IN1, KEY, IN1 // vxor 2,2,3
312 JMP loop256 // b .Loop256
313
314 done:
315 RET
316
317 // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
318 TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
319 MOVD nr+0(FP), R6 // Round count/Key size
320 MOVD xk+8(FP), R5 // Key pointer
321 MOVD dst+16(FP), R3 // Dest pointer
322 MOVD src+24(FP), R4 // Src pointer
323 SETUP_ESPERM(R7)
324
325 // Set CR{1,2,3}EQ to hold the key size information.
326 CMPU R6, $10, CR1
327 CMPU R6, $12, CR2
328 CMPU R6, $14, CR3
329
330 MOVD $16, R6
331 MOVD $32, R7
332 MOVD $48, R8
333 MOVD $64, R9
334 MOVD $80, R10
335 MOVD $96, R11
336 MOVD $112, R12
337
338 // Load text in BE order
339 P8_LXVB16X(R4, R0, V0)
340
341 // V1, V2 will hold keys, V0 is a temp.
342 // At completion, V2 will hold the ciphertext.
343 // Load xk[0:3] and xor with text
344 LXVD2X (R0+R5), V1
345 VXOR V0, V1, V0
346
347 // Load xk[4:11] and cipher
348 LXVD2X (R6+R5), V1
349 LXVD2X (R7+R5), V2
350 VCIPHER V0, V1, V0
351 VCIPHER V0, V2, V0
352
353 // Load xk[12:19] and cipher
354 LXVD2X (R8+R5), V1
355 LXVD2X (R9+R5), V2
356 VCIPHER V0, V1, V0
357 VCIPHER V0, V2, V0
358
359 // Load xk[20:27] and cipher
360 LXVD2X (R10+R5), V1
361 LXVD2X (R11+R5), V2
362 VCIPHER V0, V1, V0
363 VCIPHER V0, V2, V0
364
365 // Increment xk pointer to reuse constant offsets in R6-R12.
366 ADD $112, R5
367
368 // Load xk[28:35] and cipher
369 LXVD2X (R0+R5), V1
370 LXVD2X (R6+R5), V2
371 VCIPHER V0, V1, V0
372 VCIPHER V0, V2, V0
373
374 // Load xk[36:43] and cipher
375 LXVD2X (R7+R5), V1
376 LXVD2X (R8+R5), V2
377 BEQ CR1, Ldec_tail // Key size 10?
378 VCIPHER V0, V1, V0
379 VCIPHER V0, V2, V0
380
381 // Load xk[44:51] and cipher
382 LXVD2X (R9+R5), V1
383 LXVD2X (R10+R5), V2
384 BEQ CR2, Ldec_tail // Key size 12?
385 VCIPHER V0, V1, V0
386 VCIPHER V0, V2, V0
387
388 // Load xk[52:59] and cipher
389 LXVD2X (R11+R5), V1
390 LXVD2X (R12+R5), V2
391 BNE CR3, Linvalid_key_len // Not key size 14?
392 // Fallthrough to final cipher
393
394 Ldec_tail:
395 // Cipher last two keys such that key information is
396 // cleared from V1 and V2.
397 VCIPHER V0, V1, V1
398 VCIPHERLAST V1, V2, V2
399
400 // Store the result in BE order.
401 P8_STXVB16X(V2, R3, R0)
402 RET
403
404 Linvalid_key_len:
405 // Segfault, this should never happen. Only 3 keys sizes are created/used.
406 MOVD R0, 0(R0)
407 RET
408
409 // func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
410 TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
411 MOVD nr+0(FP), R6 // Round count/Key size
412 MOVD xk+8(FP), R5 // Key pointer
413 MOVD dst+16(FP), R3 // Dest pointer
414 MOVD src+24(FP), R4 // Src pointer
415 SETUP_ESPERM(R7)
416
417 // Set CR{1,2,3}EQ to hold the key size information.
418 CMPU R6, $10, CR1
419 CMPU R6, $12, CR2
420 CMPU R6, $14, CR3
421
422 MOVD $16, R6
423 MOVD $32, R7
424 MOVD $48, R8
425 MOVD $64, R9
426 MOVD $80, R10
427 MOVD $96, R11
428 MOVD $112, R12
429
430 // Load text in BE order
431 P8_LXVB16X(R4, R0, V0)
432
433 // V1, V2 will hold keys, V0 is a temp.
434 // At completion, V2 will hold the text.
435 // Load xk[0:3] and xor with ciphertext
436 LXVD2X (R0+R5), V1
437 VXOR V0, V1, V0
438
439 // Load xk[4:11] and cipher
440 LXVD2X (R6+R5), V1
441 LXVD2X (R7+R5), V2
442 VNCIPHER V0, V1, V0
443 VNCIPHER V0, V2, V0
444
445 // Load xk[12:19] and cipher
446 LXVD2X (R8+R5), V1
447 LXVD2X (R9+R5), V2
448 VNCIPHER V0, V1, V0
449 VNCIPHER V0, V2, V0
450
451 // Load xk[20:27] and cipher
452 LXVD2X (R10+R5), V1
453 LXVD2X (R11+R5), V2
454 VNCIPHER V0, V1, V0
455 VNCIPHER V0, V2, V0
456
457 // Increment xk pointer to reuse constant offsets in R6-R12.
458 ADD $112, R5
459
460 // Load xk[28:35] and cipher
461 LXVD2X (R0+R5), V1
462 LXVD2X (R6+R5), V2
463 VNCIPHER V0, V1, V0
464 VNCIPHER V0, V2, V0
465
466 // Load xk[36:43] and cipher
467 LXVD2X (R7+R5), V1
468 LXVD2X (R8+R5), V2
469 BEQ CR1, Ldec_tail // Key size 10?
470 VNCIPHER V0, V1, V0
471 VNCIPHER V0, V2, V0
472
473 // Load xk[44:51] and cipher
474 LXVD2X (R9+R5), V1
475 LXVD2X (R10+R5), V2
476 BEQ CR2, Ldec_tail // Key size 12?
477 VNCIPHER V0, V1, V0
478 VNCIPHER V0, V2, V0
479
480 // Load xk[52:59] and cipher
481 LXVD2X (R11+R5), V1
482 LXVD2X (R12+R5), V2
483 BNE CR3, Linvalid_key_len // Not key size 14?
484 // Fallthrough to final cipher
485
486 Ldec_tail:
487 // Cipher last two keys such that key information is
488 // cleared from V1 and V2.
489 VNCIPHER V0, V1, V1
490 VNCIPHERLAST V1, V2, V2
491
492 // Store the result in BE order.
493 P8_STXVB16X(V2, R3, R0)
494 RET
495
496 Linvalid_key_len:
497 // Segfault, this should never happen. Only 3 keys sizes are created/used.
498 MOVD R0, 0(R0)
499 RET
500
501 // Remove defines from above so they can be defined here
502 #undef INP
503 #undef OUTENC
504 #undef ROUNDS
505 #undef KEY
506 #undef TMP
507
508 #define INP R3
509 #define OUTP R4
510 #define LEN R5
511 #define KEYP R6
512 #define ROUNDS R7
513 #define IVP R8
514 #define ENC R9
515
516 #define INOUT V2
517 #define TMP V3
518 #define IVEC V4
519
520 // Load the crypt key into VSRs.
521 //
522 // The expanded key is stored and loaded using
523 // STXVD2X/LXVD2X. The in-memory byte ordering
524 // depends on the endianness of the machine. The
525 // expanded keys are generated by expandKeyAsm above.
526 //
527 // Rkeyp holds the key pointer. It is clobbered. Once
528 // the expanded keys are loaded, it is not needed.
529 //
530 // R12,R14-R21 are scratch registers.
531 // For keyp of 10, V6, V11-V20 hold the expanded key.
532 // For keyp of 12, V6, V9-V20 hold the expanded key.
533 // For keyp of 14, V6, V7-V20 hold the expanded key.
534 #define LOAD_KEY(Rkeyp) \
535 MOVD $16, R12 \
536 MOVD $32, R14 \
537 MOVD $48, R15 \
538 MOVD $64, R16 \
539 MOVD $80, R17 \
540 MOVD $96, R18 \
541 MOVD $112, R19 \
542 MOVD $128, R20 \
543 MOVD $144, R21 \
544 LXVD2X (R0+Rkeyp), V6 \
545 ADD $16, Rkeyp \
546 BEQ CR1, L_start10 \
547 BEQ CR2, L_start12 \
548 LXVD2X (R0+Rkeyp), V7 \
549 LXVD2X (R12+Rkeyp), V8 \
550 ADD $32, Rkeyp \
551 L_start12: \
552 LXVD2X (R0+Rkeyp), V9 \
553 LXVD2X (R12+Rkeyp), V10 \
554 ADD $32, Rkeyp \
555 L_start10: \
556 LXVD2X (R0+Rkeyp), V11 \
557 LXVD2X (R12+Rkeyp), V12 \
558 LXVD2X (R14+Rkeyp), V13 \
559 LXVD2X (R15+Rkeyp), V14 \
560 LXVD2X (R16+Rkeyp), V15 \
561 LXVD2X (R17+Rkeyp), V16 \
562 LXVD2X (R18+Rkeyp), V17 \
563 LXVD2X (R19+Rkeyp), V18 \
564 LXVD2X (R20+Rkeyp), V19 \
565 LXVD2X (R21+Rkeyp), V20
566
567 // Perform aes cipher operation for keysize 10/12/14 using the keys
568 // loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
569 //
570 // Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
571 // performance V6 and IVEC can be swapped (xor is both associative and
572 // commutative) during encryption:
573 //
574 // VXOR INOUT, IVEC, INOUT
575 // VXOR INOUT, V6, INOUT
576 //
577 // into
578 //
579 // VXOR INOUT, V6, INOUT
580 // VXOR INOUT, IVEC, INOUT
581 //
582 #define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
583 VXOR Vin, Vxor, Vout \
584 BEQ CR1, label10 \
585 BEQ CR2, label12 \
586 vcipher Vout, V7, Vout \
587 vcipher Vout, V8, Vout \
588 label12: \
589 vcipher Vout, V9, Vout \
590 vcipher Vout, V10, Vout \
591 label10: \
592 vcipher Vout, V11, Vout \
593 vcipher Vout, V12, Vout \
594 vcipher Vout, V13, Vout \
595 vcipher Vout, V14, Vout \
596 vcipher Vout, V15, Vout \
597 vcipher Vout, V16, Vout \
598 vcipher Vout, V17, Vout \
599 vcipher Vout, V18, Vout \
600 vcipher Vout, V19, Vout \
601 vciphel Vout, V20, Vout \
602
603 #define CLEAR_KEYS() \
604 VXOR V6, V6, V6 \
605 VXOR V7, V7, V7 \
606 VXOR V8, V8, V8 \
607 VXOR V9, V9, V9 \
608 VXOR V10, V10, V10 \
609 VXOR V11, V11, V11 \
610 VXOR V12, V12, V12 \
611 VXOR V13, V13, V13 \
612 VXOR V14, V14, V14 \
613 VXOR V15, V15, V15 \
614 VXOR V16, V16, V16 \
615 VXOR V17, V17, V17 \
616 VXOR V18, V18, V18 \
617 VXOR V19, V19, V19 \
618 VXOR V20, V20, V20
619
620 //func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
621 TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
622 MOVD src+0(FP), INP
623 MOVD dst+8(FP), OUTP
624 MOVD length+16(FP), LEN
625 MOVD key+24(FP), KEYP
626 MOVD iv+32(FP), IVP
627 MOVD enc+40(FP), ENC
628 MOVD nr+48(FP), ROUNDS
629
630 SETUP_ESPERM(R11)
631
632 // Assume len > 0 && len % blockSize == 0.
633 CMPW ENC, $0
634 P8_LXVB16X(IVP, R0, IVEC)
635 CMPU ROUNDS, $10, CR1
636 CMPU ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
637
638 // Setup key in VSRs, and set loop count in CTR.
639 LOAD_KEY(KEYP)
640 SRD $4, LEN
641 MOVD LEN, CTR
642
643 BEQ Lcbc_dec
644
645 PCALIGN $16
646 Lcbc_enc:
647 P8_LXVB16X(INP, R0, INOUT)
648 ADD $16, INP
649 VXOR INOUT, V6, INOUT
650 CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
651 VOR INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
652 P8_STXVB16X(INOUT, OUTP, R0)
653 ADD $16, OUTP
654 BDNZ Lcbc_enc
655
656 P8_STXVB16X(INOUT, IVP, R0)
657 CLEAR_KEYS()
658 RET
659
660 PCALIGN $16
661 Lcbc_dec:
662 P8_LXVB16X(INP, R0, TMP)
663 ADD $16, INP
664 CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
665 VXOR INOUT, IVEC, INOUT
666 VOR TMP, TMP, IVEC // TMP is IVEC for next block.
667 P8_STXVB16X(INOUT, OUTP, R0)
668 ADD $16, OUTP
669 BDNZ Lcbc_dec
670
671 P8_STXVB16X(IVEC, IVP, R0)
672 CLEAR_KEYS()
673 RET
674
675
676 #define DO1_CIPHER(iv0, keyv, key, op) \
677 LXVD2X (key), keyv \
678 ADD $16, key \
679 op iv0, keyv, iv0
680
681 #define DO2_CIPHER(iv0, iv1, keyv, key, op) \
682 DO1_CIPHER(iv0, keyv, key, op) \
683 op iv1, keyv, iv1
684
685 #define DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
686 DO2_CIPHER(iv0, iv1, keyv, key, op) \
687 op iv2, keyv, iv2 \
688 op iv3, keyv, iv3
689
690 #define DO8_CIPHER(iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7, keyv, key, op) \
691 DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
692 op iv4, keyv, iv4 \
693 op iv5, keyv, iv5 \
694 op iv6, keyv, iv6 \
695 op iv7, keyv, iv7
696
697 #define XOR_STORE(src, iv, dstp, dstpoff) \
698 XXLXOR src, iv, V8 \
699 P8_STXVB16X(V8,dstp,dstpoff)
700
701 //func ctrBlocks1Asm(nr int, xk *[60]uint32, dst, src *[1 * BlockSize]byte, ivlo, ivhi uint64)
702 TEXT ·ctrBlocks1Asm(SB), NOSPLIT|NOFRAME, $0
703
704 #define CTRBLOCK_PROLOGUE \
705 MOVD nr+0(FP), R3 \
706 MOVD xk+8(FP), R4 \
707 MOVD dst+16(FP), R5 \
708 MOVD src+24(FP), R6 \
709 MOVD ivlo+32(FP), R8 \
710 MOVD ivhi+40(FP), R9 \
711 CMP R3, $12, CR1 \
712 MTVSRD R8, V0 \
713 MTVSRD R9, V1 \
714 XXPERMDI V1, V0, $0, V0 \
715 SETUP_ESPERM(R8)
716
717 CTRBLOCK_PROLOGUE
718
719 DO1_CIPHER(V0,V8,R4,VXOR)
720
721 BEQ CR1, key_12
722 BLT CR1, key_10
723 key_14:
724 DO1_CIPHER(V0,V8,R4,VCIPHER)
725 DO1_CIPHER(V0,V8,R4,VCIPHER)
726 key_12:
727 DO1_CIPHER(V0,V8,R4,VCIPHER)
728 DO1_CIPHER(V0,V8,R4,VCIPHER)
729 key_10:
730 P8_LXVB16X(R6,R0,V9)
731 DO1_CIPHER(V0,V8,R4,VCIPHER)
732 DO1_CIPHER(V0,V8,R4,VCIPHER)
733 DO1_CIPHER(V0,V8,R4,VCIPHER)
734 DO1_CIPHER(V0,V8,R4,VCIPHER)
735
736 DO1_CIPHER(V0,V8,R4,VCIPHER)
737 DO1_CIPHER(V0,V8,R4,VCIPHER)
738 DO1_CIPHER(V0,V8,R4,VCIPHER)
739 DO1_CIPHER(V0,V8,R4,VCIPHER)
740
741 DO1_CIPHER(V0,V8,R4,VCIPHER)
742 DO1_CIPHER(V0,V8,R4,VCIPHERLAST)
743
744 XOR_STORE(V9,V0,R5,R0)
745 RET
746
747 //func ctrBlocks2Asm(nr int, xk *[60]uint32, dst, src *[2 * BlockSize]byte, ivlo, ivhi uint64)
748 TEXT ·ctrBlocks2Asm(SB), NOSPLIT|NOFRAME, $0
749 CTRBLOCK_PROLOGUE
750
751 XXLEQV V8, V8, V8 // V0 is -1
752 VSUBUQM V0, V8, V1 // Vi = IV + i (as IV - (-1))
753
754 DO2_CIPHER(V0,V1,V8,R4,VXOR)
755
756 BEQ CR1, key_12
757 BLT CR1, key_10
758 key_14:
759 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
760 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
761 key_12:
762 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
763 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
764 key_10:
765 P8_LXVB16X(R6,R0,V9)
766 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
767 MOVD $16, R8
768 P8_LXVB16X(R6,R8,V10)
769 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
770 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
771 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
772 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
773 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
774 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
775 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
776 DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
777 DO2_CIPHER(V0,V1,V8,R4,VCIPHERLAST)
778
779 XOR_STORE(V9,V0,R5,R0)
780 XOR_STORE(V10,V1,R5,R8)
781
782 RET
783
784 //func ctrBlocks4Asm(nr int, xk *[60]uint32, dst, src *[4 * BlockSize]byte, ivlo, ivhi uint64)
785 TEXT ·ctrBlocks4Asm(SB), NOSPLIT|NOFRAME, $0
786 CTRBLOCK_PROLOGUE
787
788 XXLEQV V8, V8, V8 // V0 is -1
789 VSUBUQM V0, V8, V1 // Vi = IV + i (as IV - (-1))
790 VSUBUQM V1, V8, V2
791 VSUBUQM V2, V8, V3
792
793 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VXOR)
794
795 BEQ CR1, key_12
796 BLT CR1, key_10
797 key_14:
798 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
799 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
800 key_12:
801 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
802 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
803 key_10:
804 P8_LXVB16X(R6,R0,V9)
805 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
806 MOVD $16, R8
807 P8_LXVB16X(R6,R8,V10)
808 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
809 MOVD $32, R9
810 P8_LXVB16X(R6,R9,V11)
811 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
812 MOVD $48, R10
813 P8_LXVB16X(R6,R10,V12)
814 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
815 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
816 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
817 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
818 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
819 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
820 DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHERLAST)
821
822 XOR_STORE(V9,V0,R5,R0)
823 XOR_STORE(V10,V1,R5,R8)
824 XOR_STORE(V11,V2,R5,R9)
825 XOR_STORE(V12,V3,R5,R10)
826
827 RET
828
829 //func ctrBlocks8Asm(nr int, xk *[60]uint32, dst, src *[8 * BlockSize]byte, ivlo, ivhi uint64)
830 TEXT ·ctrBlocks8Asm(SB), NOSPLIT|NOFRAME, $0
831 CTRBLOCK_PROLOGUE
832
833 XXLEQV V8, V8, V8 // V8 is -1
834 VSUBUQM V0, V8, V1 // Vi = IV + i (as IV - (-1))
835 VADDUQM V8, V8, V9 // V9 is -2
836
837 VSUBUQM V0, V9, V2
838 VSUBUQM V1, V9, V3
839 VSUBUQM V2, V9, V4
840 VSUBUQM V3, V9, V5
841 VSUBUQM V4, V9, V6
842 VSUBUQM V5, V9, V7
843
844 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VXOR)
845
846 BEQ CR1, key_12
847 BLT CR1, key_10
848 key_14:
849 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
850 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
851 key_12:
852 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
853 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
854 key_10:
855 P8_LXVB16X(R6,R0,V9)
856 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
857 MOVD $16, R8
858 P8_LXVB16X(R6,R8,V10)
859 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
860 MOVD $32, R9
861 P8_LXVB16X(R6,R9,V11)
862 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
863 MOVD $48, R10
864 P8_LXVB16X(R6,R10,V12)
865 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
866 MOVD $64, R11
867 P8_LXVB16X(R6,R11,V13)
868 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
869 MOVD $80, R12
870 P8_LXVB16X(R6,R12,V14)
871 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
872 MOVD $96, R14
873 P8_LXVB16X(R6,R14,V15)
874 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
875 MOVD $112, R15
876 P8_LXVB16X(R6,R15,V16)
877 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
878 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
879 DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHERLAST)
880
881 XOR_STORE(V9,V0,R5,R0)
882 XOR_STORE(V10,V1,R5,R8)
883 XOR_STORE(V11,V2,R5,R9)
884 XOR_STORE(V12,V3,R5,R10)
885 XOR_STORE(V13,V4,R5,R11)
886 XOR_STORE(V14,V5,R5,R12)
887 XOR_STORE(V15,V6,R5,R14)
888 XOR_STORE(V16,V7,R5,R15)
889
890 RET
891
892
View as plain text