1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build (ppc64 || ppc64le) && !purego
6
7 // Portions based on CRYPTOGAMS code with the following comment:
8 // # ====================================================================
9 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10 // # project. The module is, however, dual licensed under OpenSSL and
11 // # CRYPTOGAMS licenses depending on where you obtain it. For further
12 // # details see http://www.openssl.org/~appro/cryptogams/.
13 // # ====================================================================
14
15 // The implementations for gcmHash and gcmInit are based on the generated asm
16 // from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
17 // from commit d47afb3c.
18
19 // Changes were made due to differences in the ABI and some register usage.
20 // Some arguments were changed due to the way the Go code passes them.
21
22 // Portions that use the stitched AES-GCM approach in counterCryptASM
23 // are based on code found in
24 // https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
25
26 #include "textflag.h"
27
28 #define XIP R3
29 #define HTBL R4
30 #define INP R5
31 #define LEN R6
32
33 #define XL V0
34 #define XM V1
35 #define XH V2
36 #define IN V3
37 #define ZERO V4
38 #define T0 V5
39 #define T1 V6
40 #define T2 V7
41 #define XC2 V8
42 #define H V9
43 #define HH V10
44 #define HL V11
45 #define LEMASK V12
46 #define XL1 V13
47 #define XM1 V14
48 #define XH1 V15
49 #define IN1 V16
50 #define H2 V17
51 #define H2H V18
52 #define H2L V19
53 #define XL3 V20
54 #define XM2 V21
55 #define IN2 V22
56 #define H3L V23
57 #define H3 V24
58 #define H3H V25
59 #define XH3 V26
60 #define XM3 V27
61 #define IN3 V28
62 #define H4L V29
63 #define H4 V30
64 #define H4H V31
65
66 #define IN0 IN
67 #define H21L HL
68 #define H21H HH
69 #define LOPERM H2L
70 #define HIPERM H2H
71
72 #define VXL VS32
73 #define VIN VS35
74 #define VXC2 VS40
75 #define VH VS41
76 #define VHH VS42
77 #define VHL VS43
78 #define VIN1 VS48
79 #define VH2 VS49
80 #define VH2H VS50
81 #define VH2L VS51
82
83 #define VIN2 VS54
84 #define VH3L VS55
85 #define VH3 VS56
86 #define VH3H VS57
87 #define VIN3 VS60
88 #define VH4L VS61
89 #define VH4 VS62
90 #define VH4H VS63
91
92 #define VIN0 VIN
93
94 #define ESPERM V10
95 #define TMP2 V11
96
97 DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
98 DATA ·rcon+0x08(SB)/8, $0x0706050403020100
99 DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
100 DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
101 DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
102 DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
103 DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
104 DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
105 DATA ·rcon+0x40(SB)/8, $0x0000000000000000
106 DATA ·rcon+0x48(SB)/8, $0x0000000000000000
107 GLOBL ·rcon(SB), RODATA, $80
108
109 // The following macros provide appropriate
110 // implementations for endianness as well as
111 // ISA specific for power8 and power9.
112 #ifdef GOARCH_ppc64le
113 # ifdef GOPPC64_power9
114 #define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT
115 #define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB)
116 # else
117 #define NEEDS_ESPERM
118 #define P8_LXVB16X(RA,RB,VT) \
119 LXVD2X (RA+RB), VT \
120 VPERM VT, VT, ESPERM, VT
121
122 #define P8_STXVB16X(VS,RA,RB) \
123 VPERM VS, VS, ESPERM, TMP2; \
124 STXVD2X TMP2, (RA+RB)
125
126 # endif
127 #else
128 #define P8_LXVB16X(RA,RB,VT) \
129 LXVD2X (RA+RB), VT
130
131 #define P8_STXVB16X(VS,RA,RB) \
132 STXVD2X VS, (RA+RB)
133
134 #endif
135
136 #define MASK_PTR R8
137
138 #define MASKV V0
139 #define INV V1
140
141 // The following macros are used for
142 // the stitched implementation within
143 // counterCryptASM.
144
145 // Load the initial GCM counter value
146 // in V30 and set up the counter increment
147 // in V31
148 #define SETUP_COUNTER \
149 P8_LXVB16X(COUNTER, R0, V30); \
150 VSPLTISB $1, V28; \
151 VXOR V31, V31, V31; \
152 VSLDOI $1, V31, V28, V31
153
154 // These macros set up the initial value
155 // for a single encryption, or 4 or 8
156 // stitched encryptions implemented
157 // with interleaving vciphers.
158 //
159 // The input value for each encryption
160 // is generated by XORing the counter
161 // from V30 with the first key in VS0
162 // and incrementing the counter.
163 //
164 // Single encryption in V15
165 #define GEN_VCIPHER_INPUT \
166 XXLOR VS0, VS0, V29 \
167 VXOR V30, V29, V15; \
168 VADDUWM V30, V31, V30
169
170 // 4 encryptions in V15 - V18
171 #define GEN_VCIPHER_4_INPUTS \
172 XXLOR VS0, VS0, V29; \
173 VXOR V30, V29, V15; \
174 VADDUWM V30, V31, V30; \
175 VXOR V30, V29, V16; \
176 VADDUWM V30, V31, V30; \
177 VXOR V30, V29, V17; \
178 VADDUWM V30, V31, V30; \
179 VXOR V30, V29, V18; \
180 VADDUWM V30, V31, V30
181
182 // 8 encryptions in V15 - V22
183 #define GEN_VCIPHER_8_INPUTS \
184 XXLOR VS0, VS0, V29; \
185 VXOR V30, V29, V15; \
186 VADDUWM V30, V31, V30; \
187 VXOR V30, V29, V16; \
188 VADDUWM V30, V31, V30; \
189 VXOR V30, V29, V17; \
190 VADDUWM V30, V31, V30; \
191 VXOR V30, V29, V18; \
192 VADDUWM V30, V31, V30; \
193 VXOR V30, V29, V19; \
194 VADDUWM V30, V31, V30; \
195 VXOR V30, V29, V20; \
196 VADDUWM V30, V31, V30; \
197 VXOR V30, V29, V21; \
198 VADDUWM V30, V31, V30; \
199 VXOR V30, V29, V22; \
200 VADDUWM V30, V31, V30
201
202 // Load the keys to be used for
203 // encryption based on key_len.
204 // Keys are in VS0 - VS14
205 // depending on key_len.
206 // Valid keys sizes are verified
207 // here. CR2 is set and used
208 // throughout to check key_len.
209 #define LOAD_KEYS(blk_key, key_len) \
210 MOVD $16, R16; \
211 MOVD $32, R17; \
212 MOVD $48, R18; \
213 MOVD $64, R19; \
214 LXVD2X (blk_key)(R0), VS0; \
215 LXVD2X (blk_key)(R16), VS1; \
216 LXVD2X (blk_key)(R17), VS2; \
217 LXVD2X (blk_key)(R18), VS3; \
218 LXVD2X (blk_key)(R19), VS4; \
219 ADD $64, R16; \
220 ADD $64, R17; \
221 ADD $64, R18; \
222 ADD $64, R19; \
223 LXVD2X (blk_key)(R16), VS5; \
224 LXVD2X (blk_key)(R17), VS6; \
225 LXVD2X (blk_key)(R18), VS7; \
226 LXVD2X (blk_key)(R19), VS8; \
227 ADD $64, R16; \
228 ADD $64, R17; \
229 ADD $64, R18; \
230 ADD $64, R19; \
231 LXVD2X (blk_key)(R16), VS9; \
232 LXVD2X (blk_key)(R17), VS10; \
233 CMP key_len, $12, CR2; \
234 CMP key_len, $10; \
235 BEQ keysLoaded; \
236 LXVD2X (blk_key)(R18), VS11; \
237 LXVD2X (blk_key)(R19), VS12; \
238 BEQ CR2, keysLoaded; \
239 ADD $64, R16; \
240 ADD $64, R17; \
241 LXVD2X (blk_key)(R16), VS13; \
242 LXVD2X (blk_key)(R17), VS14; \
243 CMP key_len, $14; \
244 BEQ keysLoaded; \
245 MOVD R0,0(R0); \
246 keysLoaded:
247
248 // Encrypt 1 (vin) with first 9
249 // keys from VS1 - VS9.
250 #define VCIPHER_1X9_KEYS(vin) \
251 XXLOR VS1, VS1, V23; \
252 XXLOR VS2, VS2, V24; \
253 XXLOR VS3, VS3, V25; \
254 XXLOR VS4, VS4, V26; \
255 XXLOR VS5, VS5, V27; \
256 VCIPHER vin, V23, vin; \
257 VCIPHER vin, V24, vin; \
258 VCIPHER vin, V25, vin; \
259 VCIPHER vin, V26, vin; \
260 VCIPHER vin, V27, vin; \
261 XXLOR VS6, VS6, V23; \
262 XXLOR VS7, VS7, V24; \
263 XXLOR VS8, VS8, V25; \
264 XXLOR VS9, VS9, V26; \
265 VCIPHER vin, V23, vin; \
266 VCIPHER vin, V24, vin; \
267 VCIPHER vin, V25, vin; \
268 VCIPHER vin, V26, vin
269
270 // Encrypt 1 value (vin) with
271 // 2 specified keys
272 #define VCIPHER_1X2_KEYS(vin, key1, key2) \
273 XXLOR key1, key1, V25; \
274 XXLOR key2, key2, V26; \
275 VCIPHER vin, V25, vin; \
276 VCIPHER vin, V26, vin
277
278 // Encrypt 4 values in V15 - V18
279 // with the specified key from
280 // VS1 - VS9.
281 #define VCIPHER_4X1_KEY(key) \
282 XXLOR key, key, V23; \
283 VCIPHER V15, V23, V15; \
284 VCIPHER V16, V23, V16; \
285 VCIPHER V17, V23, V17; \
286 VCIPHER V18, V23, V18
287
288 // Encrypt 8 values in V15 - V22
289 // with the specified key,
290 // assuming it is a VSreg
291 #define VCIPHER_8X1_KEY(key) \
292 XXLOR key, key, V23; \
293 VCIPHER V15, V23, V15; \
294 VCIPHER V16, V23, V16; \
295 VCIPHER V17, V23, V17; \
296 VCIPHER V18, V23, V18; \
297 VCIPHER V19, V23, V19; \
298 VCIPHER V20, V23, V20; \
299 VCIPHER V21, V23, V21; \
300 VCIPHER V22, V23, V22
301
302 // Load input block into V1-V4
303 // in big endian order and
304 // update blk_inp by 64.
305 #define LOAD_INPUT_BLOCK64(blk_inp) \
306 MOVD $16, R16; \
307 MOVD $32, R17; \
308 MOVD $48, R18; \
309 P8_LXVB16X(blk_inp,R0,V1); \
310 P8_LXVB16X(blk_inp,R16,V2); \
311 P8_LXVB16X(blk_inp,R17,V3); \
312 P8_LXVB16X(blk_inp,R18,V4); \
313 ADD $64, blk_inp
314
315 // Load input block into V1-V8
316 // in big endian order and
317 // Update blk_inp by 128
318 #define LOAD_INPUT_BLOCK128(blk_inp) \
319 MOVD $16, R16; \
320 MOVD $32, R17; \
321 MOVD $48, R18; \
322 MOVD $64, R19; \
323 MOVD $80, R20; \
324 MOVD $96, R21; \
325 MOVD $112, R22; \
326 P8_LXVB16X(blk_inp,R0,V1); \
327 P8_LXVB16X(blk_inp,R16,V2); \
328 P8_LXVB16X(blk_inp,R17,V3); \
329 P8_LXVB16X(blk_inp,R18,V4); \
330 P8_LXVB16X(blk_inp,R19,V5); \
331 P8_LXVB16X(blk_inp,R20,V6); \
332 P8_LXVB16X(blk_inp,R21,V7); \
333 P8_LXVB16X(blk_inp,R22,V8); \
334 ADD $128, blk_inp
335
336 // Finish encryption on 8 streams and
337 // XOR with input block
338 #define VCIPHERLAST8_XOR_INPUT \
339 VCIPHERLAST V15, V23, V15; \
340 VCIPHERLAST V16, V23, V16; \
341 VCIPHERLAST V17, V23, V17; \
342 VCIPHERLAST V18, V23, V18; \
343 VCIPHERLAST V19, V23, V19; \
344 VCIPHERLAST V20, V23, V20; \
345 VCIPHERLAST V21, V23, V21; \
346 VCIPHERLAST V22, V23, V22; \
347 XXLXOR V1, V15, V1; \
348 XXLXOR V2, V16, V2; \
349 XXLXOR V3, V17, V3; \
350 XXLXOR V4, V18, V4; \
351 XXLXOR V5, V19, V5; \
352 XXLXOR V6, V20, V6; \
353 XXLXOR V7, V21, V7; \
354 XXLXOR V8, V22, V8
355
356 // Finish encryption on 4 streams and
357 // XOR with input block
358 #define VCIPHERLAST4_XOR_INPUT \
359 VCIPHERLAST V15, V23, V15; \
360 VCIPHERLAST V16, V23, V16; \
361 VCIPHERLAST V17, V23, V17; \
362 VCIPHERLAST V18, V23, V18; \
363 XXLXOR V1, V15, V1; \
364 XXLXOR V2, V16, V2; \
365 XXLXOR V3, V17, V3; \
366 XXLXOR V4, V18, V4
367
368 // Store output block from V1-V8
369 // in big endian order and
370 // Update blk_out by 128
371 #define STORE_OUTPUT_BLOCK128(blk_out) \
372 P8_STXVB16X(V1,blk_out,R0); \
373 P8_STXVB16X(V2,blk_out,R16); \
374 P8_STXVB16X(V3,blk_out,R17); \
375 P8_STXVB16X(V4,blk_out,R18); \
376 P8_STXVB16X(V5,blk_out,R19); \
377 P8_STXVB16X(V6,blk_out,R20); \
378 P8_STXVB16X(V7,blk_out,R21); \
379 P8_STXVB16X(V8,blk_out,R22); \
380 ADD $128, blk_out
381
382 // Store output block from V1-V4
383 // in big endian order and
384 // Update blk_out by 64
385 #define STORE_OUTPUT_BLOCK64(blk_out) \
386 P8_STXVB16X(V1,blk_out,R0); \
387 P8_STXVB16X(V2,blk_out,R16); \
388 P8_STXVB16X(V3,blk_out,R17); \
389 P8_STXVB16X(V4,blk_out,R18); \
390 ADD $64, blk_out
391
392 // func gcmInit(productTable *[256]byte, h []byte)
393 TEXT ·gcmInit(SB), NOSPLIT, $0-32
394 MOVD productTable+0(FP), XIP
395 MOVD h+8(FP), HTBL
396
397 MOVD $0x10, R8
398 MOVD $0x20, R9
399 MOVD $0x30, R10
400 LXVD2X (HTBL)(R0), VH // Load H
401
402 VSPLTISB $-16, XC2 // 0xf0
403 VSPLTISB $1, T0 // one
404 VADDUBM XC2, XC2, XC2 // 0xe0
405 VXOR ZERO, ZERO, ZERO
406 VOR XC2, T0, XC2 // 0xe1
407 VSLDOI $15, XC2, ZERO, XC2 // 0xe1...
408 VSLDOI $1, ZERO, T0, T1 // ...1
409 VADDUBM XC2, XC2, XC2 // 0xc2...
410 VSPLTISB $7, T2
411 VOR XC2, T1, XC2 // 0xc2....01
412 VSPLTB $0, H, T1 // most significant byte
413 VSL H, T0, H // H<<=1
414 VSRAB T1, T2, T1 // broadcast carry bit
415 VAND T1, XC2, T1
416 VXOR H, T1, IN // twisted H
417
418 VSLDOI $8, IN, IN, H // twist even more ...
419 VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
420 VSLDOI $8, ZERO, H, HL // ... and split
421 VSLDOI $8, H, ZERO, HH
422
423 STXVD2X VXC2, (XIP+R0) // save pre-computed table
424 STXVD2X VHL, (XIP+R8)
425 MOVD $0x40, R8
426 STXVD2X VH, (XIP+R9)
427 MOVD $0x50, R9
428 STXVD2X VHH, (XIP+R10)
429 MOVD $0x60, R10
430
431 VPMSUMD IN, HL, XL // H.lo·H.lo
432 VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi
433 VPMSUMD IN, HH, XH // H.hi·H.hi
434
435 VPMSUMD XL, XC2, T2 // 1st reduction phase
436
437 VSLDOI $8, XM, ZERO, T0
438 VSLDOI $8, ZERO, XM, T1
439 VXOR XL, T0, XL
440 VXOR XH, T1, XH
441
442 VSLDOI $8, XL, XL, XL
443 VXOR XL, T2, XL
444
445 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
446 VPMSUMD XL, XC2, XL
447 VXOR T1, XH, T1
448 VXOR XL, T1, IN1
449
450 VSLDOI $8, IN1, IN1, H2
451 VSLDOI $8, ZERO, H2, H2L
452 VSLDOI $8, H2, ZERO, H2H
453
454 STXVD2X VH2L, (XIP+R8) // save H^2
455 MOVD $0x70, R8
456 STXVD2X VH2, (XIP+R9)
457 MOVD $0x80, R9
458 STXVD2X VH2H, (XIP+R10)
459 MOVD $0x90, R10
460
461 VPMSUMD IN, H2L, XL // H.lo·H^2.lo
462 VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
463 VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi
464 VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi
465 VPMSUMD IN, H2H, XH // H.hi·H^2.hi
466 VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
467
468 VPMSUMD XL, XC2, T2 // 1st reduction phase
469 VPMSUMD XL1, XC2, HH // 1st reduction phase
470
471 VSLDOI $8, XM, ZERO, T0
472 VSLDOI $8, ZERO, XM, T1
473 VSLDOI $8, XM1, ZERO, HL
474 VSLDOI $8, ZERO, XM1, H
475 VXOR XL, T0, XL
476 VXOR XH, T1, XH
477 VXOR XL1, HL, XL1
478 VXOR XH1, H, XH1
479
480 VSLDOI $8, XL, XL, XL
481 VSLDOI $8, XL1, XL1, XL1
482 VXOR XL, T2, XL
483 VXOR XL1, HH, XL1
484
485 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
486 VSLDOI $8, XL1, XL1, H // 2nd reduction phase
487 VPMSUMD XL, XC2, XL
488 VPMSUMD XL1, XC2, XL1
489 VXOR T1, XH, T1
490 VXOR H, XH1, H
491 VXOR XL, T1, XL
492 VXOR XL1, H, XL1
493
494 VSLDOI $8, XL, XL, H
495 VSLDOI $8, XL1, XL1, H2
496 VSLDOI $8, ZERO, H, HL
497 VSLDOI $8, H, ZERO, HH
498 VSLDOI $8, ZERO, H2, H2L
499 VSLDOI $8, H2, ZERO, H2H
500
501 STXVD2X VHL, (XIP+R8) // save H^3
502 MOVD $0xa0, R8
503 STXVD2X VH, (XIP+R9)
504 MOVD $0xb0, R9
505 STXVD2X VHH, (XIP+R10)
506 MOVD $0xc0, R10
507 STXVD2X VH2L, (XIP+R8) // save H^4
508 STXVD2X VH2, (XIP+R9)
509 STXVD2X VH2H, (XIP+R10)
510
511 RET
512
513 // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
514 TEXT ·gcmHash(SB), NOSPLIT, $0-64
515 MOVD output+0(FP), XIP
516 MOVD productTable+24(FP), HTBL
517 MOVD inp+32(FP), INP
518 MOVD len+56(FP), LEN
519
520 MOVD $0x10, R8
521 MOVD $0x20, R9
522 MOVD $0x30, R10
523 LXVD2X (XIP)(R0), VXL // load Xi
524
525 LXVD2X (HTBL)(R8), VHL // load pre-computed table
526 MOVD $0x40, R8
527 LXVD2X (HTBL)(R9), VH
528 MOVD $0x50, R9
529 LXVD2X (HTBL)(R10), VHH
530 MOVD $0x60, R10
531 LXVD2X (HTBL)(R0), VXC2
532 #ifdef GOARCH_ppc64le
533 LVSL (R0)(R0), LEMASK
534 VSPLTISB $0x07, T0
535 VXOR LEMASK, T0, LEMASK
536 VPERM XL, XL, LEMASK, XL
537 #endif
538 VXOR ZERO, ZERO, ZERO
539
540 CMPU LEN, $64
541 BGE gcm_ghash_p8_4x
542
543 LXVD2X (INP)(R0), VIN
544 ADD $16, INP, INP
545 SUBCCC $16, LEN, LEN
546 #ifdef GOARCH_ppc64le
547 VPERM IN, IN, LEMASK, IN
548 #endif
549 VXOR IN, XL, IN
550 BEQ short
551
552 LXVD2X (HTBL)(R8), VH2L // load H^2
553 MOVD $16, R8
554 LXVD2X (HTBL)(R9), VH2
555 ADD LEN, INP, R9 // end of input
556 LXVD2X (HTBL)(R10), VH2H
557
558 loop_2x:
559 LXVD2X (INP)(R0), VIN1
560 #ifdef GOARCH_ppc64le
561 VPERM IN1, IN1, LEMASK, IN1
562 #endif
563
564 SUBC $32, LEN, LEN
565 VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo
566 VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo
567 SUBE R11, R11, R11 // borrow?-1:0
568 VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi
569 VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi
570 AND LEN, R11, R11
571 VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi
572 VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi
573 ADD R11, INP, INP
574
575 VXOR XL, XL1, XL
576 VXOR XM, XM1, XM
577
578 VPMSUMD XL, XC2, T2 // 1st reduction phase
579
580 VSLDOI $8, XM, ZERO, T0
581 VSLDOI $8, ZERO, XM, T1
582 VXOR XH, XH1, XH
583 VXOR XL, T0, XL
584 VXOR XH, T1, XH
585
586 VSLDOI $8, XL, XL, XL
587 VXOR XL, T2, XL
588 LXVD2X (INP)(R8), VIN
589 ADD $32, INP, INP
590
591 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
592 VPMSUMD XL, XC2, XL
593 #ifdef GOARCH_ppc64le
594 VPERM IN, IN, LEMASK, IN
595 #endif
596 VXOR T1, XH, T1
597 VXOR IN, T1, IN
598 VXOR IN, XL, IN
599 CMP R9, INP
600 BGT loop_2x // done yet?
601
602 CMPWU LEN, $0
603 BNE even
604
605 short:
606 VPMSUMD IN, HL, XL // H.lo·Xi.lo
607 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
608 VPMSUMD IN, HH, XH // H.hi·Xi.hi
609
610 VPMSUMD XL, XC2, T2 // 1st reduction phase
611
612 VSLDOI $8, XM, ZERO, T0
613 VSLDOI $8, ZERO, XM, T1
614 VXOR XL, T0, XL
615 VXOR XH, T1, XH
616
617 VSLDOI $8, XL, XL, XL
618 VXOR XL, T2, XL
619
620 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
621 VPMSUMD XL, XC2, XL
622 VXOR T1, XH, T1
623
624 even:
625 VXOR XL, T1, XL
626 #ifdef GOARCH_ppc64le
627 VPERM XL, XL, LEMASK, XL
628 #endif
629 STXVD2X VXL, (XIP+R0)
630
631 OR R12, R12, R12 // write out Xi
632 RET
633
634 gcm_ghash_p8_4x:
635 LVSL (R8)(R0), T0 // 0x0001..0e0f
636 MOVD $0x70, R8
637 LXVD2X (HTBL)(R9), VH2
638 MOVD $0x80, R9
639 VSPLTISB $8, T1 // 0x0808..0808
640 MOVD $0x90, R10
641 LXVD2X (HTBL)(R8), VH3L // load H^3
642 MOVD $0xa0, R8
643 LXVD2X (HTBL)(R9), VH3
644 MOVD $0xb0, R9
645 LXVD2X (HTBL)(R10), VH3H
646 MOVD $0xc0, R10
647 LXVD2X (HTBL)(R8), VH4L // load H^4
648 MOVD $0x10, R8
649 LXVD2X (HTBL)(R9), VH4
650 MOVD $0x20, R9
651 LXVD2X (HTBL)(R10), VH4H
652 MOVD $0x30, R10
653
654 VSLDOI $8, ZERO, T1, T2 // 0x0000..0808
655 VADDUBM T0, T2, HIPERM // 0x0001..1617
656 VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
657
658 SRD $4, LEN, LEN // this allows to use sign bit as carry
659
660 LXVD2X (INP)(R0), VIN0 // load input
661 LXVD2X (INP)(R8), VIN1
662 SUBCCC $8, LEN, LEN
663 LXVD2X (INP)(R9), VIN2
664 LXVD2X (INP)(R10), VIN3
665 ADD $0x40, INP, INP
666 #ifdef GOARCH_ppc64le
667 VPERM IN0, IN0, LEMASK, IN0
668 VPERM IN1, IN1, LEMASK, IN1
669 VPERM IN2, IN2, LEMASK, IN2
670 VPERM IN3, IN3, LEMASK, IN3
671 #endif
672
673 VXOR IN0, XL, XH
674
675 VPMSUMD IN1, H3L, XL1
676 VPMSUMD IN1, H3, XM1
677 VPMSUMD IN1, H3H, XH1
678
679 VPERM H2, H, HIPERM, H21L
680 VPERM IN2, IN3, LOPERM, T0
681 VPERM H2, H, LOPERM, H21H
682 VPERM IN2, IN3, HIPERM, T1
683 VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
684 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
685 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
686 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
687
688 VXOR XM2, XM1, XM2
689 VXOR XL3, XL1, XL3
690 VXOR XM3, XM2, XM3
691 VXOR XH3, XH1, XH3
692
693 BLT tail_4x
694
695 loop_4x:
696 LXVD2X (INP)(R0), VIN0
697 LXVD2X (INP)(R8), VIN1
698 SUBCCC $4, LEN, LEN
699 LXVD2X (INP)(R9), VIN2
700 LXVD2X (INP)(R10), VIN3
701 ADD $0x40, INP, INP
702 #ifdef GOARCH_ppc64le
703 VPERM IN1, IN1, LEMASK, IN1
704 VPERM IN2, IN2, LEMASK, IN2
705 VPERM IN3, IN3, LEMASK, IN3
706 VPERM IN0, IN0, LEMASK, IN0
707 #endif
708
709 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
710 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
711 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
712 VPMSUMD IN1, H3L, XL1
713 VPMSUMD IN1, H3, XM1
714 VPMSUMD IN1, H3H, XH1
715
716 VXOR XL, XL3, XL
717 VXOR XM, XM3, XM
718 VXOR XH, XH3, XH
719 VPERM IN2, IN3, LOPERM, T0
720 VPERM IN2, IN3, HIPERM, T1
721
722 VPMSUMD XL, XC2, T2 // 1st reduction phase
723 VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
724 VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
725
726 VSLDOI $8, XM, ZERO, T0
727 VSLDOI $8, ZERO, XM, T1
728 VXOR XL, T0, XL
729 VXOR XH, T1, XH
730
731 VSLDOI $8, XL, XL, XL
732 VXOR XL, T2, XL
733
734 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
735 VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
736 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
737 VPMSUMD XL, XC2, XL
738
739 VXOR XL3, XL1, XL3
740 VXOR XH3, XH1, XH3
741 VXOR XH, IN0, XH
742 VXOR XM2, XM1, XM2
743 VXOR XH, T1, XH
744 VXOR XM3, XM2, XM3
745 VXOR XH, XL, XH
746 BGE loop_4x
747
748 tail_4x:
749 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
750 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
751 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
752
753 VXOR XL, XL3, XL
754 VXOR XM, XM3, XM
755
756 VPMSUMD XL, XC2, T2 // 1st reduction phase
757
758 VSLDOI $8, XM, ZERO, T0
759 VSLDOI $8, ZERO, XM, T1
760 VXOR XH, XH3, XH
761 VXOR XL, T0, XL
762 VXOR XH, T1, XH
763
764 VSLDOI $8, XL, XL, XL
765 VXOR XL, T2, XL
766
767 VSLDOI $8, XL, XL, T1 // 2nd reduction phase
768 VPMSUMD XL, XC2, XL
769 VXOR T1, XH, T1
770 VXOR XL, T1, XL
771
772 ADDCCC $4, LEN, LEN
773 BEQ done_4x
774
775 LXVD2X (INP)(R0), VIN0
776 CMPU LEN, $2
777 MOVD $-4, LEN
778 BLT one
779 LXVD2X (INP)(R8), VIN1
780 BEQ two
781
782 three:
783 LXVD2X (INP)(R9), VIN2
784 #ifdef GOARCH_ppc64le
785 VPERM IN0, IN0, LEMASK, IN0
786 VPERM IN1, IN1, LEMASK, IN1
787 VPERM IN2, IN2, LEMASK, IN2
788 #endif
789
790 VXOR IN0, XL, XH
791 VOR H3L, H3L, H4L
792 VOR H3, H3, H4
793 VOR H3H, H3H, H4H
794
795 VPERM IN1, IN2, LOPERM, T0
796 VPERM IN1, IN2, HIPERM, T1
797 VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
798 VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi
799 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
800 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
801
802 VXOR XM3, XM2, XM3
803 JMP tail_4x
804
805 two:
806 #ifdef GOARCH_ppc64le
807 VPERM IN0, IN0, LEMASK, IN0
808 VPERM IN1, IN1, LEMASK, IN1
809 #endif
810
811 VXOR IN, XL, XH
812 VPERM ZERO, IN1, LOPERM, T0
813 VPERM ZERO, IN1, HIPERM, T1
814
815 VSLDOI $8, ZERO, H2, H4L
816 VOR H2, H2, H4
817 VSLDOI $8, H2, ZERO, H4H
818
819 VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
820 VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi
821 VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
822
823 JMP tail_4x
824
825 one:
826 #ifdef GOARCH_ppc64le
827 VPERM IN0, IN0, LEMASK, IN0
828 #endif
829
830 VSLDOI $8, ZERO, H, H4L
831 VOR H, H, H4
832 VSLDOI $8, H, ZERO, H4H
833
834 VXOR IN0, XL, XH
835 VXOR XL3, XL3, XL3
836 VXOR XM3, XM3, XM3
837 VXOR XH3, XH3, XH3
838
839 JMP tail_4x
840
841 done_4x:
842 #ifdef GOARCH_ppc64le
843 VPERM XL, XL, LEMASK, XL
844 #endif
845 STXVD2X VXL, (XIP+R0) // write out Xi
846 RET
847
848 #define BLK_INP R3
849 #define BLK_OUT R4
850 #define BLK_KEY R5
851 #define KEY_LEN R6
852 #define BLK_IDX R7
853 #define IDX R8
854 #define IN_LEN R9
855 #define COUNTER R10
856 #define CONPTR R14
857 #define MASK V5
858
859 // Implementation of the counterCrypt function in assembler.
860 // Original loop is unrolled to allow for multiple encryption
861 // streams to be done in parallel, which is achieved by interleaving
862 // vcipher instructions from each stream. This is also referred to as
863 // stitching, and provides significant performance improvements.
864 // Some macros are defined which enable execution for big or little
865 // endian as well as different ISA targets.
866 //func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
867 //func counterCryptASM(xr, out, in, counter, key)
868 TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
869 MOVD xr(FP), KEY_LEN
870 MOVD out+8(FP), BLK_OUT
871 MOVD out_len+16(FP), R8
872 MOVD in+32(FP), BLK_INP
873 MOVD in_len+40(FP), IN_LEN
874 MOVD counter+56(FP), COUNTER
875 MOVD key+64(FP), BLK_KEY
876
877 // Set up permute string when needed.
878 #ifdef NEEDS_ESPERM
879 MOVD $·rcon(SB), R14
880 LVX (R14), ESPERM // Permute value for P8_ macros.
881 #endif
882 SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1}
883 LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize
884 CMP IN_LEN, $128
885 BLT block64
886 block128_loop:
887 // Do 8 encryptions in parallel by setting
888 // input values in V15-V22 and executing
889 // vcipher on the updated value and the keys.
890 GEN_VCIPHER_8_INPUTS
891 VCIPHER_8X1_KEY(VS1)
892 VCIPHER_8X1_KEY(VS2)
893 VCIPHER_8X1_KEY(VS3)
894 VCIPHER_8X1_KEY(VS4)
895 VCIPHER_8X1_KEY(VS5)
896 VCIPHER_8X1_KEY(VS6)
897 VCIPHER_8X1_KEY(VS7)
898 VCIPHER_8X1_KEY(VS8)
899 VCIPHER_8X1_KEY(VS9)
900 // Additional encryptions are done based on
901 // the key length, with the last key moved
902 // to V23 for use with VCIPHERLAST.
903 // CR2 = CMP key_len, $12
904 XXLOR VS10, VS10, V23
905 BLT CR2, block128_last // key_len = 10
906 VCIPHER_8X1_KEY(VS10)
907 VCIPHER_8X1_KEY(VS11)
908 XXLOR VS12,VS12,V23
909 BEQ CR2, block128_last // ken_len = 12
910 VCIPHER_8X1_KEY(VS12)
911 VCIPHER_8X1_KEY(VS13)
912 XXLOR VS14,VS14,V23 // key_len = 14
913 block128_last:
914 // vcipher encryptions are in V15-V22 at this
915 // point with vcipherlast remaining to be done.
916 // Load input block into V1-V8, setting index offsets
917 // in R16-R22 to use with the STORE.
918 LOAD_INPUT_BLOCK128(BLK_INP)
919 // Do VCIPHERLAST on the last key for each encryption
920 // stream and XOR the result with the corresponding
921 // value from the input block.
922 VCIPHERLAST8_XOR_INPUT
923 // Store the results (8*16) and update BLK_OUT by 128.
924 STORE_OUTPUT_BLOCK128(BLK_OUT)
925 ADD $-128, IN_LEN // input size
926 CMP IN_LEN, $128 // check if >= blocksize
927 BGE block128_loop // next input block
928 CMP IN_LEN, $0
929 BEQ done
930 block64:
931 CMP IN_LEN, $64 // Check if >= 64
932 BLT block16_loop
933 // Do 4 encryptions in parallel by setting
934 // input values in V15-V18 and executing
935 // vcipher on the updated value and the keys.
936 GEN_VCIPHER_4_INPUTS
937 VCIPHER_4X1_KEY(VS1)
938 VCIPHER_4X1_KEY(VS2)
939 VCIPHER_4X1_KEY(VS3)
940 VCIPHER_4X1_KEY(VS4)
941 VCIPHER_4X1_KEY(VS5)
942 VCIPHER_4X1_KEY(VS6)
943 VCIPHER_4X1_KEY(VS7)
944 VCIPHER_4X1_KEY(VS8)
945 VCIPHER_4X1_KEY(VS9)
946 // Check key length based on CR2
947 // Move last key to V23 for use with later vcipherlast
948 XXLOR VS10, VS10, V23
949 BLT CR2, block64_last // size = 10
950 VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys
951 VCIPHER_4X1_KEY(VS11)
952 XXLOR VS12, VS12, V23
953 BEQ CR2, block64_last // size = 12
954 VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys
955 VCIPHER_4X1_KEY(VS13)
956 XXLOR VS14, VS14, V23 // size = 14
957 block64_last:
958 LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input
959 // Do VCIPHERLAST on the last for each encryption
960 // stream and XOR the result with the corresponding
961 // value from the input block.
962 VCIPHERLAST4_XOR_INPUT
963 // Store the results (4*16) and update BLK_OUT by 64.
964 STORE_OUTPUT_BLOCK64(BLK_OUT)
965 ADD $-64, IN_LEN // decrement input block length
966 CMP IN_LEN, $0 // check for remaining length
967 BEQ done
968 block16_loop:
969 CMP IN_LEN, $16 // More input
970 BLT final_block // If not, then handle partial block
971 // Single encryption, no stitching
972 GEN_VCIPHER_INPUT // Generate input value for single encryption
973 VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys
974 XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast
975 // Key length based on CR2. (LT=10, EQ=12, GT=14)
976 BLT CR2, block16_last // Finish for key size 10
977 VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
978 XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast
979 BEQ CR2, block16_last // Finish for key size 12
980 VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
981 XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14
982 block16_last:
983 P8_LXVB16X(BLK_INP, R0, V1) // Load input
984 VCIPHERLAST V15, V23, V15 // Encrypt last value in V23
985 XXLXOR V15, V1, V1 // XOR with input
986 P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output
987 ADD $16, BLK_INP // Increment input pointer
988 ADD $16, BLK_OUT // Increment output pointer
989 ADD $-16, IN_LEN // Decrement input length
990 BR block16_loop // Check for next
991 final_block:
992 CMP IN_LEN, $0
993 BEQ done
994 GEN_VCIPHER_INPUT // Generate input value for partial encryption
995 VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys
996 XXLOR VS10, VS10, V23 // Save possible last key
997 BLT CR2, final_block_last
998 VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys
999 XXLOR VS12, VS12, V23 // Save possible last key
1000 BEQ CR2, final_block_last
1001 VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
1002 XXLOR VS14, VS14, V23 // Save last key
1003 final_block_last:
1004 VCIPHERLAST V15, V23, V15 // Finish encryption
1005 #ifdef GOPPC64_power10
1006 // set up length
1007 SLD $56, IN_LEN, R17
1008 LXVLL BLK_INP, R17, V25
1009 VXOR V25, V15, V25
1010 STXVLL V25, BLK_OUT, R17
1011 #else
1012 ADD $32, R1, MASK_PTR
1013 MOVD $0, R16
1014 P8_STXVB16X(V15, MASK_PTR, R0)
1015 CMP IN_LEN, $8
1016 BLT next4
1017 MOVD 0(MASK_PTR), R14
1018 MOVD 0(BLK_INP), R15
1019 XOR R14, R15, R14
1020 MOVD R14, 0(BLK_OUT)
1021 ADD $8, R16
1022 ADD $-8, IN_LEN
1023 next4:
1024 CMP IN_LEN, $4
1025 BLT next2
1026 MOVWZ (BLK_INP)(R16), R15
1027 MOVWZ (MASK_PTR)(R16), R14
1028 XOR R14, R15, R14
1029 MOVW R14, (R16)(BLK_OUT)
1030 ADD $4, R16
1031 ADD $-4, IN_LEN
1032 next2:
1033 CMP IN_LEN, $2
1034 BLT next1
1035 MOVHZ (BLK_INP)(R16), R15
1036 MOVHZ (MASK_PTR)(R16), R14
1037 XOR R14, R15, R14
1038 MOVH R14, (R16)(BLK_OUT)
1039 ADD $2, R16
1040 ADD $-2, IN_LEN
1041 next1:
1042 CMP IN_LEN, $1
1043 BLT done
1044 MOVBZ (MASK_PTR)(R16), R14
1045 MOVBZ (BLK_INP)(R16), R15
1046 XOR R14, R15, R14
1047 MOVB R14, (R16)(BLK_OUT)
1048 #endif
1049 done:
1050 // Save the updated counter value
1051 P8_STXVB16X(V30, COUNTER, R0)
1052 // Clear the keys
1053 XXLXOR VS0, VS0, VS0
1054 XXLXOR VS1, VS1, VS1
1055 XXLXOR VS2, VS2, VS2
1056 XXLXOR VS3, VS3, VS3
1057 XXLXOR VS4, VS4, VS4
1058 XXLXOR VS5, VS5, VS5
1059 XXLXOR VS6, VS6, VS6
1060 XXLXOR VS7, VS7, VS7
1061 XXLXOR VS8, VS8, VS8
1062 XXLXOR VS9, VS9, VS9
1063 XXLXOR VS10, VS10, VS10
1064 XXLXOR VS11, VS11, VS11
1065 XXLXOR VS12, VS12, VS12
1066 XXLXOR VS13, VS13, VS13
1067 XXLXOR VS14, VS14, VS14
1068 RET
1069
1070
View as plain text