1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8
9 // This is a port of the s390x asm implementation.
10 // to ppc64le.
11
12 // Some changes were needed due to differences in
13 // the Go opcodes and/or available instructions
14 // between s390x and ppc64le.
15
16 // 1. There were operand order differences in the
17 // VSUBUQM, VSUBCUQ, and VSEL instructions.
18
19 // 2. ppc64 does not have a multiply high and low
20 // like s390x, so those were implemented using
21 // macros to compute the equivalent values.
22
23 // 3. The LVX, STVX instructions on ppc64 require
24 // 16 byte alignment of the data. To avoid that
25 // requirement, data is loaded using LXVD2X and
26 // STXVD2X with VPERM to reorder bytes correctly.
27
28 // I have identified some areas where I believe
29 // changes would be needed to make this work for big
30 // endian; however additional changes beyond what I
31 // have noted are most likely needed to make it work.
32 // - The string used with VPERM to swap the byte order
33 // for loads and stores.
34 // - The constants that are loaded from CPOOL.
35 //
36
37 // The following constants are defined in an order
38 // that is correct for use with LXVD2X/STXVD2X
39 // on little endian.
40 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
41 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
42 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
43 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
44 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
45 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
46 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
47 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
48 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
49 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
50 DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
51 DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
52 DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
53 DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
54 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
55 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
56 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
57 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
58 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
59 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
60 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
61 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
62 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
63 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
64 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
65 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
66 DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
67 DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
68 DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
69 DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
70
71 // External declarations for constants
72 GLOBL p256ord<>(SB), 8, $32
73 GLOBL p256<>(SB), 8, $80
74 GLOBL p256mul<>(SB), 8, $160
75
76 // The following macros are used to implement the ppc64le
77 // equivalent function from the corresponding s390x
78 // instruction for vector multiply high, low, and add,
79 // since there aren't exact equivalent instructions.
80 // The corresponding s390x instructions appear in the
81 // comments.
82 // Implementation for big endian would have to be
83 // investigated, I think it would be different.
84 //
85 //
86 // Vector multiply word
87 //
88 // VMLF x0, x1, out_low
89 // VMLHF x0, x1, out_hi
90 #define VMULT(x1, x2, out_low, out_hi) \
91 VMULEUW x1, x2, TMP1; \
92 VMULOUW x1, x2, TMP2; \
93 VMRGEW TMP1, TMP2, out_hi; \
94 VMRGOW TMP1, TMP2, out_low
95
96 //
97 // Vector multiply add word
98 //
99 // VMALF x0, x1, y, out_low
100 // VMALHF x0, x1, y, out_hi
101 #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
102 VMULEUW y, one, TMP2; \
103 VMULOUW y, one, TMP1; \
104 VMULEUW x1, x2, out_low; \
105 VMULOUW x1, x2, out_hi; \
106 VADDUDM TMP2, out_low, TMP2; \
107 VADDUDM TMP1, out_hi, TMP1; \
108 VMRGOW TMP2, TMP1, out_low; \
109 VMRGEW TMP2, TMP1, out_hi
110
111 #define res_ptr R3
112 #define a_ptr R4
113
114 #undef res_ptr
115 #undef a_ptr
116
117 #define P1ptr R3
118 #define CPOOL R7
119
120 #define Y1L V0
121 #define Y1H V1
122 #define T1L V2
123 #define T1H V3
124
125 #define PL V30
126 #define PH V31
127
128 #define CAR1 V6
129 // func p256NegCond(val *p256Point, cond int)
130 TEXT ·p256NegCond(SB), NOSPLIT, $0-16
131 MOVD val+0(FP), P1ptr
132 MOVD $16, R16
133
134 MOVD cond+8(FP), R6
135 CMP $0, R6
136 BC 12, 2, LR // just return if cond == 0
137
138 MOVD $p256mul<>+0x00(SB), CPOOL
139
140 LXVD2X (P1ptr)(R0), Y1L
141 LXVD2X (P1ptr)(R16), Y1H
142
143 XXPERMDI Y1H, Y1H, $2, Y1H
144 XXPERMDI Y1L, Y1L, $2, Y1L
145
146 LXVD2X (CPOOL)(R0), PL
147 LXVD2X (CPOOL)(R16), PH
148
149 VSUBCUQ PL, Y1L, CAR1 // subtract part2 giving carry
150 VSUBUQM PL, Y1L, T1L // subtract part2 giving result
151 VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
152
153 XXPERMDI T1H, T1H, $2, T1H
154 XXPERMDI T1L, T1L, $2, T1L
155
156 STXVD2X T1L, (R0+P1ptr)
157 STXVD2X T1H, (R16+P1ptr)
158 RET
159
160 #undef P1ptr
161 #undef CPOOL
162 #undef Y1L
163 #undef Y1H
164 #undef T1L
165 #undef T1H
166 #undef PL
167 #undef PH
168 #undef CAR1
169
170 #define P3ptr R3
171 #define P1ptr R4
172 #define P2ptr R5
173
174 #define X1L V0
175 #define X1H V1
176 #define Y1L V2
177 #define Y1H V3
178 #define Z1L V4
179 #define Z1H V5
180 #define X2L V6
181 #define X2H V7
182 #define Y2L V8
183 #define Y2H V9
184 #define Z2L V10
185 #define Z2H V11
186 #define SEL V12
187 #define ZER V13
188
189 // This function uses LXVD2X and STXVD2X to avoid the
190 // data alignment requirement for LVX, STVX. Since
191 // this code is just moving bytes and not doing arithmetic,
192 // order of the bytes doesn't matter.
193 //
194 // func p256MovCond(res, a, b *p256Point, cond int)
195 TEXT ·p256MovCond(SB), NOSPLIT, $0-32
196 MOVD res+0(FP), P3ptr
197 MOVD a+8(FP), P1ptr
198 MOVD b+16(FP), P2ptr
199 MOVD $16, R16
200 MOVD $32, R17
201 MOVD $48, R18
202 MOVD $56, R21
203 MOVD $64, R19
204 MOVD $80, R20
205 // cond is R1 + 24 (cond offset) + 32
206 LXVDSX (R1)(R21), SEL
207 VSPLTISB $0, ZER
208 // SEL controls whether to store a or b
209 VCMPEQUD SEL, ZER, SEL
210
211 LXVD2X (P1ptr+R0), X1H
212 LXVD2X (P1ptr+R16), X1L
213 LXVD2X (P1ptr+R17), Y1H
214 LXVD2X (P1ptr+R18), Y1L
215 LXVD2X (P1ptr+R19), Z1H
216 LXVD2X (P1ptr+R20), Z1L
217
218 LXVD2X (P2ptr+R0), X2H
219 LXVD2X (P2ptr+R16), X2L
220 LXVD2X (P2ptr+R17), Y2H
221 LXVD2X (P2ptr+R18), Y2L
222 LXVD2X (P2ptr+R19), Z2H
223 LXVD2X (P2ptr+R20), Z2L
224
225 VSEL X1H, X2H, SEL, X1H
226 VSEL X1L, X2L, SEL, X1L
227 VSEL Y1H, Y2H, SEL, Y1H
228 VSEL Y1L, Y2L, SEL, Y1L
229 VSEL Z1H, Z2H, SEL, Z1H
230 VSEL Z1L, Z2L, SEL, Z1L
231
232 STXVD2X X1H, (P3ptr+R0)
233 STXVD2X X1L, (P3ptr+R16)
234 STXVD2X Y1H, (P3ptr+R17)
235 STXVD2X Y1L, (P3ptr+R18)
236 STXVD2X Z1H, (P3ptr+R19)
237 STXVD2X Z1L, (P3ptr+R20)
238
239 RET
240
241 #undef P3ptr
242 #undef P1ptr
243 #undef P2ptr
244 #undef X1L
245 #undef X1H
246 #undef Y1L
247 #undef Y1H
248 #undef Z1L
249 #undef Z1H
250 #undef X2L
251 #undef X2H
252 #undef Y2L
253 #undef Y2H
254 #undef Z2L
255 #undef Z2H
256 #undef SEL
257 #undef ZER
258
259 #define P3ptr R3
260 #define P1ptr R4
261 #define COUNT R5
262
263 #define X1L V0
264 #define X1H V1
265 #define Y1L V2
266 #define Y1H V3
267 #define Z1L V4
268 #define Z1H V5
269 #define X2L V6
270 #define X2H V7
271 #define Y2L V8
272 #define Y2H V9
273 #define Z2L V10
274 #define Z2H V11
275
276 #define ONE V18
277 #define IDX V19
278 #define SEL1 V20
279 #define SEL2 V21
280 // func p256Select(point *p256Point, table *p256Table, idx int)
281 TEXT ·p256Select(SB), NOSPLIT, $0-24
282 MOVD res+0(FP), P3ptr
283 MOVD table+8(FP), P1ptr
284 MOVD $16, R16
285 MOVD $32, R17
286 MOVD $48, R18
287 MOVD $64, R19
288 MOVD $80, R20
289
290 LXVDSX (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
291 VSPLTB $7, SEL1, IDX // splat byte
292 VSPLTISB $1, ONE // VREPIB $1, ONE
293 VSPLTISB $1, SEL2 // VREPIB $1, SEL2
294 MOVD $16, COUNT // len(p256Table)
295 MOVD COUNT, CTR // set up ctr
296
297 VSPLTISB $0, X1H // VZERO X1H
298 VSPLTISB $0, X1L // VZERO X1L
299 VSPLTISB $0, Y1H // VZERO Y1H
300 VSPLTISB $0, Y1L // VZERO Y1L
301 VSPLTISB $0, Z1H // VZERO Z1H
302 VSPLTISB $0, Z1L // VZERO Z1L
303
304 loop_select:
305
306 // LVXD2X is used here since data alignment doesn't
307 // matter.
308
309 LXVD2X (P1ptr+R0), X2H
310 LXVD2X (P1ptr+R16), X2L
311 LXVD2X (P1ptr+R17), Y2H
312 LXVD2X (P1ptr+R18), Y2L
313 LXVD2X (P1ptr+R19), Z2H
314 LXVD2X (P1ptr+R20), Z2L
315
316 VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
317
318 // This will result in SEL1 being all 0s or 1s, meaning
319 // the result is either X1L or X2L, no individual byte
320 // selection.
321
322 VSEL X1L, X2L, SEL1, X1L
323 VSEL X1H, X2H, SEL1, X1H
324 VSEL Y1L, Y2L, SEL1, Y1L
325 VSEL Y1H, Y2H, SEL1, Y1H
326 VSEL Z1L, Z2L, SEL1, Z1L
327 VSEL Z1H, Z2H, SEL1, Z1H
328
329 // Add 1 to all bytes in SEL2
330 VADDUBM SEL2, ONE, SEL2 // VAB SEL2, ONE, SEL2 OK
331 ADD $96, P1ptr
332 BDNZ loop_select
333
334 // STXVD2X is used here so that alignment doesn't
335 // need to be verified. Since values were loaded
336 // using LXVD2X this is OK.
337 STXVD2X X1H, (P3ptr+R0)
338 STXVD2X X1L, (P3ptr+R16)
339 STXVD2X Y1H, (P3ptr+R17)
340 STXVD2X Y1L, (P3ptr+R18)
341 STXVD2X Z1H, (P3ptr+R19)
342 STXVD2X Z1L, (P3ptr+R20)
343 RET
344
345 #undef P3ptr
346 #undef P1ptr
347 #undef COUNT
348 #undef X1L
349 #undef X1H
350 #undef Y1L
351 #undef Y1H
352 #undef Z1L
353 #undef Z1H
354 #undef X2L
355 #undef X2H
356 #undef Y2L
357 #undef Y2H
358 #undef Z2L
359 #undef Z2H
360 #undef ONE
361 #undef IDX
362 #undef SEL1
363 #undef SEL2
364
365 #define P3ptr R3
366 #define P1ptr R4
367 #define COUNT R5
368
369 #define X1L V0
370 #define X1H V1
371 #define Y1L V2
372 #define Y1H V3
373 #define Z1L V4
374 #define Z1H V5
375 #define X2L V6
376 #define X2H V7
377 #define Y2L V8
378 #define Y2H V9
379 #define Z2L V10
380 #define Z2H V11
381
382 #define ONE V18
383 #define IDX V19
384 #define SEL1 V20
385 #define SEL2 V21
386
387 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
388 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
389 MOVD res+0(FP), P3ptr
390 MOVD table+8(FP), P1ptr
391 MOVD $16, R16
392 MOVD $32, R17
393 MOVD $48, R18
394
395 LXVDSX (R1)(R18), SEL1
396 VSPLTB $7, SEL1, IDX // splat byte
397
398 VSPLTISB $1, ONE // Vector with byte 1s
399 VSPLTISB $1, SEL2 // Vector with byte 1s
400 MOVD $32, COUNT // len(p256AffineTable)
401 MOVD COUNT, CTR // loop count
402
403 VSPLTISB $0, X1H // VZERO X1H
404 VSPLTISB $0, X1L // VZERO X1L
405 VSPLTISB $0, Y1H // VZERO Y1H
406 VSPLTISB $0, Y1L // VZERO Y1L
407
408 loop_select:
409 LXVD2X (P1ptr+R0), X2H
410 LXVD2X (P1ptr+R16), X2L
411 LXVD2X (P1ptr+R17), Y2H
412 LXVD2X (P1ptr+R18), Y2L
413
414 VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
415
416 VSEL X1L, X2L, SEL1, X1L // Select if idx matched
417 VSEL X1H, X2H, SEL1, X1H
418 VSEL Y1L, Y2L, SEL1, Y1L
419 VSEL Y1H, Y2H, SEL1, Y1H
420
421 VADDUBM SEL2, ONE, SEL2 // Increment SEL2 bytes by 1
422 ADD $64, P1ptr // Next chunk
423 BDNZ loop_select
424
425 STXVD2X X1H, (P3ptr+R0)
426 STXVD2X X1L, (P3ptr+R16)
427 STXVD2X Y1H, (P3ptr+R17)
428 STXVD2X Y1L, (P3ptr+R18)
429 RET
430
431 #undef P3ptr
432 #undef P1ptr
433 #undef COUNT
434 #undef X1L
435 #undef X1H
436 #undef Y1L
437 #undef Y1H
438 #undef Z1L
439 #undef Z1H
440 #undef X2L
441 #undef X2H
442 #undef Y2L
443 #undef Y2H
444 #undef Z2L
445 #undef Z2H
446 #undef ONE
447 #undef IDX
448 #undef SEL1
449 #undef SEL2
450
451 #define res_ptr R3
452 #define x_ptr R4
453 #define CPOOL R7
454
455 #define T0 V0
456 #define T1 V1
457 #define T2 V2
458 #define TT0 V3
459 #define TT1 V4
460
461 #define ZER V6
462 #define SEL1 V7
463 #define SEL2 V8
464 #define CAR1 V9
465 #define CAR2 V10
466 #define RED1 V11
467 #define RED2 V12
468 #define PL V13
469 #define PH V14
470
471 // func p256FromMont(res, in *p256Element)
472 TEXT ·p256FromMont(SB), NOSPLIT, $0-16
473 MOVD res+0(FP), res_ptr
474 MOVD in+8(FP), x_ptr
475
476 MOVD $16, R16
477 MOVD $32, R17
478 MOVD $48, R18
479 MOVD $64, R19
480 MOVD $p256<>+0x00(SB), CPOOL
481
482 VSPLTISB $0, T2 // VZERO T2
483 VSPLTISB $0, ZER // VZERO ZER
484
485 // Constants are defined so that the LXVD2X is correct
486 LXVD2X (CPOOL+R0), PH
487 LXVD2X (CPOOL+R16), PL
488
489 // VPERM byte selections
490 LXVD2X (CPOOL+R18), SEL2
491 LXVD2X (CPOOL+R19), SEL1
492
493 LXVD2X (R16)(x_ptr), T1
494 LXVD2X (R0)(x_ptr), T0
495
496 // Put in true little endian order
497 XXPERMDI T0, T0, $2, T0
498 XXPERMDI T1, T1, $2, T1
499
500 // First round
501 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
502 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
503 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
504
505 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
506 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
507
508 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
509 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
510 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
511 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
512 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
513
514 // Second round
515 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
516 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
517 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
518
519 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
520 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
521
522 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
523 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
524 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
525 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
526 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
527
528 // Third round
529 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
530 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
531 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
532
533 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
534 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
535
536 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
537 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
538 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
539 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
540 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
541
542 // Last round
543 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
544 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
545 VSUBUQM RED2, RED1, RED2 // VSQ RED1, RED2, RED2 // Guaranteed not to underflow
546
547 VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
548 VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
549
550 VADDCUQ T0, RED1, CAR1 // VACCQ T0, RED1, CAR1
551 VADDUQM T0, RED1, T0 // VAQ T0, RED1, T0
552 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
553 VADDEUQM T1, RED2, CAR1, T1 // VACQ T1, RED2, CAR1, T1
554 VADDUQM T2, CAR2, T2 // VAQ T2, CAR2, T2
555
556 // ---------------------------------------------------
557
558 VSUBCUQ T0, PL, CAR1 // VSCBIQ PL, T0, CAR1
559 VSUBUQM T0, PL, TT0 // VSQ PL, T0, TT0
560 VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
561 VSUBEUQM T1, PH, CAR1, TT1 // VSBIQ T1, PH, CAR1, TT1
562 VSUBEUQM T2, ZER, CAR2, T2 // VSBIQ T2, ZER, CAR2, T2
563
564 VSEL TT0, T0, T2, T0
565 VSEL TT1, T1, T2, T1
566
567 // Reorder the bytes so STXVD2X can be used.
568 // TT0, TT1 used for VPERM result in case
569 // the caller expects T0, T1 to be good.
570 XXPERMDI T0, T0, $2, TT0
571 XXPERMDI T1, T1, $2, TT1
572
573 STXVD2X TT0, (R0)(res_ptr)
574 STXVD2X TT1, (R16)(res_ptr)
575 RET
576
577 #undef res_ptr
578 #undef x_ptr
579 #undef CPOOL
580 #undef T0
581 #undef T1
582 #undef T2
583 #undef TT0
584 #undef TT1
585 #undef ZER
586 #undef SEL1
587 #undef SEL2
588 #undef CAR1
589 #undef CAR2
590 #undef RED1
591 #undef RED2
592 #undef PL
593 #undef PH
594
595 // ---------------------------------------
596 // p256MulInternal
597 // V0-V3 V30,V31 - Not Modified
598 // V4-V15 V27-V29 - Volatile
599
600 #define CPOOL R7
601
602 // Parameters
603 #define X0 V0 // Not modified
604 #define X1 V1 // Not modified
605 #define Y0 V2 // Not modified
606 #define Y1 V3 // Not modified
607 #define T0 V4 // Result
608 #define T1 V5 // Result
609 #define P0 V30 // Not modified
610 #define P1 V31 // Not modified
611
612 // Temporaries: lots of reused vector regs
613 #define YDIG V6 // Overloaded with CAR2
614 #define ADD1H V7 // Overloaded with ADD3H
615 #define ADD2H V8 // Overloaded with ADD4H
616 #define ADD3 V9 // Overloaded with SEL2,SEL5
617 #define ADD4 V10 // Overloaded with SEL3,SEL6
618 #define RED1 V11 // Overloaded with CAR2
619 #define RED2 V12
620 #define RED3 V13 // Overloaded with SEL1
621 #define T2 V14
622 // Overloaded temporaries
623 #define ADD1 V4 // Overloaded with T0
624 #define ADD2 V5 // Overloaded with T1
625 #define ADD3H V7 // Overloaded with ADD1H
626 #define ADD4H V8 // Overloaded with ADD2H
627 #define ZER V28 // Overloaded with TMP1
628 #define CAR1 V6 // Overloaded with YDIG
629 #define CAR2 V11 // Overloaded with RED1
630 // Constant Selects
631 #define SEL1 V13 // Overloaded with RED3
632 #define SEL2 V9 // Overloaded with ADD3,SEL5
633 #define SEL3 V10 // Overloaded with ADD4,SEL6
634 #define SEL4 V6 // Overloaded with YDIG,CAR1
635 #define SEL5 V9 // Overloaded with ADD3,SEL2
636 #define SEL6 V10 // Overloaded with ADD4,SEL3
637
638 // TMP1, TMP2 used in
639 // VMULT macros
640 #define TMP1 V13 // Overloaded with RED3
641 #define TMP2 V27
642 #define ONE V29 // 1s splatted by word
643
644 /* *
645 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
646 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
647 * With you, SIMD be...
648 *
649 * +--------+--------+
650 * +--------| RED2 | RED1 |
651 * | +--------+--------+
652 * | ---+--------+--------+
653 * | +---- T2| T1 | T0 |--+
654 * | | ---+--------+--------+ |
655 * | | |
656 * | | ======================= |
657 * | | |
658 * | | +--------+--------+<-+
659 * | +-------| ADD2 | ADD1 |--|-----+
660 * | | +--------+--------+ | |
661 * | | +--------+--------+<---+ |
662 * | | | ADD2H | ADD1H |--+ |
663 * | | +--------+--------+ | |
664 * | | +--------+--------+<-+ |
665 * | | | ADD4 | ADD3 |--|-+ |
666 * | | +--------+--------+ | | |
667 * | | +--------+--------+<---+ | |
668 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
669 * | | +--------+--------+ | | V
670 * | | ------------------------ | | +--------+
671 * | | | | | RED3 | [d0 0 0 d0]
672 * | | | | +--------+
673 * | +---->+--------+--------+ | | |
674 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
675 * | +--------+--------+ | | |
676 * +---->---+--------+--------+ | | |
677 * T2| T1 | T0 |----+ | |
678 * ---+--------+--------+ | | |
679 * ---+--------+--------+<---+ | |
680 * +--- T2| T1 | T0 |----------+
681 * | ---+--------+--------+ | |
682 * | +--------+--------+<-------------+
683 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
684 * | +--------+--------+ | | |
685 * | +--------+<----------------------+
686 * | | RED3 |--------------+ | [0 0 d1 d0]
687 * | +--------+ | |
688 * +--->+--------+--------+ | |
689 * | T1 | T0 |--------+
690 * +--------+--------+ | |
691 * --------------------------- | |
692 * | |
693 * +--------+--------+<----+ |
694 * | RED2 | RED1 | |
695 * +--------+--------+ |
696 * ---+--------+--------+<-------+
697 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
698 * ---+--------+--------+
699 *
700 * *Mi obra de arte de siglo XXI @vpaprots
701 *
702 *
703 * First group is special, doesn't get the two inputs:
704 * +--------+--------+<-+
705 * +-------| ADD2 | ADD1 |--|-----+
706 * | +--------+--------+ | |
707 * | +--------+--------+<---+ |
708 * | | ADD2H | ADD1H |--+ |
709 * | +--------+--------+ | |
710 * | +--------+--------+<-+ |
711 * | | ADD4 | ADD3 |--|-+ |
712 * | +--------+--------+ | | |
713 * | +--------+--------+<---+ | |
714 * | | ADD4H | ADD3H |------|-+ |(+vzero)
715 * | +--------+--------+ | | V
716 * | ------------------------ | | +--------+
717 * | | | | RED3 | [d0 0 0 d0]
718 * | | | +--------+
719 * +---->+--------+--------+ | | |
720 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
721 * +--------+--------+ | | |
722 * ---+--------+--------+<---+ | |
723 * +--- T2| T1 | T0 |----------+
724 * | ---+--------+--------+ | |
725 * | +--------+--------+<-------------+
726 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
727 * | +--------+--------+ | | |
728 * | +--------+<----------------------+
729 * | | RED3 |--------------+ | [0 0 d1 d0]
730 * | +--------+ | |
731 * +--->+--------+--------+ | |
732 * | T1 | T0 |--------+
733 * +--------+--------+ | |
734 * --------------------------- | |
735 * | |
736 * +--------+--------+<----+ |
737 * | RED2 | RED1 | |
738 * +--------+--------+ |
739 * ---+--------+--------+<-------+
740 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
741 * ---+--------+--------+
742 *
743 * Last 'group' needs to RED2||RED1 shifted less
744 */
745 TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
746 // CPOOL loaded from caller
747 MOVD $16, R16
748 MOVD $32, R17
749 MOVD $48, R18
750 MOVD $64, R19
751 MOVD $80, R20
752 MOVD $96, R21
753 MOVD $112, R22
754
755 // ---------------------------------------------------
756
757 VSPLTW $3, Y0, YDIG // VREPF Y0 is input
758
759 // VMLHF X0, YDIG, ADD1H
760 // VMLHF X1, YDIG, ADD2H
761 // VMLF X0, YDIG, ADD1
762 // VMLF X1, YDIG, ADD2
763 //
764 VMULT(X0, YDIG, ADD1, ADD1H)
765 VMULT(X1, YDIG, ADD2, ADD2H)
766
767 VSPLTISW $1, ONE
768 VSPLTW $2, Y0, YDIG // VREPF
769
770 // VMALF X0, YDIG, ADD1H, ADD3
771 // VMALF X1, YDIG, ADD2H, ADD4
772 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
773 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
774 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
775 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
776
777 LXVD2X (R17)(CPOOL), SEL1
778 VSPLTISB $0, ZER // VZERO ZER
779 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
780
781 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
782 VSLDOI $12, ZER, ADD2, T1 // ADD2 Free // VSLDB
783
784 VADDCUQ T0, ADD3, CAR1 // VACCQ
785 VADDUQM T0, ADD3, T0 // ADD3 Free // VAQ
786 VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
787 VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free // VACQ
788
789 LXVD2X (R18)(CPOOL), SEL2
790 LXVD2X (R19)(CPOOL), SEL3
791 LXVD2X (R20)(CPOOL), SEL4
792 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
793 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
794 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
795 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow -->? // VSQ
796
797 VSLDOI $12, T1, T0, T0 // VSLDB
798 VSLDOI $12, T2, T1, T1 // VSLDB
799
800 VADDCUQ T0, ADD3H, CAR1 // VACCQ
801 VADDUQM T0, ADD3H, T0 // VAQ
802 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
803 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
804
805 // ---------------------------------------------------
806
807 VSPLTW $1, Y0, YDIG // VREPF
808
809 // VMALHF X0, YDIG, T0, ADD1H
810 // VMALHF X1, YDIG, T1, ADD2H
811 // VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
812 // VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
813 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
814 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
815
816 VSPLTW $0, Y0, YDIG // VREPF
817
818 // VMALF X0, YDIG, ADD1H, ADD3
819 // VMALF X1, YDIG, ADD2H, ADD4
820 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
821 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
822 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
823 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
824
825 VSPLTISB $0, ZER // VZERO ZER
826 LXVD2X (R17)(CPOOL), SEL1
827 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
828
829 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0 // VSLDB
830 VSLDOI $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free // VSLDB
831
832 VADDCUQ T0, RED1, CAR1 // VACCQ
833 VADDUQM T0, RED1, T0 // VAQ
834 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
835 VADDEUQM T1, RED2, CAR1, T1 // VACQ
836
837 VADDCUQ T0, ADD3, CAR1 // VACCQ
838 VADDUQM T0, ADD3, T0 // VAQ
839 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
840 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
841 VADDUQM T2, CAR2, T2 // VAQ
842
843 LXVD2X (R18)(CPOOL), SEL2
844 LXVD2X (R19)(CPOOL), SEL3
845 LXVD2X (R20)(CPOOL), SEL4
846 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
847 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
848 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
849 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
850
851 VSLDOI $12, T1, T0, T0 // VSLDB
852 VSLDOI $12, T2, T1, T1 // VSLDB
853
854 VADDCUQ T0, ADD3H, CAR1 // VACCQ
855 VADDUQM T0, ADD3H, T0 // VAQ
856 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
857 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
858
859 // ---------------------------------------------------
860
861 VSPLTW $3, Y1, YDIG // VREPF
862
863 // VMALHF X0, YDIG, T0, ADD1H
864 // VMALHF X1, YDIG, T1, ADD2H
865 // VMALF X0, YDIG, T0, ADD1
866 // VMALF X1, YDIG, T1, ADD2
867 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
868 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
869
870 VSPLTW $2, Y1, YDIG // VREPF
871
872 // VMALF X0, YDIG, ADD1H, ADD3
873 // VMALF X1, YDIG, ADD2H, ADD4
874 // VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
875 // VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
876 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
877 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
878
879 LXVD2X (R17)(CPOOL), SEL1
880 VSPLTISB $0, ZER // VZERO ZER
881 LXVD2X (R17)(CPOOL), SEL1
882 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
883
884 VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free // VSLDB
885 VSLDOI $12, T2, ADD2, T1 // ADD2 Free // VSLDB
886
887 VADDCUQ T0, RED1, CAR1 // VACCQ
888 VADDUQM T0, RED1, T0 // VAQ
889 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
890 VADDEUQM T1, RED2, CAR1, T1 // VACQ
891
892 VADDCUQ T0, ADD3, CAR1 // VACCQ
893 VADDUQM T0, ADD3, T0 // VAQ
894 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
895 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
896 VADDUQM T2, CAR2, T2 // VAQ
897
898 LXVD2X (R18)(CPOOL), SEL2
899 LXVD2X (R19)(CPOOL), SEL3
900 LXVD2X (R20)(CPOOL), SEL4
901 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
902 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
903 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
904 VSUBUQM RED2, RED3, RED2 // Guaranteed not to underflow // VSQ
905
906 VSLDOI $12, T1, T0, T0 // VSLDB
907 VSLDOI $12, T2, T1, T1 // VSLDB
908
909 VADDCUQ T0, ADD3H, CAR1 // VACCQ
910 VADDUQM T0, ADD3H, T0 // VAQ
911 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
912 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
913
914 // ---------------------------------------------------
915
916 VSPLTW $1, Y1, YDIG // VREPF
917
918 // VMALHF X0, YDIG, T0, ADD1H
919 // VMALHF X1, YDIG, T1, ADD2H
920 // VMALF X0, YDIG, T0, ADD1
921 // VMALF X1, YDIG, T1, ADD2
922 VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
923 VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
924
925 VSPLTW $0, Y1, YDIG // VREPF
926
927 // VMALF X0, YDIG, ADD1H, ADD3
928 // VMALF X1, YDIG, ADD2H, ADD4
929 // VMALHF X0, YDIG, ADD1H, ADD3H
930 // VMALHF X1, YDIG, ADD2H, ADD4H
931 VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
932 VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
933
934 VSPLTISB $0, ZER // VZERO ZER
935 LXVD2X (R17)(CPOOL), SEL1
936 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
937
938 VSLDOI $12, ADD2, ADD1, T0 // VSLDB
939 VSLDOI $12, T2, ADD2, T1 // VSLDB
940
941 VADDCUQ T0, RED1, CAR1 // VACCQ
942 VADDUQM T0, RED1, T0 // VAQ
943 VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
944 VADDEUQM T1, RED2, CAR1, T1 // VACQ
945
946 VADDCUQ T0, ADD3, CAR1 // VACCQ
947 VADDUQM T0, ADD3, T0 // VAQ
948 VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
949 VADDEUQM T1, ADD4, CAR1, T1 // VACQ
950 VADDUQM T2, CAR2, T2 // VAQ
951
952 LXVD2X (R21)(CPOOL), SEL5
953 LXVD2X (R22)(CPOOL), SEL6
954 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
955 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
956 VSUBUQM RED2, RED1, RED2 // Guaranteed not to underflow // VSQ
957
958 VSLDOI $12, T1, T0, T0 // VSLDB
959 VSLDOI $12, T2, T1, T1 // VSLDB
960
961 VADDCUQ T0, ADD3H, CAR1 // VACCQ
962 VADDUQM T0, ADD3H, T0 // VAQ
963 VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
964 VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
965
966 VADDCUQ T0, RED1, CAR1 // VACCQ
967 VADDUQM T0, RED1, T0 // VAQ
968 VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
969 VADDEUQM T1, RED2, CAR1, T1 // VACQ
970 VADDUQM T2, CAR2, T2 // VAQ
971
972 // ---------------------------------------------------
973
974 VSPLTISB $0, RED3 // VZERO RED3
975 VSUBCUQ T0, P0, CAR1 // VSCBIQ
976 VSUBUQM T0, P0, ADD1H // VSQ
977 VSUBECUQ T1, P1, CAR1, CAR2 // VSBCBIQ
978 VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
979 VSUBEUQM T2, RED3, CAR2, T2 // VSBIQ
980
981 // what output to use, ADD2H||ADD1H or T1||T0?
982 VSEL ADD1H, T0, T2, T0
983 VSEL ADD2H, T1, T2, T1
984 RET
985
986 #undef CPOOL
987
988 #undef X0
989 #undef X1
990 #undef Y0
991 #undef Y1
992 #undef T0
993 #undef T1
994 #undef P0
995 #undef P1
996
997 #undef SEL1
998 #undef SEL2
999 #undef SEL3
1000 #undef SEL4
1001 #undef SEL5
1002 #undef SEL6
1003
1004 #undef YDIG
1005 #undef ADD1H
1006 #undef ADD2H
1007 #undef ADD3
1008 #undef ADD4
1009 #undef RED1
1010 #undef RED2
1011 #undef RED3
1012 #undef T2
1013 #undef ADD1
1014 #undef ADD2
1015 #undef ADD3H
1016 #undef ADD4H
1017 #undef ZER
1018 #undef CAR1
1019 #undef CAR2
1020
1021 #undef TMP1
1022 #undef TMP2
1023
1024 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1025 VSPLTISB $0, ZER \ // VZERO
1026 VSUBCUQ X0, Y0, CAR1 \
1027 VSUBUQM X0, Y0, T0 \
1028 VSUBECUQ X1, Y1, CAR1, SEL1 \
1029 VSUBEUQM X1, Y1, CAR1, T1 \
1030 VSUBUQM ZER, SEL1, SEL1 \ // VSQ
1031 \
1032 VADDCUQ T0, PL, CAR1 \ // VACCQ
1033 VADDUQM T0, PL, TT0 \ // VAQ
1034 VADDEUQM T1, PH, CAR1, TT1 \ // VACQ
1035 \
1036 VSEL TT0, T0, SEL1, T0 \
1037 VSEL TT1, T1, SEL1, T1 \
1038
1039 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1040 VADDCUQ X0, Y0, CAR1 \
1041 VADDUQM X0, Y0, T0 \
1042 VADDECUQ X1, Y1, CAR1, T2 \ // VACCCQ
1043 VADDEUQM X1, Y1, CAR1, T1 \
1044 \
1045 VSPLTISB $0, ZER \
1046 VSUBCUQ T0, PL, CAR1 \ // VSCBIQ
1047 VSUBUQM T0, PL, TT0 \
1048 VSUBECUQ T1, PH, CAR1, CAR2 \ // VSBCBIQ
1049 VSUBEUQM T1, PH, CAR1, TT1 \ // VSBIQ
1050 VSUBEUQM T2, ZER, CAR2, SEL1 \
1051 \
1052 VSEL TT0, T0, SEL1, T0 \
1053 VSEL TT1, T1, SEL1, T1
1054
1055 #define p256HalfInternal(T1, T0, X1, X0) \
1056 VSPLTISB $0, ZER \
1057 VSUBEUQM ZER, ZER, X0, SEL1 \
1058 \
1059 VADDCUQ X0, PL, CAR1 \
1060 VADDUQM X0, PL, T0 \
1061 VADDECUQ X1, PH, CAR1, T2 \
1062 VADDEUQM X1, PH, CAR1, T1 \
1063 \
1064 VSEL T0, X0, SEL1, T0 \
1065 VSEL T1, X1, SEL1, T1 \
1066 VSEL T2, ZER, SEL1, T2 \
1067 \
1068 VSLDOI $15, T2, ZER, TT1 \
1069 VSLDOI $15, T1, ZER, TT0 \
1070 VSPLTISB $1, SEL1 \
1071 VSR T0, SEL1, T0 \ // VSRL
1072 VSR T1, SEL1, T1 \
1073 VSPLTISB $7, SEL1 \ // VREPIB
1074 VSL TT0, SEL1, TT0 \
1075 VSL TT1, SEL1, TT1 \
1076 VOR T0, TT0, T0 \
1077 VOR T1, TT1, T1
1078
1079 #define res_ptr R3
1080 #define x_ptr R4
1081 #define y_ptr R5
1082 #define CPOOL R7
1083 #define TEMP R8
1084 #define N R9
1085
1086 // Parameters
1087 #define X0 V0
1088 #define X1 V1
1089 #define Y0 V2
1090 #define Y1 V3
1091 #define T0 V4
1092 #define T1 V5
1093
1094 // Constants
1095 #define P0 V30
1096 #define P1 V31
1097 // func p256MulAsm(res, in1, in2 *p256Element)
1098 TEXT ·p256Mul(SB), NOSPLIT, $0-24
1099 MOVD res+0(FP), res_ptr
1100 MOVD in1+8(FP), x_ptr
1101 MOVD in2+16(FP), y_ptr
1102 MOVD $16, R16
1103 MOVD $32, R17
1104
1105 MOVD $p256mul<>+0x00(SB), CPOOL
1106
1107
1108 LXVD2X (R0)(x_ptr), X0
1109 LXVD2X (R16)(x_ptr), X1
1110
1111 XXPERMDI X0, X0, $2, X0
1112 XXPERMDI X1, X1, $2, X1
1113
1114 LXVD2X (R0)(y_ptr), Y0
1115 LXVD2X (R16)(y_ptr), Y1
1116
1117 XXPERMDI Y0, Y0, $2, Y0
1118 XXPERMDI Y1, Y1, $2, Y1
1119
1120 LXVD2X (R16)(CPOOL), P1
1121 LXVD2X (R0)(CPOOL), P0
1122
1123 CALL p256MulInternal<>(SB)
1124
1125 MOVD $p256mul<>+0x00(SB), CPOOL
1126
1127 XXPERMDI T0, T0, $2, T0
1128 XXPERMDI T1, T1, $2, T1
1129 STXVD2X T0, (R0)(res_ptr)
1130 STXVD2X T1, (R16)(res_ptr)
1131 RET
1132
1133 // func p256Sqr(res, in *p256Element, n int)
1134 TEXT ·p256Sqr(SB), NOSPLIT, $0-24
1135 MOVD res+0(FP), res_ptr
1136 MOVD in+8(FP), x_ptr
1137 MOVD $16, R16
1138 MOVD $32, R17
1139
1140 MOVD $p256mul<>+0x00(SB), CPOOL
1141
1142 LXVD2X (R0)(x_ptr), X0
1143 LXVD2X (R16)(x_ptr), X1
1144
1145 XXPERMDI X0, X0, $2, X0
1146 XXPERMDI X1, X1, $2, X1
1147
1148 sqrLoop:
1149 // Sqr uses same value for both
1150
1151 VOR X0, X0, Y0
1152 VOR X1, X1, Y1
1153
1154 LXVD2X (R16)(CPOOL), P1
1155 LXVD2X (R0)(CPOOL), P0
1156
1157 CALL p256MulInternal<>(SB)
1158
1159 MOVD n+16(FP), N
1160 ADD $-1, N
1161 CMP $0, N
1162 BEQ done
1163 MOVD N, n+16(FP) // Save counter to avoid clobber
1164 VOR T0, T0, X0
1165 VOR T1, T1, X1
1166 BR sqrLoop
1167
1168 done:
1169 MOVD $p256mul<>+0x00(SB), CPOOL
1170
1171 XXPERMDI T0, T0, $2, T0
1172 XXPERMDI T1, T1, $2, T1
1173 STXVD2X T0, (R0)(res_ptr)
1174 STXVD2X T1, (R16)(res_ptr)
1175 RET
1176
1177 #undef res_ptr
1178 #undef x_ptr
1179 #undef y_ptr
1180 #undef CPOOL
1181
1182 #undef X0
1183 #undef X1
1184 #undef Y0
1185 #undef Y1
1186 #undef T0
1187 #undef T1
1188 #undef P0
1189 #undef P1
1190
1191 #define P3ptr R3
1192 #define P1ptr R4
1193 #define P2ptr R5
1194 #define CPOOL R7
1195
1196 // Temporaries in REGs
1197 #define Y2L V15
1198 #define Y2H V16
1199 #define T1L V17
1200 #define T1H V18
1201 #define T2L V19
1202 #define T2H V20
1203 #define T3L V21
1204 #define T3H V22
1205 #define T4L V23
1206 #define T4H V24
1207
1208 // Temps for Sub and Add
1209 #define TT0 V11
1210 #define TT1 V12
1211 #define T2 V13
1212
1213 // p256MulAsm Parameters
1214 #define X0 V0
1215 #define X1 V1
1216 #define Y0 V2
1217 #define Y1 V3
1218 #define T0 V4
1219 #define T1 V5
1220
1221 #define PL V30
1222 #define PH V31
1223
1224 // Names for zero/sel selects
1225 #define X1L V0
1226 #define X1H V1
1227 #define Y1L V2 // p256MulAsmParmY
1228 #define Y1H V3 // p256MulAsmParmY
1229 #define Z1L V4
1230 #define Z1H V5
1231 #define X2L V0
1232 #define X2H V1
1233 #define Z2L V4
1234 #define Z2H V5
1235 #define X3L V17 // T1L
1236 #define X3H V18 // T1H
1237 #define Y3L V21 // T3L
1238 #define Y3H V22 // T3H
1239 #define Z3L V25
1240 #define Z3H V26
1241
1242 #define ZER V6
1243 #define SEL1 V7
1244 #define CAR1 V8
1245 #define CAR2 V9
1246 /* *
1247 * Three operand formula:
1248 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1249 * T1 = Z1²
1250 * T2 = T1*Z1
1251 * T1 = T1*X2
1252 * T2 = T2*Y2
1253 * T1 = T1-X1
1254 * T2 = T2-Y1
1255 * Z3 = Z1*T1
1256 * T3 = T1²
1257 * T4 = T3*T1
1258 * T3 = T3*X1
1259 * T1 = 2*T3
1260 * X3 = T2²
1261 * X3 = X3-T1
1262 * X3 = X3-T4
1263 * T3 = T3-X3
1264 * T3 = T3*T2
1265 * T4 = T4*Y1
1266 * Y3 = T3-T4
1267
1268 * Three operand formulas, but with MulInternal X,Y used to store temps
1269 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1270 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1271 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1272 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1273 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1274 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1275 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1276 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1277 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1278 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1279 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1280 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1281 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1282 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1283 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1284 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1285 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1286 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1287
1288 */
1289 //
1290 // V27 is clobbered by p256MulInternal so must be
1291 // saved in a temp.
1292 //
1293 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1294 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
1295 MOVD res+0(FP), P3ptr
1296 MOVD in1+8(FP), P1ptr
1297 MOVD in2+16(FP), P2ptr
1298
1299 MOVD $p256mul<>+0x00(SB), CPOOL
1300
1301 MOVD $16, R16
1302 MOVD $32, R17
1303 MOVD $48, R18
1304 MOVD $64, R19
1305 MOVD $80, R20
1306 MOVD $96, R21
1307 MOVD $112, R22
1308 MOVD $128, R23
1309 MOVD $144, R24
1310 MOVD $160, R25
1311 MOVD $104, R26 // offset of sign+24(FP)
1312
1313 LXVD2X (R16)(CPOOL), PH
1314 LXVD2X (R0)(CPOOL), PL
1315
1316 LXVD2X (R17)(P2ptr), Y2L
1317 LXVD2X (R18)(P2ptr), Y2H
1318 XXPERMDI Y2H, Y2H, $2, Y2H
1319 XXPERMDI Y2L, Y2L, $2, Y2L
1320
1321 // Equivalent of VLREPG sign+24(FP), SEL1
1322 LXVDSX (R1)(R26), SEL1
1323 VSPLTISB $0, ZER
1324 VCMPEQUD SEL1, ZER, SEL1
1325
1326 VSUBCUQ PL, Y2L, CAR1
1327 VSUBUQM PL, Y2L, T1L
1328 VSUBEUQM PH, Y2H, CAR1, T1H
1329
1330 VSEL T1L, Y2L, SEL1, Y2L
1331 VSEL T1H, Y2H, SEL1, Y2H
1332
1333 /* *
1334 * Three operand formula:
1335 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1336 */
1337 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1338 LXVD2X (R19)(P1ptr), X0 // Z1H
1339 LXVD2X (R20)(P1ptr), X1 // Z1L
1340 XXPERMDI X0, X0, $2, X0
1341 XXPERMDI X1, X1, $2, X1
1342 VOR X0, X0, Y0
1343 VOR X1, X1, Y1
1344 CALL p256MulInternal<>(SB)
1345
1346 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1347 VOR T0, T0, X0
1348 VOR T1, T1, X1
1349 CALL p256MulInternal<>(SB)
1350 VOR T0, T0, T2L
1351 VOR T1, T1, T2H
1352
1353 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1354 MOVD in2+16(FP), P2ptr
1355 LXVD2X (R0)(P2ptr), Y0 // X2H
1356 LXVD2X (R16)(P2ptr), Y1 // X2L
1357 XXPERMDI Y0, Y0, $2, Y0
1358 XXPERMDI Y1, Y1, $2, Y1
1359 CALL p256MulInternal<>(SB)
1360 VOR T0, T0, T1L
1361 VOR T1, T1, T1H
1362
1363 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1364 VOR T2L, T2L, X0
1365 VOR T2H, T2H, X1
1366 VOR Y2L, Y2L, Y0
1367 VOR Y2H, Y2H, Y1
1368 CALL p256MulInternal<>(SB)
1369
1370 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1371 MOVD in1+8(FP), P1ptr
1372 LXVD2X (R17)(P1ptr), Y1L
1373 LXVD2X (R18)(P1ptr), Y1H
1374 XXPERMDI Y1H, Y1H, $2, Y1H
1375 XXPERMDI Y1L, Y1L, $2, Y1L
1376 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1377
1378 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1379 LXVD2X (R0)(P1ptr), X1L
1380 LXVD2X (R16)(P1ptr), X1H
1381 XXPERMDI X1H, X1H, $2, X1H
1382 XXPERMDI X1L, X1L, $2, X1L
1383 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1384
1385 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1386 LXVD2X (R19)(P1ptr), X0 // Z1H
1387 LXVD2X (R20)(P1ptr), X1 // Z1L
1388 XXPERMDI X0, X0, $2, X0
1389 XXPERMDI X1, X1, $2, X1
1390 CALL p256MulInternal<>(SB)
1391
1392 VOR T0, T0, Z3L
1393 VOR T1, T1, Z3H
1394
1395 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1396 VOR Y0, Y0, X0
1397 VOR Y1, Y1, X1
1398 CALL p256MulInternal<>(SB)
1399 VOR T0, T0, X0
1400 VOR T1, T1, X1
1401
1402 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1403 CALL p256MulInternal<>(SB)
1404 VOR T0, T0, T4L
1405 VOR T1, T1, T4H
1406
1407 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1408 MOVD in1+8(FP), P1ptr
1409 LXVD2X (R0)(P1ptr), Y0 // X1H
1410 LXVD2X (R16)(P1ptr), Y1 // X1L
1411 XXPERMDI Y1, Y1, $2, Y1
1412 XXPERMDI Y0, Y0, $2, Y0
1413 CALL p256MulInternal<>(SB)
1414 VOR T0, T0, T3L
1415 VOR T1, T1, T3H
1416
1417 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1418 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1419
1420 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1421 VOR T2L, T2L, X0
1422 VOR T2H, T2H, X1
1423 VOR T2L, T2L, Y0
1424 VOR T2H, T2H, Y1
1425 CALL p256MulInternal<>(SB)
1426
1427 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1428 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1429
1430 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1431 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1432 VOR T0, T0, X3L
1433 VOR T1, T1, X3H
1434
1435 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1436 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1437
1438 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1439 CALL p256MulInternal<>(SB)
1440 VOR T0, T0, T3L
1441 VOR T1, T1, T3H
1442
1443 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1444 VOR T4L, T4L, X0
1445 VOR T4H, T4H, X1
1446 MOVD in1+8(FP), P1ptr
1447 LXVD2X (R17)(P1ptr), Y0 // Y1H
1448 LXVD2X (R18)(P1ptr), Y1 // Y1L
1449 XXPERMDI Y0, Y0, $2, Y0
1450 XXPERMDI Y1, Y1, $2, Y1
1451 CALL p256MulInternal<>(SB)
1452
1453 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1454 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1455
1456 // if (sel == 0) {
1457 // copy(P3.x[:], X1)
1458 // copy(P3.y[:], Y1)
1459 // copy(P3.z[:], Z1)
1460 // }
1461
1462 LXVD2X (R0)(P1ptr), X1L
1463 LXVD2X (R16)(P1ptr), X1H
1464 XXPERMDI X1H, X1H, $2, X1H
1465 XXPERMDI X1L, X1L, $2, X1L
1466
1467 // Y1 already loaded, left over from addition
1468 LXVD2X (R19)(P1ptr), Z1L
1469 LXVD2X (R20)(P1ptr), Z1H
1470 XXPERMDI Z1H, Z1H, $2, Z1H
1471 XXPERMDI Z1L, Z1L, $2, Z1L
1472
1473 MOVD $112, R26 // Get offset to sel+32
1474 LXVDSX (R1)(R26), SEL1
1475 VSPLTISB $0, ZER
1476 VCMPEQUD SEL1, ZER, SEL1
1477
1478 VSEL X3L, X1L, SEL1, X3L
1479 VSEL X3H, X1H, SEL1, X3H
1480 VSEL Y3L, Y1L, SEL1, Y3L
1481 VSEL Y3H, Y1H, SEL1, Y3H
1482 VSEL Z3L, Z1L, SEL1, Z3L
1483 VSEL Z3H, Z1H, SEL1, Z3H
1484
1485 MOVD in2+16(FP), P2ptr
1486 LXVD2X (R0)(P2ptr), X2L
1487 LXVD2X (R16)(P2ptr), X2H
1488 XXPERMDI X2H, X2H, $2, X2H
1489 XXPERMDI X2L, X2L, $2, X2L
1490
1491 // Y2 already loaded
1492 LXVD2X (R23)(CPOOL), Z2L
1493 LXVD2X (R24)(CPOOL), Z2H
1494
1495 MOVD $120, R26 // Get the value from zero+40(FP)
1496 LXVDSX (R1)(R26), SEL1
1497 VSPLTISB $0, ZER
1498 VCMPEQUD SEL1, ZER, SEL1
1499
1500 VSEL X3L, X2L, SEL1, X3L
1501 VSEL X3H, X2H, SEL1, X3H
1502 VSEL Y3L, Y2L, SEL1, Y3L
1503 VSEL Y3H, Y2H, SEL1, Y3H
1504 VSEL Z3L, Z2L, SEL1, Z3L
1505 VSEL Z3H, Z2H, SEL1, Z3H
1506
1507 // Reorder the bytes so they can be stored using STXVD2X.
1508 MOVD res+0(FP), P3ptr
1509 XXPERMDI X3H, X3H, $2, X3H
1510 XXPERMDI X3L, X3L, $2, X3L
1511 XXPERMDI Y3H, Y3H, $2, Y3H
1512 XXPERMDI Y3L, Y3L, $2, Y3L
1513 XXPERMDI Z3H, Z3H, $2, Z3H
1514 XXPERMDI Z3L, Z3L, $2, Z3L
1515 STXVD2X X3L, (R0)(P3ptr)
1516 STXVD2X X3H, (R16)(P3ptr)
1517 STXVD2X Y3L, (R17)(P3ptr)
1518 STXVD2X Y3H, (R18)(P3ptr)
1519 STXVD2X Z3L, (R19)(P3ptr)
1520 STXVD2X Z3H, (R20)(P3ptr)
1521
1522 RET
1523
1524 #undef P3ptr
1525 #undef P1ptr
1526 #undef P2ptr
1527 #undef CPOOL
1528
1529 #undef Y2L
1530 #undef Y2H
1531 #undef T1L
1532 #undef T1H
1533 #undef T2L
1534 #undef T2H
1535 #undef T3L
1536 #undef T3H
1537 #undef T4L
1538 #undef T4H
1539
1540 #undef TT0
1541 #undef TT1
1542 #undef T2
1543
1544 #undef X0
1545 #undef X1
1546 #undef Y0
1547 #undef Y1
1548 #undef T0
1549 #undef T1
1550
1551 #undef PL
1552 #undef PH
1553
1554 #undef X1L
1555 #undef X1H
1556 #undef Y1L
1557 #undef Y1H
1558 #undef Z1L
1559 #undef Z1H
1560 #undef X2L
1561 #undef X2H
1562 #undef Z2L
1563 #undef Z2H
1564 #undef X3L
1565 #undef X3H
1566 #undef Y3L
1567 #undef Y3H
1568 #undef Z3L
1569 #undef Z3H
1570
1571 #undef ZER
1572 #undef SEL1
1573 #undef CAR1
1574 #undef CAR2
1575
1576 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1577 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1578 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1579 #define P3ptr R3
1580 #define P1ptr R4
1581 #define CPOOL R7
1582
1583 // Temporaries in REGs
1584 #define X3L V15
1585 #define X3H V16
1586 #define Y3L V17
1587 #define Y3H V18
1588 #define T1L V19
1589 #define T1H V20
1590 #define T2L V21
1591 #define T2H V22
1592 #define T3L V23
1593 #define T3H V24
1594
1595 #define X1L V6
1596 #define X1H V7
1597 #define Y1L V8
1598 #define Y1H V9
1599 #define Z1L V10
1600 #define Z1H V11
1601
1602 // Temps for Sub and Add
1603 #define TT0 V11
1604 #define TT1 V12
1605 #define T2 V13
1606
1607 // p256MulAsm Parameters
1608 #define X0 V0
1609 #define X1 V1
1610 #define Y0 V2
1611 #define Y1 V3
1612 #define T0 V4
1613 #define T1 V5
1614
1615 #define PL V30
1616 #define PH V31
1617
1618 #define Z3L V23
1619 #define Z3H V24
1620
1621 #define ZER V26
1622 #define SEL1 V27
1623 #define CAR1 V28
1624 #define CAR2 V29
1625 /*
1626 * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1627 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1628 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1629 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1630 * B = 2Y₁
1631 * Z₃ = B×Z₁
1632 * C = B²
1633 * D = C×X₁
1634 * X₃ = A²-2D
1635 * Y₃ = (D-X₃)×A-C²/2
1636 *
1637 * Three-operand formula:
1638 * T1 = Z1²
1639 * T2 = X1-T1
1640 * T1 = X1+T1
1641 * T2 = T2*T1
1642 * T2 = 3*T2
1643 * Y3 = 2*Y1
1644 * Z3 = Y3*Z1
1645 * Y3 = Y3²
1646 * T3 = Y3*X1
1647 * Y3 = Y3²
1648 * Y3 = half*Y3
1649 * X3 = T2²
1650 * T1 = 2*T3
1651 * X3 = X3-T1
1652 * T1 = T3-X3
1653 * T1 = T1*T2
1654 * Y3 = T1-Y3
1655 */
1656 // p256PointDoubleAsm(res, in1 *p256Point)
1657 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
1658 MOVD res+0(FP), P3ptr
1659 MOVD in+8(FP), P1ptr
1660
1661 MOVD $p256mul<>+0x00(SB), CPOOL
1662
1663 MOVD $16, R16
1664 MOVD $32, R17
1665 MOVD $48, R18
1666 MOVD $64, R19
1667 MOVD $80, R20
1668
1669 LXVD2X (R16)(CPOOL), PH
1670 LXVD2X (R0)(CPOOL), PL
1671
1672 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1673 LXVD2X (R19)(P1ptr), X0 // Z1H
1674 LXVD2X (R20)(P1ptr), X1 // Z1L
1675
1676 XXPERMDI X0, X0, $2, X0
1677 XXPERMDI X1, X1, $2, X1
1678
1679 VOR X0, X0, Y0
1680 VOR X1, X1, Y1
1681 CALL p256MulInternal<>(SB)
1682
1683 // SUB(X<X1-T) // T2 = X1-T1
1684 LXVD2X (R0)(P1ptr), X1L
1685 LXVD2X (R16)(P1ptr), X1H
1686 XXPERMDI X1L, X1L, $2, X1L
1687 XXPERMDI X1H, X1H, $2, X1H
1688
1689 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1690
1691 // ADD(Y<X1+T) // T1 = X1+T1
1692 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1693
1694 // X- ; Y- ; MUL; T- // T2 = T2*T1
1695 CALL p256MulInternal<>(SB)
1696
1697 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1698 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1699 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1700
1701 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1702 LXVD2X (R17)(P1ptr), Y1L
1703 LXVD2X (R18)(P1ptr), Y1H
1704 XXPERMDI Y1L, Y1L, $2, Y1L
1705 XXPERMDI Y1H, Y1H, $2, Y1H
1706
1707 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1708
1709 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1710 LXVD2X (R19)(P1ptr), Y0
1711 LXVD2X (R20)(P1ptr), Y1
1712 XXPERMDI Y0, Y0, $2, Y0
1713 XXPERMDI Y1, Y1, $2, Y1
1714
1715 CALL p256MulInternal<>(SB)
1716
1717 // Leave T0, T1 as is.
1718 XXPERMDI T0, T0, $2, TT0
1719 XXPERMDI T1, T1, $2, TT1
1720 STXVD2X TT0, (R19)(P3ptr)
1721 STXVD2X TT1, (R20)(P3ptr)
1722
1723 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1724 VOR X0, X0, Y0
1725 VOR X1, X1, Y1
1726 CALL p256MulInternal<>(SB)
1727
1728 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
1729 VOR T0, T0, X0
1730 VOR T1, T1, X1
1731 LXVD2X (R0)(P1ptr), Y0
1732 LXVD2X (R16)(P1ptr), Y1
1733 XXPERMDI Y0, Y0, $2, Y0
1734 XXPERMDI Y1, Y1, $2, Y1
1735 CALL p256MulInternal<>(SB)
1736 VOR T0, T0, T3L
1737 VOR T1, T1, T3H
1738
1739 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1740 VOR X0, X0, Y0
1741 VOR X1, X1, Y1
1742 CALL p256MulInternal<>(SB)
1743
1744 // HAL(Y3<T) // Y3 = half*Y3
1745 p256HalfInternal(Y3H,Y3L, T1,T0)
1746
1747 // X=T2; Y=T2; MUL; T- // X3 = T2²
1748 VOR T2L, T2L, X0
1749 VOR T2H, T2H, X1
1750 VOR T2L, T2L, Y0
1751 VOR T2H, T2H, Y1
1752 CALL p256MulInternal<>(SB)
1753
1754 // ADD(T1<T3+T3) // T1 = 2*T3
1755 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
1756
1757 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
1758 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
1759
1760 XXPERMDI X3L, X3L, $2, TT0
1761 XXPERMDI X3H, X3H, $2, TT1
1762 STXVD2X TT0, (R0)(P3ptr)
1763 STXVD2X TT1, (R16)(P3ptr)
1764
1765 // SUB(X<T3-X3) // T1 = T3-X3
1766 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
1767
1768 // X- ; Y- ; MUL; T- // T1 = T1*T2
1769 CALL p256MulInternal<>(SB)
1770
1771 // SUB(Y3<T-Y3) // Y3 = T1-Y3
1772 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
1773
1774 XXPERMDI Y3L, Y3L, $2, Y3L
1775 XXPERMDI Y3H, Y3H, $2, Y3H
1776 STXVD2X Y3L, (R17)(P3ptr)
1777 STXVD2X Y3H, (R18)(P3ptr)
1778 RET
1779
1780 #undef P3ptr
1781 #undef P1ptr
1782 #undef CPOOL
1783 #undef X3L
1784 #undef X3H
1785 #undef Y3L
1786 #undef Y3H
1787 #undef T1L
1788 #undef T1H
1789 #undef T2L
1790 #undef T2H
1791 #undef T3L
1792 #undef T3H
1793 #undef X1L
1794 #undef X1H
1795 #undef Y1L
1796 #undef Y1H
1797 #undef Z1L
1798 #undef Z1H
1799 #undef TT0
1800 #undef TT1
1801 #undef T2
1802 #undef X0
1803 #undef X1
1804 #undef Y0
1805 #undef Y1
1806 #undef T0
1807 #undef T1
1808 #undef PL
1809 #undef PH
1810 #undef Z3L
1811 #undef Z3H
1812 #undef ZER
1813 #undef SEL1
1814 #undef CAR1
1815 #undef CAR2
1816
1817 #define P3ptr R3
1818 #define P1ptr R4
1819 #define P2ptr R5
1820 #define CPOOL R7
1821 #define TRUE R14
1822 #define RES1 R9
1823 #define RES2 R10
1824
1825 // Temporaries in REGs
1826 #define T1L V16
1827 #define T1H V17
1828 #define T2L V18
1829 #define T2H V19
1830 #define U1L V20
1831 #define U1H V21
1832 #define S1L V22
1833 #define S1H V23
1834 #define HL V24
1835 #define HH V25
1836 #define RL V26
1837 #define RH V27
1838
1839 // Temps for Sub and Add
1840 #define ZER V6
1841 #define SEL1 V7
1842 #define CAR1 V8
1843 #define CAR2 V9
1844 #define TT0 V11
1845 #define TT1 V12
1846 #define T2 V13
1847
1848 // p256MulAsm Parameters
1849 #define X0 V0
1850 #define X1 V1
1851 #define Y0 V2
1852 #define Y1 V3
1853 #define T0 V4
1854 #define T1 V5
1855
1856 #define PL V30
1857 #define PH V31
1858 /*
1859 * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
1860 *
1861 * A = X₁×Z₂²
1862 * B = Y₁×Z₂³
1863 * C = X₂×Z₁²-A
1864 * D = Y₂×Z₁³-B
1865 * X₃ = D² - 2A×C² - C³
1866 * Y₃ = D×(A×C² - X₃) - B×C³
1867 * Z₃ = Z₁×Z₂×C
1868 *
1869 * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
1870 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
1871 *
1872 * T1 = Z1*Z1
1873 * T2 = Z2*Z2
1874 * U1 = X1*T2
1875 * H = X2*T1
1876 * H = H-U1
1877 * Z3 = Z1*Z2
1878 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1879 *
1880 * S1 = Z2*T2
1881 * S1 = Y1*S1
1882 * R = Z1*T1
1883 * R = Y2*R
1884 * R = R-S1
1885 *
1886 * T1 = H*H
1887 * T2 = H*T1
1888 * U1 = U1*T1
1889 *
1890 * X3 = R*R
1891 * X3 = X3-T2
1892 * T1 = 2*U1
1893 * X3 = X3-T1 << store-out X3 result reg
1894 *
1895 * T2 = S1*T2
1896 * Y3 = U1-X3
1897 * Y3 = R*Y3
1898 * Y3 = Y3-T2 << store-out Y3 result reg
1899
1900 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1901 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1902 // X=X2; Y- ; MUL; H=T // H = X2*T1
1903 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1904 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1905 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1906 // SUB(H<H-T) // H = H-U1
1907 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
1908 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1909 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1910 // X=Y2; Y=R ; MUL; T- // R = Y2*R
1911 // SUB(R<T-S1) // R = R-S1
1912 // X=H ; Y=H ; MUL; T- // T1 = H*H
1913 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
1914 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
1915 // X=R ; Y=R ; MUL; T- // X3 = R*R
1916 // SUB(T<T-T2) // X3 = X3-T2
1917 // ADD(X<U1+U1) // T1 = 2*U1
1918 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
1919 // SUB(Y<U1-T) // Y3 = U1-X3
1920 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
1921 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
1922 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
1923 */
1924 // p256PointAddAsm(res, in1, in2 *p256Point)
1925 TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
1926 MOVD res+0(FP), P3ptr
1927 MOVD in1+8(FP), P1ptr
1928 MOVD $p256mul<>+0x00(SB), CPOOL
1929 MOVD $16, R16
1930 MOVD $32, R17
1931 MOVD $48, R18
1932 MOVD $64, R19
1933 MOVD $80, R20
1934
1935 LXVD2X (R16)(CPOOL), PH
1936 LXVD2X (R0)(CPOOL), PL
1937
1938 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1939 LXVD2X (R19)(P1ptr), X0 // Z1L
1940 LXVD2X (R20)(P1ptr), X1 // Z1H
1941 XXPERMDI X0, X0, $2, X0
1942 XXPERMDI X1, X1, $2, X1
1943 VOR X0, X0, Y0
1944 VOR X1, X1, Y1
1945 CALL p256MulInternal<>(SB)
1946
1947 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1948 VOR T0, T0, Y0
1949 VOR T1, T1, Y1
1950 CALL p256MulInternal<>(SB)
1951 VOR T0, T0, RL // SAVE: RL
1952 VOR T1, T1, RH // SAVE: RH
1953
1954 STXVD2X RH, (R1)(R17) // V27 has to be saved
1955
1956 // X=X2; Y- ; MUL; H=T // H = X2*T1
1957 MOVD in2+16(FP), P2ptr
1958 LXVD2X (R0)(P2ptr), X0 // X2L
1959 LXVD2X (R16)(P2ptr), X1 // X2H
1960 XXPERMDI X0, X0, $2, X0
1961 XXPERMDI X1, X1, $2, X1
1962 CALL p256MulInternal<>(SB)
1963 VOR T0, T0, HL // SAVE: HL
1964 VOR T1, T1, HH // SAVE: HH
1965
1966 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1967 MOVD in2+16(FP), P2ptr
1968 LXVD2X (R19)(P2ptr), X0 // Z2L
1969 LXVD2X (R20)(P2ptr), X1 // Z2H
1970 XXPERMDI X0, X0, $2, X0
1971 XXPERMDI X1, X1, $2, X1
1972 VOR X0, X0, Y0
1973 VOR X1, X1, Y1
1974 CALL p256MulInternal<>(SB)
1975
1976 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1977 VOR T0, T0, Y0
1978 VOR T1, T1, Y1
1979 CALL p256MulInternal<>(SB)
1980 VOR T0, T0, S1L // SAVE: S1L
1981 VOR T1, T1, S1H // SAVE: S1H
1982
1983 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1984 MOVD in1+8(FP), P1ptr
1985 LXVD2X (R0)(P1ptr), X0 // X1L
1986 LXVD2X (R16)(P1ptr), X1 // X1H
1987 XXPERMDI X0, X0, $2, X0
1988 XXPERMDI X1, X1, $2, X1
1989 CALL p256MulInternal<>(SB)
1990 VOR T0, T0, U1L // SAVE: U1L
1991 VOR T1, T1, U1H // SAVE: U1H
1992
1993 // SUB(H<H-T) // H = H-U1
1994 p256SubInternal(HH,HL,HH,HL,T1,T0)
1995
1996 // if H == 0 or H^P == 0 then ret=1 else ret=0
1997 // clobbers T1H and T1L
1998 MOVD $1, TRUE
1999 VSPLTISB $0, ZER
2000 VOR HL, HH, T1H
2001 VCMPEQUDCC ZER, T1H, T1H
2002
2003 // 26 = CR6 NE
2004 ISEL $26, R0, TRUE, RES1
2005 VXOR HL, PL, T1L // SAVE: T1L
2006 VXOR HH, PH, T1H // SAVE: T1H
2007 VOR T1L, T1H, T1H
2008 VCMPEQUDCC ZER, T1H, T1H
2009
2010 // 26 = CR6 NE
2011 ISEL $26, R0, TRUE, RES2
2012 OR RES2, RES1, RES1
2013 MOVD RES1, ret+24(FP)
2014
2015 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2016 MOVD in1+8(FP), P1ptr
2017 MOVD in2+16(FP), P2ptr
2018 LXVD2X (R19)(P1ptr), X0 // Z1L
2019 LXVD2X (R20)(P1ptr), X1 // Z1H
2020 XXPERMDI X0, X0, $2, X0
2021 XXPERMDI X1, X1, $2, X1
2022 LXVD2X (R19)(P2ptr), Y0 // Z2L
2023 LXVD2X (R20)(P2ptr), Y1 // Z2H
2024 XXPERMDI Y0, Y0, $2, Y0
2025 XXPERMDI Y1, Y1, $2, Y1
2026 CALL p256MulInternal<>(SB)
2027
2028 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2029 VOR T0, T0, X0
2030 VOR T1, T1, X1
2031 VOR HL, HL, Y0
2032 VOR HH, HH, Y1
2033 CALL p256MulInternal<>(SB)
2034 MOVD res+0(FP), P3ptr
2035 XXPERMDI T1, T1, $2, TT1
2036 XXPERMDI T0, T0, $2, TT0
2037 STXVD2X TT0, (R19)(P3ptr)
2038 STXVD2X TT1, (R20)(P3ptr)
2039
2040 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2041 MOVD in1+8(FP), P1ptr
2042 LXVD2X (R17)(P1ptr), X0
2043 LXVD2X (R18)(P1ptr), X1
2044 XXPERMDI X0, X0, $2, X0
2045 XXPERMDI X1, X1, $2, X1
2046 VOR S1L, S1L, Y0
2047 VOR S1H, S1H, Y1
2048 CALL p256MulInternal<>(SB)
2049 VOR T0, T0, S1L
2050 VOR T1, T1, S1H
2051
2052 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2053 MOVD in2+16(FP), P2ptr
2054 LXVD2X (R17)(P2ptr), X0
2055 LXVD2X (R18)(P2ptr), X1
2056 XXPERMDI X0, X0, $2, X0
2057 XXPERMDI X1, X1, $2, X1
2058 VOR RL, RL, Y0
2059
2060 // VOR RH, RH, Y1 RH was saved above in D2X format
2061 LXVD2X (R1)(R17), Y1
2062 CALL p256MulInternal<>(SB)
2063
2064 // SUB(R<T-S1) // R = T-S1
2065 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2066
2067 STXVD2X RH, (R1)(R17) // Save RH
2068
2069 // if R == 0 or R^P == 0 then ret=ret else ret=0
2070 // clobbers T1H and T1L
2071 // Redo this using ISEL??
2072 MOVD $1, TRUE
2073 VSPLTISB $0, ZER
2074 VOR RL, RH, T1H
2075 VCMPEQUDCC ZER, T1H, T1H
2076
2077 // 24 = CR6 NE
2078 ISEL $26, R0, TRUE, RES1
2079 VXOR RL, PL, T1L
2080 VXOR RH, PH, T1H // SAVE: T1L
2081 VOR T1L, T1H, T1H
2082 VCMPEQUDCC ZER, T1H, T1H
2083
2084 // 26 = CR6 NE
2085 ISEL $26, R0, TRUE, RES2
2086 OR RES2, RES1, RES1
2087 MOVD ret+24(FP), RES2
2088 AND RES2, RES1, RES1
2089 MOVD RES1, ret+24(FP)
2090
2091 // X=H ; Y=H ; MUL; T- // T1 = H*H
2092 VOR HL, HL, X0
2093 VOR HH, HH, X1
2094 VOR HL, HL, Y0
2095 VOR HH, HH, Y1
2096 CALL p256MulInternal<>(SB)
2097
2098 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2099 VOR T0, T0, Y0
2100 VOR T1, T1, Y1
2101 CALL p256MulInternal<>(SB)
2102 VOR T0, T0, T2L
2103 VOR T1, T1, T2H
2104
2105 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2106 VOR U1L, U1L, X0
2107 VOR U1H, U1H, X1
2108 CALL p256MulInternal<>(SB)
2109 VOR T0, T0, U1L
2110 VOR T1, T1, U1H
2111
2112 // X=R ; Y=R ; MUL; T- // X3 = R*R
2113 VOR RL, RL, X0
2114
2115 // VOR RH, RH, X1
2116 VOR RL, RL, Y0
2117
2118 // RH was saved above using STXVD2X
2119 LXVD2X (R1)(R17), X1
2120 VOR X1, X1, Y1
2121
2122 // VOR RH, RH, Y1
2123 CALL p256MulInternal<>(SB)
2124
2125 // SUB(T<T-T2) // X3 = X3-T2
2126 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2127
2128 // ADD(X<U1+U1) // T1 = 2*U1
2129 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2130
2131 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2132 p256SubInternal(T1,T0,T1,T0,X1,X0)
2133 MOVD res+0(FP), P3ptr
2134 XXPERMDI T1, T1, $2, TT1
2135 XXPERMDI T0, T0, $2, TT0
2136 STXVD2X TT0, (R0)(P3ptr)
2137 STXVD2X TT1, (R16)(P3ptr)
2138
2139 // SUB(Y<U1-T) // Y3 = U1-X3
2140 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2141
2142 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2143 VOR RL, RL, X0
2144
2145 // VOR RH, RH, X1
2146 LXVD2X (R1)(R17), X1
2147 CALL p256MulInternal<>(SB)
2148 VOR T0, T0, U1L
2149 VOR T1, T1, U1H
2150
2151 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2152 VOR S1L, S1L, X0
2153 VOR S1H, S1H, X1
2154 VOR T2L, T2L, Y0
2155 VOR T2H, T2H, Y1
2156 CALL p256MulInternal<>(SB)
2157
2158 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2159 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2160 MOVD res+0(FP), P3ptr
2161 XXPERMDI T1, T1, $2, TT1
2162 XXPERMDI T0, T0, $2, TT0
2163 STXVD2X TT0, (R17)(P3ptr)
2164 STXVD2X TT1, (R18)(P3ptr)
2165
2166 RET
2167
View as plain text