1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8 #include "go_asm.h"
9
10 DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
11 DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
12 DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
13 DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
14 DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
15 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
16 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
17 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
18 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
19 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
20 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
21 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
22 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
23 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
24 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
25 DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
26 DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
27 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
28 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
29 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
30 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
31 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
32 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
33 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
34 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
35 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
36 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
37 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
38 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
39 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
40 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
41 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
42 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
43 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
44 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
45 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
46 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
47 GLOBL p256ordK0<>(SB), 8, $4
48 GLOBL p256ord<>(SB), 8, $32
49 GLOBL p256<>(SB), 8, $96
50 GLOBL p256mul<>(SB), 8, $160
51
52 // ---------------------------------------
53 // iff cond == 1 val <- -val
54 // func p256NegCond(val *p256Element, cond int)
55 #define P1ptr R1
56 #define CPOOL R4
57
58 #define Y1L V0
59 #define Y1H V1
60 #define T1L V2
61 #define T1H V3
62
63 #define PL V30
64 #define PH V31
65
66 #define ZER V4
67 #define SEL1 V5
68 #define CAR1 V6
69 TEXT ·p256NegCond(SB), NOSPLIT, $0
70 MOVD val+0(FP), P1ptr
71
72 MOVD $p256mul<>+0x00(SB), CPOOL
73 VL 16(CPOOL), PL
74 VL 0(CPOOL), PH
75
76 VL 16(P1ptr), Y1H
77 VPDI $0x4, Y1H, Y1H, Y1H
78 VL 0(P1ptr), Y1L
79 VPDI $0x4, Y1L, Y1L, Y1L
80
81 VLREPG cond+8(FP), SEL1
82 VZERO ZER
83 VCEQG SEL1, ZER, SEL1
84
85 VSCBIQ Y1L, PL, CAR1
86 VSQ Y1L, PL, T1L
87 VSBIQ PH, Y1H, CAR1, T1H
88
89 VSEL Y1L, T1L, SEL1, Y1L
90 VSEL Y1H, T1H, SEL1, Y1H
91
92 VPDI $0x4, Y1H, Y1H, Y1H
93 VST Y1H, 16(P1ptr)
94 VPDI $0x4, Y1L, Y1L, Y1L
95 VST Y1L, 0(P1ptr)
96 RET
97
98 #undef P1ptr
99 #undef CPOOL
100 #undef Y1L
101 #undef Y1H
102 #undef T1L
103 #undef T1H
104 #undef PL
105 #undef PH
106 #undef ZER
107 #undef SEL1
108 #undef CAR1
109
110 // ---------------------------------------
111 // if cond == 0 res <- b; else res <- a
112 // func p256MovCond(res, a, b *P256Point, cond int)
113 #define P3ptr R1
114 #define P1ptr R2
115 #define P2ptr R3
116
117 #define X1L V0
118 #define X1H V1
119 #define Y1L V2
120 #define Y1H V3
121 #define Z1L V4
122 #define Z1H V5
123 #define X2L V6
124 #define X2H V7
125 #define Y2L V8
126 #define Y2H V9
127 #define Z2L V10
128 #define Z2H V11
129
130 #define ZER V18
131 #define SEL1 V19
132 TEXT ·p256MovCond(SB), NOSPLIT, $0
133 MOVD res+0(FP), P3ptr
134 MOVD a+8(FP), P1ptr
135 MOVD b+16(FP), P2ptr
136 VLREPG cond+24(FP), SEL1
137 VZERO ZER
138 VCEQG SEL1, ZER, SEL1
139
140 VL 0(P1ptr), X1H
141 VL 16(P1ptr), X1L
142 VL 32(P1ptr), Y1H
143 VL 48(P1ptr), Y1L
144 VL 64(P1ptr), Z1H
145 VL 80(P1ptr), Z1L
146
147 VL 0(P2ptr), X2H
148 VL 16(P2ptr), X2L
149 VL 32(P2ptr), Y2H
150 VL 48(P2ptr), Y2L
151 VL 64(P2ptr), Z2H
152 VL 80(P2ptr), Z2L
153
154 VSEL X2L, X1L, SEL1, X1L
155 VSEL X2H, X1H, SEL1, X1H
156 VSEL Y2L, Y1L, SEL1, Y1L
157 VSEL Y2H, Y1H, SEL1, Y1H
158 VSEL Z2L, Z1L, SEL1, Z1L
159 VSEL Z2H, Z1H, SEL1, Z1H
160
161 VST X1H, 0(P3ptr)
162 VST X1L, 16(P3ptr)
163 VST Y1H, 32(P3ptr)
164 VST Y1L, 48(P3ptr)
165 VST Z1H, 64(P3ptr)
166 VST Z1L, 80(P3ptr)
167
168 RET
169
170 #undef P3ptr
171 #undef P1ptr
172 #undef P2ptr
173 #undef X1L
174 #undef X1H
175 #undef Y1L
176 #undef Y1H
177 #undef Z1L
178 #undef Z1H
179 #undef X2L
180 #undef X2H
181 #undef Y2L
182 #undef Y2H
183 #undef Z2L
184 #undef Z2H
185 #undef ZER
186 #undef SEL1
187
188 // ---------------------------------------
189 // Constant time table access
190 // Indexed from 1 to 15, with -1 offset
191 // (index 0 is implicitly point at infinity)
192 // func p256Select(res *P256Point, table *p256Table, idx int)
193 #define P3ptr R1
194 #define P1ptr R2
195 #define COUNT R4
196
197 #define X1L V0
198 #define X1H V1
199 #define Y1L V2
200 #define Y1H V3
201 #define Z1L V4
202 #define Z1H V5
203 #define X2L V6
204 #define X2H V7
205 #define Y2L V8
206 #define Y2H V9
207 #define Z2L V10
208 #define Z2H V11
209
210 #define ONE V18
211 #define IDX V19
212 #define SEL1 V20
213 #define SEL2 V21
214 TEXT ·p256Select(SB), NOSPLIT, $0
215 MOVD res+0(FP), P3ptr
216 MOVD table+8(FP), P1ptr
217 VLREPB idx+(16+7)(FP), IDX
218 VREPIB $1, ONE
219 VREPIB $1, SEL2
220 MOVD $1, COUNT
221
222 VZERO X1H
223 VZERO X1L
224 VZERO Y1H
225 VZERO Y1L
226 VZERO Z1H
227 VZERO Z1L
228
229 loop_select:
230 VL 0(P1ptr), X2H
231 VL 16(P1ptr), X2L
232 VL 32(P1ptr), Y2H
233 VL 48(P1ptr), Y2L
234 VL 64(P1ptr), Z2H
235 VL 80(P1ptr), Z2L
236
237 VCEQG SEL2, IDX, SEL1
238
239 VSEL X2L, X1L, SEL1, X1L
240 VSEL X2H, X1H, SEL1, X1H
241 VSEL Y2L, Y1L, SEL1, Y1L
242 VSEL Y2H, Y1H, SEL1, Y1H
243 VSEL Z2L, Z1L, SEL1, Z1L
244 VSEL Z2H, Z1H, SEL1, Z1H
245
246 VAB SEL2, ONE, SEL2
247 ADDW $1, COUNT
248 ADD $96, P1ptr
249 CMPW COUNT, $17
250 BLT loop_select
251
252 VST X1H, 0(P3ptr)
253 VST X1L, 16(P3ptr)
254 VST Y1H, 32(P3ptr)
255 VST Y1L, 48(P3ptr)
256 VST Z1H, 64(P3ptr)
257 VST Z1L, 80(P3ptr)
258 RET
259
260 #undef P3ptr
261 #undef P1ptr
262 #undef COUNT
263 #undef X1L
264 #undef X1H
265 #undef Y1L
266 #undef Y1H
267 #undef Z1L
268 #undef Z1H
269 #undef X2L
270 #undef X2H
271 #undef Y2L
272 #undef Y2H
273 #undef Z2L
274 #undef Z2H
275 #undef ONE
276 #undef IDX
277 #undef SEL1
278 #undef SEL2
279
280 // ---------------------------------------
281
282 // func p256FromMont(res, in *p256Element)
283 #define res_ptr R1
284 #define x_ptr R2
285 #define CPOOL R4
286
287 #define T0 V0
288 #define T1 V1
289 #define T2 V2
290 #define TT0 V3
291 #define TT1 V4
292
293 #define ZER V6
294 #define SEL1 V7
295 #define SEL2 V8
296 #define CAR1 V9
297 #define CAR2 V10
298 #define RED1 V11
299 #define RED2 V12
300 #define PL V13
301 #define PH V14
302
303 TEXT ·p256FromMont(SB), NOSPLIT, $0
304 MOVD res+0(FP), res_ptr
305 MOVD in+8(FP), x_ptr
306
307 VZERO T2
308 VZERO ZER
309 MOVD $p256<>+0x00(SB), CPOOL
310 VL 16(CPOOL), PL
311 VL 0(CPOOL), PH
312 VL 48(CPOOL), SEL2
313 VL 64(CPOOL), SEL1
314
315 VL (0*16)(x_ptr), T0
316 VPDI $0x4, T0, T0, T0
317 VL (1*16)(x_ptr), T1
318 VPDI $0x4, T1, T1, T1
319
320 // First round
321 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
322 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
323 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
324
325 VSLDB $8, T1, T0, T0
326 VSLDB $8, T2, T1, T1
327
328 VACCQ T0, RED1, CAR1
329 VAQ T0, RED1, T0
330 VACCCQ T1, RED2, CAR1, CAR2
331 VACQ T1, RED2, CAR1, T1
332 VAQ T2, CAR2, T2
333
334 // Second round
335 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
336 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
337 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
338
339 VSLDB $8, T1, T0, T0
340 VSLDB $8, T2, T1, T1
341
342 VACCQ T0, RED1, CAR1
343 VAQ T0, RED1, T0
344 VACCCQ T1, RED2, CAR1, CAR2
345 VACQ T1, RED2, CAR1, T1
346 VAQ T2, CAR2, T2
347
348 // Third round
349 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
350 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
351 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
352
353 VSLDB $8, T1, T0, T0
354 VSLDB $8, T2, T1, T1
355
356 VACCQ T0, RED1, CAR1
357 VAQ T0, RED1, T0
358 VACCCQ T1, RED2, CAR1, CAR2
359 VACQ T1, RED2, CAR1, T1
360 VAQ T2, CAR2, T2
361
362 // Last round
363 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
364 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
365 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
366
367 VSLDB $8, T1, T0, T0
368 VSLDB $8, T2, T1, T1
369
370 VACCQ T0, RED1, CAR1
371 VAQ T0, RED1, T0
372 VACCCQ T1, RED2, CAR1, CAR2
373 VACQ T1, RED2, CAR1, T1
374 VAQ T2, CAR2, T2
375
376 // ---------------------------------------------------
377
378 VSCBIQ PL, T0, CAR1
379 VSQ PL, T0, TT0
380 VSBCBIQ T1, PH, CAR1, CAR2
381 VSBIQ T1, PH, CAR1, TT1
382 VSBIQ T2, ZER, CAR2, T2
383
384 // what output to use, TT1||TT0 or T1||T0?
385 VSEL T0, TT0, T2, T0
386 VSEL T1, TT1, T2, T1
387
388 VPDI $0x4, T0, T0, TT0
389 VST TT0, (0*16)(res_ptr)
390 VPDI $0x4, T1, T1, TT1
391 VST TT1, (1*16)(res_ptr)
392 RET
393
394 #undef res_ptr
395 #undef x_ptr
396 #undef CPOOL
397 #undef T0
398 #undef T1
399 #undef T2
400 #undef TT0
401 #undef TT1
402 #undef ZER
403 #undef SEL1
404 #undef SEL2
405 #undef CAR1
406 #undef CAR2
407 #undef RED1
408 #undef RED2
409 #undef PL
410 #undef PH
411
412 // Constant time table access
413 // Indexed from 1 to 15, with -1 offset
414 // (index 0 is implicitly point at infinity)
415 // func p256SelectBase(point *p256Point, table []p256Point, idx int)
416 // new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
417
418 #define P3ptr R1
419 #define P1ptr R2
420 #define COUNT R4
421 #define CPOOL R5
422
423 #define X1L V0
424 #define X1H V1
425 #define Y1L V2
426 #define Y1H V3
427 #define Z1L V4
428 #define Z1H V5
429 #define X2L V6
430 #define X2H V7
431 #define Y2L V8
432 #define Y2H V9
433 #define Z2L V10
434 #define Z2H V11
435 #define LE2BE V12
436
437 #define ONE V18
438 #define IDX V19
439 #define SEL1 V20
440 #define SEL2 V21
441
442 TEXT ·p256SelectAffine(SB), NOSPLIT, $0
443 MOVD res+0(FP), P3ptr
444 MOVD table+8(FP), P1ptr
445 MOVD $p256<>+0x00(SB), CPOOL
446 VLREPB idx+(16+7)(FP), IDX
447 VREPIB $1, ONE
448 VREPIB $1, SEL2
449 MOVD $1, COUNT
450 VL 80(CPOOL), LE2BE
451
452 VZERO X1H
453 VZERO X1L
454 VZERO Y1H
455 VZERO Y1L
456
457 loop_select:
458 VL 0(P1ptr), X2H
459 VL 16(P1ptr), X2L
460 VL 32(P1ptr), Y2H
461 VL 48(P1ptr), Y2L
462
463 VCEQG SEL2, IDX, SEL1
464
465 VSEL X2L, X1L, SEL1, X1L
466 VSEL X2H, X1H, SEL1, X1H
467 VSEL Y2L, Y1L, SEL1, Y1L
468 VSEL Y2H, Y1H, SEL1, Y1H
469
470 VAB SEL2, ONE, SEL2
471 ADDW $1, COUNT
472 ADD $64, P1ptr
473 CMPW COUNT, $33 // len(p256AffineTable) + 1
474 BLT loop_select
475 VST X1H, 0(P3ptr)
476 VST X1L, 16(P3ptr)
477 VST Y1H, 32(P3ptr)
478 VST Y1L, 48(P3ptr)
479
480 RET
481
482 #undef P3ptr
483 #undef P1ptr
484 #undef COUNT
485 #undef X1L
486 #undef X1H
487 #undef Y1L
488 #undef Y1H
489 #undef Z1L
490 #undef Z1H
491 #undef X2L
492 #undef X2H
493 #undef Y2L
494 #undef Y2H
495 #undef Z2L
496 #undef Z2H
497 #undef ONE
498 #undef IDX
499 #undef SEL1
500 #undef SEL2
501 #undef CPOOL
502
503 // ---------------------------------------
504
505 // func p256OrdMul(res, in1, in2 *p256OrdElement)
506 #define res_ptr R1
507 #define x_ptr R2
508 #define y_ptr R3
509 #define X0 V0
510 #define X1 V1
511 #define Y0 V2
512 #define Y1 V3
513 #define M0 V4
514 #define M1 V5
515 #define T0 V6
516 #define T1 V7
517 #define T2 V8
518 #define YDIG V9
519
520 #define ADD1 V16
521 #define ADD1H V17
522 #define ADD2 V18
523 #define ADD2H V19
524 #define RED1 V20
525 #define RED1H V21
526 #define RED2 V22
527 #define RED2H V23
528 #define CAR1 V24
529 #define CAR1M V25
530
531 #define MK0 V30
532 #define K0 V31
533 TEXT ·p256OrdMul<>(SB), NOSPLIT, $0
534 MOVD res+0(FP), res_ptr
535 MOVD in1+8(FP), x_ptr
536 MOVD in2+16(FP), y_ptr
537
538 VZERO T2
539 MOVD $p256ordK0<>+0x00(SB), R4
540
541 // VLEF $3, 0(R4), K0
542 WORD $0xE7F40000
543 BYTE $0x38
544 BYTE $0x03
545 MOVD $p256ord<>+0x00(SB), R4
546 VL 16(R4), M0
547 VL 0(R4), M1
548
549 VL (0*16)(x_ptr), X0
550 VPDI $0x4, X0, X0, X0
551 VL (1*16)(x_ptr), X1
552 VPDI $0x4, X1, X1, X1
553 VL (0*16)(y_ptr), Y0
554 VPDI $0x4, Y0, Y0, Y0
555 VL (1*16)(y_ptr), Y1
556 VPDI $0x4, Y1, Y1, Y1
557
558 // ---------------------------------------------------------------------------/
559 VREPF $3, Y0, YDIG
560 VMLF X0, YDIG, ADD1
561 VMLF ADD1, K0, MK0
562 VREPF $3, MK0, MK0
563
564 VMLF X1, YDIG, ADD2
565 VMLHF X0, YDIG, ADD1H
566 VMLHF X1, YDIG, ADD2H
567
568 VMALF M0, MK0, ADD1, RED1
569 VMALHF M0, MK0, ADD1, RED1H
570 VMALF M1, MK0, ADD2, RED2
571 VMALHF M1, MK0, ADD2, RED2H
572
573 VSLDB $12, RED2, RED1, RED1
574 VSLDB $12, T2, RED2, RED2
575
576 VACCQ RED1, ADD1H, CAR1
577 VAQ RED1, ADD1H, T0
578 VACCQ RED1H, T0, CAR1M
579 VAQ RED1H, T0, T0
580
581 // << ready for next MK0
582
583 VACQ RED2, ADD2H, CAR1, T1
584 VACCCQ RED2, ADD2H, CAR1, CAR1
585 VACCCQ RED2H, T1, CAR1M, T2
586 VACQ RED2H, T1, CAR1M, T1
587 VAQ CAR1, T2, T2
588
589 // ---------------------------------------------------
590 /* *
591 * ---+--------+--------+
592 * T2| T1 | T0 |
593 * ---+--------+--------+
594 * *(add)*
595 * +--------+--------+
596 * | X1 | X0 |
597 * +--------+--------+
598 * *(mul)*
599 * +--------+--------+
600 * | YDIG | YDIG |
601 * +--------+--------+
602 * *(add)*
603 * +--------+--------+
604 * | M1 | M0 |
605 * +--------+--------+
606 * *(mul)*
607 * +--------+--------+
608 * | MK0 | MK0 |
609 * +--------+--------+
610 *
611 * ---------------------
612 *
613 * +--------+--------+
614 * | ADD2 | ADD1 |
615 * +--------+--------+
616 * +--------+--------+
617 * | ADD2H | ADD1H |
618 * +--------+--------+
619 * +--------+--------+
620 * | RED2 | RED1 |
621 * +--------+--------+
622 * +--------+--------+
623 * | RED2H | RED1H |
624 * +--------+--------+
625 */
626 VREPF $2, Y0, YDIG
627 VMALF X0, YDIG, T0, ADD1
628 VMLF ADD1, K0, MK0
629 VREPF $3, MK0, MK0
630
631 VMALF X1, YDIG, T1, ADD2
632 VMALHF X0, YDIG, T0, ADD1H
633 VMALHF X1, YDIG, T1, ADD2H
634
635 VMALF M0, MK0, ADD1, RED1
636 VMALHF M0, MK0, ADD1, RED1H
637 VMALF M1, MK0, ADD2, RED2
638 VMALHF M1, MK0, ADD2, RED2H
639
640 VSLDB $12, RED2, RED1, RED1
641 VSLDB $12, T2, RED2, RED2
642
643 VACCQ RED1, ADD1H, CAR1
644 VAQ RED1, ADD1H, T0
645 VACCQ RED1H, T0, CAR1M
646 VAQ RED1H, T0, T0
647
648 // << ready for next MK0
649
650 VACQ RED2, ADD2H, CAR1, T1
651 VACCCQ RED2, ADD2H, CAR1, CAR1
652 VACCCQ RED2H, T1, CAR1M, T2
653 VACQ RED2H, T1, CAR1M, T1
654 VAQ CAR1, T2, T2
655
656 // ---------------------------------------------------
657 VREPF $1, Y0, YDIG
658 VMALF X0, YDIG, T0, ADD1
659 VMLF ADD1, K0, MK0
660 VREPF $3, MK0, MK0
661
662 VMALF X1, YDIG, T1, ADD2
663 VMALHF X0, YDIG, T0, ADD1H
664 VMALHF X1, YDIG, T1, ADD2H
665
666 VMALF M0, MK0, ADD1, RED1
667 VMALHF M0, MK0, ADD1, RED1H
668 VMALF M1, MK0, ADD2, RED2
669 VMALHF M1, MK0, ADD2, RED2H
670
671 VSLDB $12, RED2, RED1, RED1
672 VSLDB $12, T2, RED2, RED2
673
674 VACCQ RED1, ADD1H, CAR1
675 VAQ RED1, ADD1H, T0
676 VACCQ RED1H, T0, CAR1M
677 VAQ RED1H, T0, T0
678
679 // << ready for next MK0
680
681 VACQ RED2, ADD2H, CAR1, T1
682 VACCCQ RED2, ADD2H, CAR1, CAR1
683 VACCCQ RED2H, T1, CAR1M, T2
684 VACQ RED2H, T1, CAR1M, T1
685 VAQ CAR1, T2, T2
686
687 // ---------------------------------------------------
688 VREPF $0, Y0, YDIG
689 VMALF X0, YDIG, T0, ADD1
690 VMLF ADD1, K0, MK0
691 VREPF $3, MK0, MK0
692
693 VMALF X1, YDIG, T1, ADD2
694 VMALHF X0, YDIG, T0, ADD1H
695 VMALHF X1, YDIG, T1, ADD2H
696
697 VMALF M0, MK0, ADD1, RED1
698 VMALHF M0, MK0, ADD1, RED1H
699 VMALF M1, MK0, ADD2, RED2
700 VMALHF M1, MK0, ADD2, RED2H
701
702 VSLDB $12, RED2, RED1, RED1
703 VSLDB $12, T2, RED2, RED2
704
705 VACCQ RED1, ADD1H, CAR1
706 VAQ RED1, ADD1H, T0
707 VACCQ RED1H, T0, CAR1M
708 VAQ RED1H, T0, T0
709
710 // << ready for next MK0
711
712 VACQ RED2, ADD2H, CAR1, T1
713 VACCCQ RED2, ADD2H, CAR1, CAR1
714 VACCCQ RED2H, T1, CAR1M, T2
715 VACQ RED2H, T1, CAR1M, T1
716 VAQ CAR1, T2, T2
717
718 // ---------------------------------------------------
719 VREPF $3, Y1, YDIG
720 VMALF X0, YDIG, T0, ADD1
721 VMLF ADD1, K0, MK0
722 VREPF $3, MK0, MK0
723
724 VMALF X1, YDIG, T1, ADD2
725 VMALHF X0, YDIG, T0, ADD1H
726 VMALHF X1, YDIG, T1, ADD2H
727
728 VMALF M0, MK0, ADD1, RED1
729 VMALHF M0, MK0, ADD1, RED1H
730 VMALF M1, MK0, ADD2, RED2
731 VMALHF M1, MK0, ADD2, RED2H
732
733 VSLDB $12, RED2, RED1, RED1
734 VSLDB $12, T2, RED2, RED2
735
736 VACCQ RED1, ADD1H, CAR1
737 VAQ RED1, ADD1H, T0
738 VACCQ RED1H, T0, CAR1M
739 VAQ RED1H, T0, T0
740
741 // << ready for next MK0
742
743 VACQ RED2, ADD2H, CAR1, T1
744 VACCCQ RED2, ADD2H, CAR1, CAR1
745 VACCCQ RED2H, T1, CAR1M, T2
746 VACQ RED2H, T1, CAR1M, T1
747 VAQ CAR1, T2, T2
748
749 // ---------------------------------------------------
750 VREPF $2, Y1, YDIG
751 VMALF X0, YDIG, T0, ADD1
752 VMLF ADD1, K0, MK0
753 VREPF $3, MK0, MK0
754
755 VMALF X1, YDIG, T1, ADD2
756 VMALHF X0, YDIG, T0, ADD1H
757 VMALHF X1, YDIG, T1, ADD2H
758
759 VMALF M0, MK0, ADD1, RED1
760 VMALHF M0, MK0, ADD1, RED1H
761 VMALF M1, MK0, ADD2, RED2
762 VMALHF M1, MK0, ADD2, RED2H
763
764 VSLDB $12, RED2, RED1, RED1
765 VSLDB $12, T2, RED2, RED2
766
767 VACCQ RED1, ADD1H, CAR1
768 VAQ RED1, ADD1H, T0
769 VACCQ RED1H, T0, CAR1M
770 VAQ RED1H, T0, T0
771
772 // << ready for next MK0
773
774 VACQ RED2, ADD2H, CAR1, T1
775 VACCCQ RED2, ADD2H, CAR1, CAR1
776 VACCCQ RED2H, T1, CAR1M, T2
777 VACQ RED2H, T1, CAR1M, T1
778 VAQ CAR1, T2, T2
779
780 // ---------------------------------------------------
781 VREPF $1, Y1, YDIG
782 VMALF X0, YDIG, T0, ADD1
783 VMLF ADD1, K0, MK0
784 VREPF $3, MK0, MK0
785
786 VMALF X1, YDIG, T1, ADD2
787 VMALHF X0, YDIG, T0, ADD1H
788 VMALHF X1, YDIG, T1, ADD2H
789
790 VMALF M0, MK0, ADD1, RED1
791 VMALHF M0, MK0, ADD1, RED1H
792 VMALF M1, MK0, ADD2, RED2
793 VMALHF M1, MK0, ADD2, RED2H
794
795 VSLDB $12, RED2, RED1, RED1
796 VSLDB $12, T2, RED2, RED2
797
798 VACCQ RED1, ADD1H, CAR1
799 VAQ RED1, ADD1H, T0
800 VACCQ RED1H, T0, CAR1M
801 VAQ RED1H, T0, T0
802
803 // << ready for next MK0
804
805 VACQ RED2, ADD2H, CAR1, T1
806 VACCCQ RED2, ADD2H, CAR1, CAR1
807 VACCCQ RED2H, T1, CAR1M, T2
808 VACQ RED2H, T1, CAR1M, T1
809 VAQ CAR1, T2, T2
810
811 // ---------------------------------------------------
812 VREPF $0, Y1, YDIG
813 VMALF X0, YDIG, T0, ADD1
814 VMLF ADD1, K0, MK0
815 VREPF $3, MK0, MK0
816
817 VMALF X1, YDIG, T1, ADD2
818 VMALHF X0, YDIG, T0, ADD1H
819 VMALHF X1, YDIG, T1, ADD2H
820
821 VMALF M0, MK0, ADD1, RED1
822 VMALHF M0, MK0, ADD1, RED1H
823 VMALF M1, MK0, ADD2, RED2
824 VMALHF M1, MK0, ADD2, RED2H
825
826 VSLDB $12, RED2, RED1, RED1
827 VSLDB $12, T2, RED2, RED2
828
829 VACCQ RED1, ADD1H, CAR1
830 VAQ RED1, ADD1H, T0
831 VACCQ RED1H, T0, CAR1M
832 VAQ RED1H, T0, T0
833
834 // << ready for next MK0
835
836 VACQ RED2, ADD2H, CAR1, T1
837 VACCCQ RED2, ADD2H, CAR1, CAR1
838 VACCCQ RED2H, T1, CAR1M, T2
839 VACQ RED2H, T1, CAR1M, T1
840 VAQ CAR1, T2, T2
841
842 // ---------------------------------------------------
843
844 VZERO RED1
845 VSCBIQ M0, T0, CAR1
846 VSQ M0, T0, ADD1
847 VSBCBIQ T1, M1, CAR1, CAR1M
848 VSBIQ T1, M1, CAR1, ADD2
849 VSBIQ T2, RED1, CAR1M, T2
850
851 // what output to use, ADD2||ADD1 or T1||T0?
852 VSEL T0, ADD1, T2, T0
853 VSEL T1, ADD2, T2, T1
854
855 VPDI $0x4, T0, T0, T0
856 VST T0, (0*16)(res_ptr)
857 VPDI $0x4, T1, T1, T1
858 VST T1, (1*16)(res_ptr)
859 RET
860
861 #undef res_ptr
862 #undef x_ptr
863 #undef y_ptr
864 #undef X0
865 #undef X1
866 #undef Y0
867 #undef Y1
868 #undef M0
869 #undef M1
870 #undef T0
871 #undef T1
872 #undef T2
873 #undef YDIG
874
875 #undef ADD1
876 #undef ADD1H
877 #undef ADD2
878 #undef ADD2H
879 #undef RED1
880 #undef RED1H
881 #undef RED2
882 #undef RED2H
883 #undef CAR1
884 #undef CAR1M
885
886 #undef MK0
887 #undef K0
888
889 // ---------------------------------------
890 // p256MulInternal
891 // V0-V3,V30,V31 - Not Modified
892 // V4-V15 - Volatile
893
894 #define CPOOL R4
895
896 // Parameters
897 #define X0 V0 // Not modified
898 #define X1 V1 // Not modified
899 #define Y0 V2 // Not modified
900 #define Y1 V3 // Not modified
901 #define T0 V4
902 #define T1 V5
903 #define P0 V30 // Not modified
904 #define P1 V31 // Not modified
905
906 // Temporaries
907 #define YDIG V6 // Overloaded with CAR2, ZER
908 #define ADD1H V7 // Overloaded with ADD3H
909 #define ADD2H V8 // Overloaded with ADD4H
910 #define ADD3 V9 // Overloaded with SEL2,SEL5
911 #define ADD4 V10 // Overloaded with SEL3,SEL6
912 #define RED1 V11 // Overloaded with CAR2
913 #define RED2 V12
914 #define RED3 V13 // Overloaded with SEL1
915 #define T2 V14
916 // Overloaded temporaries
917 #define ADD1 V4 // Overloaded with T0
918 #define ADD2 V5 // Overloaded with T1
919 #define ADD3H V7 // Overloaded with ADD1H
920 #define ADD4H V8 // Overloaded with ADD2H
921 #define ZER V6 // Overloaded with YDIG, CAR2
922 #define CAR1 V6 // Overloaded with YDIG, ZER
923 #define CAR2 V11 // Overloaded with RED1
924 // Constant Selects
925 #define SEL1 V13 // Overloaded with RED3
926 #define SEL2 V9 // Overloaded with ADD3,SEL5
927 #define SEL3 V10 // Overloaded with ADD4,SEL6
928 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER
929 #define SEL5 V9 // Overloaded with ADD3,SEL2
930 #define SEL6 V10 // Overloaded with ADD4,SEL3
931
932 /* *
933 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
934 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
935 * With you, SIMD be...
936 *
937 * +--------+--------+
938 * +--------| RED2 | RED1 |
939 * | +--------+--------+
940 * | ---+--------+--------+
941 * | +---- T2| T1 | T0 |--+
942 * | | ---+--------+--------+ |
943 * | | |
944 * | | ======================= |
945 * | | |
946 * | | +--------+--------+<-+
947 * | +-------| ADD2 | ADD1 |--|-----+
948 * | | +--------+--------+ | |
949 * | | +--------+--------+<---+ |
950 * | | | ADD2H | ADD1H |--+ |
951 * | | +--------+--------+ | |
952 * | | +--------+--------+<-+ |
953 * | | | ADD4 | ADD3 |--|-+ |
954 * | | +--------+--------+ | | |
955 * | | +--------+--------+<---+ | |
956 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
957 * | | +--------+--------+ | | V
958 * | | ------------------------ | | +--------+
959 * | | | | | RED3 | [d0 0 0 d0]
960 * | | | | +--------+
961 * | +---->+--------+--------+ | | |
962 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
963 * | +--------+--------+ | | |
964 * +---->---+--------+--------+ | | |
965 * T2| T1 | T0 |----+ | |
966 * ---+--------+--------+ | | |
967 * ---+--------+--------+<---+ | |
968 * +--- T2| T1 | T0 |----------+
969 * | ---+--------+--------+ | |
970 * | +--------+--------+<-------------+
971 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
972 * | +--------+--------+ | | |
973 * | +--------+<----------------------+
974 * | | RED3 |--------------+ | [0 0 d1 d0]
975 * | +--------+ | |
976 * +--->+--------+--------+ | |
977 * | T1 | T0 |--------+
978 * +--------+--------+ | |
979 * --------------------------- | |
980 * | |
981 * +--------+--------+<----+ |
982 * | RED2 | RED1 | |
983 * +--------+--------+ |
984 * ---+--------+--------+<-------+
985 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
986 * ---+--------+--------+
987 *
988 * *Mi obra de arte de siglo XXI @vpaprots
989 *
990 *
991 * First group is special, doesn't get the two inputs:
992 * +--------+--------+<-+
993 * +-------| ADD2 | ADD1 |--|-----+
994 * | +--------+--------+ | |
995 * | +--------+--------+<---+ |
996 * | | ADD2H | ADD1H |--+ |
997 * | +--------+--------+ | |
998 * | +--------+--------+<-+ |
999 * | | ADD4 | ADD3 |--|-+ |
1000 * | +--------+--------+ | | |
1001 * | +--------+--------+<---+ | |
1002 * | | ADD4H | ADD3H |------|-+ |(+vzero)
1003 * | +--------+--------+ | | V
1004 * | ------------------------ | | +--------+
1005 * | | | | RED3 | [d0 0 0 d0]
1006 * | | | +--------+
1007 * +---->+--------+--------+ | | |
1008 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
1009 * +--------+--------+ | | |
1010 * ---+--------+--------+<---+ | |
1011 * +--- T2| T1 | T0 |----------+
1012 * | ---+--------+--------+ | |
1013 * | +--------+--------+<-------------+
1014 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
1015 * | +--------+--------+ | | |
1016 * | +--------+<----------------------+
1017 * | | RED3 |--------------+ | [0 0 d1 d0]
1018 * | +--------+ | |
1019 * +--->+--------+--------+ | |
1020 * | T1 | T0 |--------+
1021 * +--------+--------+ | |
1022 * --------------------------- | |
1023 * | |
1024 * +--------+--------+<----+ |
1025 * | RED2 | RED1 | |
1026 * +--------+--------+ |
1027 * ---+--------+--------+<-------+
1028 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
1029 * ---+--------+--------+
1030 *
1031 * Last 'group' needs to RED2||RED1 shifted less
1032 */
1033 TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
1034 VL 32(CPOOL), SEL1
1035 VL 48(CPOOL), SEL2
1036 VL 64(CPOOL), SEL3
1037 VL 80(CPOOL), SEL4
1038
1039 // ---------------------------------------------------
1040
1041 VREPF $3, Y0, YDIG
1042 VMLHF X0, YDIG, ADD1H
1043 VMLHF X1, YDIG, ADD2H
1044 VMLF X0, YDIG, ADD1
1045 VMLF X1, YDIG, ADD2
1046
1047 VREPF $2, Y0, YDIG
1048 VMALF X0, YDIG, ADD1H, ADD3
1049 VMALF X1, YDIG, ADD2H, ADD4
1050 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1051 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1052
1053 VZERO ZER
1054 VL 32(CPOOL), SEL1
1055 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1056
1057 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1058 VSLDB $12, ZER, ADD2, T1 // ADD2 Free
1059
1060 VACCQ T0, ADD3, CAR1
1061 VAQ T0, ADD3, T0 // ADD3 Free
1062 VACCCQ T1, ADD4, CAR1, T2
1063 VACQ T1, ADD4, CAR1, T1 // ADD4 Free
1064
1065 VL 48(CPOOL), SEL2
1066 VL 64(CPOOL), SEL3
1067 VL 80(CPOOL), SEL4
1068 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1069 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1070 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1071 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1072
1073 VSLDB $12, T1, T0, T0
1074 VSLDB $12, T2, T1, T1
1075
1076 VACCQ T0, ADD3H, CAR1
1077 VAQ T0, ADD3H, T0
1078 VACCCQ T1, ADD4H, CAR1, T2
1079 VACQ T1, ADD4H, CAR1, T1
1080
1081 // ---------------------------------------------------
1082
1083 VREPF $1, Y0, YDIG
1084 VMALHF X0, YDIG, T0, ADD1H
1085 VMALHF X1, YDIG, T1, ADD2H
1086 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
1087 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
1088
1089 VREPF $0, Y0, YDIG
1090 VMALF X0, YDIG, ADD1H, ADD3
1091 VMALF X1, YDIG, ADD2H, ADD4
1092 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
1093 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
1094
1095 VZERO ZER
1096 VL 32(CPOOL), SEL1
1097 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1098
1099 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
1100 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free
1101
1102 VACCQ T0, RED1, CAR1
1103 VAQ T0, RED1, T0
1104 VACCCQ T1, RED2, CAR1, T2
1105 VACQ T1, RED2, CAR1, T1
1106
1107 VACCQ T0, ADD3, CAR1
1108 VAQ T0, ADD3, T0
1109 VACCCQ T1, ADD4, CAR1, CAR2
1110 VACQ T1, ADD4, CAR1, T1
1111 VAQ T2, CAR2, T2
1112
1113 VL 48(CPOOL), SEL2
1114 VL 64(CPOOL), SEL3
1115 VL 80(CPOOL), SEL4
1116 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1117 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1118 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1119 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1120
1121 VSLDB $12, T1, T0, T0
1122 VSLDB $12, T2, T1, T1
1123
1124 VACCQ T0, ADD3H, CAR1
1125 VAQ T0, ADD3H, T0
1126 VACCCQ T1, ADD4H, CAR1, T2
1127 VACQ T1, ADD4H, CAR1, T1
1128
1129 // ---------------------------------------------------
1130
1131 VREPF $3, Y1, YDIG
1132 VMALHF X0, YDIG, T0, ADD1H
1133 VMALHF X1, YDIG, T1, ADD2H
1134 VMALF X0, YDIG, T0, ADD1
1135 VMALF X1, YDIG, T1, ADD2
1136
1137 VREPF $2, Y1, YDIG
1138 VMALF X0, YDIG, ADD1H, ADD3
1139 VMALF X1, YDIG, ADD2H, ADD4
1140 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
1141 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
1142
1143 VZERO ZER
1144 VL 32(CPOOL), SEL1
1145 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1146
1147 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
1148 VSLDB $12, T2, ADD2, T1 // ADD2 Free
1149
1150 VACCQ T0, RED1, CAR1
1151 VAQ T0, RED1, T0
1152 VACCCQ T1, RED2, CAR1, T2
1153 VACQ T1, RED2, CAR1, T1
1154
1155 VACCQ T0, ADD3, CAR1
1156 VAQ T0, ADD3, T0
1157 VACCCQ T1, ADD4, CAR1, CAR2
1158 VACQ T1, ADD4, CAR1, T1
1159 VAQ T2, CAR2, T2
1160
1161 VL 48(CPOOL), SEL2
1162 VL 64(CPOOL), SEL3
1163 VL 80(CPOOL), SEL4
1164 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
1165 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
1166 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
1167 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
1168
1169 VSLDB $12, T1, T0, T0
1170 VSLDB $12, T2, T1, T1
1171
1172 VACCQ T0, ADD3H, CAR1
1173 VAQ T0, ADD3H, T0
1174 VACCCQ T1, ADD4H, CAR1, T2
1175 VACQ T1, ADD4H, CAR1, T1
1176
1177 // ---------------------------------------------------
1178
1179 VREPF $1, Y1, YDIG
1180 VMALHF X0, YDIG, T0, ADD1H
1181 VMALHF X1, YDIG, T1, ADD2H
1182 VMALF X0, YDIG, T0, ADD1
1183 VMALF X1, YDIG, T1, ADD2
1184
1185 VREPF $0, Y1, YDIG
1186 VMALF X0, YDIG, ADD1H, ADD3
1187 VMALF X1, YDIG, ADD2H, ADD4
1188 VMALHF X0, YDIG, ADD1H, ADD3H
1189 VMALHF X1, YDIG, ADD2H, ADD4H
1190
1191 VZERO ZER
1192 VL 32(CPOOL), SEL1
1193 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
1194
1195 VSLDB $12, ADD2, ADD1, T0
1196 VSLDB $12, T2, ADD2, T1
1197
1198 VACCQ T0, RED1, CAR1
1199 VAQ T0, RED1, T0
1200 VACCCQ T1, RED2, CAR1, T2
1201 VACQ T1, RED2, CAR1, T1
1202
1203 VACCQ T0, ADD3, CAR1
1204 VAQ T0, ADD3, T0
1205 VACCCQ T1, ADD4, CAR1, CAR2
1206 VACQ T1, ADD4, CAR1, T1
1207 VAQ T2, CAR2, T2
1208
1209 VL 96(CPOOL), SEL5
1210 VL 112(CPOOL), SEL6
1211 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
1212 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
1213 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
1214
1215 VSLDB $12, T1, T0, T0
1216 VSLDB $12, T2, T1, T1
1217
1218 VACCQ T0, ADD3H, CAR1
1219 VAQ T0, ADD3H, T0
1220 VACCCQ T1, ADD4H, CAR1, T2
1221 VACQ T1, ADD4H, CAR1, T1
1222
1223 VACCQ T0, RED1, CAR1
1224 VAQ T0, RED1, T0
1225 VACCCQ T1, RED2, CAR1, CAR2
1226 VACQ T1, RED2, CAR1, T1
1227 VAQ T2, CAR2, T2
1228
1229 // ---------------------------------------------------
1230
1231 VZERO RED3
1232 VSCBIQ P0, T0, CAR1
1233 VSQ P0, T0, ADD1H
1234 VSBCBIQ T1, P1, CAR1, CAR2
1235 VSBIQ T1, P1, CAR1, ADD2H
1236 VSBIQ T2, RED3, CAR2, T2
1237
1238 // what output to use, ADD2H||ADD1H or T1||T0?
1239 VSEL T0, ADD1H, T2, T0
1240 VSEL T1, ADD2H, T2, T1
1241 RET
1242
1243 #undef CPOOL
1244
1245 #undef X0
1246 #undef X1
1247 #undef Y0
1248 #undef Y1
1249 #undef T0
1250 #undef T1
1251 #undef P0
1252 #undef P1
1253
1254 #undef SEL1
1255 #undef SEL2
1256 #undef SEL3
1257 #undef SEL4
1258 #undef SEL5
1259 #undef SEL6
1260
1261 #undef YDIG
1262 #undef ADD1H
1263 #undef ADD2H
1264 #undef ADD3
1265 #undef ADD4
1266 #undef RED1
1267 #undef RED2
1268 #undef RED3
1269 #undef T2
1270 #undef ADD1
1271 #undef ADD2
1272 #undef ADD3H
1273 #undef ADD4H
1274 #undef ZER
1275 #undef CAR1
1276 #undef CAR2
1277
1278 // ---------------------------------------
1279
1280 // Parameters
1281 #define X0 V0
1282 #define X1 V1
1283 #define Y0 V2
1284 #define Y1 V3
1285
1286 TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
1287 VLR X0, Y0
1288 VLR X1, Y1
1289 BR p256MulInternal<>(SB)
1290
1291 #undef X0
1292 #undef X1
1293 #undef Y0
1294 #undef Y1
1295
1296 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
1297 VZERO ZER \
1298 VSCBIQ Y0, X0, CAR1 \
1299 VSQ Y0, X0, T0 \
1300 VSBCBIQ X1, Y1, CAR1, SEL1 \
1301 VSBIQ X1, Y1, CAR1, T1 \
1302 VSQ SEL1, ZER, SEL1 \
1303 \
1304 VACCQ T0, PL, CAR1 \
1305 VAQ T0, PL, TT0 \
1306 VACQ T1, PH, CAR1, TT1 \
1307 \
1308 VSEL T0, TT0, SEL1, T0 \
1309 VSEL T1, TT1, SEL1, T1 \
1310
1311 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
1312 VACCQ X0, Y0, CAR1 \
1313 VAQ X0, Y0, T0 \
1314 VACCCQ X1, Y1, CAR1, T2 \
1315 VACQ X1, Y1, CAR1, T1 \
1316 \
1317 VZERO ZER \
1318 VSCBIQ PL, T0, CAR1 \
1319 VSQ PL, T0, TT0 \
1320 VSBCBIQ T1, PH, CAR1, CAR2 \
1321 VSBIQ T1, PH, CAR1, TT1 \
1322 VSBIQ T2, ZER, CAR2, SEL1 \
1323 \
1324 VSEL T0, TT0, SEL1, T0 \
1325 VSEL T1, TT1, SEL1, T1
1326
1327 #define p256HalfInternal(T1, T0, X1, X0) \
1328 VZERO ZER \
1329 VSBIQ ZER, ZER, X0, SEL1 \
1330 \
1331 VACCQ X0, PL, CAR1 \
1332 VAQ X0, PL, T0 \
1333 VACCCQ X1, PH, CAR1, T2 \
1334 VACQ X1, PH, CAR1, T1 \
1335 \
1336 VSEL X0, T0, SEL1, T0 \
1337 VSEL X1, T1, SEL1, T1 \
1338 VSEL ZER, T2, SEL1, T2 \
1339 \
1340 VSLDB $15, T2, ZER, TT1 \
1341 VSLDB $15, T1, ZER, TT0 \
1342 VREPIB $1, SEL1 \
1343 VSRL SEL1, T0, T0 \
1344 VSRL SEL1, T1, T1 \
1345 VREPIB $7, SEL1 \
1346 VSL SEL1, TT0, TT0 \
1347 VSL SEL1, TT1, TT1 \
1348 VO T0, TT0, T0 \
1349 VO T1, TT1, T1
1350
1351 // ---------------------------------------
1352 // func p256Mul(res, in1, in2 *p256Element)
1353 #define res_ptr R1
1354 #define x_ptr R2
1355 #define y_ptr R3
1356 #define CPOOL R4
1357
1358 // Parameters
1359 #define X0 V0
1360 #define X1 V1
1361 #define Y0 V2
1362 #define Y1 V3
1363 #define T0 V4
1364 #define T1 V5
1365
1366 // Constants
1367 #define P0 V30
1368 #define P1 V31
1369 TEXT ·p256Mul(SB), NOSPLIT, $0
1370 MOVD res+0(FP), res_ptr
1371 MOVD in1+8(FP), x_ptr
1372 MOVD in2+16(FP), y_ptr
1373
1374 VL (0*16)(x_ptr), X0
1375 VPDI $0x4, X0, X0, X0
1376 VL (1*16)(x_ptr), X1
1377 VPDI $0x4, X1, X1, X1
1378 VL (0*16)(y_ptr), Y0
1379 VPDI $0x4, Y0, Y0, Y0
1380 VL (1*16)(y_ptr), Y1
1381 VPDI $0x4, Y1, Y1, Y1
1382
1383 MOVD $p256mul<>+0x00(SB), CPOOL
1384 VL 16(CPOOL), P0
1385 VL 0(CPOOL), P1
1386
1387 CALL p256MulInternal<>(SB)
1388
1389 VPDI $0x4, T0, T0, T0
1390 VST T0, (0*16)(res_ptr)
1391 VPDI $0x4, T1, T1, T1
1392 VST T1, (1*16)(res_ptr)
1393 RET
1394
1395 #undef res_ptr
1396 #undef x_ptr
1397 #undef y_ptr
1398 #undef CPOOL
1399
1400 #undef X0
1401 #undef X1
1402 #undef Y0
1403 #undef Y1
1404 #undef T0
1405 #undef T1
1406 #undef P0
1407 #undef P1
1408
1409 // ---------------------------------------
1410 // func p256Sqr(res, in *p256Element, n int)
1411 #define res_ptr R1
1412 #define x_ptr R2
1413 #define y_ptr R3
1414 #define CPOOL R4
1415 #define COUNT R5
1416 #define N R6
1417
1418 // Parameters
1419 #define X0 V0
1420 #define X1 V1
1421 #define T0 V4
1422 #define T1 V5
1423
1424 // Constants
1425 #define P0 V30
1426 #define P1 V31
1427 TEXT ·p256Sqr(SB), NOSPLIT, $0
1428 MOVD res+0(FP), res_ptr
1429 MOVD in+8(FP), x_ptr
1430
1431 VL (0*16)(x_ptr), X0
1432 VPDI $0x4, X0, X0, X0
1433 VL (1*16)(x_ptr), X1
1434 VPDI $0x4, X1, X1, X1
1435
1436 MOVD $p256mul<>+0x00(SB), CPOOL
1437 MOVD $0, COUNT
1438 MOVD n+16(FP), N
1439 VL 16(CPOOL), P0
1440 VL 0(CPOOL), P1
1441
1442 loop:
1443 CALL p256SqrInternal<>(SB)
1444 VLR T0, X0
1445 VLR T1, X1
1446 ADDW $1, COUNT
1447 CMPW COUNT, N
1448 BLT loop
1449
1450 VPDI $0x4, T0, T0, T0
1451 VST T0, (0*16)(res_ptr)
1452 VPDI $0x4, T1, T1, T1
1453 VST T1, (1*16)(res_ptr)
1454 RET
1455
1456 #undef res_ptr
1457 #undef x_ptr
1458 #undef y_ptr
1459 #undef CPOOL
1460 #undef COUNT
1461 #undef N
1462
1463 #undef X0
1464 #undef X1
1465 #undef T0
1466 #undef T1
1467 #undef P0
1468 #undef P1
1469
1470 // Point add with P2 being affine point
1471 // If sign == 1 -> P2 = -P2
1472 // If sel == 0 -> P3 = P1
1473 // if zero == 0 -> P3 = P2
1474 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1475 #define P3ptr R1
1476 #define P1ptr R2
1477 #define P2ptr R3
1478 #define CPOOL R4
1479
1480 // Temporaries in REGs
1481 #define Y2L V15
1482 #define Y2H V16
1483 #define T1L V17
1484 #define T1H V18
1485 #define T2L V19
1486 #define T2H V20
1487 #define T3L V21
1488 #define T3H V22
1489 #define T4L V23
1490 #define T4H V24
1491
1492 // Temps for Sub and Add
1493 #define TT0 V11
1494 #define TT1 V12
1495 #define T2 V13
1496
1497 // p256MulAsm Parameters
1498 #define X0 V0
1499 #define X1 V1
1500 #define Y0 V2
1501 #define Y1 V3
1502 #define T0 V4
1503 #define T1 V5
1504
1505 #define PL V30
1506 #define PH V31
1507
1508 // Names for zero/sel selects
1509 #define X1L V0
1510 #define X1H V1
1511 #define Y1L V2 // p256MulAsmParmY
1512 #define Y1H V3 // p256MulAsmParmY
1513 #define Z1L V4
1514 #define Z1H V5
1515 #define X2L V0
1516 #define X2H V1
1517 #define Z2L V4
1518 #define Z2H V5
1519 #define X3L V17 // T1L
1520 #define X3H V18 // T1H
1521 #define Y3L V21 // T3L
1522 #define Y3H V22 // T3H
1523 #define Z3L V28
1524 #define Z3H V29
1525
1526 #define ZER V6
1527 #define SEL1 V7
1528 #define CAR1 V8
1529 #define CAR2 V9
1530 /* *
1531 * Three operand formula:
1532 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1533 * T1 = Z1²
1534 * T2 = T1*Z1
1535 * T1 = T1*X2
1536 * T2 = T2*Y2
1537 * T1 = T1-X1
1538 * T2 = T2-Y1
1539 * Z3 = Z1*T1
1540 * T3 = T1²
1541 * T4 = T3*T1
1542 * T3 = T3*X1
1543 * T1 = 2*T3
1544 * X3 = T2²
1545 * X3 = X3-T1
1546 * X3 = X3-T4
1547 * T3 = T3-X3
1548 * T3 = T3*T2
1549 * T4 = T4*Y1
1550 * Y3 = T3-T4
1551
1552 * Three operand formulas, but with MulInternal X,Y used to store temps
1553 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1554 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1555 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1556 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1557 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1558 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1559 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1560 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1561 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1562 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1563 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1564 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1565 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1566 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1567 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1568 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1569 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1570 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1571
1572 */
1573 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
1574 MOVD res+0(FP), P3ptr
1575 MOVD in1+8(FP), P1ptr
1576 MOVD in2+16(FP), P2ptr
1577
1578 MOVD $p256mul<>+0x00(SB), CPOOL
1579 VL 16(CPOOL), PL
1580 VL 0(CPOOL), PH
1581
1582 // if (sign == 1) {
1583 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
1584 // }
1585
1586 VL 48(P2ptr), Y2H
1587 VPDI $0x4, Y2H, Y2H, Y2H
1588 VL 32(P2ptr), Y2L
1589 VPDI $0x4, Y2L, Y2L, Y2L
1590
1591 VLREPG sign+24(FP), SEL1
1592 VZERO ZER
1593 VCEQG SEL1, ZER, SEL1
1594
1595 VSCBIQ Y2L, PL, CAR1
1596 VSQ Y2L, PL, T1L
1597 VSBIQ PH, Y2H, CAR1, T1H
1598
1599 VSEL Y2L, T1L, SEL1, Y2L
1600 VSEL Y2H, T1H, SEL1, Y2H
1601
1602 /* *
1603 * Three operand formula:
1604 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1605 */
1606 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1607 VL 80(P1ptr), X1 // Z1H
1608 VPDI $0x4, X1, X1, X1
1609 VL 64(P1ptr), X0 // Z1L
1610 VPDI $0x4, X0, X0, X0
1611 VLR X0, Y0
1612 VLR X1, Y1
1613 CALL p256SqrInternal<>(SB)
1614
1615 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1616 VLR T0, X0
1617 VLR T1, X1
1618 CALL p256MulInternal<>(SB)
1619 VLR T0, T2L
1620 VLR T1, T2H
1621
1622 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1623 VL 16(P2ptr), Y1 // X2H
1624 VPDI $0x4, Y1, Y1, Y1
1625 VL 0(P2ptr), Y0 // X2L
1626 VPDI $0x4, Y0, Y0, Y0
1627 CALL p256MulInternal<>(SB)
1628 VLR T0, T1L
1629 VLR T1, T1H
1630
1631 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1632 VLR T2L, X0
1633 VLR T2H, X1
1634 VLR Y2L, Y0
1635 VLR Y2H, Y1
1636 CALL p256MulInternal<>(SB)
1637
1638 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1639 VL 48(P1ptr), Y1H
1640 VPDI $0x4, Y1H, Y1H, Y1H
1641 VL 32(P1ptr), Y1L
1642 VPDI $0x4, Y1L, Y1L, Y1L
1643 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1644
1645 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1646 VL 16(P1ptr), X1H
1647 VPDI $0x4, X1H, X1H, X1H
1648 VL 0(P1ptr), X1L
1649 VPDI $0x4, X1L, X1L, X1L
1650 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1651
1652 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1653 VL 80(P1ptr), X1 // Z1H
1654 VPDI $0x4, X1, X1, X1
1655 VL 64(P1ptr), X0 // Z1L
1656 VPDI $0x4, X0, X0, X0
1657 CALL p256MulInternal<>(SB)
1658
1659 // VST T1, 64(P3ptr)
1660 // VST T0, 80(P3ptr)
1661 VLR T0, Z3L
1662 VLR T1, Z3H
1663
1664 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1665 VLR Y0, X0
1666 VLR Y1, X1
1667 CALL p256SqrInternal<>(SB)
1668 VLR T0, X0
1669 VLR T1, X1
1670
1671 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1672 CALL p256MulInternal<>(SB)
1673 VLR T0, T4L
1674 VLR T1, T4H
1675
1676 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1677 VL 16(P1ptr), Y1 // X1H
1678 VPDI $0x4, Y1, Y1, Y1
1679 VL 0(P1ptr), Y0 // X1L
1680 VPDI $0x4, Y0, Y0, Y0
1681 CALL p256MulInternal<>(SB)
1682 VLR T0, T3L
1683 VLR T1, T3H
1684
1685 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1686 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1687
1688 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1689 VLR T2L, X0
1690 VLR T2H, X1
1691 VLR T2L, Y0
1692 VLR T2H, Y1
1693 CALL p256SqrInternal<>(SB)
1694
1695 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1696 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1697
1698 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1699 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1700 VLR T0, X3L
1701 VLR T1, X3H
1702
1703 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1704 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1705
1706 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1707 CALL p256MulInternal<>(SB)
1708 VLR T0, T3L
1709 VLR T1, T3H
1710
1711 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1712 VLR T4L, X0
1713 VLR T4H, X1
1714 VL 48(P1ptr), Y1 // Y1H
1715 VPDI $0x4, Y1, Y1, Y1
1716 VL 32(P1ptr), Y0 // Y1L
1717 VPDI $0x4, Y0, Y0, Y0
1718 CALL p256MulInternal<>(SB)
1719
1720 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1721 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1722
1723 // if (sel == 0) {
1724 // copy(P3.x[:], X1)
1725 // copy(P3.y[:], Y1)
1726 // copy(P3.z[:], Z1)
1727 // }
1728
1729 VL 16(P1ptr), X1H
1730 VPDI $0x4, X1H, X1H, X1H
1731 VL 0(P1ptr), X1L
1732 VPDI $0x4, X1L, X1L, X1L
1733
1734 // Y1 already loaded, left over from addition
1735 VL 80(P1ptr), Z1H
1736 VPDI $0x4, Z1H, Z1H, Z1H
1737 VL 64(P1ptr), Z1L
1738 VPDI $0x4, Z1L, Z1L, Z1L
1739
1740 VLREPG sel+32(FP), SEL1
1741 VZERO ZER
1742 VCEQG SEL1, ZER, SEL1
1743
1744 VSEL X1L, X3L, SEL1, X3L
1745 VSEL X1H, X3H, SEL1, X3H
1746 VSEL Y1L, Y3L, SEL1, Y3L
1747 VSEL Y1H, Y3H, SEL1, Y3H
1748 VSEL Z1L, Z3L, SEL1, Z3L
1749 VSEL Z1H, Z3H, SEL1, Z3H
1750
1751 // if (zero == 0) {
1752 // copy(P3.x[:], X2)
1753 // copy(P3.y[:], Y2)
1754 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1755 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
1756 // }
1757 VL 16(P2ptr), X2H
1758 VPDI $0x4, X2H, X2H, X2H
1759 VL 0(P2ptr), X2L
1760 VPDI $0x4, X2L, X2L, X2L
1761
1762 // Y2 already loaded
1763 VL 128(CPOOL), Z2H
1764 VL 144(CPOOL), Z2L
1765
1766 VLREPG zero+40(FP), SEL1
1767 VZERO ZER
1768 VCEQG SEL1, ZER, SEL1
1769
1770 VSEL X2L, X3L, SEL1, X3L
1771 VSEL X2H, X3H, SEL1, X3H
1772 VSEL Y2L, Y3L, SEL1, Y3L
1773 VSEL Y2H, Y3H, SEL1, Y3H
1774 VSEL Z2L, Z3L, SEL1, Z3L
1775 VSEL Z2H, Z3H, SEL1, Z3H
1776
1777 // All done, store out the result!!!
1778 VPDI $0x4, X3H, X3H, X3H
1779 VST X3H, 16(P3ptr)
1780 VPDI $0x4, X3L, X3L, X3L
1781 VST X3L, 0(P3ptr)
1782 VPDI $0x4, Y3H, Y3H, Y3H
1783 VST Y3H, 48(P3ptr)
1784 VPDI $0x4, Y3L, Y3L, Y3L
1785 VST Y3L, 32(P3ptr)
1786 VPDI $0x4, Z3H, Z3H, Z3H
1787 VST Z3H, 80(P3ptr)
1788 VPDI $0x4, Z3L, Z3L, Z3L
1789 VST Z3L, 64(P3ptr)
1790
1791 RET
1792
1793 #undef P3ptr
1794 #undef P1ptr
1795 #undef P2ptr
1796 #undef CPOOL
1797
1798 #undef Y2L
1799 #undef Y2H
1800 #undef T1L
1801 #undef T1H
1802 #undef T2L
1803 #undef T2H
1804 #undef T3L
1805 #undef T3H
1806 #undef T4L
1807 #undef T4H
1808
1809 #undef TT0
1810 #undef TT1
1811 #undef T2
1812
1813 #undef X0
1814 #undef X1
1815 #undef Y0
1816 #undef Y1
1817 #undef T0
1818 #undef T1
1819
1820 #undef PL
1821 #undef PH
1822
1823 #undef X1L
1824 #undef X1H
1825 #undef Y1L
1826 #undef Y1H
1827 #undef Z1L
1828 #undef Z1H
1829 #undef X2L
1830 #undef X2H
1831 #undef Z2L
1832 #undef Z2H
1833 #undef X3L
1834 #undef X3H
1835 #undef Y3L
1836 #undef Y3H
1837 #undef Z3L
1838 #undef Z3H
1839
1840 #undef ZER
1841 #undef SEL1
1842 #undef CAR1
1843 #undef CAR2
1844
1845 // func p256PointDoubleAsm(res, in *P256Point)
1846 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1847 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1848 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1849 #define P3ptr R1
1850 #define P1ptr R2
1851 #define CPOOL R4
1852
1853 // Temporaries in REGs
1854 #define X3L V15
1855 #define X3H V16
1856 #define Y3L V17
1857 #define Y3H V18
1858 #define T1L V19
1859 #define T1H V20
1860 #define T2L V21
1861 #define T2H V22
1862 #define T3L V23
1863 #define T3H V24
1864
1865 #define X1L V6
1866 #define X1H V7
1867 #define Y1L V8
1868 #define Y1H V9
1869 #define Z1L V10
1870 #define Z1H V11
1871
1872 // Temps for Sub and Add
1873 #define TT0 V11
1874 #define TT1 V12
1875 #define T2 V13
1876
1877 // p256MulAsm Parameters
1878 #define X0 V0
1879 #define X1 V1
1880 #define Y0 V2
1881 #define Y1 V3
1882 #define T0 V4
1883 #define T1 V5
1884
1885 #define PL V30
1886 #define PH V31
1887
1888 #define Z3L V23
1889 #define Z3H V24
1890
1891 #define ZER V26
1892 #define SEL1 V27
1893 #define CAR1 V28
1894 #define CAR2 V29
1895 /*
1896 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1897 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1898 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1899 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1900 * B = 2Y₁
1901 * Z₃ = B×Z₁
1902 * C = B²
1903 * D = C×X₁
1904 * X₃ = A²-2D
1905 * Y₃ = (D-X₃)×A-C²/2
1906 *
1907 * Three-operand formula:
1908 * T1 = Z1²
1909 * T2 = X1-T1
1910 * T1 = X1+T1
1911 * T2 = T2*T1
1912 * T2 = 3*T2
1913 * Y3 = 2*Y1
1914 * Z3 = Y3*Z1
1915 * Y3 = Y3²
1916 * T3 = Y3*X1
1917 * Y3 = Y3²
1918 * Y3 = half*Y3
1919 * X3 = T2²
1920 * T1 = 2*T3
1921 * X3 = X3-T1
1922 * T1 = T3-X3
1923 * T1 = T1*T2
1924 * Y3 = T1-Y3
1925 */
1926
1927 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
1928 MOVD res+0(FP), P3ptr
1929 MOVD in+8(FP), P1ptr
1930
1931 MOVD $p256mul<>+0x00(SB), CPOOL
1932 VL 16(CPOOL), PL
1933 VL 0(CPOOL), PH
1934
1935 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1936 VL 80(P1ptr), X1 // Z1H
1937 VPDI $0x4, X1, X1, X1
1938 VL 64(P1ptr), X0 // Z1L
1939 VPDI $0x4, X0, X0, X0
1940 VLR X0, Y0
1941 VLR X1, Y1
1942 CALL p256SqrInternal<>(SB)
1943
1944 // SUB(X<X1-T) // T2 = X1-T1
1945 VL 16(P1ptr), X1H
1946 VPDI $0x4, X1H, X1H, X1H
1947 VL 0(P1ptr), X1L
1948 VPDI $0x4, X1L, X1L, X1L
1949 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1950
1951 // ADD(Y<X1+T) // T1 = X1+T1
1952 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1953
1954 // X- ; Y- ; MUL; T- // T2 = T2*T1
1955 CALL p256MulInternal<>(SB)
1956
1957 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1958 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1959 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1960
1961 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1962 VL 48(P1ptr), Y1H
1963 VPDI $0x4, Y1H, Y1H, Y1H
1964 VL 32(P1ptr), Y1L
1965 VPDI $0x4, Y1L, Y1L, Y1L
1966 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1967
1968 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1969 VL 80(P1ptr), Y1 // Z1H
1970 VPDI $0x4, Y1, Y1, Y1
1971 VL 64(P1ptr), Y0 // Z1L
1972 VPDI $0x4, Y0, Y0, Y0
1973 CALL p256MulInternal<>(SB)
1974 VPDI $0x4, T1, T1, TT1
1975 VST TT1, 80(P3ptr)
1976 VPDI $0x4, T0, T0, TT0
1977 VST TT0, 64(P3ptr)
1978
1979 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1980 VLR X0, Y0
1981 VLR X1, Y1
1982 CALL p256SqrInternal<>(SB)
1983
1984 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
1985 VLR T0, X0
1986 VLR T1, X1
1987 VL 16(P1ptr), Y1
1988 VPDI $0x4, Y1, Y1, Y1
1989 VL 0(P1ptr), Y0
1990 VPDI $0x4, Y0, Y0, Y0
1991 CALL p256MulInternal<>(SB)
1992 VLR T0, T3L
1993 VLR T1, T3H
1994
1995 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1996 VLR X0, Y0
1997 VLR X1, Y1
1998 CALL p256SqrInternal<>(SB)
1999
2000 // HAL(Y3<T) // Y3 = half*Y3
2001 p256HalfInternal(Y3H,Y3L, T1,T0)
2002
2003 // X=T2; Y=T2; MUL; T- // X3 = T2²
2004 VLR T2L, X0
2005 VLR T2H, X1
2006 VLR T2L, Y0
2007 VLR T2H, Y1
2008 CALL p256SqrInternal<>(SB)
2009
2010 // ADD(T1<T3+T3) // T1 = 2*T3
2011 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
2012
2013 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
2014 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
2015 VPDI $0x4, X3H, X3H, TT1
2016 VST TT1, 16(P3ptr)
2017 VPDI $0x4, X3L, X3L, TT0
2018 VST TT0, 0(P3ptr)
2019
2020 // SUB(X<T3-X3) // T1 = T3-X3
2021 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
2022
2023 // X- ; Y- ; MUL; T- // T1 = T1*T2
2024 CALL p256MulInternal<>(SB)
2025
2026 // SUB(Y3<T-Y3) // Y3 = T1-Y3
2027 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
2028
2029 VPDI $0x4, Y3H, Y3H, Y3H
2030 VST Y3H, 48(P3ptr)
2031 VPDI $0x4, Y3L, Y3L, Y3L
2032 VST Y3L, 32(P3ptr)
2033 RET
2034
2035 #undef P3ptr
2036 #undef P1ptr
2037 #undef CPOOL
2038 #undef X3L
2039 #undef X3H
2040 #undef Y3L
2041 #undef Y3H
2042 #undef T1L
2043 #undef T1H
2044 #undef T2L
2045 #undef T2H
2046 #undef T3L
2047 #undef T3H
2048 #undef X1L
2049 #undef X1H
2050 #undef Y1L
2051 #undef Y1H
2052 #undef Z1L
2053 #undef Z1H
2054 #undef TT0
2055 #undef TT1
2056 #undef T2
2057 #undef X0
2058 #undef X1
2059 #undef Y0
2060 #undef Y1
2061 #undef T0
2062 #undef T1
2063 #undef PL
2064 #undef PH
2065 #undef Z3L
2066 #undef Z3H
2067 #undef ZER
2068 #undef SEL1
2069 #undef CAR1
2070 #undef CAR2
2071
2072 // func p256PointAddAsm(res, in1, in2 *P256Point) int
2073 #define P3ptr R1
2074 #define P1ptr R2
2075 #define P2ptr R3
2076 #define CPOOL R4
2077 #define ISZERO R5
2078 #define TRUE R6
2079
2080 // Temporaries in REGs
2081 #define T1L V16
2082 #define T1H V17
2083 #define T2L V18
2084 #define T2H V19
2085 #define U1L V20
2086 #define U1H V21
2087 #define S1L V22
2088 #define S1H V23
2089 #define HL V24
2090 #define HH V25
2091 #define RL V26
2092 #define RH V27
2093
2094 // Temps for Sub and Add
2095 #define ZER V6
2096 #define SEL1 V7
2097 #define CAR1 V8
2098 #define CAR2 V9
2099 #define TT0 V11
2100 #define TT1 V12
2101 #define T2 V13
2102
2103 // p256MulAsm Parameters
2104 #define X0 V0
2105 #define X1 V1
2106 #define Y0 V2
2107 #define Y1 V3
2108 #define T0 V4
2109 #define T1 V5
2110
2111 #define PL V30
2112 #define PH V31
2113 /*
2114 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
2115 *
2116 * A = X₁×Z₂²
2117 * B = Y₁×Z₂³
2118 * C = X₂×Z₁²-A
2119 * D = Y₂×Z₁³-B
2120 * X₃ = D² - 2A×C² - C³
2121 * Y₃ = D×(A×C² - X₃) - B×C³
2122 * Z₃ = Z₁×Z₂×C
2123 *
2124 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
2125 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
2126 *
2127 * T1 = Z1*Z1
2128 * T2 = Z2*Z2
2129 * U1 = X1*T2
2130 * H = X2*T1
2131 * H = H-U1
2132 * Z3 = Z1*Z2
2133 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2134 *
2135 * S1 = Z2*T2
2136 * S1 = Y1*S1
2137 * R = Z1*T1
2138 * R = Y2*R
2139 * R = R-S1
2140 *
2141 * T1 = H*H
2142 * T2 = H*T1
2143 * U1 = U1*T1
2144 *
2145 * X3 = R*R
2146 * X3 = X3-T2
2147 * T1 = 2*U1
2148 * X3 = X3-T1 << store-out X3 result reg
2149 *
2150 * T2 = S1*T2
2151 * Y3 = U1-X3
2152 * Y3 = R*Y3
2153 * Y3 = Y3-T2 << store-out Y3 result reg
2154
2155 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2156 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2157 // X=X2; Y- ; MUL; H=T // H = X2*T1
2158 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2159 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2160 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2161 // SUB(H<H-T) // H = H-U1
2162 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2163 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
2164 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2165 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2166 // SUB(R<T-S1) // R = R-S1
2167 // X=H ; Y=H ; MUL; T- // T1 = H*H
2168 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2169 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2170 // X=R ; Y=R ; MUL; T- // X3 = R*R
2171 // SUB(T<T-T2) // X3 = X3-T2
2172 // ADD(X<U1+U1) // T1 = 2*U1
2173 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2174 // SUB(Y<U1-T) // Y3 = U1-X3
2175 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2176 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2177 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2178 */
2179 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
2180 MOVD res+0(FP), P3ptr
2181 MOVD in1+8(FP), P1ptr
2182 MOVD in2+16(FP), P2ptr
2183
2184 MOVD $p256mul<>+0x00(SB), CPOOL
2185 VL 16(CPOOL), PL
2186 VL 0(CPOOL), PH
2187
2188 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
2189 VL 80(P1ptr), X1 // Z1H
2190 VPDI $0x4, X1, X1, X1
2191 VL 64(P1ptr), X0 // Z1L
2192 VPDI $0x4, X0, X0, X0
2193 VLR X0, Y0
2194 VLR X1, Y1
2195 CALL p256SqrInternal<>(SB)
2196
2197 // X- ; Y=T ; MUL; R=T // R = Z1*T1
2198 VLR T0, Y0
2199 VLR T1, Y1
2200 CALL p256MulInternal<>(SB)
2201 VLR T0, RL
2202 VLR T1, RH
2203
2204 // X=X2; Y- ; MUL; H=T // H = X2*T1
2205 VL 16(P2ptr), X1 // X2H
2206 VPDI $0x4, X1, X1, X1
2207 VL 0(P2ptr), X0 // X2L
2208 VPDI $0x4, X0, X0, X0
2209 CALL p256MulInternal<>(SB)
2210 VLR T0, HL
2211 VLR T1, HH
2212
2213 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
2214 VL 80(P2ptr), X1 // Z2H
2215 VPDI $0x4, X1, X1, X1
2216 VL 64(P2ptr), X0 // Z2L
2217 VPDI $0x4, X0, X0, X0
2218 VLR X0, Y0
2219 VLR X1, Y1
2220 CALL p256SqrInternal<>(SB)
2221
2222 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
2223 VLR T0, Y0
2224 VLR T1, Y1
2225 CALL p256MulInternal<>(SB)
2226 VLR T0, S1L
2227 VLR T1, S1H
2228
2229 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
2230 VL 16(P1ptr), X1 // X1H
2231 VPDI $0x4, X1, X1, X1
2232 VL 0(P1ptr), X0 // X1L
2233 VPDI $0x4, X0, X0, X0
2234 CALL p256MulInternal<>(SB)
2235 VLR T0, U1L
2236 VLR T1, U1H
2237
2238 // SUB(H<H-T) // H = H-U1
2239 p256SubInternal(HH,HL,HH,HL,T1,T0)
2240
2241 // if H == 0 or H^P == 0 then ret=1 else ret=0
2242 // clobbers T1H and T1L
2243 MOVD $0, ISZERO
2244 MOVD $1, TRUE
2245 VZERO ZER
2246 VO HL, HH, T1H
2247 VCEQGS ZER, T1H, T1H
2248 MOVDEQ TRUE, ISZERO
2249 VX HL, PL, T1L
2250 VX HH, PH, T1H
2251 VO T1L, T1H, T1H
2252 VCEQGS ZER, T1H, T1H
2253 MOVDEQ TRUE, ISZERO
2254 MOVD ISZERO, ret+24(FP)
2255
2256 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
2257 VL 80(P1ptr), X1 // Z1H
2258 VPDI $0x4, X1, X1, X1
2259 VL 64(P1ptr), X0 // Z1L
2260 VPDI $0x4, X0, X0, X0
2261 VL 80(P2ptr), Y1 // Z2H
2262 VPDI $0x4, Y1, Y1, Y1
2263 VL 64(P2ptr), Y0 // Z2L
2264 VPDI $0x4, Y0, Y0, Y0
2265 CALL p256MulInternal<>(SB)
2266
2267 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
2268 VLR T0, X0
2269 VLR T1, X1
2270 VLR HL, Y0
2271 VLR HH, Y1
2272 CALL p256MulInternal<>(SB)
2273 VPDI $0x4, T1, T1, TT1
2274 VST TT1, 80(P3ptr)
2275 VPDI $0x4, T0, T0, TT0
2276 VST TT0, 64(P3ptr)
2277
2278 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
2279 VL 48(P1ptr), X1
2280 VPDI $0x4, X1, X1, X1
2281 VL 32(P1ptr), X0
2282 VPDI $0x4, X0, X0, X0
2283 VLR S1L, Y0
2284 VLR S1H, Y1
2285 CALL p256MulInternal<>(SB)
2286 VLR T0, S1L
2287 VLR T1, S1H
2288
2289 // X=Y2; Y=R ; MUL; T- // R = Y2*R
2290 VL 48(P2ptr), X1
2291 VPDI $0x4, X1, X1, X1
2292 VL 32(P2ptr), X0
2293 VPDI $0x4, X0, X0, X0
2294 VLR RL, Y0
2295 VLR RH, Y1
2296 CALL p256MulInternal<>(SB)
2297
2298 // SUB(R<T-S1) // R = T-S1
2299 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
2300
2301 // if R == 0 or R^P == 0 then ret=ret else ret=0
2302 // clobbers T1H and T1L
2303 MOVD $0, ISZERO
2304 MOVD $1, TRUE
2305 VZERO ZER
2306 VO RL, RH, T1H
2307 VCEQGS ZER, T1H, T1H
2308 MOVDEQ TRUE, ISZERO
2309 VX RL, PL, T1L
2310 VX RH, PH, T1H
2311 VO T1L, T1H, T1H
2312 VCEQGS ZER, T1H, T1H
2313 MOVDEQ TRUE, ISZERO
2314 AND ret+24(FP), ISZERO
2315 MOVD ISZERO, ret+24(FP)
2316
2317 // X=H ; Y=H ; MUL; T- // T1 = H*H
2318 VLR HL, X0
2319 VLR HH, X1
2320 VLR HL, Y0
2321 VLR HH, Y1
2322 CALL p256SqrInternal<>(SB)
2323
2324 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
2325 VLR T0, Y0
2326 VLR T1, Y1
2327 CALL p256MulInternal<>(SB)
2328 VLR T0, T2L
2329 VLR T1, T2H
2330
2331 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
2332 VLR U1L, X0
2333 VLR U1H, X1
2334 CALL p256MulInternal<>(SB)
2335 VLR T0, U1L
2336 VLR T1, U1H
2337
2338 // X=R ; Y=R ; MUL; T- // X3 = R*R
2339 VLR RL, X0
2340 VLR RH, X1
2341 VLR RL, Y0
2342 VLR RH, Y1
2343 CALL p256SqrInternal<>(SB)
2344
2345 // SUB(T<T-T2) // X3 = X3-T2
2346 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
2347
2348 // ADD(X<U1+U1) // T1 = 2*U1
2349 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
2350
2351 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
2352 p256SubInternal(T1,T0,T1,T0,X1,X0)
2353 VPDI $0x4, T1, T1, TT1
2354 VST TT1, 16(P3ptr)
2355 VPDI $0x4, T0, T0, TT0
2356 VST TT0, 0(P3ptr)
2357
2358 // SUB(Y<U1-T) // Y3 = U1-X3
2359 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
2360
2361 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
2362 VLR RL, X0
2363 VLR RH, X1
2364 CALL p256MulInternal<>(SB)
2365 VLR T0, U1L
2366 VLR T1, U1H
2367
2368 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
2369 VLR S1L, X0
2370 VLR S1H, X1
2371 VLR T2L, Y0
2372 VLR T2H, Y1
2373 CALL p256MulInternal<>(SB)
2374
2375 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
2376 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
2377 VPDI $0x4, T1, T1, T1
2378 VST T1, 48(P3ptr)
2379 VPDI $0x4, T0, T0, T0
2380 VST T0, 32(P3ptr)
2381
2382 RET
2383
View as plain text