1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "textflag.h"
8 #include "go_asm.h"
9
10 DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
11 DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
12 DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
13 DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
14 DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
15 DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
16 DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
17 DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
18 DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
19 DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
20 DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
21 DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
22 DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
23 DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
24 DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
25 DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
26 DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
27 DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
28 DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
29 DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
30 DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
31 DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
32 DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
33 DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
34 DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
35 DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
36 DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
37 DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
38 DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
39 DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
40 DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
41 DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
42 GLOBL p256<>(SB), 8, $96
43 GLOBL p256mul<>(SB), 8, $160
44
45 // ---------------------------------------
46 // iff cond == 1 val <- -val
47 // func p256NegCond(val *p256Element, cond int)
48 #define P1ptr R1
49 #define CPOOL R4
50
51 #define Y1L V0
52 #define Y1H V1
53 #define T1L V2
54 #define T1H V3
55
56 #define PL V30
57 #define PH V31
58
59 #define ZER V4
60 #define SEL1 V5
61 #define CAR1 V6
62 TEXT ·p256NegCond(SB), NOSPLIT, $0
63 MOVD val+0(FP), P1ptr
64
65 MOVD $p256mul<>+0x00(SB), CPOOL
66 VL 16(CPOOL), PL
67 VL 0(CPOOL), PH
68
69 VL 16(P1ptr), Y1H
70 VPDI $0x4, Y1H, Y1H, Y1H
71 VL 0(P1ptr), Y1L
72 VPDI $0x4, Y1L, Y1L, Y1L
73
74 VLREPG cond+8(FP), SEL1
75 VZERO ZER
76 VCEQG SEL1, ZER, SEL1
77
78 VSCBIQ Y1L, PL, CAR1
79 VSQ Y1L, PL, T1L
80 VSBIQ PH, Y1H, CAR1, T1H
81
82 VSEL Y1L, T1L, SEL1, Y1L
83 VSEL Y1H, T1H, SEL1, Y1H
84
85 VPDI $0x4, Y1H, Y1H, Y1H
86 VST Y1H, 16(P1ptr)
87 VPDI $0x4, Y1L, Y1L, Y1L
88 VST Y1L, 0(P1ptr)
89 RET
90
91 #undef P1ptr
92 #undef CPOOL
93 #undef Y1L
94 #undef Y1H
95 #undef T1L
96 #undef T1H
97 #undef PL
98 #undef PH
99 #undef ZER
100 #undef SEL1
101 #undef CAR1
102
103 // ---------------------------------------
104 // if cond == 0 res <- b; else res <- a
105 // func p256MovCond(res, a, b *P256Point, cond int)
106 #define P3ptr R1
107 #define P1ptr R2
108 #define P2ptr R3
109
110 #define X1L V0
111 #define X1H V1
112 #define Y1L V2
113 #define Y1H V3
114 #define Z1L V4
115 #define Z1H V5
116 #define X2L V6
117 #define X2H V7
118 #define Y2L V8
119 #define Y2H V9
120 #define Z2L V10
121 #define Z2H V11
122
123 #define ZER V18
124 #define SEL1 V19
125 TEXT ·p256MovCond(SB), NOSPLIT, $0
126 MOVD res+0(FP), P3ptr
127 MOVD a+8(FP), P1ptr
128 MOVD b+16(FP), P2ptr
129 VLREPG cond+24(FP), SEL1
130 VZERO ZER
131 VCEQG SEL1, ZER, SEL1
132
133 VL 0(P1ptr), X1H
134 VL 16(P1ptr), X1L
135 VL 32(P1ptr), Y1H
136 VL 48(P1ptr), Y1L
137 VL 64(P1ptr), Z1H
138 VL 80(P1ptr), Z1L
139
140 VL 0(P2ptr), X2H
141 VL 16(P2ptr), X2L
142 VL 32(P2ptr), Y2H
143 VL 48(P2ptr), Y2L
144 VL 64(P2ptr), Z2H
145 VL 80(P2ptr), Z2L
146
147 VSEL X2L, X1L, SEL1, X1L
148 VSEL X2H, X1H, SEL1, X1H
149 VSEL Y2L, Y1L, SEL1, Y1L
150 VSEL Y2H, Y1H, SEL1, Y1H
151 VSEL Z2L, Z1L, SEL1, Z1L
152 VSEL Z2H, Z1H, SEL1, Z1H
153
154 VST X1H, 0(P3ptr)
155 VST X1L, 16(P3ptr)
156 VST Y1H, 32(P3ptr)
157 VST Y1L, 48(P3ptr)
158 VST Z1H, 64(P3ptr)
159 VST Z1L, 80(P3ptr)
160
161 RET
162
163 #undef P3ptr
164 #undef P1ptr
165 #undef P2ptr
166 #undef X1L
167 #undef X1H
168 #undef Y1L
169 #undef Y1H
170 #undef Z1L
171 #undef Z1H
172 #undef X2L
173 #undef X2H
174 #undef Y2L
175 #undef Y2H
176 #undef Z2L
177 #undef Z2H
178 #undef ZER
179 #undef SEL1
180
181 // ---------------------------------------
182 // Constant time table access
183 // Indexed from 1 to 15, with -1 offset
184 // (index 0 is implicitly point at infinity)
185 // func p256Select(res *P256Point, table *p256Table, idx int)
186 #define P3ptr R1
187 #define P1ptr R2
188 #define COUNT R4
189
190 #define X1L V0
191 #define X1H V1
192 #define Y1L V2
193 #define Y1H V3
194 #define Z1L V4
195 #define Z1H V5
196 #define X2L V6
197 #define X2H V7
198 #define Y2L V8
199 #define Y2H V9
200 #define Z2L V10
201 #define Z2H V11
202
203 #define ONE V18
204 #define IDX V19
205 #define SEL1 V20
206 #define SEL2 V21
207 TEXT ·p256Select(SB), NOSPLIT, $0
208 MOVD res+0(FP), P3ptr
209 MOVD table+8(FP), P1ptr
210 VLREPB idx+(16+7)(FP), IDX
211 VREPIB $1, ONE
212 VREPIB $1, SEL2
213 MOVD $1, COUNT
214
215 VZERO X1H
216 VZERO X1L
217 VZERO Y1H
218 VZERO Y1L
219 VZERO Z1H
220 VZERO Z1L
221
222 loop_select:
223 VL 0(P1ptr), X2H
224 VL 16(P1ptr), X2L
225 VL 32(P1ptr), Y2H
226 VL 48(P1ptr), Y2L
227 VL 64(P1ptr), Z2H
228 VL 80(P1ptr), Z2L
229
230 VCEQG SEL2, IDX, SEL1
231
232 VSEL X2L, X1L, SEL1, X1L
233 VSEL X2H, X1H, SEL1, X1H
234 VSEL Y2L, Y1L, SEL1, Y1L
235 VSEL Y2H, Y1H, SEL1, Y1H
236 VSEL Z2L, Z1L, SEL1, Z1L
237 VSEL Z2H, Z1H, SEL1, Z1H
238
239 VAB SEL2, ONE, SEL2
240 ADDW $1, COUNT
241 ADD $96, P1ptr
242 CMPW COUNT, $17
243 BLT loop_select
244
245 VST X1H, 0(P3ptr)
246 VST X1L, 16(P3ptr)
247 VST Y1H, 32(P3ptr)
248 VST Y1L, 48(P3ptr)
249 VST Z1H, 64(P3ptr)
250 VST Z1L, 80(P3ptr)
251 RET
252
253 #undef P3ptr
254 #undef P1ptr
255 #undef COUNT
256 #undef X1L
257 #undef X1H
258 #undef Y1L
259 #undef Y1H
260 #undef Z1L
261 #undef Z1H
262 #undef X2L
263 #undef X2H
264 #undef Y2L
265 #undef Y2H
266 #undef Z2L
267 #undef Z2H
268 #undef ONE
269 #undef IDX
270 #undef SEL1
271 #undef SEL2
272
273 // ---------------------------------------
274
275 // func p256FromMont(res, in *p256Element)
276 #define res_ptr R1
277 #define x_ptr R2
278 #define CPOOL R4
279
280 #define T0 V0
281 #define T1 V1
282 #define T2 V2
283 #define TT0 V3
284 #define TT1 V4
285
286 #define ZER V6
287 #define SEL1 V7
288 #define SEL2 V8
289 #define CAR1 V9
290 #define CAR2 V10
291 #define RED1 V11
292 #define RED2 V12
293 #define PL V13
294 #define PH V14
295
296 TEXT ·p256FromMont(SB), NOSPLIT, $0
297 MOVD res+0(FP), res_ptr
298 MOVD in+8(FP), x_ptr
299
300 VZERO T2
301 VZERO ZER
302 MOVD $p256<>+0x00(SB), CPOOL
303 VL 16(CPOOL), PL
304 VL 0(CPOOL), PH
305 VL 48(CPOOL), SEL2
306 VL 64(CPOOL), SEL1
307
308 VL (0*16)(x_ptr), T0
309 VPDI $0x4, T0, T0, T0
310 VL (1*16)(x_ptr), T1
311 VPDI $0x4, T1, T1, T1
312
313 // First round
314 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
315 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
316 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
317
318 VSLDB $8, T1, T0, T0
319 VSLDB $8, T2, T1, T1
320
321 VACCQ T0, RED1, CAR1
322 VAQ T0, RED1, T0
323 VACCCQ T1, RED2, CAR1, CAR2
324 VACQ T1, RED2, CAR1, T1
325 VAQ T2, CAR2, T2
326
327 // Second round
328 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
329 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
330 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
331
332 VSLDB $8, T1, T0, T0
333 VSLDB $8, T2, T1, T1
334
335 VACCQ T0, RED1, CAR1
336 VAQ T0, RED1, T0
337 VACCCQ T1, RED2, CAR1, CAR2
338 VACQ T1, RED2, CAR1, T1
339 VAQ T2, CAR2, T2
340
341 // Third round
342 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
343 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
344 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
345
346 VSLDB $8, T1, T0, T0
347 VSLDB $8, T2, T1, T1
348
349 VACCQ T0, RED1, CAR1
350 VAQ T0, RED1, T0
351 VACCCQ T1, RED2, CAR1, CAR2
352 VACQ T1, RED2, CAR1, T1
353 VAQ T2, CAR2, T2
354
355 // Last round
356 VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
357 VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
358 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
359
360 VSLDB $8, T1, T0, T0
361 VSLDB $8, T2, T1, T1
362
363 VACCQ T0, RED1, CAR1
364 VAQ T0, RED1, T0
365 VACCCQ T1, RED2, CAR1, CAR2
366 VACQ T1, RED2, CAR1, T1
367 VAQ T2, CAR2, T2
368
369 // ---------------------------------------------------
370
371 VSCBIQ PL, T0, CAR1
372 VSQ PL, T0, TT0
373 VSBCBIQ T1, PH, CAR1, CAR2
374 VSBIQ T1, PH, CAR1, TT1
375 VSBIQ T2, ZER, CAR2, T2
376
377 // what output to use, TT1||TT0 or T1||T0?
378 VSEL T0, TT0, T2, T0
379 VSEL T1, TT1, T2, T1
380
381 VPDI $0x4, T0, T0, TT0
382 VST TT0, (0*16)(res_ptr)
383 VPDI $0x4, T1, T1, TT1
384 VST TT1, (1*16)(res_ptr)
385 RET
386
387 #undef res_ptr
388 #undef x_ptr
389 #undef CPOOL
390 #undef T0
391 #undef T1
392 #undef T2
393 #undef TT0
394 #undef TT1
395 #undef ZER
396 #undef SEL1
397 #undef SEL2
398 #undef CAR1
399 #undef CAR2
400 #undef RED1
401 #undef RED2
402 #undef PL
403 #undef PH
404
405 // Constant time table access
406 // Indexed from 1 to 15, with -1 offset
407 // (index 0 is implicitly point at infinity)
408 // func p256SelectBase(point *p256Point, table []p256Point, idx int)
409 // new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
410
411 #define P3ptr R1
412 #define P1ptr R2
413 #define COUNT R4
414 #define CPOOL R5
415
416 #define X1L V0
417 #define X1H V1
418 #define Y1L V2
419 #define Y1H V3
420 #define Z1L V4
421 #define Z1H V5
422 #define X2L V6
423 #define X2H V7
424 #define Y2L V8
425 #define Y2H V9
426 #define Z2L V10
427 #define Z2H V11
428 #define LE2BE V12
429
430 #define ONE V18
431 #define IDX V19
432 #define SEL1 V20
433 #define SEL2 V21
434
435 TEXT ·p256SelectAffine(SB), NOSPLIT, $0
436 MOVD res+0(FP), P3ptr
437 MOVD table+8(FP), P1ptr
438 MOVD $p256<>+0x00(SB), CPOOL
439 VLREPB idx+(16+7)(FP), IDX
440 VREPIB $1, ONE
441 VREPIB $1, SEL2
442 MOVD $1, COUNT
443 VL 80(CPOOL), LE2BE
444
445 VZERO X1H
446 VZERO X1L
447 VZERO Y1H
448 VZERO Y1L
449
450 loop_select:
451 VL 0(P1ptr), X2H
452 VL 16(P1ptr), X2L
453 VL 32(P1ptr), Y2H
454 VL 48(P1ptr), Y2L
455
456 VCEQG SEL2, IDX, SEL1
457
458 VSEL X2L, X1L, SEL1, X1L
459 VSEL X2H, X1H, SEL1, X1H
460 VSEL Y2L, Y1L, SEL1, Y1L
461 VSEL Y2H, Y1H, SEL1, Y1H
462
463 VAB SEL2, ONE, SEL2
464 ADDW $1, COUNT
465 ADD $64, P1ptr
466 CMPW COUNT, $33 // len(p256AffineTable) + 1
467 BLT loop_select
468 VST X1H, 0(P3ptr)
469 VST X1L, 16(P3ptr)
470 VST Y1H, 32(P3ptr)
471 VST Y1L, 48(P3ptr)
472
473 RET
474
475 #undef P3ptr
476 #undef P1ptr
477 #undef COUNT
478 #undef X1L
479 #undef X1H
480 #undef Y1L
481 #undef Y1H
482 #undef Z1L
483 #undef Z1H
484 #undef X2L
485 #undef X2H
486 #undef Y2L
487 #undef Y2H
488 #undef Z2L
489 #undef Z2H
490 #undef ONE
491 #undef IDX
492 #undef SEL1
493 #undef SEL2
494 #undef CPOOL
495
496 // ---------------------------------------
497 // p256MulInternal
498 // V0-V3,V30,V31 - Not Modified
499 // V4-V15 - Volatile
500
501 #define CPOOL R4
502
503 // Parameters
504 #define X0 V0 // Not modified
505 #define X1 V1 // Not modified
506 #define Y0 V2 // Not modified
507 #define Y1 V3 // Not modified
508 #define T0 V4
509 #define T1 V5
510 #define P0 V30 // Not modified
511 #define P1 V31 // Not modified
512
513 // Temporaries
514 #define YDIG V6 // Overloaded with CAR2, ZER
515 #define ADD1H V7 // Overloaded with ADD3H
516 #define ADD2H V8 // Overloaded with ADD4H
517 #define ADD3 V9 // Overloaded with SEL2,SEL5
518 #define ADD4 V10 // Overloaded with SEL3,SEL6
519 #define RED1 V11 // Overloaded with CAR2
520 #define RED2 V12
521 #define RED3 V13 // Overloaded with SEL1
522 #define T2 V14
523 // Overloaded temporaries
524 #define ADD1 V4 // Overloaded with T0
525 #define ADD2 V5 // Overloaded with T1
526 #define ADD3H V7 // Overloaded with ADD1H
527 #define ADD4H V8 // Overloaded with ADD2H
528 #define ZER V6 // Overloaded with YDIG, CAR2
529 #define CAR1 V6 // Overloaded with YDIG, ZER
530 #define CAR2 V11 // Overloaded with RED1
531 // Constant Selects
532 #define SEL1 V13 // Overloaded with RED3
533 #define SEL2 V9 // Overloaded with ADD3,SEL5
534 #define SEL3 V10 // Overloaded with ADD4,SEL6
535 #define SEL4 V6 // Overloaded with YDIG,CAR2,ZER
536 #define SEL5 V9 // Overloaded with ADD3,SEL2
537 #define SEL6 V10 // Overloaded with ADD4,SEL3
538
539 /* *
540 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
541 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
542 * With you, SIMD be...
543 *
544 * +--------+--------+
545 * +--------| RED2 | RED1 |
546 * | +--------+--------+
547 * | ---+--------+--------+
548 * | +---- T2| T1 | T0 |--+
549 * | | ---+--------+--------+ |
550 * | | |
551 * | | ======================= |
552 * | | |
553 * | | +--------+--------+<-+
554 * | +-------| ADD2 | ADD1 |--|-----+
555 * | | +--------+--------+ | |
556 * | | +--------+--------+<---+ |
557 * | | | ADD2H | ADD1H |--+ |
558 * | | +--------+--------+ | |
559 * | | +--------+--------+<-+ |
560 * | | | ADD4 | ADD3 |--|-+ |
561 * | | +--------+--------+ | | |
562 * | | +--------+--------+<---+ | |
563 * | | | ADD4H | ADD3H |------|-+ |(+vzero)
564 * | | +--------+--------+ | | V
565 * | | ------------------------ | | +--------+
566 * | | | | | RED3 | [d0 0 0 d0]
567 * | | | | +--------+
568 * | +---->+--------+--------+ | | |
569 * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
570 * | +--------+--------+ | | |
571 * +---->---+--------+--------+ | | |
572 * T2| T1 | T0 |----+ | |
573 * ---+--------+--------+ | | |
574 * ---+--------+--------+<---+ | |
575 * +--- T2| T1 | T0 |----------+
576 * | ---+--------+--------+ | |
577 * | +--------+--------+<-------------+
578 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
579 * | +--------+--------+ | | |
580 * | +--------+<----------------------+
581 * | | RED3 |--------------+ | [0 0 d1 d0]
582 * | +--------+ | |
583 * +--->+--------+--------+ | |
584 * | T1 | T0 |--------+
585 * +--------+--------+ | |
586 * --------------------------- | |
587 * | |
588 * +--------+--------+<----+ |
589 * | RED2 | RED1 | |
590 * +--------+--------+ |
591 * ---+--------+--------+<-------+
592 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
593 * ---+--------+--------+
594 *
595 * *Mi obra de arte de siglo XXI @vpaprots
596 *
597 *
598 * First group is special, doesn't get the two inputs:
599 * +--------+--------+<-+
600 * +-------| ADD2 | ADD1 |--|-----+
601 * | +--------+--------+ | |
602 * | +--------+--------+<---+ |
603 * | | ADD2H | ADD1H |--+ |
604 * | +--------+--------+ | |
605 * | +--------+--------+<-+ |
606 * | | ADD4 | ADD3 |--|-+ |
607 * | +--------+--------+ | | |
608 * | +--------+--------+<---+ | |
609 * | | ADD4H | ADD3H |------|-+ |(+vzero)
610 * | +--------+--------+ | | V
611 * | ------------------------ | | +--------+
612 * | | | | RED3 | [d0 0 0 d0]
613 * | | | +--------+
614 * +---->+--------+--------+ | | |
615 * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
616 * +--------+--------+ | | |
617 * ---+--------+--------+<---+ | |
618 * +--- T2| T1 | T0 |----------+
619 * | ---+--------+--------+ | |
620 * | +--------+--------+<-------------+
621 * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
622 * | +--------+--------+ | | |
623 * | +--------+<----------------------+
624 * | | RED3 |--------------+ | [0 0 d1 d0]
625 * | +--------+ | |
626 * +--->+--------+--------+ | |
627 * | T1 | T0 |--------+
628 * +--------+--------+ | |
629 * --------------------------- | |
630 * | |
631 * +--------+--------+<----+ |
632 * | RED2 | RED1 | |
633 * +--------+--------+ |
634 * ---+--------+--------+<-------+
635 * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
636 * ---+--------+--------+
637 *
638 * Last 'group' needs to RED2||RED1 shifted less
639 */
640 TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
641 VL 32(CPOOL), SEL1
642 VL 48(CPOOL), SEL2
643 VL 64(CPOOL), SEL3
644 VL 80(CPOOL), SEL4
645
646 // ---------------------------------------------------
647
648 VREPF $3, Y0, YDIG
649 VMLHF X0, YDIG, ADD1H
650 VMLHF X1, YDIG, ADD2H
651 VMLF X0, YDIG, ADD1
652 VMLF X1, YDIG, ADD2
653
654 VREPF $2, Y0, YDIG
655 VMALF X0, YDIG, ADD1H, ADD3
656 VMALF X1, YDIG, ADD2H, ADD4
657 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
658 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
659
660 VZERO ZER
661 VL 32(CPOOL), SEL1
662 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
663
664 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
665 VSLDB $12, ZER, ADD2, T1 // ADD2 Free
666
667 VACCQ T0, ADD3, CAR1
668 VAQ T0, ADD3, T0 // ADD3 Free
669 VACCCQ T1, ADD4, CAR1, T2
670 VACQ T1, ADD4, CAR1, T1 // ADD4 Free
671
672 VL 48(CPOOL), SEL2
673 VL 64(CPOOL), SEL3
674 VL 80(CPOOL), SEL4
675 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
676 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
677 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
678 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
679
680 VSLDB $12, T1, T0, T0
681 VSLDB $12, T2, T1, T1
682
683 VACCQ T0, ADD3H, CAR1
684 VAQ T0, ADD3H, T0
685 VACCCQ T1, ADD4H, CAR1, T2
686 VACQ T1, ADD4H, CAR1, T1
687
688 // ---------------------------------------------------
689
690 VREPF $1, Y0, YDIG
691 VMALHF X0, YDIG, T0, ADD1H
692 VMALHF X1, YDIG, T1, ADD2H
693 VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
694 VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
695
696 VREPF $0, Y0, YDIG
697 VMALF X0, YDIG, ADD1H, ADD3
698 VMALF X1, YDIG, ADD2H, ADD4
699 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
700 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
701
702 VZERO ZER
703 VL 32(CPOOL), SEL1
704 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
705
706 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
707 VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free
708
709 VACCQ T0, RED1, CAR1
710 VAQ T0, RED1, T0
711 VACCCQ T1, RED2, CAR1, T2
712 VACQ T1, RED2, CAR1, T1
713
714 VACCQ T0, ADD3, CAR1
715 VAQ T0, ADD3, T0
716 VACCCQ T1, ADD4, CAR1, CAR2
717 VACQ T1, ADD4, CAR1, T1
718 VAQ T2, CAR2, T2
719
720 VL 48(CPOOL), SEL2
721 VL 64(CPOOL), SEL3
722 VL 80(CPOOL), SEL4
723 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
724 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
725 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
726 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
727
728 VSLDB $12, T1, T0, T0
729 VSLDB $12, T2, T1, T1
730
731 VACCQ T0, ADD3H, CAR1
732 VAQ T0, ADD3H, T0
733 VACCCQ T1, ADD4H, CAR1, T2
734 VACQ T1, ADD4H, CAR1, T1
735
736 // ---------------------------------------------------
737
738 VREPF $3, Y1, YDIG
739 VMALHF X0, YDIG, T0, ADD1H
740 VMALHF X1, YDIG, T1, ADD2H
741 VMALF X0, YDIG, T0, ADD1
742 VMALF X1, YDIG, T1, ADD2
743
744 VREPF $2, Y1, YDIG
745 VMALF X0, YDIG, ADD1H, ADD3
746 VMALF X1, YDIG, ADD2H, ADD4
747 VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
748 VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
749
750 VZERO ZER
751 VL 32(CPOOL), SEL1
752 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
753
754 VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
755 VSLDB $12, T2, ADD2, T1 // ADD2 Free
756
757 VACCQ T0, RED1, CAR1
758 VAQ T0, RED1, T0
759 VACCCQ T1, RED2, CAR1, T2
760 VACQ T1, RED2, CAR1, T1
761
762 VACCQ T0, ADD3, CAR1
763 VAQ T0, ADD3, T0
764 VACCCQ T1, ADD4, CAR1, CAR2
765 VACQ T1, ADD4, CAR1, T1
766 VAQ T2, CAR2, T2
767
768 VL 48(CPOOL), SEL2
769 VL 64(CPOOL), SEL3
770 VL 80(CPOOL), SEL4
771 VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
772 VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
773 VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
774 VSQ RED3, RED2, RED2 // Guaranteed not to underflow
775
776 VSLDB $12, T1, T0, T0
777 VSLDB $12, T2, T1, T1
778
779 VACCQ T0, ADD3H, CAR1
780 VAQ T0, ADD3H, T0
781 VACCCQ T1, ADD4H, CAR1, T2
782 VACQ T1, ADD4H, CAR1, T1
783
784 // ---------------------------------------------------
785
786 VREPF $1, Y1, YDIG
787 VMALHF X0, YDIG, T0, ADD1H
788 VMALHF X1, YDIG, T1, ADD2H
789 VMALF X0, YDIG, T0, ADD1
790 VMALF X1, YDIG, T1, ADD2
791
792 VREPF $0, Y1, YDIG
793 VMALF X0, YDIG, ADD1H, ADD3
794 VMALF X1, YDIG, ADD2H, ADD4
795 VMALHF X0, YDIG, ADD1H, ADD3H
796 VMALHF X1, YDIG, ADD2H, ADD4H
797
798 VZERO ZER
799 VL 32(CPOOL), SEL1
800 VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
801
802 VSLDB $12, ADD2, ADD1, T0
803 VSLDB $12, T2, ADD2, T1
804
805 VACCQ T0, RED1, CAR1
806 VAQ T0, RED1, T0
807 VACCCQ T1, RED2, CAR1, T2
808 VACQ T1, RED2, CAR1, T1
809
810 VACCQ T0, ADD3, CAR1
811 VAQ T0, ADD3, T0
812 VACCCQ T1, ADD4, CAR1, CAR2
813 VACQ T1, ADD4, CAR1, T1
814 VAQ T2, CAR2, T2
815
816 VL 96(CPOOL), SEL5
817 VL 112(CPOOL), SEL6
818 VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
819 VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
820 VSQ RED1, RED2, RED2 // Guaranteed not to underflow
821
822 VSLDB $12, T1, T0, T0
823 VSLDB $12, T2, T1, T1
824
825 VACCQ T0, ADD3H, CAR1
826 VAQ T0, ADD3H, T0
827 VACCCQ T1, ADD4H, CAR1, T2
828 VACQ T1, ADD4H, CAR1, T1
829
830 VACCQ T0, RED1, CAR1
831 VAQ T0, RED1, T0
832 VACCCQ T1, RED2, CAR1, CAR2
833 VACQ T1, RED2, CAR1, T1
834 VAQ T2, CAR2, T2
835
836 // ---------------------------------------------------
837
838 VZERO RED3
839 VSCBIQ P0, T0, CAR1
840 VSQ P0, T0, ADD1H
841 VSBCBIQ T1, P1, CAR1, CAR2
842 VSBIQ T1, P1, CAR1, ADD2H
843 VSBIQ T2, RED3, CAR2, T2
844
845 // what output to use, ADD2H||ADD1H or T1||T0?
846 VSEL T0, ADD1H, T2, T0
847 VSEL T1, ADD2H, T2, T1
848 RET
849
850 #undef CPOOL
851
852 #undef X0
853 #undef X1
854 #undef Y0
855 #undef Y1
856 #undef T0
857 #undef T1
858 #undef P0
859 #undef P1
860
861 #undef SEL1
862 #undef SEL2
863 #undef SEL3
864 #undef SEL4
865 #undef SEL5
866 #undef SEL6
867
868 #undef YDIG
869 #undef ADD1H
870 #undef ADD2H
871 #undef ADD3
872 #undef ADD4
873 #undef RED1
874 #undef RED2
875 #undef RED3
876 #undef T2
877 #undef ADD1
878 #undef ADD2
879 #undef ADD3H
880 #undef ADD4H
881 #undef ZER
882 #undef CAR1
883 #undef CAR2
884
885 // ---------------------------------------
886
887 // Parameters
888 #define X0 V0
889 #define X1 V1
890 #define Y0 V2
891 #define Y1 V3
892
893 TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
894 VLR X0, Y0
895 VLR X1, Y1
896 BR p256MulInternal<>(SB)
897
898 #undef X0
899 #undef X1
900 #undef Y0
901 #undef Y1
902
903 #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
904 VZERO ZER \
905 VSCBIQ Y0, X0, CAR1 \
906 VSQ Y0, X0, T0 \
907 VSBCBIQ X1, Y1, CAR1, SEL1 \
908 VSBIQ X1, Y1, CAR1, T1 \
909 VSQ SEL1, ZER, SEL1 \
910 \
911 VACCQ T0, PL, CAR1 \
912 VAQ T0, PL, TT0 \
913 VACQ T1, PH, CAR1, TT1 \
914 \
915 VSEL T0, TT0, SEL1, T0 \
916 VSEL T1, TT1, SEL1, T1 \
917
918 #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
919 VACCQ X0, Y0, CAR1 \
920 VAQ X0, Y0, T0 \
921 VACCCQ X1, Y1, CAR1, T2 \
922 VACQ X1, Y1, CAR1, T1 \
923 \
924 VZERO ZER \
925 VSCBIQ PL, T0, CAR1 \
926 VSQ PL, T0, TT0 \
927 VSBCBIQ T1, PH, CAR1, CAR2 \
928 VSBIQ T1, PH, CAR1, TT1 \
929 VSBIQ T2, ZER, CAR2, SEL1 \
930 \
931 VSEL T0, TT0, SEL1, T0 \
932 VSEL T1, TT1, SEL1, T1
933
934 #define p256HalfInternal(T1, T0, X1, X0) \
935 VZERO ZER \
936 VSBIQ ZER, ZER, X0, SEL1 \
937 \
938 VACCQ X0, PL, CAR1 \
939 VAQ X0, PL, T0 \
940 VACCCQ X1, PH, CAR1, T2 \
941 VACQ X1, PH, CAR1, T1 \
942 \
943 VSEL X0, T0, SEL1, T0 \
944 VSEL X1, T1, SEL1, T1 \
945 VSEL ZER, T2, SEL1, T2 \
946 \
947 VSLDB $15, T2, ZER, TT1 \
948 VSLDB $15, T1, ZER, TT0 \
949 VREPIB $1, SEL1 \
950 VSRL SEL1, T0, T0 \
951 VSRL SEL1, T1, T1 \
952 VREPIB $7, SEL1 \
953 VSL SEL1, TT0, TT0 \
954 VSL SEL1, TT1, TT1 \
955 VO T0, TT0, T0 \
956 VO T1, TT1, T1
957
958 // ---------------------------------------
959 // func p256Mul(res, in1, in2 *p256Element)
960 #define res_ptr R1
961 #define x_ptr R2
962 #define y_ptr R3
963 #define CPOOL R4
964
965 // Parameters
966 #define X0 V0
967 #define X1 V1
968 #define Y0 V2
969 #define Y1 V3
970 #define T0 V4
971 #define T1 V5
972
973 // Constants
974 #define P0 V30
975 #define P1 V31
976 TEXT ·p256Mul(SB), NOSPLIT, $0
977 MOVD res+0(FP), res_ptr
978 MOVD in1+8(FP), x_ptr
979 MOVD in2+16(FP), y_ptr
980
981 VL (0*16)(x_ptr), X0
982 VPDI $0x4, X0, X0, X0
983 VL (1*16)(x_ptr), X1
984 VPDI $0x4, X1, X1, X1
985 VL (0*16)(y_ptr), Y0
986 VPDI $0x4, Y0, Y0, Y0
987 VL (1*16)(y_ptr), Y1
988 VPDI $0x4, Y1, Y1, Y1
989
990 MOVD $p256mul<>+0x00(SB), CPOOL
991 VL 16(CPOOL), P0
992 VL 0(CPOOL), P1
993
994 CALL p256MulInternal<>(SB)
995
996 VPDI $0x4, T0, T0, T0
997 VST T0, (0*16)(res_ptr)
998 VPDI $0x4, T1, T1, T1
999 VST T1, (1*16)(res_ptr)
1000 RET
1001
1002 #undef res_ptr
1003 #undef x_ptr
1004 #undef y_ptr
1005 #undef CPOOL
1006
1007 #undef X0
1008 #undef X1
1009 #undef Y0
1010 #undef Y1
1011 #undef T0
1012 #undef T1
1013 #undef P0
1014 #undef P1
1015
1016 // ---------------------------------------
1017 // func p256Sqr(res, in *p256Element, n int)
1018 #define res_ptr R1
1019 #define x_ptr R2
1020 #define y_ptr R3
1021 #define CPOOL R4
1022 #define COUNT R5
1023 #define N R6
1024
1025 // Parameters
1026 #define X0 V0
1027 #define X1 V1
1028 #define T0 V4
1029 #define T1 V5
1030
1031 // Constants
1032 #define P0 V30
1033 #define P1 V31
1034 TEXT ·p256Sqr(SB), NOSPLIT, $0
1035 MOVD res+0(FP), res_ptr
1036 MOVD in+8(FP), x_ptr
1037
1038 VL (0*16)(x_ptr), X0
1039 VPDI $0x4, X0, X0, X0
1040 VL (1*16)(x_ptr), X1
1041 VPDI $0x4, X1, X1, X1
1042
1043 MOVD $p256mul<>+0x00(SB), CPOOL
1044 MOVD $0, COUNT
1045 MOVD n+16(FP), N
1046 VL 16(CPOOL), P0
1047 VL 0(CPOOL), P1
1048
1049 loop:
1050 CALL p256SqrInternal<>(SB)
1051 VLR T0, X0
1052 VLR T1, X1
1053 ADDW $1, COUNT
1054 CMPW COUNT, N
1055 BLT loop
1056
1057 VPDI $0x4, T0, T0, T0
1058 VST T0, (0*16)(res_ptr)
1059 VPDI $0x4, T1, T1, T1
1060 VST T1, (1*16)(res_ptr)
1061 RET
1062
1063 #undef res_ptr
1064 #undef x_ptr
1065 #undef y_ptr
1066 #undef CPOOL
1067 #undef COUNT
1068 #undef N
1069
1070 #undef X0
1071 #undef X1
1072 #undef T0
1073 #undef T1
1074 #undef P0
1075 #undef P1
1076
1077 // Point add with P2 being affine point
1078 // If sign == 1 -> P2 = -P2
1079 // If sel == 0 -> P3 = P1
1080 // if zero == 0 -> P3 = P2
1081 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1082 #define P3ptr R1
1083 #define P1ptr R2
1084 #define P2ptr R3
1085 #define CPOOL R4
1086
1087 // Temporaries in REGs
1088 #define Y2L V15
1089 #define Y2H V16
1090 #define T1L V17
1091 #define T1H V18
1092 #define T2L V19
1093 #define T2H V20
1094 #define T3L V21
1095 #define T3H V22
1096 #define T4L V23
1097 #define T4H V24
1098
1099 // Temps for Sub and Add
1100 #define TT0 V11
1101 #define TT1 V12
1102 #define T2 V13
1103
1104 // p256MulAsm Parameters
1105 #define X0 V0
1106 #define X1 V1
1107 #define Y0 V2
1108 #define Y1 V3
1109 #define T0 V4
1110 #define T1 V5
1111
1112 #define PL V30
1113 #define PH V31
1114
1115 // Names for zero/sel selects
1116 #define X1L V0
1117 #define X1H V1
1118 #define Y1L V2 // p256MulAsmParmY
1119 #define Y1H V3 // p256MulAsmParmY
1120 #define Z1L V4
1121 #define Z1H V5
1122 #define X2L V0
1123 #define X2H V1
1124 #define Z2L V4
1125 #define Z2H V5
1126 #define X3L V17 // T1L
1127 #define X3H V18 // T1H
1128 #define Y3L V21 // T3L
1129 #define Y3H V22 // T3H
1130 #define Z3L V28
1131 #define Z3H V29
1132
1133 #define ZER V6
1134 #define SEL1 V7
1135 #define CAR1 V8
1136 #define CAR2 V9
1137 /* *
1138 * Three operand formula:
1139 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1140 * T1 = Z1²
1141 * T2 = T1*Z1
1142 * T1 = T1*X2
1143 * T2 = T2*Y2
1144 * T1 = T1-X1
1145 * T2 = T2-Y1
1146 * Z3 = Z1*T1
1147 * T3 = T1²
1148 * T4 = T3*T1
1149 * T3 = T3*X1
1150 * T1 = 2*T3
1151 * X3 = T2²
1152 * X3 = X3-T1
1153 * X3 = X3-T4
1154 * T3 = T3-X3
1155 * T3 = T3*T2
1156 * T4 = T4*Y1
1157 * Y3 = T3-T4
1158
1159 * Three operand formulas, but with MulInternal X,Y used to store temps
1160 X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
1161 X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
1162 X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
1163 X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
1164 SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1165 SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1166 X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
1167 X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
1168 X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
1169 X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
1170 ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1171 X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
1172 SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
1173 SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1174 SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1175 X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
1176 X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
1177 SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
1178
1179 */
1180 TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
1181 MOVD res+0(FP), P3ptr
1182 MOVD in1+8(FP), P1ptr
1183 MOVD in2+16(FP), P2ptr
1184
1185 MOVD $p256mul<>+0x00(SB), CPOOL
1186 VL 16(CPOOL), PL
1187 VL 0(CPOOL), PH
1188
1189 // if (sign == 1) {
1190 // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
1191 // }
1192
1193 VL 48(P2ptr), Y2H
1194 VPDI $0x4, Y2H, Y2H, Y2H
1195 VL 32(P2ptr), Y2L
1196 VPDI $0x4, Y2L, Y2L, Y2L
1197
1198 VLREPG sign+24(FP), SEL1
1199 VZERO ZER
1200 VCEQG SEL1, ZER, SEL1
1201
1202 VSCBIQ Y2L, PL, CAR1
1203 VSQ Y2L, PL, T1L
1204 VSBIQ PH, Y2H, CAR1, T1H
1205
1206 VSEL Y2L, T1L, SEL1, Y2L
1207 VSEL Y2H, T1H, SEL1, Y2H
1208
1209 /* *
1210 * Three operand formula:
1211 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1212 */
1213 // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
1214 VL 80(P1ptr), X1 // Z1H
1215 VPDI $0x4, X1, X1, X1
1216 VL 64(P1ptr), X0 // Z1L
1217 VPDI $0x4, X0, X0, X0
1218 VLR X0, Y0
1219 VLR X1, Y1
1220 CALL p256SqrInternal<>(SB)
1221
1222 // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
1223 VLR T0, X0
1224 VLR T1, X1
1225 CALL p256MulInternal<>(SB)
1226 VLR T0, T2L
1227 VLR T1, T2H
1228
1229 // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
1230 VL 16(P2ptr), Y1 // X2H
1231 VPDI $0x4, Y1, Y1, Y1
1232 VL 0(P2ptr), Y0 // X2L
1233 VPDI $0x4, Y0, Y0, Y0
1234 CALL p256MulInternal<>(SB)
1235 VLR T0, T1L
1236 VLR T1, T1H
1237
1238 // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
1239 VLR T2L, X0
1240 VLR T2H, X1
1241 VLR Y2L, Y0
1242 VLR Y2H, Y1
1243 CALL p256MulInternal<>(SB)
1244
1245 // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
1246 VL 48(P1ptr), Y1H
1247 VPDI $0x4, Y1H, Y1H, Y1H
1248 VL 32(P1ptr), Y1L
1249 VPDI $0x4, Y1L, Y1L, Y1L
1250 p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
1251
1252 // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
1253 VL 16(P1ptr), X1H
1254 VPDI $0x4, X1H, X1H, X1H
1255 VL 0(P1ptr), X1L
1256 VPDI $0x4, X1L, X1L, X1L
1257 p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
1258
1259 // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
1260 VL 80(P1ptr), X1 // Z1H
1261 VPDI $0x4, X1, X1, X1
1262 VL 64(P1ptr), X0 // Z1L
1263 VPDI $0x4, X0, X0, X0
1264 CALL p256MulInternal<>(SB)
1265
1266 // VST T1, 64(P3ptr)
1267 // VST T0, 80(P3ptr)
1268 VLR T0, Z3L
1269 VLR T1, Z3H
1270
1271 // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
1272 VLR Y0, X0
1273 VLR Y1, X1
1274 CALL p256SqrInternal<>(SB)
1275 VLR T0, X0
1276 VLR T1, X1
1277
1278 // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
1279 CALL p256MulInternal<>(SB)
1280 VLR T0, T4L
1281 VLR T1, T4H
1282
1283 // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
1284 VL 16(P1ptr), Y1 // X1H
1285 VPDI $0x4, Y1, Y1, Y1
1286 VL 0(P1ptr), Y0 // X1L
1287 VPDI $0x4, Y0, Y0, Y0
1288 CALL p256MulInternal<>(SB)
1289 VLR T0, T3L
1290 VLR T1, T3H
1291
1292 // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
1293 p256AddInternal(T1H,T1L, T1,T0,T1,T0)
1294
1295 // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
1296 VLR T2L, X0
1297 VLR T2H, X1
1298 VLR T2L, Y0
1299 VLR T2H, Y1
1300 CALL p256SqrInternal<>(SB)
1301
1302 // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
1303 p256SubInternal(T1,T0,T1,T0,T1H,T1L)
1304
1305 // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
1306 p256SubInternal(T1,T0,T1,T0,T4H,T4L)
1307 VLR T0, X3L
1308 VLR T1, X3H
1309
1310 // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
1311 p256SubInternal(X1,X0,T3H,T3L,T1,T0)
1312
1313 // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
1314 CALL p256MulInternal<>(SB)
1315 VLR T0, T3L
1316 VLR T1, T3H
1317
1318 // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
1319 VLR T4L, X0
1320 VLR T4H, X1
1321 VL 48(P1ptr), Y1 // Y1H
1322 VPDI $0x4, Y1, Y1, Y1
1323 VL 32(P1ptr), Y0 // Y1L
1324 VPDI $0x4, Y0, Y0, Y0
1325 CALL p256MulInternal<>(SB)
1326
1327 // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
1328 p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
1329
1330 // if (sel == 0) {
1331 // copy(P3.x[:], X1)
1332 // copy(P3.y[:], Y1)
1333 // copy(P3.z[:], Z1)
1334 // }
1335
1336 VL 16(P1ptr), X1H
1337 VPDI $0x4, X1H, X1H, X1H
1338 VL 0(P1ptr), X1L
1339 VPDI $0x4, X1L, X1L, X1L
1340
1341 // Y1 already loaded, left over from addition
1342 VL 80(P1ptr), Z1H
1343 VPDI $0x4, Z1H, Z1H, Z1H
1344 VL 64(P1ptr), Z1L
1345 VPDI $0x4, Z1L, Z1L, Z1L
1346
1347 VLREPG sel+32(FP), SEL1
1348 VZERO ZER
1349 VCEQG SEL1, ZER, SEL1
1350
1351 VSEL X1L, X3L, SEL1, X3L
1352 VSEL X1H, X3H, SEL1, X3H
1353 VSEL Y1L, Y3L, SEL1, Y3L
1354 VSEL Y1H, Y3H, SEL1, Y3H
1355 VSEL Z1L, Z3L, SEL1, Z3L
1356 VSEL Z1H, Z3H, SEL1, Z3H
1357
1358 // if (zero == 0) {
1359 // copy(P3.x[:], X2)
1360 // copy(P3.y[:], Y2)
1361 // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1362 // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
1363 // }
1364 VL 16(P2ptr), X2H
1365 VPDI $0x4, X2H, X2H, X2H
1366 VL 0(P2ptr), X2L
1367 VPDI $0x4, X2L, X2L, X2L
1368
1369 // Y2 already loaded
1370 VL 128(CPOOL), Z2H
1371 VL 144(CPOOL), Z2L
1372
1373 VLREPG zero+40(FP), SEL1
1374 VZERO ZER
1375 VCEQG SEL1, ZER, SEL1
1376
1377 VSEL X2L, X3L, SEL1, X3L
1378 VSEL X2H, X3H, SEL1, X3H
1379 VSEL Y2L, Y3L, SEL1, Y3L
1380 VSEL Y2H, Y3H, SEL1, Y3H
1381 VSEL Z2L, Z3L, SEL1, Z3L
1382 VSEL Z2H, Z3H, SEL1, Z3H
1383
1384 // All done, store out the result!!!
1385 VPDI $0x4, X3H, X3H, X3H
1386 VST X3H, 16(P3ptr)
1387 VPDI $0x4, X3L, X3L, X3L
1388 VST X3L, 0(P3ptr)
1389 VPDI $0x4, Y3H, Y3H, Y3H
1390 VST Y3H, 48(P3ptr)
1391 VPDI $0x4, Y3L, Y3L, Y3L
1392 VST Y3L, 32(P3ptr)
1393 VPDI $0x4, Z3H, Z3H, Z3H
1394 VST Z3H, 80(P3ptr)
1395 VPDI $0x4, Z3L, Z3L, Z3L
1396 VST Z3L, 64(P3ptr)
1397
1398 RET
1399
1400 #undef P3ptr
1401 #undef P1ptr
1402 #undef P2ptr
1403 #undef CPOOL
1404
1405 #undef Y2L
1406 #undef Y2H
1407 #undef T1L
1408 #undef T1H
1409 #undef T2L
1410 #undef T2H
1411 #undef T3L
1412 #undef T3H
1413 #undef T4L
1414 #undef T4H
1415
1416 #undef TT0
1417 #undef TT1
1418 #undef T2
1419
1420 #undef X0
1421 #undef X1
1422 #undef Y0
1423 #undef Y1
1424 #undef T0
1425 #undef T1
1426
1427 #undef PL
1428 #undef PH
1429
1430 #undef X1L
1431 #undef X1H
1432 #undef Y1L
1433 #undef Y1H
1434 #undef Z1L
1435 #undef Z1H
1436 #undef X2L
1437 #undef X2H
1438 #undef Z2L
1439 #undef Z2H
1440 #undef X3L
1441 #undef X3H
1442 #undef Y3L
1443 #undef Y3H
1444 #undef Z3L
1445 #undef Z3H
1446
1447 #undef ZER
1448 #undef SEL1
1449 #undef CAR1
1450 #undef CAR2
1451
1452 // func p256PointDoubleAsm(res, in *P256Point)
1453 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
1454 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
1455 // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
1456 #define P3ptr R1
1457 #define P1ptr R2
1458 #define CPOOL R4
1459
1460 // Temporaries in REGs
1461 #define X3L V15
1462 #define X3H V16
1463 #define Y3L V17
1464 #define Y3H V18
1465 #define T1L V19
1466 #define T1H V20
1467 #define T2L V21
1468 #define T2H V22
1469 #define T3L V23
1470 #define T3H V24
1471
1472 #define X1L V6
1473 #define X1H V7
1474 #define Y1L V8
1475 #define Y1H V9
1476 #define Z1L V10
1477 #define Z1H V11
1478
1479 // Temps for Sub and Add
1480 #define TT0 V11
1481 #define TT1 V12
1482 #define T2 V13
1483
1484 // p256MulAsm Parameters
1485 #define X0 V0
1486 #define X1 V1
1487 #define Y0 V2
1488 #define Y1 V3
1489 #define T0 V4
1490 #define T1 V5
1491
1492 #define PL V30
1493 #define PH V31
1494
1495 #define Z3L V23
1496 #define Z3H V24
1497
1498 #define ZER V26
1499 #define SEL1 V27
1500 #define CAR1 V28
1501 #define CAR2 V29
1502 /*
1503 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
1504 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
1505 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
1506 * A = 3(X₁-Z₁²)×(X₁+Z₁²)
1507 * B = 2Y₁
1508 * Z₃ = B×Z₁
1509 * C = B²
1510 * D = C×X₁
1511 * X₃ = A²-2D
1512 * Y₃ = (D-X₃)×A-C²/2
1513 *
1514 * Three-operand formula:
1515 * T1 = Z1²
1516 * T2 = X1-T1
1517 * T1 = X1+T1
1518 * T2 = T2*T1
1519 * T2 = 3*T2
1520 * Y3 = 2*Y1
1521 * Z3 = Y3*Z1
1522 * Y3 = Y3²
1523 * T3 = Y3*X1
1524 * Y3 = Y3²
1525 * Y3 = half*Y3
1526 * X3 = T2²
1527 * T1 = 2*T3
1528 * X3 = X3-T1
1529 * T1 = T3-X3
1530 * T1 = T1*T2
1531 * Y3 = T1-Y3
1532 */
1533
1534 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
1535 MOVD res+0(FP), P3ptr
1536 MOVD in+8(FP), P1ptr
1537
1538 MOVD $p256mul<>+0x00(SB), CPOOL
1539 VL 16(CPOOL), PL
1540 VL 0(CPOOL), PH
1541
1542 // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
1543 VL 80(P1ptr), X1 // Z1H
1544 VPDI $0x4, X1, X1, X1
1545 VL 64(P1ptr), X0 // Z1L
1546 VPDI $0x4, X0, X0, X0
1547 VLR X0, Y0
1548 VLR X1, Y1
1549 CALL p256SqrInternal<>(SB)
1550
1551 // SUB(X<X1-T) // T2 = X1-T1
1552 VL 16(P1ptr), X1H
1553 VPDI $0x4, X1H, X1H, X1H
1554 VL 0(P1ptr), X1L
1555 VPDI $0x4, X1L, X1L, X1L
1556 p256SubInternal(X1,X0,X1H,X1L,T1,T0)
1557
1558 // ADD(Y<X1+T) // T1 = X1+T1
1559 p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
1560
1561 // X- ; Y- ; MUL; T- // T2 = T2*T1
1562 CALL p256MulInternal<>(SB)
1563
1564 // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
1565 p256AddInternal(T2H,T2L,T1,T0,T1,T0)
1566 p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
1567
1568 // ADD(X<Y1+Y1) // Y3 = 2*Y1
1569 VL 48(P1ptr), Y1H
1570 VPDI $0x4, Y1H, Y1H, Y1H
1571 VL 32(P1ptr), Y1L
1572 VPDI $0x4, Y1L, Y1L, Y1L
1573 p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
1574
1575 // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
1576 VL 80(P1ptr), Y1 // Z1H
1577 VPDI $0x4, Y1, Y1, Y1
1578 VL 64(P1ptr), Y0 // Z1L
1579 VPDI $0x4, Y0, Y0, Y0
1580 CALL p256MulInternal<>(SB)
1581 VPDI $0x4, T1, T1, TT1
1582 VST TT1, 80(P3ptr)
1583 VPDI $0x4, T0, T0, TT0
1584 VST TT0, 64(P3ptr)
1585
1586 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1587 VLR X0, Y0
1588 VLR X1, Y1
1589 CALL p256SqrInternal<>(SB)
1590
1591 // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
1592 VLR T0, X0
1593 VLR T1, X1
1594 VL 16(P1ptr), Y1
1595 VPDI $0x4, Y1, Y1, Y1
1596 VL 0(P1ptr), Y0
1597 VPDI $0x4, Y0, Y0, Y0
1598 CALL p256MulInternal<>(SB)
1599 VLR T0, T3L
1600 VLR T1, T3H
1601
1602 // X- ; Y=X ; MUL; T- // Y3 = Y3²
1603 VLR X0, Y0
1604 VLR X1, Y1
1605 CALL p256SqrInternal<>(SB)
1606
1607 // HAL(Y3<T) // Y3 = half*Y3
1608 p256HalfInternal(Y3H,Y3L, T1,T0)
1609
1610 // X=T2; Y=T2; MUL; T- // X3 = T2²
1611 VLR T2L, X0
1612 VLR T2H, X1
1613 VLR T2L, Y0
1614 VLR T2H, Y1
1615 CALL p256SqrInternal<>(SB)
1616
1617 // ADD(T1<T3+T3) // T1 = 2*T3
1618 p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
1619
1620 // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
1621 p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
1622 VPDI $0x4, X3H, X3H, TT1
1623 VST TT1, 16(P3ptr)
1624 VPDI $0x4, X3L, X3L, TT0
1625 VST TT0, 0(P3ptr)
1626
1627 // SUB(X<T3-X3) // T1 = T3-X3
1628 p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
1629
1630 // X- ; Y- ; MUL; T- // T1 = T1*T2
1631 CALL p256MulInternal<>(SB)
1632
1633 // SUB(Y3<T-Y3) // Y3 = T1-Y3
1634 p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
1635
1636 VPDI $0x4, Y3H, Y3H, Y3H
1637 VST Y3H, 48(P3ptr)
1638 VPDI $0x4, Y3L, Y3L, Y3L
1639 VST Y3L, 32(P3ptr)
1640 RET
1641
1642 #undef P3ptr
1643 #undef P1ptr
1644 #undef CPOOL
1645 #undef X3L
1646 #undef X3H
1647 #undef Y3L
1648 #undef Y3H
1649 #undef T1L
1650 #undef T1H
1651 #undef T2L
1652 #undef T2H
1653 #undef T3L
1654 #undef T3H
1655 #undef X1L
1656 #undef X1H
1657 #undef Y1L
1658 #undef Y1H
1659 #undef Z1L
1660 #undef Z1H
1661 #undef TT0
1662 #undef TT1
1663 #undef T2
1664 #undef X0
1665 #undef X1
1666 #undef Y0
1667 #undef Y1
1668 #undef T0
1669 #undef T1
1670 #undef PL
1671 #undef PH
1672 #undef Z3L
1673 #undef Z3H
1674 #undef ZER
1675 #undef SEL1
1676 #undef CAR1
1677 #undef CAR2
1678
1679 // func p256PointAddAsm(res, in1, in2 *P256Point) int
1680 #define P3ptr R1
1681 #define P1ptr R2
1682 #define P2ptr R3
1683 #define CPOOL R4
1684 #define ISZERO R5
1685 #define TRUE R6
1686
1687 // Temporaries in REGs
1688 #define T1L V16
1689 #define T1H V17
1690 #define T2L V18
1691 #define T2H V19
1692 #define U1L V20
1693 #define U1H V21
1694 #define S1L V22
1695 #define S1H V23
1696 #define HL V24
1697 #define HH V25
1698 #define RL V26
1699 #define RH V27
1700
1701 // Temps for Sub and Add
1702 #define ZER V6
1703 #define SEL1 V7
1704 #define CAR1 V8
1705 #define CAR2 V9
1706 #define TT0 V11
1707 #define TT1 V12
1708 #define T2 V13
1709
1710 // p256MulAsm Parameters
1711 #define X0 V0
1712 #define X1 V1
1713 #define Y0 V2
1714 #define Y1 V3
1715 #define T0 V4
1716 #define T1 V5
1717
1718 #define PL V30
1719 #define PH V31
1720 /*
1721 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
1722 *
1723 * A = X₁×Z₂²
1724 * B = Y₁×Z₂³
1725 * C = X₂×Z₁²-A
1726 * D = Y₂×Z₁³-B
1727 * X₃ = D² - 2A×C² - C³
1728 * Y₃ = D×(A×C² - X₃) - B×C³
1729 * Z₃ = Z₁×Z₂×C
1730 *
1731 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
1732 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
1733 *
1734 * T1 = Z1*Z1
1735 * T2 = Z2*Z2
1736 * U1 = X1*T2
1737 * H = X2*T1
1738 * H = H-U1
1739 * Z3 = Z1*Z2
1740 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1741 *
1742 * S1 = Z2*T2
1743 * S1 = Y1*S1
1744 * R = Z1*T1
1745 * R = Y2*R
1746 * R = R-S1
1747 *
1748 * T1 = H*H
1749 * T2 = H*T1
1750 * U1 = U1*T1
1751 *
1752 * X3 = R*R
1753 * X3 = X3-T2
1754 * T1 = 2*U1
1755 * X3 = X3-T1 << store-out X3 result reg
1756 *
1757 * T2 = S1*T2
1758 * Y3 = U1-X3
1759 * Y3 = R*Y3
1760 * Y3 = Y3-T2 << store-out Y3 result reg
1761
1762 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1763 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1764 // X=X2; Y- ; MUL; H=T // H = X2*T1
1765 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1766 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1767 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1768 // SUB(H<H-T) // H = H-U1
1769 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
1770 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
1771 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1772 // X=Y2; Y=R ; MUL; T- // R = Y2*R
1773 // SUB(R<T-S1) // R = R-S1
1774 // X=H ; Y=H ; MUL; T- // T1 = H*H
1775 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
1776 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
1777 // X=R ; Y=R ; MUL; T- // X3 = R*R
1778 // SUB(T<T-T2) // X3 = X3-T2
1779 // ADD(X<U1+U1) // T1 = 2*U1
1780 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
1781 // SUB(Y<U1-T) // Y3 = U1-X3
1782 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
1783 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
1784 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
1785 */
1786 TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
1787 MOVD res+0(FP), P3ptr
1788 MOVD in1+8(FP), P1ptr
1789 MOVD in2+16(FP), P2ptr
1790
1791 MOVD $p256mul<>+0x00(SB), CPOOL
1792 VL 16(CPOOL), PL
1793 VL 0(CPOOL), PH
1794
1795 // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
1796 VL 80(P1ptr), X1 // Z1H
1797 VPDI $0x4, X1, X1, X1
1798 VL 64(P1ptr), X0 // Z1L
1799 VPDI $0x4, X0, X0, X0
1800 VLR X0, Y0
1801 VLR X1, Y1
1802 CALL p256SqrInternal<>(SB)
1803
1804 // X- ; Y=T ; MUL; R=T // R = Z1*T1
1805 VLR T0, Y0
1806 VLR T1, Y1
1807 CALL p256MulInternal<>(SB)
1808 VLR T0, RL
1809 VLR T1, RH
1810
1811 // X=X2; Y- ; MUL; H=T // H = X2*T1
1812 VL 16(P2ptr), X1 // X2H
1813 VPDI $0x4, X1, X1, X1
1814 VL 0(P2ptr), X0 // X2L
1815 VPDI $0x4, X0, X0, X0
1816 CALL p256MulInternal<>(SB)
1817 VLR T0, HL
1818 VLR T1, HH
1819
1820 // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
1821 VL 80(P2ptr), X1 // Z2H
1822 VPDI $0x4, X1, X1, X1
1823 VL 64(P2ptr), X0 // Z2L
1824 VPDI $0x4, X0, X0, X0
1825 VLR X0, Y0
1826 VLR X1, Y1
1827 CALL p256SqrInternal<>(SB)
1828
1829 // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
1830 VLR T0, Y0
1831 VLR T1, Y1
1832 CALL p256MulInternal<>(SB)
1833 VLR T0, S1L
1834 VLR T1, S1H
1835
1836 // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
1837 VL 16(P1ptr), X1 // X1H
1838 VPDI $0x4, X1, X1, X1
1839 VL 0(P1ptr), X0 // X1L
1840 VPDI $0x4, X0, X0, X0
1841 CALL p256MulInternal<>(SB)
1842 VLR T0, U1L
1843 VLR T1, U1H
1844
1845 // SUB(H<H-T) // H = H-U1
1846 p256SubInternal(HH,HL,HH,HL,T1,T0)
1847
1848 // if H == 0 or H^P == 0 then ret=1 else ret=0
1849 // clobbers T1H and T1L
1850 MOVD $0, ISZERO
1851 MOVD $1, TRUE
1852 VZERO ZER
1853 VO HL, HH, T1H
1854 VCEQGS ZER, T1H, T1H
1855 MOVDEQ TRUE, ISZERO
1856 VX HL, PL, T1L
1857 VX HH, PH, T1H
1858 VO T1L, T1H, T1H
1859 VCEQGS ZER, T1H, T1H
1860 MOVDEQ TRUE, ISZERO
1861 MOVD ISZERO, ret+24(FP)
1862
1863 // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
1864 VL 80(P1ptr), X1 // Z1H
1865 VPDI $0x4, X1, X1, X1
1866 VL 64(P1ptr), X0 // Z1L
1867 VPDI $0x4, X0, X0, X0
1868 VL 80(P2ptr), Y1 // Z2H
1869 VPDI $0x4, Y1, Y1, Y1
1870 VL 64(P2ptr), Y0 // Z2L
1871 VPDI $0x4, Y0, Y0, Y0
1872 CALL p256MulInternal<>(SB)
1873
1874 // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
1875 VLR T0, X0
1876 VLR T1, X1
1877 VLR HL, Y0
1878 VLR HH, Y1
1879 CALL p256MulInternal<>(SB)
1880 VPDI $0x4, T1, T1, TT1
1881 VST TT1, 80(P3ptr)
1882 VPDI $0x4, T0, T0, TT0
1883 VST TT0, 64(P3ptr)
1884
1885 // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
1886 VL 48(P1ptr), X1
1887 VPDI $0x4, X1, X1, X1
1888 VL 32(P1ptr), X0
1889 VPDI $0x4, X0, X0, X0
1890 VLR S1L, Y0
1891 VLR S1H, Y1
1892 CALL p256MulInternal<>(SB)
1893 VLR T0, S1L
1894 VLR T1, S1H
1895
1896 // X=Y2; Y=R ; MUL; T- // R = Y2*R
1897 VL 48(P2ptr), X1
1898 VPDI $0x4, X1, X1, X1
1899 VL 32(P2ptr), X0
1900 VPDI $0x4, X0, X0, X0
1901 VLR RL, Y0
1902 VLR RH, Y1
1903 CALL p256MulInternal<>(SB)
1904
1905 // SUB(R<T-S1) // R = T-S1
1906 p256SubInternal(RH,RL,T1,T0,S1H,S1L)
1907
1908 // if R == 0 or R^P == 0 then ret=ret else ret=0
1909 // clobbers T1H and T1L
1910 MOVD $0, ISZERO
1911 MOVD $1, TRUE
1912 VZERO ZER
1913 VO RL, RH, T1H
1914 VCEQGS ZER, T1H, T1H
1915 MOVDEQ TRUE, ISZERO
1916 VX RL, PL, T1L
1917 VX RH, PH, T1H
1918 VO T1L, T1H, T1H
1919 VCEQGS ZER, T1H, T1H
1920 MOVDEQ TRUE, ISZERO
1921 AND ret+24(FP), ISZERO
1922 MOVD ISZERO, ret+24(FP)
1923
1924 // X=H ; Y=H ; MUL; T- // T1 = H*H
1925 VLR HL, X0
1926 VLR HH, X1
1927 VLR HL, Y0
1928 VLR HH, Y1
1929 CALL p256SqrInternal<>(SB)
1930
1931 // X- ; Y=T ; MUL; T2=T // T2 = H*T1
1932 VLR T0, Y0
1933 VLR T1, Y1
1934 CALL p256MulInternal<>(SB)
1935 VLR T0, T2L
1936 VLR T1, T2H
1937
1938 // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
1939 VLR U1L, X0
1940 VLR U1H, X1
1941 CALL p256MulInternal<>(SB)
1942 VLR T0, U1L
1943 VLR T1, U1H
1944
1945 // X=R ; Y=R ; MUL; T- // X3 = R*R
1946 VLR RL, X0
1947 VLR RH, X1
1948 VLR RL, Y0
1949 VLR RH, Y1
1950 CALL p256SqrInternal<>(SB)
1951
1952 // SUB(T<T-T2) // X3 = X3-T2
1953 p256SubInternal(T1,T0,T1,T0,T2H,T2L)
1954
1955 // ADD(X<U1+U1) // T1 = 2*U1
1956 p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
1957
1958 // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
1959 p256SubInternal(T1,T0,T1,T0,X1,X0)
1960 VPDI $0x4, T1, T1, TT1
1961 VST TT1, 16(P3ptr)
1962 VPDI $0x4, T0, T0, TT0
1963 VST TT0, 0(P3ptr)
1964
1965 // SUB(Y<U1-T) // Y3 = U1-X3
1966 p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
1967
1968 // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
1969 VLR RL, X0
1970 VLR RH, X1
1971 CALL p256MulInternal<>(SB)
1972 VLR T0, U1L
1973 VLR T1, U1H
1974
1975 // X=S1; Y=T2; MUL; T- // T2 = S1*T2
1976 VLR S1L, X0
1977 VLR S1H, X1
1978 VLR T2L, Y0
1979 VLR T2H, Y1
1980 CALL p256MulInternal<>(SB)
1981
1982 // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
1983 p256SubInternal(T1,T0,U1H,U1L,T1,T0)
1984 VPDI $0x4, T1, T1, T1
1985 VST T1, 48(P3ptr)
1986 VPDI $0x4, T0, T0, T0
1987 VST T0, 32(P3ptr)
1988
1989 RET
1990
View as plain text