1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 // This file contains constant-time, 64-bit assembly implementation of
8 // P256. The optimizations performed here are described in detail in:
9 // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
10 // 256-bit primes"
11 // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
12 // https://eprint.iacr.org/2013/816.pdf
13
14 #include "textflag.h"
15
16 #define res_ptr R0
17 #define a_ptr R1
18 #define b_ptr R2
19
20 #define acc0 R3
21 #define acc1 R4
22 #define acc2 R5
23 #define acc3 R6
24
25 #define acc4 R7
26 #define acc5 R8
27 #define acc6 R9
28 #define acc7 R10
29 #define t0 R11
30 #define t1 R12
31 #define t2 R13
32 #define t3 R14
33 #define const0 R15
34 #define const1 R16
35
36 #define hlp0 R17
37 #define hlp1 res_ptr
38
39 #define x0 R19
40 #define x1 R20
41 #define x2 R21
42 #define x3 R22
43 #define y0 R23
44 #define y1 R24
45 #define y2 R25
46 #define y3 R26
47
48 #define const2 t2
49 #define const3 t3
50
51 DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
52 DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
53 DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
54 DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
55 DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
56 DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
57 DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
58 DATA p256one<>+0x00(SB)/8, $0x0000000000000001
59 DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
60 DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
61 DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
62 GLOBL p256const0<>(SB), 8, $8
63 GLOBL p256const1<>(SB), 8, $8
64 GLOBL p256ordK0<>(SB), 8, $8
65 GLOBL p256ord<>(SB), 8, $32
66 GLOBL p256one<>(SB), 8, $32
67
68 /* ---------------------------------------*/
69 // func p256MovCond(res, a, b *P256Point, cond int)
70 // If cond == 0 res=b, else res=a
71 TEXT ·p256MovCond(SB),NOSPLIT,$0
72 MOVD res+0(FP), res_ptr
73 MOVD a+8(FP), a_ptr
74 MOVD b+16(FP), b_ptr
75 MOVD cond+24(FP), R3
76
77 CMP $0, R3
78 // Two remarks:
79 // 1) Will want to revisit NEON, when support is better
80 // 2) CSEL might not be constant time on all ARM processors
81 LDP 0*16(a_ptr), (R4, R5)
82 LDP 1*16(a_ptr), (R6, R7)
83 LDP 2*16(a_ptr), (R8, R9)
84 LDP 0*16(b_ptr), (R16, R17)
85 LDP 1*16(b_ptr), (R19, R20)
86 LDP 2*16(b_ptr), (R21, R22)
87 CSEL EQ, R16, R4, R4
88 CSEL EQ, R17, R5, R5
89 CSEL EQ, R19, R6, R6
90 CSEL EQ, R20, R7, R7
91 CSEL EQ, R21, R8, R8
92 CSEL EQ, R22, R9, R9
93 STP (R4, R5), 0*16(res_ptr)
94 STP (R6, R7), 1*16(res_ptr)
95 STP (R8, R9), 2*16(res_ptr)
96
97 LDP 3*16(a_ptr), (R4, R5)
98 LDP 4*16(a_ptr), (R6, R7)
99 LDP 5*16(a_ptr), (R8, R9)
100 LDP 3*16(b_ptr), (R16, R17)
101 LDP 4*16(b_ptr), (R19, R20)
102 LDP 5*16(b_ptr), (R21, R22)
103 CSEL EQ, R16, R4, R4
104 CSEL EQ, R17, R5, R5
105 CSEL EQ, R19, R6, R6
106 CSEL EQ, R20, R7, R7
107 CSEL EQ, R21, R8, R8
108 CSEL EQ, R22, R9, R9
109 STP (R4, R5), 3*16(res_ptr)
110 STP (R6, R7), 4*16(res_ptr)
111 STP (R8, R9), 5*16(res_ptr)
112
113 RET
114 /* ---------------------------------------*/
115 // func p256NegCond(val *p256Element, cond int)
116 TEXT ·p256NegCond(SB),NOSPLIT,$0
117 MOVD val+0(FP), a_ptr
118 MOVD cond+8(FP), hlp0
119 MOVD a_ptr, res_ptr
120 // acc = poly
121 MOVD $-1, acc0
122 MOVD p256const0<>(SB), acc1
123 MOVD $0, acc2
124 MOVD p256const1<>(SB), acc3
125 // Load the original value
126 LDP 0*16(a_ptr), (t0, t1)
127 LDP 1*16(a_ptr), (t2, t3)
128 // Speculatively subtract
129 SUBS t0, acc0
130 SBCS t1, acc1
131 SBCS t2, acc2
132 SBC t3, acc3
133 // If condition is 0, keep original value
134 CMP $0, hlp0
135 CSEL EQ, t0, acc0, acc0
136 CSEL EQ, t1, acc1, acc1
137 CSEL EQ, t2, acc2, acc2
138 CSEL EQ, t3, acc3, acc3
139 // Store result
140 STP (acc0, acc1), 0*16(res_ptr)
141 STP (acc2, acc3), 1*16(res_ptr)
142
143 RET
144 /* ---------------------------------------*/
145 // func p256Sqr(res, in *p256Element, n int)
146 TEXT ·p256Sqr(SB),NOSPLIT,$0
147 MOVD res+0(FP), res_ptr
148 MOVD in+8(FP), a_ptr
149 MOVD n+16(FP), b_ptr
150
151 MOVD p256const0<>(SB), const0
152 MOVD p256const1<>(SB), const1
153
154 LDP 0*16(a_ptr), (x0, x1)
155 LDP 1*16(a_ptr), (x2, x3)
156
157 sqrLoop:
158 SUB $1, b_ptr
159 CALL p256SqrInternal<>(SB)
160 MOVD y0, x0
161 MOVD y1, x1
162 MOVD y2, x2
163 MOVD y3, x3
164 CBNZ b_ptr, sqrLoop
165
166 STP (y0, y1), 0*16(res_ptr)
167 STP (y2, y3), 1*16(res_ptr)
168 RET
169 /* ---------------------------------------*/
170 // func p256Mul(res, in1, in2 *p256Element)
171 TEXT ·p256Mul(SB),NOSPLIT,$0
172 MOVD res+0(FP), res_ptr
173 MOVD in1+8(FP), a_ptr
174 MOVD in2+16(FP), b_ptr
175
176 MOVD p256const0<>(SB), const0
177 MOVD p256const1<>(SB), const1
178
179 LDP 0*16(a_ptr), (x0, x1)
180 LDP 1*16(a_ptr), (x2, x3)
181
182 LDP 0*16(b_ptr), (y0, y1)
183 LDP 1*16(b_ptr), (y2, y3)
184
185 CALL p256MulInternal<>(SB)
186
187 STP (y0, y1), 0*16(res_ptr)
188 STP (y2, y3), 1*16(res_ptr)
189 RET
190 /* ---------------------------------------*/
191 // func p256FromMont(res, in *p256Element)
192 TEXT ·p256FromMont(SB),NOSPLIT,$0
193 MOVD res+0(FP), res_ptr
194 MOVD in+8(FP), a_ptr
195
196 MOVD p256const0<>(SB), const0
197 MOVD p256const1<>(SB), const1
198
199 LDP 0*16(a_ptr), (acc0, acc1)
200 LDP 1*16(a_ptr), (acc2, acc3)
201 // Only reduce, no multiplications are needed
202 // First reduction step
203 ADDS acc0<<32, acc1, acc1
204 LSR $32, acc0, t0
205 MUL acc0, const1, t1
206 UMULH acc0, const1, acc0
207 ADCS t0, acc2
208 ADCS t1, acc3
209 ADC $0, acc0
210 // Second reduction step
211 ADDS acc1<<32, acc2, acc2
212 LSR $32, acc1, t0
213 MUL acc1, const1, t1
214 UMULH acc1, const1, acc1
215 ADCS t0, acc3
216 ADCS t1, acc0
217 ADC $0, acc1
218 // Third reduction step
219 ADDS acc2<<32, acc3, acc3
220 LSR $32, acc2, t0
221 MUL acc2, const1, t1
222 UMULH acc2, const1, acc2
223 ADCS t0, acc0
224 ADCS t1, acc1
225 ADC $0, acc2
226 // Last reduction step
227 ADDS acc3<<32, acc0, acc0
228 LSR $32, acc3, t0
229 MUL acc3, const1, t1
230 UMULH acc3, const1, acc3
231 ADCS t0, acc1
232 ADCS t1, acc2
233 ADC $0, acc3
234
235 SUBS $-1, acc0, t0
236 SBCS const0, acc1, t1
237 SBCS $0, acc2, t2
238 SBCS const1, acc3, t3
239
240 CSEL CS, t0, acc0, acc0
241 CSEL CS, t1, acc1, acc1
242 CSEL CS, t2, acc2, acc2
243 CSEL CS, t3, acc3, acc3
244
245 STP (acc0, acc1), 0*16(res_ptr)
246 STP (acc2, acc3), 1*16(res_ptr)
247
248 RET
249 /* ---------------------------------------*/
250 // func p256Select(res *P256Point, table *p256Table, idx int)
251 TEXT ·p256Select(SB),NOSPLIT,$0
252 MOVD idx+16(FP), const0
253 MOVD table+8(FP), b_ptr
254 MOVD res+0(FP), res_ptr
255
256 EOR x0, x0, x0
257 EOR x1, x1, x1
258 EOR x2, x2, x2
259 EOR x3, x3, x3
260 EOR y0, y0, y0
261 EOR y1, y1, y1
262 EOR y2, y2, y2
263 EOR y3, y3, y3
264 EOR t0, t0, t0
265 EOR t1, t1, t1
266 EOR t2, t2, t2
267 EOR t3, t3, t3
268
269 MOVD $0, const1
270
271 loop_select:
272 ADD $1, const1
273 CMP const0, const1
274 LDP.P 16(b_ptr), (acc0, acc1)
275 CSEL EQ, acc0, x0, x0
276 CSEL EQ, acc1, x1, x1
277 LDP.P 16(b_ptr), (acc2, acc3)
278 CSEL EQ, acc2, x2, x2
279 CSEL EQ, acc3, x3, x3
280 LDP.P 16(b_ptr), (acc4, acc5)
281 CSEL EQ, acc4, y0, y0
282 CSEL EQ, acc5, y1, y1
283 LDP.P 16(b_ptr), (acc6, acc7)
284 CSEL EQ, acc6, y2, y2
285 CSEL EQ, acc7, y3, y3
286 LDP.P 16(b_ptr), (acc0, acc1)
287 CSEL EQ, acc0, t0, t0
288 CSEL EQ, acc1, t1, t1
289 LDP.P 16(b_ptr), (acc2, acc3)
290 CSEL EQ, acc2, t2, t2
291 CSEL EQ, acc3, t3, t3
292
293 CMP $16, const1
294 BNE loop_select
295
296 STP (x0, x1), 0*16(res_ptr)
297 STP (x2, x3), 1*16(res_ptr)
298 STP (y0, y1), 2*16(res_ptr)
299 STP (y2, y3), 3*16(res_ptr)
300 STP (t0, t1), 4*16(res_ptr)
301 STP (t2, t3), 5*16(res_ptr)
302 RET
303 /* ---------------------------------------*/
304 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
305 TEXT ·p256SelectAffine(SB),NOSPLIT,$0
306 MOVD idx+16(FP), t0
307 MOVD table+8(FP), t1
308 MOVD res+0(FP), res_ptr
309
310 EOR x0, x0, x0
311 EOR x1, x1, x1
312 EOR x2, x2, x2
313 EOR x3, x3, x3
314 EOR y0, y0, y0
315 EOR y1, y1, y1
316 EOR y2, y2, y2
317 EOR y3, y3, y3
318
319 MOVD $0, t2
320
321 loop_select:
322 ADD $1, t2
323 CMP t0, t2
324 LDP.P 16(t1), (acc0, acc1)
325 CSEL EQ, acc0, x0, x0
326 CSEL EQ, acc1, x1, x1
327 LDP.P 16(t1), (acc2, acc3)
328 CSEL EQ, acc2, x2, x2
329 CSEL EQ, acc3, x3, x3
330 LDP.P 16(t1), (acc4, acc5)
331 CSEL EQ, acc4, y0, y0
332 CSEL EQ, acc5, y1, y1
333 LDP.P 16(t1), (acc6, acc7)
334 CSEL EQ, acc6, y2, y2
335 CSEL EQ, acc7, y3, y3
336
337 CMP $32, t2
338 BNE loop_select
339
340 STP (x0, x1), 0*16(res_ptr)
341 STP (x2, x3), 1*16(res_ptr)
342 STP (y0, y1), 2*16(res_ptr)
343 STP (y2, y3), 3*16(res_ptr)
344 RET
345 /* ---------------------------------------*/
346 // func p256OrdSqr(res, in *p256OrdElement, n int)
347 TEXT ·p256OrdSqr(SB),NOSPLIT,$0
348 MOVD in+8(FP), a_ptr
349 MOVD n+16(FP), b_ptr
350
351 MOVD p256ordK0<>(SB), hlp1
352 LDP p256ord<>+0x00(SB), (const0, const1)
353 LDP p256ord<>+0x10(SB), (const2, const3)
354
355 LDP 0*16(a_ptr), (x0, x1)
356 LDP 1*16(a_ptr), (x2, x3)
357
358 ordSqrLoop:
359 SUB $1, b_ptr
360
361 // x[1:] * x[0]
362 MUL x0, x1, acc1
363 UMULH x0, x1, acc2
364
365 MUL x0, x2, t0
366 ADDS t0, acc2, acc2
367 UMULH x0, x2, acc3
368
369 MUL x0, x3, t0
370 ADCS t0, acc3, acc3
371 UMULH x0, x3, acc4
372 ADC $0, acc4, acc4
373 // x[2:] * x[1]
374 MUL x1, x2, t0
375 ADDS t0, acc3
376 UMULH x1, x2, t1
377 ADCS t1, acc4
378 ADC $0, ZR, acc5
379
380 MUL x1, x3, t0
381 ADDS t0, acc4
382 UMULH x1, x3, t1
383 ADC t1, acc5
384 // x[3] * x[2]
385 MUL x2, x3, t0
386 ADDS t0, acc5
387 UMULH x2, x3, acc6
388 ADC $0, acc6
389
390 MOVD $0, acc7
391 // *2
392 ADDS acc1, acc1
393 ADCS acc2, acc2
394 ADCS acc3, acc3
395 ADCS acc4, acc4
396 ADCS acc5, acc5
397 ADCS acc6, acc6
398 ADC $0, acc7
399 // Missing products
400 MUL x0, x0, acc0
401 UMULH x0, x0, t0
402 ADDS t0, acc1, acc1
403
404 MUL x1, x1, t0
405 ADCS t0, acc2, acc2
406 UMULH x1, x1, t1
407 ADCS t1, acc3, acc3
408
409 MUL x2, x2, t0
410 ADCS t0, acc4, acc4
411 UMULH x2, x2, t1
412 ADCS t1, acc5, acc5
413
414 MUL x3, x3, t0
415 ADCS t0, acc6, acc6
416 UMULH x3, x3, t1
417 ADC t1, acc7, acc7
418 // First reduction step
419 MUL acc0, hlp1, hlp0
420
421 MUL const0, hlp1, t0
422 ADDS t0, acc0, acc0
423 UMULH const0, hlp0, t1
424
425 MUL const1, hlp0, t0
426 ADCS t0, acc1, acc1
427 UMULH const1, hlp0, y0
428
429 MUL const2, hlp0, t0
430 ADCS t0, acc2, acc2
431 UMULH const2, hlp0, acc0
432
433 MUL const3, hlp0, t0
434 ADCS t0, acc3, acc3
435
436 UMULH const3, hlp0, hlp0
437 ADC $0, hlp0
438
439 ADDS t1, acc1, acc1
440 ADCS y0, acc2, acc2
441 ADCS acc0, acc3, acc3
442 ADC $0, hlp0, acc0
443 // Second reduction step
444 MUL acc1, hlp1, hlp0
445
446 MUL const0, hlp1, t0
447 ADDS t0, acc1, acc1
448 UMULH const0, hlp0, t1
449
450 MUL const1, hlp0, t0
451 ADCS t0, acc2, acc2
452 UMULH const1, hlp0, y0
453
454 MUL const2, hlp0, t0
455 ADCS t0, acc3, acc3
456 UMULH const2, hlp0, acc1
457
458 MUL const3, hlp0, t0
459 ADCS t0, acc0, acc0
460
461 UMULH const3, hlp0, hlp0
462 ADC $0, hlp0
463
464 ADDS t1, acc2, acc2
465 ADCS y0, acc3, acc3
466 ADCS acc1, acc0, acc0
467 ADC $0, hlp0, acc1
468 // Third reduction step
469 MUL acc2, hlp1, hlp0
470
471 MUL const0, hlp1, t0
472 ADDS t0, acc2, acc2
473 UMULH const0, hlp0, t1
474
475 MUL const1, hlp0, t0
476 ADCS t0, acc3, acc3
477 UMULH const1, hlp0, y0
478
479 MUL const2, hlp0, t0
480 ADCS t0, acc0, acc0
481 UMULH const2, hlp0, acc2
482
483 MUL const3, hlp0, t0
484 ADCS t0, acc1, acc1
485
486 UMULH const3, hlp0, hlp0
487 ADC $0, hlp0
488
489 ADDS t1, acc3, acc3
490 ADCS y0, acc0, acc0
491 ADCS acc2, acc1, acc1
492 ADC $0, hlp0, acc2
493
494 // Last reduction step
495 MUL acc3, hlp1, hlp0
496
497 MUL const0, hlp1, t0
498 ADDS t0, acc3, acc3
499 UMULH const0, hlp0, t1
500
501 MUL const1, hlp0, t0
502 ADCS t0, acc0, acc0
503 UMULH const1, hlp0, y0
504
505 MUL const2, hlp0, t0
506 ADCS t0, acc1, acc1
507 UMULH const2, hlp0, acc3
508
509 MUL const3, hlp0, t0
510 ADCS t0, acc2, acc2
511
512 UMULH const3, hlp0, hlp0
513 ADC $0, acc7
514
515 ADDS t1, acc0, acc0
516 ADCS y0, acc1, acc1
517 ADCS acc3, acc2, acc2
518 ADC $0, hlp0, acc3
519
520 ADDS acc4, acc0, acc0
521 ADCS acc5, acc1, acc1
522 ADCS acc6, acc2, acc2
523 ADCS acc7, acc3, acc3
524 ADC $0, ZR, acc4
525
526 SUBS const0, acc0, y0
527 SBCS const1, acc1, y1
528 SBCS const2, acc2, y2
529 SBCS const3, acc3, y3
530 SBCS $0, acc4, acc4
531
532 CSEL CS, y0, acc0, x0
533 CSEL CS, y1, acc1, x1
534 CSEL CS, y2, acc2, x2
535 CSEL CS, y3, acc3, x3
536
537 CBNZ b_ptr, ordSqrLoop
538
539 MOVD res+0(FP), res_ptr
540 STP (x0, x1), 0*16(res_ptr)
541 STP (x2, x3), 1*16(res_ptr)
542
543 RET
544 /* ---------------------------------------*/
545 // func p256OrdMul(res, in1, in2 *p256OrdElement)
546 TEXT ·p256OrdMul(SB),NOSPLIT,$0
547 MOVD in1+8(FP), a_ptr
548 MOVD in2+16(FP), b_ptr
549
550 MOVD p256ordK0<>(SB), hlp1
551 LDP p256ord<>+0x00(SB), (const0, const1)
552 LDP p256ord<>+0x10(SB), (const2, const3)
553
554 LDP 0*16(a_ptr), (x0, x1)
555 LDP 1*16(a_ptr), (x2, x3)
556 LDP 0*16(b_ptr), (y0, y1)
557 LDP 1*16(b_ptr), (y2, y3)
558
559 // y[0] * x
560 MUL y0, x0, acc0
561 UMULH y0, x0, acc1
562
563 MUL y0, x1, t0
564 ADDS t0, acc1
565 UMULH y0, x1, acc2
566
567 MUL y0, x2, t0
568 ADCS t0, acc2
569 UMULH y0, x2, acc3
570
571 MUL y0, x3, t0
572 ADCS t0, acc3
573 UMULH y0, x3, acc4
574 ADC $0, acc4
575 // First reduction step
576 MUL acc0, hlp1, hlp0
577
578 MUL const0, hlp1, t0
579 ADDS t0, acc0, acc0
580 UMULH const0, hlp0, t1
581
582 MUL const1, hlp0, t0
583 ADCS t0, acc1, acc1
584 UMULH const1, hlp0, y0
585
586 MUL const2, hlp0, t0
587 ADCS t0, acc2, acc2
588 UMULH const2, hlp0, acc0
589
590 MUL const3, hlp0, t0
591 ADCS t0, acc3, acc3
592
593 UMULH const3, hlp0, hlp0
594 ADC $0, acc4
595
596 ADDS t1, acc1, acc1
597 ADCS y0, acc2, acc2
598 ADCS acc0, acc3, acc3
599 ADC $0, hlp0, acc0
600 // y[1] * x
601 MUL y1, x0, t0
602 ADDS t0, acc1
603 UMULH y1, x0, t1
604
605 MUL y1, x1, t0
606 ADCS t0, acc2
607 UMULH y1, x1, hlp0
608
609 MUL y1, x2, t0
610 ADCS t0, acc3
611 UMULH y1, x2, y0
612
613 MUL y1, x3, t0
614 ADCS t0, acc4
615 UMULH y1, x3, y1
616 ADC $0, ZR, acc5
617
618 ADDS t1, acc2
619 ADCS hlp0, acc3
620 ADCS y0, acc4
621 ADC y1, acc5
622 // Second reduction step
623 MUL acc1, hlp1, hlp0
624
625 MUL const0, hlp1, t0
626 ADDS t0, acc1, acc1
627 UMULH const0, hlp0, t1
628
629 MUL const1, hlp0, t0
630 ADCS t0, acc2, acc2
631 UMULH const1, hlp0, y0
632
633 MUL const2, hlp0, t0
634 ADCS t0, acc3, acc3
635 UMULH const2, hlp0, acc1
636
637 MUL const3, hlp0, t0
638 ADCS t0, acc0, acc0
639
640 UMULH const3, hlp0, hlp0
641 ADC $0, acc5
642
643 ADDS t1, acc2, acc2
644 ADCS y0, acc3, acc3
645 ADCS acc1, acc0, acc0
646 ADC $0, hlp0, acc1
647 // y[2] * x
648 MUL y2, x0, t0
649 ADDS t0, acc2
650 UMULH y2, x0, t1
651
652 MUL y2, x1, t0
653 ADCS t0, acc3
654 UMULH y2, x1, hlp0
655
656 MUL y2, x2, t0
657 ADCS t0, acc4
658 UMULH y2, x2, y0
659
660 MUL y2, x3, t0
661 ADCS t0, acc5
662 UMULH y2, x3, y1
663 ADC $0, ZR, acc6
664
665 ADDS t1, acc3
666 ADCS hlp0, acc4
667 ADCS y0, acc5
668 ADC y1, acc6
669 // Third reduction step
670 MUL acc2, hlp1, hlp0
671
672 MUL const0, hlp1, t0
673 ADDS t0, acc2, acc2
674 UMULH const0, hlp0, t1
675
676 MUL const1, hlp0, t0
677 ADCS t0, acc3, acc3
678 UMULH const1, hlp0, y0
679
680 MUL const2, hlp0, t0
681 ADCS t0, acc0, acc0
682 UMULH const2, hlp0, acc2
683
684 MUL const3, hlp0, t0
685 ADCS t0, acc1, acc1
686
687 UMULH const3, hlp0, hlp0
688 ADC $0, acc6
689
690 ADDS t1, acc3, acc3
691 ADCS y0, acc0, acc0
692 ADCS acc2, acc1, acc1
693 ADC $0, hlp0, acc2
694 // y[3] * x
695 MUL y3, x0, t0
696 ADDS t0, acc3
697 UMULH y3, x0, t1
698
699 MUL y3, x1, t0
700 ADCS t0, acc4
701 UMULH y3, x1, hlp0
702
703 MUL y3, x2, t0
704 ADCS t0, acc5
705 UMULH y3, x2, y0
706
707 MUL y3, x3, t0
708 ADCS t0, acc6
709 UMULH y3, x3, y1
710 ADC $0, ZR, acc7
711
712 ADDS t1, acc4
713 ADCS hlp0, acc5
714 ADCS y0, acc6
715 ADC y1, acc7
716 // Last reduction step
717 MUL acc3, hlp1, hlp0
718
719 MUL const0, hlp1, t0
720 ADDS t0, acc3, acc3
721 UMULH const0, hlp0, t1
722
723 MUL const1, hlp0, t0
724 ADCS t0, acc0, acc0
725 UMULH const1, hlp0, y0
726
727 MUL const2, hlp0, t0
728 ADCS t0, acc1, acc1
729 UMULH const2, hlp0, acc3
730
731 MUL const3, hlp0, t0
732 ADCS t0, acc2, acc2
733
734 UMULH const3, hlp0, hlp0
735 ADC $0, acc7
736
737 ADDS t1, acc0, acc0
738 ADCS y0, acc1, acc1
739 ADCS acc3, acc2, acc2
740 ADC $0, hlp0, acc3
741
742 ADDS acc4, acc0, acc0
743 ADCS acc5, acc1, acc1
744 ADCS acc6, acc2, acc2
745 ADCS acc7, acc3, acc3
746 ADC $0, ZR, acc4
747
748 SUBS const0, acc0, t0
749 SBCS const1, acc1, t1
750 SBCS const2, acc2, t2
751 SBCS const3, acc3, t3
752 SBCS $0, acc4, acc4
753
754 CSEL CS, t0, acc0, acc0
755 CSEL CS, t1, acc1, acc1
756 CSEL CS, t2, acc2, acc2
757 CSEL CS, t3, acc3, acc3
758
759 MOVD res+0(FP), res_ptr
760 STP (acc0, acc1), 0*16(res_ptr)
761 STP (acc2, acc3), 1*16(res_ptr)
762
763 RET
764 /* ---------------------------------------*/
765 TEXT p256SubInternal<>(SB),NOSPLIT,$0
766 SUBS x0, y0, acc0
767 SBCS x1, y1, acc1
768 SBCS x2, y2, acc2
769 SBCS x3, y3, acc3
770 SBC $0, ZR, t0
771
772 ADDS $-1, acc0, acc4
773 ADCS const0, acc1, acc5
774 ADCS $0, acc2, acc6
775 ADC const1, acc3, acc7
776
777 ANDS $1, t0
778 CSEL EQ, acc0, acc4, x0
779 CSEL EQ, acc1, acc5, x1
780 CSEL EQ, acc2, acc6, x2
781 CSEL EQ, acc3, acc7, x3
782
783 RET
784 /* ---------------------------------------*/
785 TEXT p256SqrInternal<>(SB),NOSPLIT,$0
786 // x[1:] * x[0]
787 MUL x0, x1, acc1
788 UMULH x0, x1, acc2
789
790 MUL x0, x2, t0
791 ADDS t0, acc2, acc2
792 UMULH x0, x2, acc3
793
794 MUL x0, x3, t0
795 ADCS t0, acc3, acc3
796 UMULH x0, x3, acc4
797 ADC $0, acc4, acc4
798 // x[2:] * x[1]
799 MUL x1, x2, t0
800 ADDS t0, acc3
801 UMULH x1, x2, t1
802 ADCS t1, acc4
803 ADC $0, ZR, acc5
804
805 MUL x1, x3, t0
806 ADDS t0, acc4
807 UMULH x1, x3, t1
808 ADC t1, acc5
809 // x[3] * x[2]
810 MUL x2, x3, t0
811 ADDS t0, acc5
812 UMULH x2, x3, acc6
813 ADC $0, acc6
814
815 MOVD $0, acc7
816 // *2
817 ADDS acc1, acc1
818 ADCS acc2, acc2
819 ADCS acc3, acc3
820 ADCS acc4, acc4
821 ADCS acc5, acc5
822 ADCS acc6, acc6
823 ADC $0, acc7
824 // Missing products
825 MUL x0, x0, acc0
826 UMULH x0, x0, t0
827 ADDS t0, acc1, acc1
828
829 MUL x1, x1, t0
830 ADCS t0, acc2, acc2
831 UMULH x1, x1, t1
832 ADCS t1, acc3, acc3
833
834 MUL x2, x2, t0
835 ADCS t0, acc4, acc4
836 UMULH x2, x2, t1
837 ADCS t1, acc5, acc5
838
839 MUL x3, x3, t0
840 ADCS t0, acc6, acc6
841 UMULH x3, x3, t1
842 ADCS t1, acc7, acc7
843 // First reduction step
844 ADDS acc0<<32, acc1, acc1
845 LSR $32, acc0, t0
846 MUL acc0, const1, t1
847 UMULH acc0, const1, acc0
848 ADCS t0, acc2, acc2
849 ADCS t1, acc3, acc3
850 ADC $0, acc0, acc0
851 // Second reduction step
852 ADDS acc1<<32, acc2, acc2
853 LSR $32, acc1, t0
854 MUL acc1, const1, t1
855 UMULH acc1, const1, acc1
856 ADCS t0, acc3, acc3
857 ADCS t1, acc0, acc0
858 ADC $0, acc1, acc1
859 // Third reduction step
860 ADDS acc2<<32, acc3, acc3
861 LSR $32, acc2, t0
862 MUL acc2, const1, t1
863 UMULH acc2, const1, acc2
864 ADCS t0, acc0, acc0
865 ADCS t1, acc1, acc1
866 ADC $0, acc2, acc2
867 // Last reduction step
868 ADDS acc3<<32, acc0, acc0
869 LSR $32, acc3, t0
870 MUL acc3, const1, t1
871 UMULH acc3, const1, acc3
872 ADCS t0, acc1, acc1
873 ADCS t1, acc2, acc2
874 ADC $0, acc3, acc3
875 // Add bits [511:256] of the sqr result
876 ADDS acc4, acc0, acc0
877 ADCS acc5, acc1, acc1
878 ADCS acc6, acc2, acc2
879 ADCS acc7, acc3, acc3
880 ADC $0, ZR, acc4
881
882 SUBS $-1, acc0, t0
883 SBCS const0, acc1, t1
884 SBCS $0, acc2, t2
885 SBCS const1, acc3, t3
886 SBCS $0, acc4, acc4
887
888 CSEL CS, t0, acc0, y0
889 CSEL CS, t1, acc1, y1
890 CSEL CS, t2, acc2, y2
891 CSEL CS, t3, acc3, y3
892 RET
893 /* ---------------------------------------*/
894 TEXT p256MulInternal<>(SB),NOSPLIT,$0
895 // y[0] * x
896 MUL y0, x0, acc0
897 UMULH y0, x0, acc1
898
899 MUL y0, x1, t0
900 ADDS t0, acc1
901 UMULH y0, x1, acc2
902
903 MUL y0, x2, t0
904 ADCS t0, acc2
905 UMULH y0, x2, acc3
906
907 MUL y0, x3, t0
908 ADCS t0, acc3
909 UMULH y0, x3, acc4
910 ADC $0, acc4
911 // First reduction step
912 ADDS acc0<<32, acc1, acc1
913 LSR $32, acc0, t0
914 MUL acc0, const1, t1
915 UMULH acc0, const1, acc0
916 ADCS t0, acc2
917 ADCS t1, acc3
918 ADC $0, acc0
919 // y[1] * x
920 MUL y1, x0, t0
921 ADDS t0, acc1
922 UMULH y1, x0, t1
923
924 MUL y1, x1, t0
925 ADCS t0, acc2
926 UMULH y1, x1, t2
927
928 MUL y1, x2, t0
929 ADCS t0, acc3
930 UMULH y1, x2, t3
931
932 MUL y1, x3, t0
933 ADCS t0, acc4
934 UMULH y1, x3, hlp0
935 ADC $0, ZR, acc5
936
937 ADDS t1, acc2
938 ADCS t2, acc3
939 ADCS t3, acc4
940 ADC hlp0, acc5
941 // Second reduction step
942 ADDS acc1<<32, acc2, acc2
943 LSR $32, acc1, t0
944 MUL acc1, const1, t1
945 UMULH acc1, const1, acc1
946 ADCS t0, acc3
947 ADCS t1, acc0
948 ADC $0, acc1
949 // y[2] * x
950 MUL y2, x0, t0
951 ADDS t0, acc2
952 UMULH y2, x0, t1
953
954 MUL y2, x1, t0
955 ADCS t0, acc3
956 UMULH y2, x1, t2
957
958 MUL y2, x2, t0
959 ADCS t0, acc4
960 UMULH y2, x2, t3
961
962 MUL y2, x3, t0
963 ADCS t0, acc5
964 UMULH y2, x3, hlp0
965 ADC $0, ZR, acc6
966
967 ADDS t1, acc3
968 ADCS t2, acc4
969 ADCS t3, acc5
970 ADC hlp0, acc6
971 // Third reduction step
972 ADDS acc2<<32, acc3, acc3
973 LSR $32, acc2, t0
974 MUL acc2, const1, t1
975 UMULH acc2, const1, acc2
976 ADCS t0, acc0
977 ADCS t1, acc1
978 ADC $0, acc2
979 // y[3] * x
980 MUL y3, x0, t0
981 ADDS t0, acc3
982 UMULH y3, x0, t1
983
984 MUL y3, x1, t0
985 ADCS t0, acc4
986 UMULH y3, x1, t2
987
988 MUL y3, x2, t0
989 ADCS t0, acc5
990 UMULH y3, x2, t3
991
992 MUL y3, x3, t0
993 ADCS t0, acc6
994 UMULH y3, x3, hlp0
995 ADC $0, ZR, acc7
996
997 ADDS t1, acc4
998 ADCS t2, acc5
999 ADCS t3, acc6
1000 ADC hlp0, acc7
1001 // Last reduction step
1002 ADDS acc3<<32, acc0, acc0
1003 LSR $32, acc3, t0
1004 MUL acc3, const1, t1
1005 UMULH acc3, const1, acc3
1006 ADCS t0, acc1
1007 ADCS t1, acc2
1008 ADC $0, acc3
1009 // Add bits [511:256] of the mul result
1010 ADDS acc4, acc0, acc0
1011 ADCS acc5, acc1, acc1
1012 ADCS acc6, acc2, acc2
1013 ADCS acc7, acc3, acc3
1014 ADC $0, ZR, acc4
1015
1016 SUBS $-1, acc0, t0
1017 SBCS const0, acc1, t1
1018 SBCS $0, acc2, t2
1019 SBCS const1, acc3, t3
1020 SBCS $0, acc4, acc4
1021
1022 CSEL CS, t0, acc0, y0
1023 CSEL CS, t1, acc1, y1
1024 CSEL CS, t2, acc2, y2
1025 CSEL CS, t3, acc3, y3
1026 RET
1027 /* ---------------------------------------*/
1028 #define p256MulBy2Inline \
1029 ADDS y0, y0, x0; \
1030 ADCS y1, y1, x1; \
1031 ADCS y2, y2, x2; \
1032 ADCS y3, y3, x3; \
1033 ADC $0, ZR, hlp0; \
1034 SUBS $-1, x0, t0; \
1035 SBCS const0, x1, t1;\
1036 SBCS $0, x2, t2; \
1037 SBCS const1, x3, t3;\
1038 SBCS $0, hlp0, hlp0;\
1039 CSEL CC, x0, t0, x0;\
1040 CSEL CC, x1, t1, x1;\
1041 CSEL CC, x2, t2, x2;\
1042 CSEL CC, x3, t3, x3;
1043 /* ---------------------------------------*/
1044 #define x1in(off) (off)(a_ptr)
1045 #define y1in(off) (off + 32)(a_ptr)
1046 #define z1in(off) (off + 64)(a_ptr)
1047 #define x2in(off) (off)(b_ptr)
1048 #define z2in(off) (off + 64)(b_ptr)
1049 #define x3out(off) (off)(res_ptr)
1050 #define y3out(off) (off + 32)(res_ptr)
1051 #define z3out(off) (off + 64)(res_ptr)
1052 #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
1053 #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
1054 #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
1055 #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
1056 /* ---------------------------------------*/
1057 #define y2in(off) (32*0 + 8 + off)(RSP)
1058 #define s2(off) (32*1 + 8 + off)(RSP)
1059 #define z1sqr(off) (32*2 + 8 + off)(RSP)
1060 #define h(off) (32*3 + 8 + off)(RSP)
1061 #define r(off) (32*4 + 8 + off)(RSP)
1062 #define hsqr(off) (32*5 + 8 + off)(RSP)
1063 #define rsqr(off) (32*6 + 8 + off)(RSP)
1064 #define hcub(off) (32*7 + 8 + off)(RSP)
1065
1066 #define z2sqr(off) (32*8 + 8 + off)(RSP)
1067 #define s1(off) (32*9 + 8 + off)(RSP)
1068 #define u1(off) (32*10 + 8 + off)(RSP)
1069 #define u2(off) (32*11 + 8 + off)(RSP)
1070
1071 // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
1072 TEXT ·p256PointAddAffineAsm(SB),0,$264-48
1073 MOVD in1+8(FP), a_ptr
1074 MOVD in2+16(FP), b_ptr
1075 MOVD sign+24(FP), hlp0
1076 MOVD sel+32(FP), hlp1
1077 MOVD zero+40(FP), t2
1078
1079 MOVD $1, t0
1080 CMP $0, t2
1081 CSEL EQ, ZR, t0, t2
1082 CMP $0, hlp1
1083 CSEL EQ, ZR, t0, hlp1
1084
1085 MOVD p256const0<>(SB), const0
1086 MOVD p256const1<>(SB), const1
1087 EOR t2<<1, hlp1
1088
1089 // Negate y2in based on sign
1090 LDP 2*16(b_ptr), (y0, y1)
1091 LDP 3*16(b_ptr), (y2, y3)
1092 MOVD $-1, acc0
1093
1094 SUBS y0, acc0, acc0
1095 SBCS y1, const0, acc1
1096 SBCS y2, ZR, acc2
1097 SBCS y3, const1, acc3
1098 SBC $0, ZR, t0
1099
1100 ADDS $-1, acc0, acc4
1101 ADCS const0, acc1, acc5
1102 ADCS $0, acc2, acc6
1103 ADCS const1, acc3, acc7
1104 ADC $0, t0, t0
1105
1106 CMP $0, t0
1107 CSEL EQ, acc4, acc0, acc0
1108 CSEL EQ, acc5, acc1, acc1
1109 CSEL EQ, acc6, acc2, acc2
1110 CSEL EQ, acc7, acc3, acc3
1111 // If condition is 0, keep original value
1112 CMP $0, hlp0
1113 CSEL EQ, y0, acc0, y0
1114 CSEL EQ, y1, acc1, y1
1115 CSEL EQ, y2, acc2, y2
1116 CSEL EQ, y3, acc3, y3
1117 // Store result
1118 STy(y2in)
1119 // Begin point add
1120 LDx(z1in)
1121 CALL p256SqrInternal<>(SB) // z1ˆ2
1122 STy(z1sqr)
1123
1124 LDx(x2in)
1125 CALL p256MulInternal<>(SB) // x2 * z1ˆ2
1126
1127 LDx(x1in)
1128 CALL p256SubInternal<>(SB) // h = u2 - u1
1129 STx(h)
1130
1131 LDy(z1in)
1132 CALL p256MulInternal<>(SB) // z3 = h * z1
1133
1134 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
1135 LDP 5*16(a_ptr), (acc2, acc3)
1136 ANDS $1, hlp1, ZR
1137 CSEL EQ, acc0, y0, y0
1138 CSEL EQ, acc1, y1, y1
1139 CSEL EQ, acc2, y2, y2
1140 CSEL EQ, acc3, y3, y3
1141 LDP p256one<>+0x00(SB), (acc0, acc1)
1142 LDP p256one<>+0x10(SB), (acc2, acc3)
1143 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1
1144 CSEL EQ, acc0, y0, y0
1145 CSEL EQ, acc1, y1, y1
1146 CSEL EQ, acc2, y2, y2
1147 CSEL EQ, acc3, y3, y3
1148 LDx(z1in)
1149 MOVD res+0(FP), t0
1150 STP (y0, y1), 4*16(t0)
1151 STP (y2, y3), 5*16(t0)
1152
1153 LDy(z1sqr)
1154 CALL p256MulInternal<>(SB) // z1 ^ 3
1155
1156 LDx(y2in)
1157 CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3
1158 STy(s2)
1159
1160 LDx(y1in)
1161 CALL p256SubInternal<>(SB) // r = s2 - s1
1162 STx(r)
1163
1164 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1165 STy (rsqr)
1166
1167 LDx(h)
1168 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1169 STy(hsqr)
1170
1171 CALL p256MulInternal<>(SB) // hcub = hˆ3
1172 STy(hcub)
1173
1174 LDx(y1in)
1175 CALL p256MulInternal<>(SB) // y1 * hˆ3
1176 STy(s2)
1177
1178 LDP hsqr(0*8), (x0, x1)
1179 LDP hsqr(2*8), (x2, x3)
1180 LDP 0*16(a_ptr), (y0, y1)
1181 LDP 1*16(a_ptr), (y2, y3)
1182 CALL p256MulInternal<>(SB) // u1 * hˆ2
1183 STP (y0, y1), h(0*8)
1184 STP (y2, y3), h(2*8)
1185
1186 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1187
1188 LDy(rsqr)
1189 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1190
1191 MOVD x0, y0
1192 MOVD x1, y1
1193 MOVD x2, y2
1194 MOVD x3, y3
1195 LDx(hcub)
1196 CALL p256SubInternal<>(SB)
1197
1198 LDP 0*16(a_ptr), (acc0, acc1)
1199 LDP 1*16(a_ptr), (acc2, acc3)
1200 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1
1201 CSEL EQ, acc0, x0, x0
1202 CSEL EQ, acc1, x1, x1
1203 CSEL EQ, acc2, x2, x2
1204 CSEL EQ, acc3, x3, x3
1205 LDP 0*16(b_ptr), (acc0, acc1)
1206 LDP 1*16(b_ptr), (acc2, acc3)
1207 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2
1208 CSEL EQ, acc0, x0, x0
1209 CSEL EQ, acc1, x1, x1
1210 CSEL EQ, acc2, x2, x2
1211 CSEL EQ, acc3, x3, x3
1212 MOVD res+0(FP), t0
1213 STP (x0, x1), 0*16(t0)
1214 STP (x2, x3), 1*16(t0)
1215
1216 LDP h(0*8), (y0, y1)
1217 LDP h(2*8), (y2, y3)
1218 CALL p256SubInternal<>(SB)
1219
1220 LDP r(0*8), (y0, y1)
1221 LDP r(2*8), (y2, y3)
1222 CALL p256MulInternal<>(SB)
1223
1224 LDP s2(0*8), (x0, x1)
1225 LDP s2(2*8), (x2, x3)
1226 CALL p256SubInternal<>(SB)
1227 LDP 2*16(a_ptr), (acc0, acc1)
1228 LDP 3*16(a_ptr), (acc2, acc3)
1229 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1
1230 CSEL EQ, acc0, x0, x0
1231 CSEL EQ, acc1, x1, x1
1232 CSEL EQ, acc2, x2, x2
1233 CSEL EQ, acc3, x3, x3
1234 LDP y2in(0*8), (acc0, acc1)
1235 LDP y2in(2*8), (acc2, acc3)
1236 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2
1237 CSEL EQ, acc0, x0, x0
1238 CSEL EQ, acc1, x1, x1
1239 CSEL EQ, acc2, x2, x2
1240 CSEL EQ, acc3, x3, x3
1241 MOVD res+0(FP), t0
1242 STP (x0, x1), 2*16(t0)
1243 STP (x2, x3), 3*16(t0)
1244
1245 RET
1246
1247 #define p256AddInline \
1248 ADDS y0, x0, x0; \
1249 ADCS y1, x1, x1; \
1250 ADCS y2, x2, x2; \
1251 ADCS y3, x3, x3; \
1252 ADC $0, ZR, hlp0; \
1253 SUBS $-1, x0, t0; \
1254 SBCS const0, x1, t1;\
1255 SBCS $0, x2, t2; \
1256 SBCS const1, x3, t3;\
1257 SBCS $0, hlp0, hlp0;\
1258 CSEL CC, x0, t0, x0;\
1259 CSEL CC, x1, t1, x1;\
1260 CSEL CC, x2, t2, x2;\
1261 CSEL CC, x3, t3, x3;
1262
1263 #define s(off) (32*0 + 8 + off)(RSP)
1264 #define m(off) (32*1 + 8 + off)(RSP)
1265 #define zsqr(off) (32*2 + 8 + off)(RSP)
1266 #define tmp(off) (32*3 + 8 + off)(RSP)
1267
1268 //func p256PointDoubleAsm(res, in *P256Point)
1269 TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
1270 MOVD res+0(FP), res_ptr
1271 MOVD in+8(FP), a_ptr
1272
1273 MOVD p256const0<>(SB), const0
1274 MOVD p256const1<>(SB), const1
1275
1276 // Begin point double
1277 LDP 4*16(a_ptr), (x0, x1)
1278 LDP 5*16(a_ptr), (x2, x3)
1279 CALL p256SqrInternal<>(SB)
1280 STP (y0, y1), zsqr(0*8)
1281 STP (y2, y3), zsqr(2*8)
1282
1283 LDP 0*16(a_ptr), (x0, x1)
1284 LDP 1*16(a_ptr), (x2, x3)
1285 p256AddInline
1286 STx(m)
1287
1288 LDx(z1in)
1289 LDy(y1in)
1290 CALL p256MulInternal<>(SB)
1291 p256MulBy2Inline
1292 STx(z3out)
1293
1294 LDy(x1in)
1295 LDx(zsqr)
1296 CALL p256SubInternal<>(SB)
1297 LDy(m)
1298 CALL p256MulInternal<>(SB)
1299
1300 // Multiply by 3
1301 p256MulBy2Inline
1302 p256AddInline
1303 STx(m)
1304
1305 LDy(y1in)
1306 p256MulBy2Inline
1307 CALL p256SqrInternal<>(SB)
1308 STy(s)
1309 MOVD y0, x0
1310 MOVD y1, x1
1311 MOVD y2, x2
1312 MOVD y3, x3
1313 CALL p256SqrInternal<>(SB)
1314
1315 // Divide by 2
1316 ADDS $-1, y0, t0
1317 ADCS const0, y1, t1
1318 ADCS $0, y2, t2
1319 ADCS const1, y3, t3
1320 ADC $0, ZR, hlp0
1321
1322 ANDS $1, y0, ZR
1323 CSEL EQ, y0, t0, t0
1324 CSEL EQ, y1, t1, t1
1325 CSEL EQ, y2, t2, t2
1326 CSEL EQ, y3, t3, t3
1327 AND y0, hlp0, hlp0
1328
1329 EXTR $1, t0, t1, y0
1330 EXTR $1, t1, t2, y1
1331 EXTR $1, t2, t3, y2
1332 EXTR $1, t3, hlp0, y3
1333 STy(y3out)
1334
1335 LDx(x1in)
1336 LDy(s)
1337 CALL p256MulInternal<>(SB)
1338 STy(s)
1339 p256MulBy2Inline
1340 STx(tmp)
1341
1342 LDx(m)
1343 CALL p256SqrInternal<>(SB)
1344 LDx(tmp)
1345 CALL p256SubInternal<>(SB)
1346
1347 STx(x3out)
1348
1349 LDy(s)
1350 CALL p256SubInternal<>(SB)
1351
1352 LDy(m)
1353 CALL p256MulInternal<>(SB)
1354
1355 LDx(y3out)
1356 CALL p256SubInternal<>(SB)
1357 STx(y3out)
1358 RET
1359 /* ---------------------------------------*/
1360 #undef y2in
1361 #undef x3out
1362 #undef y3out
1363 #undef z3out
1364 #define y2in(off) (off + 32)(b_ptr)
1365 #define x3out(off) (off)(b_ptr)
1366 #define y3out(off) (off + 32)(b_ptr)
1367 #define z3out(off) (off + 64)(b_ptr)
1368 // func p256PointAddAsm(res, in1, in2 *P256Point) int
1369 TEXT ·p256PointAddAsm(SB),0,$392-32
1370 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
1371 // Move input to stack in order to free registers
1372 MOVD in1+8(FP), a_ptr
1373 MOVD in2+16(FP), b_ptr
1374
1375 MOVD p256const0<>(SB), const0
1376 MOVD p256const1<>(SB), const1
1377
1378 // Begin point add
1379 LDx(z2in)
1380 CALL p256SqrInternal<>(SB) // z2^2
1381 STy(z2sqr)
1382
1383 CALL p256MulInternal<>(SB) // z2^3
1384
1385 LDx(y1in)
1386 CALL p256MulInternal<>(SB) // s1 = z2ˆ3*y1
1387 STy(s1)
1388
1389 LDx(z1in)
1390 CALL p256SqrInternal<>(SB) // z1^2
1391 STy(z1sqr)
1392
1393 CALL p256MulInternal<>(SB) // z1^3
1394
1395 LDx(y2in)
1396 CALL p256MulInternal<>(SB) // s2 = z1ˆ3*y2
1397
1398 LDx(s1)
1399 CALL p256SubInternal<>(SB) // r = s2 - s1
1400 STx(r)
1401
1402 MOVD $1, t2
1403 ORR x0, x1, t0 // Check if zero mod p256
1404 ORR x2, x3, t1
1405 ORR t1, t0, t0
1406 CMP $0, t0
1407 CSEL EQ, t2, ZR, hlp1
1408
1409 EOR $-1, x0, t0
1410 EOR const0, x1, t1
1411 EOR const1, x3, t3
1412
1413 ORR t0, t1, t0
1414 ORR x2, t3, t1
1415 ORR t1, t0, t0
1416 CMP $0, t0
1417 CSEL EQ, t2, hlp1, hlp1
1418
1419 LDx(z2sqr)
1420 LDy(x1in)
1421 CALL p256MulInternal<>(SB) // u1 = x1 * z2ˆ2
1422 STy(u1)
1423
1424 LDx(z1sqr)
1425 LDy(x2in)
1426 CALL p256MulInternal<>(SB) // u2 = x2 * z1ˆ2
1427 STy(u2)
1428
1429 LDx(u1)
1430 CALL p256SubInternal<>(SB) // h = u2 - u1
1431 STx(h)
1432
1433 MOVD $1, t2
1434 ORR x0, x1, t0 // Check if zero mod p256
1435 ORR x2, x3, t1
1436 ORR t1, t0, t0
1437 CMP $0, t0
1438 CSEL EQ, t2, ZR, hlp0
1439
1440 EOR $-1, x0, t0
1441 EOR const0, x1, t1
1442 EOR const1, x3, t3
1443
1444 ORR t0, t1, t0
1445 ORR x2, t3, t1
1446 ORR t1, t0, t0
1447 CMP $0, t0
1448 CSEL EQ, t2, hlp0, hlp0
1449
1450 AND hlp0, hlp1, hlp1
1451
1452 LDx(r)
1453 CALL p256SqrInternal<>(SB) // rsqr = rˆ2
1454 STy(rsqr)
1455
1456 LDx(h)
1457 CALL p256SqrInternal<>(SB) // hsqr = hˆ2
1458 STy(hsqr)
1459
1460 LDx(h)
1461 CALL p256MulInternal<>(SB) // hcub = hˆ3
1462 STy(hcub)
1463
1464 LDx(s1)
1465 CALL p256MulInternal<>(SB)
1466 STy(s2)
1467
1468 LDx(z1in)
1469 LDy(z2in)
1470 CALL p256MulInternal<>(SB) // z1 * z2
1471 LDx(h)
1472 CALL p256MulInternal<>(SB) // z1 * z2 * h
1473 MOVD res+0(FP), b_ptr
1474 STy(z3out)
1475
1476 LDx(hsqr)
1477 LDy(u1)
1478 CALL p256MulInternal<>(SB) // hˆ2 * u1
1479 STy(u2)
1480
1481 p256MulBy2Inline // u1 * hˆ2 * 2, inline
1482 LDy(rsqr)
1483 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2
1484
1485 MOVD x0, y0
1486 MOVD x1, y1
1487 MOVD x2, y2
1488 MOVD x3, y3
1489 LDx(hcub)
1490 CALL p256SubInternal<>(SB)
1491 STx(x3out)
1492
1493 LDy(u2)
1494 CALL p256SubInternal<>(SB)
1495
1496 LDy(r)
1497 CALL p256MulInternal<>(SB)
1498
1499 LDx(s2)
1500 CALL p256SubInternal<>(SB)
1501 STx(y3out)
1502
1503 MOVD hlp1, R0
1504 MOVD R0, ret+24(FP)
1505
1506 RET
1507
View as plain text