1 // Code generated by command: go run p256_asm.go -out ../p256_asm_amd64.s. DO NOT EDIT.
2
3 //go:build !purego
4
5 #include "textflag.h"
6
7 // func p256MovCond(res *P256Point, a *P256Point, b *P256Point, cond int)
8 // Requires: SSE2
9 TEXT ·p256MovCond(SB), NOSPLIT, $0-32
10 MOVQ res+0(FP), DI
11 MOVQ a+8(FP), SI
12 MOVQ b+16(FP), CX
13 MOVQ cond+24(FP), X12
14 PXOR X13, X13
15 PSHUFD $0x00, X12, X12
16 PCMPEQL X13, X12
17 MOVOU X12, X0
18 MOVOU (SI), X6
19 PANDN X6, X0
20 MOVOU X12, X1
21 MOVOU 16(SI), X7
22 PANDN X7, X1
23 MOVOU X12, X2
24 MOVOU 32(SI), X8
25 PANDN X8, X2
26 MOVOU X12, X3
27 MOVOU 48(SI), X9
28 PANDN X9, X3
29 MOVOU X12, X4
30 MOVOU 64(SI), X10
31 PANDN X10, X4
32 MOVOU X12, X5
33 MOVOU 80(SI), X11
34 PANDN X11, X5
35 MOVOU (CX), X6
36 MOVOU 16(CX), X7
37 MOVOU 32(CX), X8
38 MOVOU 48(CX), X9
39 MOVOU 64(CX), X10
40 MOVOU 80(CX), X11
41 PAND X12, X6
42 PAND X12, X7
43 PAND X12, X8
44 PAND X12, X9
45 PAND X12, X10
46 PAND X12, X11
47 PXOR X6, X0
48 PXOR X7, X1
49 PXOR X8, X2
50 PXOR X9, X3
51 PXOR X10, X4
52 PXOR X11, X5
53 MOVOU X0, (DI)
54 MOVOU X1, 16(DI)
55 MOVOU X2, 32(DI)
56 MOVOU X3, 48(DI)
57 MOVOU X4, 64(DI)
58 MOVOU X5, 80(DI)
59 RET
60
61 // func p256NegCond(val *p256Element, cond int)
62 // Requires: CMOV
63 TEXT ·p256NegCond(SB), NOSPLIT, $0-16
64 MOVQ val+0(FP), DI
65 MOVQ cond+8(FP), R14
66
67 // acc = poly
68 MOVQ $-1, R8
69 MOVQ p256const0<>+0(SB), R9
70 MOVQ $+0, R10
71 MOVQ p256const1<>+0(SB), R11
72
73 // Load the original value
74 MOVQ (DI), R13
75 MOVQ 8(DI), SI
76 MOVQ 16(DI), CX
77 MOVQ 24(DI), R15
78
79 // Speculatively subtract
80 SUBQ R13, R8
81 SBBQ SI, R9
82 SBBQ CX, R10
83 SBBQ R15, R11
84
85 // If condition is 0, keep original value
86 TESTQ R14, R14
87 CMOVQEQ R13, R8
88 CMOVQEQ SI, R9
89 CMOVQEQ CX, R10
90 CMOVQEQ R15, R11
91
92 // Store result
93 MOVQ R8, (DI)
94 MOVQ R9, 8(DI)
95 MOVQ R10, 16(DI)
96 MOVQ R11, 24(DI)
97 RET
98
99 DATA p256const0<>+0(SB)/8, $0x00000000ffffffff
100 GLOBL p256const0<>(SB), RODATA, $8
101
102 DATA p256const1<>+0(SB)/8, $0xffffffff00000001
103 GLOBL p256const1<>(SB), RODATA, $8
104
105 // func p256Sqr(res *p256Element, in *p256Element, n int)
106 // Requires: CMOV
107 TEXT ·p256Sqr(SB), NOSPLIT, $0-24
108 MOVQ res+0(FP), DI
109 MOVQ in+8(FP), SI
110 MOVQ n+16(FP), BX
111
112 sqrLoop:
113 // y[1:] * y[0]
114 MOVQ (SI), R14
115 MOVQ 8(SI), AX
116 MULQ R14
117 MOVQ AX, R9
118 MOVQ DX, R10
119 MOVQ 16(SI), AX
120 MULQ R14
121 ADDQ AX, R10
122 ADCQ $0x00, DX
123 MOVQ DX, R11
124 MOVQ 24(SI), AX
125 MULQ R14
126 ADDQ AX, R11
127 ADCQ $0x00, DX
128 MOVQ DX, R12
129
130 // y[2:] * y[1]
131 MOVQ 8(SI), R14
132 MOVQ 16(SI), AX
133 MULQ R14
134 ADDQ AX, R11
135 ADCQ $0x00, DX
136 MOVQ DX, R15
137 MOVQ 24(SI), AX
138 MULQ R14
139 ADDQ R15, R12
140 ADCQ $0x00, DX
141 ADDQ AX, R12
142 ADCQ $0x00, DX
143 MOVQ DX, R13
144
145 // y[3] * y[2]
146 MOVQ 16(SI), R14
147 MOVQ 24(SI), AX
148 MULQ R14
149 ADDQ AX, R13
150 ADCQ $0x00, DX
151 MOVQ DX, CX
152 XORQ R15, R15
153
154 // *2
155 ADDQ R9, R9
156 ADCQ R10, R10
157 ADCQ R11, R11
158 ADCQ R12, R12
159 ADCQ R13, R13
160 ADCQ CX, CX
161 ADCQ $0x00, R15
162
163 // Missing products
164 MOVQ (SI), AX
165 MULQ AX
166 MOVQ AX, R8
167 MOVQ DX, R14
168 MOVQ 8(SI), AX
169 MULQ AX
170 ADDQ R14, R9
171 ADCQ AX, R10
172 ADCQ $0x00, DX
173 MOVQ DX, R14
174 MOVQ 16(SI), AX
175 MULQ AX
176 ADDQ R14, R11
177 ADCQ AX, R12
178 ADCQ $0x00, DX
179 MOVQ DX, R14
180 MOVQ 24(SI), AX
181 MULQ AX
182 ADDQ R14, R13
183 ADCQ AX, CX
184 ADCQ DX, R15
185 MOVQ R15, SI
186
187 // First reduction step
188 MOVQ R8, AX
189 MOVQ R8, R15
190 SHLQ $0x20, R8
191 MULQ p256const1<>+0(SB)
192 SHRQ $0x20, R15
193 ADDQ R8, R9
194 ADCQ R15, R10
195 ADCQ AX, R11
196 ADCQ $0x00, DX
197 MOVQ DX, R8
198
199 // Second reduction step
200 MOVQ R9, AX
201 MOVQ R9, R15
202 SHLQ $0x20, R9
203 MULQ p256const1<>+0(SB)
204 SHRQ $0x20, R15
205 ADDQ R9, R10
206 ADCQ R15, R11
207 ADCQ AX, R8
208 ADCQ $0x00, DX
209 MOVQ DX, R9
210
211 // Third reduction step
212 MOVQ R10, AX
213 MOVQ R10, R15
214 SHLQ $0x20, R10
215 MULQ p256const1<>+0(SB)
216 SHRQ $0x20, R15
217 ADDQ R10, R11
218 ADCQ R15, R8
219 ADCQ AX, R9
220 ADCQ $0x00, DX
221 MOVQ DX, R10
222
223 // Last reduction step
224 XORQ R14, R14
225 MOVQ R11, AX
226 MOVQ R11, R15
227 SHLQ $0x20, R11
228 MULQ p256const1<>+0(SB)
229 SHRQ $0x20, R15
230 ADDQ R11, R8
231 ADCQ R15, R9
232 ADCQ AX, R10
233 ADCQ $0x00, DX
234 MOVQ DX, R11
235
236 // Add bits [511:256] of the sqr result
237 ADCQ R12, R8
238 ADCQ R13, R9
239 ADCQ CX, R10
240 ADCQ SI, R11
241 ADCQ $0x00, R14
242 MOVQ R8, R12
243 MOVQ R9, R13
244 MOVQ R10, CX
245 MOVQ R11, R15
246
247 // Subtract p256
248 SUBQ $-1, R8
249 SBBQ p256const0<>+0(SB), R9
250 SBBQ $0x00, R10
251 SBBQ p256const1<>+0(SB), R11
252 SBBQ $0x00, R14
253 CMOVQCS R12, R8
254 CMOVQCS R13, R9
255 CMOVQCS CX, R10
256 CMOVQCS R15, R11
257 MOVQ R8, (DI)
258 MOVQ R9, 8(DI)
259 MOVQ R10, 16(DI)
260 MOVQ R11, 24(DI)
261 MOVQ DI, SI
262 DECQ BX
263 JNE sqrLoop
264 RET
265
266 // func p256Mul(res *p256Element, in1 *p256Element, in2 *p256Element)
267 // Requires: CMOV
268 TEXT ·p256Mul(SB), NOSPLIT, $0-24
269 MOVQ res+0(FP), DI
270 MOVQ in1+8(FP), SI
271 MOVQ in2+16(FP), CX
272
273 // x * y[0]
274 MOVQ (CX), R14
275 MOVQ (SI), AX
276 MULQ R14
277 MOVQ AX, R8
278 MOVQ DX, R9
279 MOVQ 8(SI), AX
280 MULQ R14
281 ADDQ AX, R9
282 ADCQ $0x00, DX
283 MOVQ DX, R10
284 MOVQ 16(SI), AX
285 MULQ R14
286 ADDQ AX, R10
287 ADCQ $0x00, DX
288 MOVQ DX, R11
289 MOVQ 24(SI), AX
290 MULQ R14
291 ADDQ AX, R11
292 ADCQ $0x00, DX
293 MOVQ DX, R12
294 XORQ R13, R13
295
296 // First reduction step
297 MOVQ R8, AX
298 MOVQ R8, R15
299 SHLQ $0x20, R8
300 MULQ p256const1<>+0(SB)
301 SHRQ $0x20, R15
302 ADDQ R8, R9
303 ADCQ R15, R10
304 ADCQ AX, R11
305 ADCQ DX, R12
306 ADCQ $0x00, R13
307 XORQ R8, R8
308
309 // x * y[1]
310 MOVQ 8(CX), R14
311 MOVQ (SI), AX
312 MULQ R14
313 ADDQ AX, R9
314 ADCQ $0x00, DX
315 MOVQ DX, R15
316 MOVQ 8(SI), AX
317 MULQ R14
318 ADDQ R15, R10
319 ADCQ $0x00, DX
320 ADDQ AX, R10
321 ADCQ $0x00, DX
322 MOVQ DX, R15
323 MOVQ 16(SI), AX
324 MULQ R14
325 ADDQ R15, R11
326 ADCQ $0x00, DX
327 ADDQ AX, R11
328 ADCQ $0x00, DX
329 MOVQ DX, R15
330 MOVQ 24(SI), AX
331 MULQ R14
332 ADDQ R15, R12
333 ADCQ $0x00, DX
334 ADDQ AX, R12
335 ADCQ DX, R13
336 ADCQ $0x00, R8
337
338 // Second reduction step
339 MOVQ R9, AX
340 MOVQ R9, R15
341 SHLQ $0x20, R9
342 MULQ p256const1<>+0(SB)
343 SHRQ $0x20, R15
344 ADDQ R9, R10
345 ADCQ R15, R11
346 ADCQ AX, R12
347 ADCQ DX, R13
348 ADCQ $0x00, R8
349 XORQ R9, R9
350
351 // x * y[2]
352 MOVQ 16(CX), R14
353 MOVQ (SI), AX
354 MULQ R14
355 ADDQ AX, R10
356 ADCQ $0x00, DX
357 MOVQ DX, R15
358 MOVQ 8(SI), AX
359 MULQ R14
360 ADDQ R15, R11
361 ADCQ $0x00, DX
362 ADDQ AX, R11
363 ADCQ $0x00, DX
364 MOVQ DX, R15
365 MOVQ 16(SI), AX
366 MULQ R14
367 ADDQ R15, R12
368 ADCQ $0x00, DX
369 ADDQ AX, R12
370 ADCQ $0x00, DX
371 MOVQ DX, R15
372 MOVQ 24(SI), AX
373 MULQ R14
374 ADDQ R15, R13
375 ADCQ $0x00, DX
376 ADDQ AX, R13
377 ADCQ DX, R8
378 ADCQ $0x00, R9
379
380 // Third reduction step
381 MOVQ R10, AX
382 MOVQ R10, R15
383 SHLQ $0x20, R10
384 MULQ p256const1<>+0(SB)
385 SHRQ $0x20, R15
386 ADDQ R10, R11
387 ADCQ R15, R12
388 ADCQ AX, R13
389 ADCQ DX, R8
390 ADCQ $0x00, R9
391 XORQ R10, R10
392
393 // x * y[3]
394 MOVQ 24(CX), R14
395 MOVQ (SI), AX
396 MULQ R14
397 ADDQ AX, R11
398 ADCQ $0x00, DX
399 MOVQ DX, R15
400 MOVQ 8(SI), AX
401 MULQ R14
402 ADDQ R15, R12
403 ADCQ $0x00, DX
404 ADDQ AX, R12
405 ADCQ $0x00, DX
406 MOVQ DX, R15
407 MOVQ 16(SI), AX
408 MULQ R14
409 ADDQ R15, R13
410 ADCQ $0x00, DX
411 ADDQ AX, R13
412 ADCQ $0x00, DX
413 MOVQ DX, R15
414 MOVQ 24(SI), AX
415 MULQ R14
416 ADDQ R15, R8
417 ADCQ $0x00, DX
418 ADDQ AX, R8
419 ADCQ DX, R9
420 ADCQ $0x00, R10
421
422 // Last reduction step
423 MOVQ R11, AX
424 MOVQ R11, R15
425 SHLQ $0x20, R11
426 MULQ p256const1<>+0(SB)
427 SHRQ $0x20, R15
428 ADDQ R11, R12
429 ADCQ R15, R13
430 ADCQ AX, R8
431 ADCQ DX, R9
432 ADCQ $0x00, R10
433
434 // Copy result [255:0]
435 MOVQ R12, SI
436 MOVQ R13, R11
437 MOVQ R8, R14
438 MOVQ R9, R15
439
440 // Subtract p256
441 SUBQ $-1, R12
442 SBBQ p256const0<>+0(SB), R13
443 SBBQ $0x00, R8
444 SBBQ p256const1<>+0(SB), R9
445 SBBQ $0x00, R10
446 CMOVQCS SI, R12
447 CMOVQCS R11, R13
448 CMOVQCS R14, R8
449 CMOVQCS R15, R9
450 MOVQ R12, (DI)
451 MOVQ R13, 8(DI)
452 MOVQ R8, 16(DI)
453 MOVQ R9, 24(DI)
454 RET
455
456 // func p256FromMont(res *p256Element, in *p256Element)
457 // Requires: CMOV
458 TEXT ·p256FromMont(SB), NOSPLIT, $0-16
459 MOVQ res+0(FP), DI
460 MOVQ in+8(FP), SI
461 MOVQ (SI), R8
462 MOVQ 8(SI), R9
463 MOVQ 16(SI), R10
464 MOVQ 24(SI), R11
465 XORQ R12, R12
466
467 // Only reduce, no multiplications are needed
468 // First stage
469 MOVQ R8, AX
470 MOVQ R8, R15
471 SHLQ $0x20, R8
472 MULQ p256const1<>+0(SB)
473 SHRQ $0x20, R15
474 ADDQ R8, R9
475 ADCQ R15, R10
476 ADCQ AX, R11
477 ADCQ DX, R12
478 XORQ R13, R13
479
480 // Second stage
481 MOVQ R9, AX
482 MOVQ R9, R15
483 SHLQ $0x20, R9
484 MULQ p256const1<>+0(SB)
485 SHRQ $0x20, R15
486 ADDQ R9, R10
487 ADCQ R15, R11
488 ADCQ AX, R12
489 ADCQ DX, R13
490 XORQ R8, R8
491
492 // Third stage
493 MOVQ R10, AX
494 MOVQ R10, R15
495 SHLQ $0x20, R10
496 MULQ p256const1<>+0(SB)
497 SHRQ $0x20, R15
498 ADDQ R10, R11
499 ADCQ R15, R12
500 ADCQ AX, R13
501 ADCQ DX, R8
502 XORQ R9, R9
503
504 // Last stage
505 MOVQ R11, AX
506 MOVQ R11, R15
507 SHLQ $0x20, R11
508 MULQ p256const1<>+0(SB)
509 SHRQ $0x20, R15
510 ADDQ R11, R12
511 ADCQ R15, R13
512 ADCQ AX, R8
513 ADCQ DX, R9
514 MOVQ R12, SI
515 MOVQ R13, R11
516 MOVQ R8, R14
517 MOVQ R9, R15
518 SUBQ $-1, R12
519 SBBQ p256const0<>+0(SB), R13
520 SBBQ $0x00, R8
521 SBBQ p256const1<>+0(SB), R9
522 CMOVQCS SI, R12
523 CMOVQCS R11, R13
524 CMOVQCS R14, R8
525 CMOVQCS R15, R9
526 MOVQ R12, (DI)
527 MOVQ R13, 8(DI)
528 MOVQ R8, 16(DI)
529 MOVQ R9, 24(DI)
530 RET
531
532 // func p256Select(res *P256Point, table *p256Table, idx int)
533 // Requires: SSE2
534 TEXT ·p256Select(SB), NOSPLIT, $0-24
535 MOVQ idx+16(FP), AX
536 MOVQ table+8(FP), DI
537 MOVQ res+0(FP), DX
538 PXOR X15, X15
539 PCMPEQL X14, X14
540 PSUBL X14, X15
541 MOVL AX, X14
542 PSHUFD $0x00, X14, X14
543 PXOR X0, X0
544 PXOR X1, X1
545 PXOR X2, X2
546 PXOR X3, X3
547 PXOR X4, X4
548 PXOR X5, X5
549 MOVQ $0x00000010, AX
550 MOVOU X15, X13
551
552 loop_select:
553 MOVOU X13, X12
554 PADDL X15, X13
555 PCMPEQL X14, X12
556 MOVOU (DI), X6
557 MOVOU 16(DI), X7
558 MOVOU 32(DI), X8
559 MOVOU 48(DI), X9
560 MOVOU 64(DI), X10
561 MOVOU 80(DI), X11
562 ADDQ $0x60, DI
563 PAND X12, X6
564 PAND X12, X7
565 PAND X12, X8
566 PAND X12, X9
567 PAND X12, X10
568 PAND X12, X11
569 PXOR X6, X0
570 PXOR X7, X1
571 PXOR X8, X2
572 PXOR X9, X3
573 PXOR X10, X4
574 PXOR X11, X5
575 DECQ AX
576 JNE loop_select
577 MOVOU X0, (DX)
578 MOVOU X1, 16(DX)
579 MOVOU X2, 32(DX)
580 MOVOU X3, 48(DX)
581 MOVOU X4, 64(DX)
582 MOVOU X5, 80(DX)
583 RET
584
585 // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
586 // Requires: SSE2
587 TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
588 MOVQ idx+16(FP), AX
589 MOVQ table+8(FP), DI
590 MOVQ res+0(FP), DX
591 PXOR X15, X15
592 PCMPEQL X14, X14
593 PSUBL X14, X15
594 MOVL AX, X14
595 PSHUFD $0x00, X14, X14
596 PXOR X0, X0
597 PXOR X1, X1
598 PXOR X2, X2
599 PXOR X3, X3
600 MOVQ $0x00000010, AX
601 MOVOU X15, X13
602
603 loop_select_base:
604 MOVOU X13, X12
605 PADDL X15, X13
606 PCMPEQL X14, X12
607 MOVOU (DI), X4
608 MOVOU 16(DI), X5
609 MOVOU 32(DI), X6
610 MOVOU 48(DI), X7
611 MOVOU 64(DI), X8
612 MOVOU 80(DI), X9
613 MOVOU 96(DI), X10
614 MOVOU 112(DI), X11
615 ADDQ $0x80, DI
616 PAND X12, X4
617 PAND X12, X5
618 PAND X12, X6
619 PAND X12, X7
620 MOVOU X13, X12
621 PADDL X15, X13
622 PCMPEQL X14, X12
623 PAND X12, X8
624 PAND X12, X9
625 PAND X12, X10
626 PAND X12, X11
627 PXOR X4, X0
628 PXOR X5, X1
629 PXOR X6, X2
630 PXOR X7, X3
631 PXOR X8, X0
632 PXOR X9, X1
633 PXOR X10, X2
634 PXOR X11, X3
635 DECQ AX
636 JNE loop_select_base
637 MOVOU X0, (DX)
638 MOVOU X1, 16(DX)
639 MOVOU X2, 32(DX)
640 MOVOU X3, 48(DX)
641 RET
642
643 // func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
644 // Requires: CMOV
645 TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
646 MOVQ res+0(FP), DI
647 MOVQ in1+8(FP), SI
648 MOVQ in2+16(FP), CX
649
650 // x * y[0]
651 MOVQ (CX), R14
652 MOVQ (SI), AX
653 MULQ R14
654 MOVQ AX, R8
655 MOVQ DX, R9
656 MOVQ 8(SI), AX
657 MULQ R14
658 ADDQ AX, R9
659 ADCQ $0x00, DX
660 MOVQ DX, R10
661 MOVQ 16(SI), AX
662 MULQ R14
663 ADDQ AX, R10
664 ADCQ $0x00, DX
665 MOVQ DX, R11
666 MOVQ 24(SI), AX
667 MULQ R14
668 ADDQ AX, R11
669 ADCQ $0x00, DX
670 MOVQ DX, R12
671 XORQ R13, R13
672
673 // First reduction step
674 MOVQ R8, AX
675 MULQ p256ordK0<>+0(SB)
676 MOVQ AX, R14
677 MOVQ p256ord<>+0(SB), AX
678 MULQ R14
679 ADDQ AX, R8
680 ADCQ $0x00, DX
681 MOVQ DX, R15
682 MOVQ p256ord<>+8(SB), AX
683 MULQ R14
684 ADDQ R15, R9
685 ADCQ $0x00, DX
686 ADDQ AX, R9
687 ADCQ $0x00, DX
688 MOVQ DX, R15
689 MOVQ p256ord<>+16(SB), AX
690 MULQ R14
691 ADDQ R15, R10
692 ADCQ $0x00, DX
693 ADDQ AX, R10
694 ADCQ $0x00, DX
695 MOVQ DX, R15
696 MOVQ p256ord<>+24(SB), AX
697 MULQ R14
698 ADDQ R15, R11
699 ADCQ $0x00, DX
700 ADDQ AX, R11
701 ADCQ DX, R12
702 ADCQ $0x00, R13
703
704 // x * y[1]
705 MOVQ 8(CX), R14
706 MOVQ (SI), AX
707 MULQ R14
708 ADDQ AX, R9
709 ADCQ $0x00, DX
710 MOVQ DX, R15
711 MOVQ 8(SI), AX
712 MULQ R14
713 ADDQ R15, R10
714 ADCQ $0x00, DX
715 ADDQ AX, R10
716 ADCQ $0x00, DX
717 MOVQ DX, R15
718 MOVQ 16(SI), AX
719 MULQ R14
720 ADDQ R15, R11
721 ADCQ $0x00, DX
722 ADDQ AX, R11
723 ADCQ $0x00, DX
724 MOVQ DX, R15
725 MOVQ 24(SI), AX
726 MULQ R14
727 ADDQ R15, R12
728 ADCQ $0x00, DX
729 ADDQ AX, R12
730 ADCQ DX, R13
731 ADCQ $0x00, R8
732
733 // Second reduction step
734 MOVQ R9, AX
735 MULQ p256ordK0<>+0(SB)
736 MOVQ AX, R14
737 MOVQ p256ord<>+0(SB), AX
738 MULQ R14
739 ADDQ AX, R9
740 ADCQ $0x00, DX
741 MOVQ DX, R15
742 MOVQ p256ord<>+8(SB), AX
743 MULQ R14
744 ADDQ R15, R10
745 ADCQ $0x00, DX
746 ADDQ AX, R10
747 ADCQ $0x00, DX
748 MOVQ DX, R15
749 MOVQ p256ord<>+16(SB), AX
750 MULQ R14
751 ADDQ R15, R11
752 ADCQ $0x00, DX
753 ADDQ AX, R11
754 ADCQ $0x00, DX
755 MOVQ DX, R15
756 MOVQ p256ord<>+24(SB), AX
757 MULQ R14
758 ADDQ R15, R12
759 ADCQ $0x00, DX
760 ADDQ AX, R12
761 ADCQ DX, R13
762 ADCQ $0x00, R8
763
764 // x * y[2]
765 MOVQ 16(CX), R14
766 MOVQ (SI), AX
767 MULQ R14
768 ADDQ AX, R10
769 ADCQ $0x00, DX
770 MOVQ DX, R15
771 MOVQ 8(SI), AX
772 MULQ R14
773 ADDQ R15, R11
774 ADCQ $0x00, DX
775 ADDQ AX, R11
776 ADCQ $0x00, DX
777 MOVQ DX, R15
778 MOVQ 16(SI), AX
779 MULQ R14
780 ADDQ R15, R12
781 ADCQ $0x00, DX
782 ADDQ AX, R12
783 ADCQ $0x00, DX
784 MOVQ DX, R15
785 MOVQ 24(SI), AX
786 MULQ R14
787 ADDQ R15, R13
788 ADCQ $0x00, DX
789 ADDQ AX, R13
790 ADCQ DX, R8
791 ADCQ $0x00, R9
792
793 // Third reduction step
794 MOVQ R10, AX
795 MULQ p256ordK0<>+0(SB)
796 MOVQ AX, R14
797 MOVQ p256ord<>+0(SB), AX
798 MULQ R14
799 ADDQ AX, R10
800 ADCQ $0x00, DX
801 MOVQ DX, R15
802 MOVQ p256ord<>+8(SB), AX
803 MULQ R14
804 ADDQ R15, R11
805 ADCQ $0x00, DX
806 ADDQ AX, R11
807 ADCQ $0x00, DX
808 MOVQ DX, R15
809 MOVQ p256ord<>+16(SB), AX
810 MULQ R14
811 ADDQ R15, R12
812 ADCQ $0x00, DX
813 ADDQ AX, R12
814 ADCQ $0x00, DX
815 MOVQ DX, R15
816 MOVQ p256ord<>+24(SB), AX
817 MULQ R14
818 ADDQ R15, R13
819 ADCQ $0x00, DX
820 ADDQ AX, R13
821 ADCQ DX, R8
822 ADCQ $0x00, R9
823
824 // x * y[3]
825 MOVQ 24(CX), R14
826 MOVQ (SI), AX
827 MULQ R14
828 ADDQ AX, R11
829 ADCQ $0x00, DX
830 MOVQ DX, R15
831 MOVQ 8(SI), AX
832 MULQ R14
833 ADDQ R15, R12
834 ADCQ $0x00, DX
835 ADDQ AX, R12
836 ADCQ $0x00, DX
837 MOVQ DX, R15
838 MOVQ 16(SI), AX
839 MULQ R14
840 ADDQ R15, R13
841 ADCQ $0x00, DX
842 ADDQ AX, R13
843 ADCQ $0x00, DX
844 MOVQ DX, R15
845 MOVQ 24(SI), AX
846 MULQ R14
847 ADDQ R15, R8
848 ADCQ $0x00, DX
849 ADDQ AX, R8
850 ADCQ DX, R9
851 ADCQ $0x00, R10
852
853 // Last reduction step
854 MOVQ R11, AX
855 MULQ p256ordK0<>+0(SB)
856 MOVQ AX, R14
857 MOVQ p256ord<>+0(SB), AX
858 MULQ R14
859 ADDQ AX, R11
860 ADCQ $0x00, DX
861 MOVQ DX, R15
862 MOVQ p256ord<>+8(SB), AX
863 MULQ R14
864 ADDQ R15, R12
865 ADCQ $0x00, DX
866 ADDQ AX, R12
867 ADCQ $0x00, DX
868 MOVQ DX, R15
869 MOVQ p256ord<>+16(SB), AX
870 MULQ R14
871 ADDQ R15, R13
872 ADCQ $0x00, DX
873 ADDQ AX, R13
874 ADCQ $0x00, DX
875 MOVQ DX, R15
876 MOVQ p256ord<>+24(SB), AX
877 MULQ R14
878 ADDQ R15, R8
879 ADCQ $0x00, DX
880 ADDQ AX, R8
881 ADCQ DX, R9
882 ADCQ $0x00, R10
883
884 // Copy result [255:0]
885 MOVQ R12, SI
886 MOVQ R13, R11
887 MOVQ R8, R14
888 MOVQ R9, R15
889
890 // Subtract p256
891 SUBQ p256ord<>+0(SB), R12
892 SBBQ p256ord<>+8(SB), R13
893 SBBQ p256ord<>+16(SB), R8
894 SBBQ p256ord<>+24(SB), R9
895 SBBQ $0x00, R10
896 CMOVQCS SI, R12
897 CMOVQCS R11, R13
898 CMOVQCS R14, R8
899 CMOVQCS R15, R9
900 MOVQ R12, (DI)
901 MOVQ R13, 8(DI)
902 MOVQ R8, 16(DI)
903 MOVQ R9, 24(DI)
904 RET
905
906 DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
907 GLOBL p256ordK0<>(SB), RODATA, $8
908
909 DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
910 DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
911 DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
912 DATA p256ord<>+24(SB)/8, $0xffffffff00000000
913 GLOBL p256ord<>(SB), RODATA, $32
914
915 // func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
916 // Requires: CMOV
917 TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
918 MOVQ res+0(FP), DI
919 MOVQ in+8(FP), SI
920 MOVQ n+16(FP), BX
921
922 ordSqrLoop:
923 // y[1:] * y[0]
924 MOVQ (SI), R14
925 MOVQ 8(SI), AX
926 MULQ R14
927 MOVQ AX, R9
928 MOVQ DX, R10
929 MOVQ 16(SI), AX
930 MULQ R14
931 ADDQ AX, R10
932 ADCQ $0x00, DX
933 MOVQ DX, R11
934 MOVQ 24(SI), AX
935 MULQ R14
936 ADDQ AX, R11
937 ADCQ $0x00, DX
938 MOVQ DX, R12
939
940 // y[2:] * y[1]
941 MOVQ 8(SI), R14
942 MOVQ 16(SI), AX
943 MULQ R14
944 ADDQ AX, R11
945 ADCQ $0x00, DX
946 MOVQ DX, R15
947 MOVQ 24(SI), AX
948 MULQ R14
949 ADDQ R15, R12
950 ADCQ $0x00, DX
951 ADDQ AX, R12
952 ADCQ $0x00, DX
953 MOVQ DX, R13
954
955 // y[3] * y[2]
956 MOVQ 16(SI), R14
957 MOVQ 24(SI), AX
958 MULQ R14
959 ADDQ AX, R13
960 ADCQ $0x00, DX
961 MOVQ DX, CX
962 XORQ R15, R15
963
964 // *2
965 ADDQ R9, R9
966 ADCQ R10, R10
967 ADCQ R11, R11
968 ADCQ R12, R12
969 ADCQ R13, R13
970 ADCQ CX, CX
971 ADCQ $0x00, R15
972
973 // Missing products
974 MOVQ (SI), AX
975 MULQ AX
976 MOVQ AX, R8
977 MOVQ DX, R14
978 MOVQ 8(SI), AX
979 MULQ AX
980 ADDQ R14, R9
981 ADCQ AX, R10
982 ADCQ $0x00, DX
983 MOVQ DX, R14
984 MOVQ 16(SI), AX
985 MULQ AX
986 ADDQ R14, R11
987 ADCQ AX, R12
988 ADCQ $0x00, DX
989 MOVQ DX, R14
990 MOVQ 24(SI), AX
991 MULQ AX
992 ADDQ R14, R13
993 ADCQ AX, CX
994 ADCQ DX, R15
995 MOVQ R15, SI
996
997 // First reduction step
998 MOVQ R8, AX
999 MULQ p256ordK0<>+0(SB)
1000 MOVQ AX, R14
1001 MOVQ p256ord<>+0(SB), AX
1002 MULQ R14
1003 ADDQ AX, R8
1004 ADCQ $0x00, DX
1005 MOVQ DX, R15
1006 MOVQ p256ord<>+8(SB), AX
1007 MULQ R14
1008 ADDQ R15, R9
1009 ADCQ $0x00, DX
1010 ADDQ AX, R9
1011 MOVQ R14, R15
1012 ADCQ DX, R10
1013 ADCQ $0x00, R15
1014 SUBQ R14, R10
1015 SBBQ $0x00, R15
1016 MOVQ R14, AX
1017 MOVQ R14, DX
1018 MOVQ R14, R8
1019 SHLQ $0x20, AX
1020 SHRQ $0x20, DX
1021 ADDQ R15, R11
1022 ADCQ $0x00, R8
1023 SUBQ AX, R11
1024 SBBQ DX, R8
1025
1026 // Second reduction step
1027 MOVQ R9, AX
1028 MULQ p256ordK0<>+0(SB)
1029 MOVQ AX, R14
1030 MOVQ p256ord<>+0(SB), AX
1031 MULQ R14
1032 ADDQ AX, R9
1033 ADCQ $0x00, DX
1034 MOVQ DX, R15
1035 MOVQ p256ord<>+8(SB), AX
1036 MULQ R14
1037 ADDQ R15, R10
1038 ADCQ $0x00, DX
1039 ADDQ AX, R10
1040 MOVQ R14, R15
1041 ADCQ DX, R11
1042 ADCQ $0x00, R15
1043 SUBQ R14, R11
1044 SBBQ $0x00, R15
1045 MOVQ R14, AX
1046 MOVQ R14, DX
1047 MOVQ R14, R9
1048 SHLQ $0x20, AX
1049 SHRQ $0x20, DX
1050 ADDQ R15, R8
1051 ADCQ $0x00, R9
1052 SUBQ AX, R8
1053 SBBQ DX, R9
1054
1055 // Third reduction step
1056 MOVQ R10, AX
1057 MULQ p256ordK0<>+0(SB)
1058 MOVQ AX, R14
1059 MOVQ p256ord<>+0(SB), AX
1060 MULQ R14
1061 ADDQ AX, R10
1062 ADCQ $0x00, DX
1063 MOVQ DX, R15
1064 MOVQ p256ord<>+8(SB), AX
1065 MULQ R14
1066 ADDQ R15, R11
1067 ADCQ $0x00, DX
1068 ADDQ AX, R11
1069 MOVQ R14, R15
1070 ADCQ DX, R8
1071 ADCQ $0x00, R15
1072 SUBQ R14, R8
1073 SBBQ $0x00, R15
1074 MOVQ R14, AX
1075 MOVQ R14, DX
1076 MOVQ R14, R10
1077 SHLQ $0x20, AX
1078 SHRQ $0x20, DX
1079 ADDQ R15, R9
1080 ADCQ $0x00, R10
1081 SUBQ AX, R9
1082 SBBQ DX, R10
1083
1084 // Last reduction step
1085 MOVQ R11, AX
1086 MULQ p256ordK0<>+0(SB)
1087 MOVQ AX, R14
1088 MOVQ p256ord<>+0(SB), AX
1089 MULQ R14
1090 ADDQ AX, R11
1091 ADCQ $0x00, DX
1092 MOVQ DX, R15
1093 MOVQ p256ord<>+8(SB), AX
1094 MULQ R14
1095 ADDQ R15, R8
1096 ADCQ $0x00, DX
1097 ADDQ AX, R8
1098 ADCQ $0x00, DX
1099 MOVQ DX, R15
1100 MOVQ R14, R15
1101 ADCQ DX, R9
1102 ADCQ $0x00, R15
1103 SUBQ R14, R9
1104 SBBQ $0x00, R15
1105 MOVQ R14, AX
1106 MOVQ R14, DX
1107 MOVQ R14, R11
1108 SHLQ $0x20, AX
1109 SHRQ $0x20, DX
1110 ADDQ R15, R10
1111 ADCQ $0x00, R11
1112 SUBQ AX, R10
1113 SBBQ DX, R11
1114 XORQ R14, R14
1115
1116 // Add bits [511:256] of the sqr result
1117 ADCQ R12, R8
1118 ADCQ R13, R9
1119 ADCQ CX, R10
1120 ADCQ SI, R11
1121 ADCQ $0x00, R14
1122 MOVQ R8, R12
1123 MOVQ R9, R13
1124 MOVQ R10, CX
1125 MOVQ R11, R15
1126
1127 // Subtract p256
1128 SUBQ p256ord<>+0(SB), R8
1129 SBBQ p256ord<>+8(SB), R9
1130 SBBQ p256ord<>+16(SB), R10
1131 SBBQ p256ord<>+24(SB), R11
1132 SBBQ $0x00, R14
1133 CMOVQCS R12, R8
1134 CMOVQCS R13, R9
1135 CMOVQCS CX, R10
1136 CMOVQCS R15, R11
1137 MOVQ R8, (DI)
1138 MOVQ R9, 8(DI)
1139 MOVQ R10, 16(DI)
1140 MOVQ R11, 24(DI)
1141 MOVQ DI, SI
1142 DECQ BX
1143 JNE ordSqrLoop
1144 RET
1145
1146 // func p256SubInternal()
1147 // Requires: CMOV
1148 TEXT p256SubInternal(SB), NOSPLIT, $0
1149 XORQ AX, AX
1150 SUBQ R14, R10
1151 SBBQ R15, R11
1152 SBBQ DI, R12
1153 SBBQ SI, R13
1154 SBBQ $0x00, AX
1155 MOVQ R10, BX
1156 MOVQ R11, CX
1157 MOVQ R12, R8
1158 MOVQ R13, R9
1159 ADDQ $-1, R10
1160 ADCQ p256const0<>+0(SB), R11
1161 ADCQ $0x00, R12
1162 ADCQ p256const1<>+0(SB), R13
1163 ANDQ $0x01, AX
1164 CMOVQEQ BX, R10
1165 CMOVQEQ CX, R11
1166 CMOVQEQ R8, R12
1167 CMOVQEQ R9, R13
1168 RET
1169
1170 // func p256MulInternal()
1171 // Requires: CMOV
1172 TEXT p256MulInternal(SB), NOSPLIT, $8
1173 MOVQ R10, AX
1174 MULQ R14
1175 MOVQ AX, BX
1176 MOVQ DX, CX
1177 MOVQ R10, AX
1178 MULQ R15
1179 ADDQ AX, CX
1180 ADCQ $0x00, DX
1181 MOVQ DX, R8
1182 MOVQ R10, AX
1183 MULQ DI
1184 ADDQ AX, R8
1185 ADCQ $0x00, DX
1186 MOVQ DX, R9
1187 MOVQ R10, AX
1188 MULQ SI
1189 ADDQ AX, R9
1190 ADCQ $0x00, DX
1191 MOVQ DX, R10
1192 MOVQ R11, AX
1193 MULQ R14
1194 ADDQ AX, CX
1195 ADCQ $0x00, DX
1196 MOVQ DX, BP
1197 MOVQ R11, AX
1198 MULQ R15
1199 ADDQ BP, R8
1200 ADCQ $0x00, DX
1201 ADDQ AX, R8
1202 ADCQ $0x00, DX
1203 MOVQ DX, BP
1204 MOVQ R11, AX
1205 MULQ DI
1206 ADDQ BP, R9
1207 ADCQ $0x00, DX
1208 ADDQ AX, R9
1209 ADCQ $0x00, DX
1210 MOVQ DX, BP
1211 MOVQ R11, AX
1212 MULQ SI
1213 ADDQ BP, R10
1214 ADCQ $0x00, DX
1215 ADDQ AX, R10
1216 ADCQ $0x00, DX
1217 MOVQ DX, R11
1218 MOVQ R12, AX
1219 MULQ R14
1220 ADDQ AX, R8
1221 ADCQ $0x00, DX
1222 MOVQ DX, BP
1223 MOVQ R12, AX
1224 MULQ R15
1225 ADDQ BP, R9
1226 ADCQ $0x00, DX
1227 ADDQ AX, R9
1228 ADCQ $0x00, DX
1229 MOVQ DX, BP
1230 MOVQ R12, AX
1231 MULQ DI
1232 ADDQ BP, R10
1233 ADCQ $0x00, DX
1234 ADDQ AX, R10
1235 ADCQ $0x00, DX
1236 MOVQ DX, BP
1237 MOVQ R12, AX
1238 MULQ SI
1239 ADDQ BP, R11
1240 ADCQ $0x00, DX
1241 ADDQ AX, R11
1242 ADCQ $0x00, DX
1243 MOVQ DX, R12
1244 MOVQ R13, AX
1245 MULQ R14
1246 ADDQ AX, R9
1247 ADCQ $0x00, DX
1248 MOVQ DX, BP
1249 MOVQ R13, AX
1250 MULQ R15
1251 ADDQ BP, R10
1252 ADCQ $0x00, DX
1253 ADDQ AX, R10
1254 ADCQ $0x00, DX
1255 MOVQ DX, BP
1256 MOVQ R13, AX
1257 MULQ DI
1258 ADDQ BP, R11
1259 ADCQ $0x00, DX
1260 ADDQ AX, R11
1261 ADCQ $0x00, DX
1262 MOVQ DX, BP
1263 MOVQ R13, AX
1264 MULQ SI
1265 ADDQ BP, R12
1266 ADCQ $0x00, DX
1267 ADDQ AX, R12
1268 ADCQ $0x00, DX
1269 MOVQ DX, R13
1270
1271 // First reduction step
1272 MOVQ BX, AX
1273 MOVQ BX, BP
1274 SHLQ $0x20, BX
1275 MULQ p256const1<>+0(SB)
1276 SHRQ $0x20, BP
1277 ADDQ BX, CX
1278 ADCQ BP, R8
1279 ADCQ AX, R9
1280 ADCQ $0x00, DX
1281 MOVQ DX, BX
1282
1283 // Second reduction step
1284 MOVQ CX, AX
1285 MOVQ CX, BP
1286 SHLQ $0x20, CX
1287 MULQ p256const1<>+0(SB)
1288 SHRQ $0x20, BP
1289 ADDQ CX, R8
1290 ADCQ BP, R9
1291 ADCQ AX, BX
1292 ADCQ $0x00, DX
1293 MOVQ DX, CX
1294
1295 // Third reduction step
1296 MOVQ R8, AX
1297 MOVQ R8, BP
1298 SHLQ $0x20, R8
1299 MULQ p256const1<>+0(SB)
1300 SHRQ $0x20, BP
1301 ADDQ R8, R9
1302 ADCQ BP, BX
1303 ADCQ AX, CX
1304 ADCQ $0x00, DX
1305 MOVQ DX, R8
1306
1307 // Last reduction step
1308 MOVQ R9, AX
1309 MOVQ R9, BP
1310 SHLQ $0x20, R9
1311 MULQ p256const1<>+0(SB)
1312 SHRQ $0x20, BP
1313 ADDQ R9, BX
1314 ADCQ BP, CX
1315 ADCQ AX, R8
1316 ADCQ $0x00, DX
1317 MOVQ DX, R9
1318 MOVQ $0x00000000, BP
1319
1320 // Add bits [511:256] of the result
1321 ADCQ BX, R10
1322 ADCQ CX, R11
1323 ADCQ R8, R12
1324 ADCQ R9, R13
1325 ADCQ $0x00, BP
1326
1327 // Copy result
1328 MOVQ R10, BX
1329 MOVQ R11, CX
1330 MOVQ R12, R8
1331 MOVQ R13, R9
1332
1333 // Subtract p256
1334 SUBQ $-1, R10
1335 SBBQ p256const0<>+0(SB), R11
1336 SBBQ $0x00, R12
1337 SBBQ p256const1<>+0(SB), R13
1338 SBBQ $0x00, BP
1339
1340 // If the result of the subtraction is negative, restore the previous result
1341 CMOVQCS BX, R10
1342 CMOVQCS CX, R11
1343 CMOVQCS R8, R12
1344 CMOVQCS R9, R13
1345 RET
1346
1347 // func p256SqrInternal()
1348 // Requires: CMOV
1349 TEXT p256SqrInternal(SB), NOSPLIT, $8
1350 MOVQ R10, AX
1351 MULQ R11
1352 MOVQ AX, CX
1353 MOVQ DX, R8
1354 MOVQ R10, AX
1355 MULQ R12
1356 ADDQ AX, R8
1357 ADCQ $0x00, DX
1358 MOVQ DX, R9
1359 MOVQ R10, AX
1360 MULQ R13
1361 ADDQ AX, R9
1362 ADCQ $0x00, DX
1363 MOVQ DX, R14
1364 MOVQ R11, AX
1365 MULQ R12
1366 ADDQ AX, R9
1367 ADCQ $0x00, DX
1368 MOVQ DX, BP
1369 MOVQ R11, AX
1370 MULQ R13
1371 ADDQ BP, R14
1372 ADCQ $0x00, DX
1373 ADDQ AX, R14
1374 ADCQ $0x00, DX
1375 MOVQ DX, R15
1376 MOVQ R12, AX
1377 MULQ R13
1378 ADDQ AX, R15
1379 ADCQ $0x00, DX
1380 MOVQ DX, DI
1381 XORQ SI, SI
1382
1383 // *2
1384 ADDQ CX, CX
1385 ADCQ R8, R8
1386 ADCQ R9, R9
1387 ADCQ R14, R14
1388 ADCQ R15, R15
1389 ADCQ DI, DI
1390 ADCQ $0x00, SI
1391
1392 // Missing products
1393 MOVQ R10, AX
1394 MULQ AX
1395 MOVQ AX, BX
1396 MOVQ DX, R10
1397 MOVQ R11, AX
1398 MULQ AX
1399 ADDQ R10, CX
1400 ADCQ AX, R8
1401 ADCQ $0x00, DX
1402 MOVQ DX, R10
1403 MOVQ R12, AX
1404 MULQ AX
1405 ADDQ R10, R9
1406 ADCQ AX, R14
1407 ADCQ $0x00, DX
1408 MOVQ DX, R10
1409 MOVQ R13, AX
1410 MULQ AX
1411 ADDQ R10, R15
1412 ADCQ AX, DI
1413 ADCQ DX, SI
1414
1415 // First reduction step
1416 MOVQ BX, AX
1417 MOVQ BX, BP
1418 SHLQ $0x20, BX
1419 MULQ p256const1<>+0(SB)
1420 SHRQ $0x20, BP
1421 ADDQ BX, CX
1422 ADCQ BP, R8
1423 ADCQ AX, R9
1424 ADCQ $0x00, DX
1425 MOVQ DX, BX
1426
1427 // Second reduction step
1428 MOVQ CX, AX
1429 MOVQ CX, BP
1430 SHLQ $0x20, CX
1431 MULQ p256const1<>+0(SB)
1432 SHRQ $0x20, BP
1433 ADDQ CX, R8
1434 ADCQ BP, R9
1435 ADCQ AX, BX
1436 ADCQ $0x00, DX
1437 MOVQ DX, CX
1438
1439 // Third reduction step
1440 MOVQ R8, AX
1441 MOVQ R8, BP
1442 SHLQ $0x20, R8
1443 MULQ p256const1<>+0(SB)
1444 SHRQ $0x20, BP
1445 ADDQ R8, R9
1446 ADCQ BP, BX
1447 ADCQ AX, CX
1448 ADCQ $0x00, DX
1449 MOVQ DX, R8
1450
1451 // Last reduction step
1452 MOVQ R9, AX
1453 MOVQ R9, BP
1454 SHLQ $0x20, R9
1455 MULQ p256const1<>+0(SB)
1456 SHRQ $0x20, BP
1457 ADDQ R9, BX
1458 ADCQ BP, CX
1459 ADCQ AX, R8
1460 ADCQ $0x00, DX
1461 MOVQ DX, R9
1462 MOVQ $0x00000000, BP
1463
1464 // Add bits [511:256] of the result
1465 ADCQ BX, R14
1466 ADCQ CX, R15
1467 ADCQ R8, DI
1468 ADCQ R9, SI
1469 ADCQ $0x00, BP
1470
1471 // Copy result
1472 MOVQ R14, R10
1473 MOVQ R15, R11
1474 MOVQ DI, R12
1475 MOVQ SI, R13
1476
1477 // Subtract p256
1478 SUBQ $-1, R10
1479 SBBQ p256const0<>+0(SB), R11
1480 SBBQ $0x00, R12
1481 SBBQ p256const1<>+0(SB), R13
1482 SBBQ $0x00, BP
1483
1484 // If the result of the subtraction is negative, restore the previous result
1485 CMOVQCS R14, R10
1486 CMOVQCS R15, R11
1487 CMOVQCS DI, R12
1488 CMOVQCS SI, R13
1489 RET
1490
1491 // func p256PointAddAffineAsm(res *P256Point, in1 *P256Point, in2 *p256AffinePoint, sign int, sel int, zero int)
1492 // Requires: CMOV, SSE2
1493 TEXT ·p256PointAddAffineAsm(SB), $512-48
1494 MOVQ res+0(FP), AX
1495 MOVQ in1+8(FP), BX
1496 MOVQ in2+16(FP), CX
1497 MOVQ sign+24(FP), DX
1498 MOVQ sel+32(FP), R15
1499 MOVQ zero+40(FP), DI
1500 MOVOU (BX), X0
1501 MOVOU 16(BX), X1
1502 MOVOU 32(BX), X2
1503 MOVOU 48(BX), X3
1504 MOVOU 64(BX), X4
1505 MOVOU 80(BX), X5
1506 MOVOU X0, (SP)
1507 MOVOU X1, 16(SP)
1508 MOVOU X2, 32(SP)
1509 MOVOU X3, 48(SP)
1510 MOVOU X4, 64(SP)
1511 MOVOU X5, 80(SP)
1512 MOVOU (CX), X0
1513 MOVOU 16(CX), X1
1514 MOVOU X0, 96(SP)
1515 MOVOU X1, 112(SP)
1516
1517 // Store pointer to result
1518 MOVQ AX, 480(SP)
1519 MOVL R15, 488(SP)
1520 MOVL DI, 492(SP)
1521
1522 // Negate y2in based on sign
1523 MOVQ 32(CX), R10
1524 MOVQ 40(CX), R11
1525 MOVQ 48(CX), R12
1526 MOVQ 56(CX), R13
1527 MOVQ $-1, BX
1528 MOVQ p256const0<>+0(SB), CX
1529 MOVQ $0x00000000, R8
1530 MOVQ p256const1<>+0(SB), R9
1531 XORQ AX, AX
1532
1533 // Speculatively subtract
1534 SUBQ R10, BX
1535 SBBQ R11, CX
1536 SBBQ R12, R8
1537 SBBQ R13, R9
1538 SBBQ $0x00, AX
1539 MOVQ BX, R14
1540 MOVQ CX, R15
1541 MOVQ R8, DI
1542 MOVQ R9, SI
1543
1544 // Add in case the operand was > p256
1545 ADDQ $-1, BX
1546 ADCQ p256const0<>+0(SB), CX
1547 ADCQ $0x00, R8
1548 ADCQ p256const1<>+0(SB), R9
1549 ADCQ $0x00, AX
1550 CMOVQNE R14, BX
1551 CMOVQNE R15, CX
1552 CMOVQNE DI, R8
1553 CMOVQNE SI, R9
1554
1555 // If condition is 0, keep original value
1556 TESTQ DX, DX
1557 CMOVQEQ R10, BX
1558 CMOVQEQ R11, CX
1559 CMOVQEQ R12, R8
1560 CMOVQEQ R13, R9
1561
1562 // Store result
1563 MOVQ BX, 128(SP)
1564 MOVQ CX, 136(SP)
1565 MOVQ R8, 144(SP)
1566 MOVQ R9, 152(SP)
1567
1568 // Begin point add
1569 MOVQ 64(SP), R10
1570 MOVQ 72(SP), R11
1571 MOVQ 80(SP), R12
1572 MOVQ 88(SP), R13
1573 CALL p256SqrInternal(SB)
1574 MOVQ R10, 288(SP)
1575 MOVQ R11, 296(SP)
1576 MOVQ R12, 304(SP)
1577 MOVQ R13, 312(SP)
1578 MOVQ 96(SP), R14
1579 MOVQ 104(SP), R15
1580 MOVQ 112(SP), DI
1581 MOVQ 120(SP), SI
1582 CALL p256MulInternal(SB)
1583 MOVQ (SP), R14
1584 MOVQ 8(SP), R15
1585 MOVQ 16(SP), DI
1586 MOVQ 24(SP), SI
1587 CALL p256SubInternal(SB)
1588 MOVQ R10, 320(SP)
1589 MOVQ R11, 328(SP)
1590 MOVQ R12, 336(SP)
1591 MOVQ R13, 344(SP)
1592 MOVQ 64(SP), R14
1593 MOVQ 72(SP), R15
1594 MOVQ 80(SP), DI
1595 MOVQ 88(SP), SI
1596 CALL p256MulInternal(SB)
1597 MOVQ R10, 224(SP)
1598 MOVQ R11, 232(SP)
1599 MOVQ R12, 240(SP)
1600 MOVQ R13, 248(SP)
1601 MOVQ 288(SP), R10
1602 MOVQ 296(SP), R11
1603 MOVQ 304(SP), R12
1604 MOVQ 312(SP), R13
1605 CALL p256MulInternal(SB)
1606 MOVQ 128(SP), R14
1607 MOVQ 136(SP), R15
1608 MOVQ 144(SP), DI
1609 MOVQ 152(SP), SI
1610 CALL p256MulInternal(SB)
1611 MOVQ R10, 256(SP)
1612 MOVQ R11, 264(SP)
1613 MOVQ R12, 272(SP)
1614 MOVQ R13, 280(SP)
1615 MOVQ 32(SP), R14
1616 MOVQ 40(SP), R15
1617 MOVQ 48(SP), DI
1618 MOVQ 56(SP), SI
1619 CALL p256SubInternal(SB)
1620 MOVQ R10, 352(SP)
1621 MOVQ R11, 360(SP)
1622 MOVQ R12, 368(SP)
1623 MOVQ R13, 376(SP)
1624 CALL p256SqrInternal(SB)
1625 MOVQ R10, 416(SP)
1626 MOVQ R11, 424(SP)
1627 MOVQ R12, 432(SP)
1628 MOVQ R13, 440(SP)
1629 MOVQ 320(SP), R10
1630 MOVQ 328(SP), R11
1631 MOVQ 336(SP), R12
1632 MOVQ 344(SP), R13
1633 CALL p256SqrInternal(SB)
1634 MOVQ R10, 384(SP)
1635 MOVQ R11, 392(SP)
1636 MOVQ R12, 400(SP)
1637 MOVQ R13, 408(SP)
1638 MOVQ 320(SP), R14
1639 MOVQ 328(SP), R15
1640 MOVQ 336(SP), DI
1641 MOVQ 344(SP), SI
1642 CALL p256MulInternal(SB)
1643 MOVQ R10, 448(SP)
1644 MOVQ R11, 456(SP)
1645 MOVQ R12, 464(SP)
1646 MOVQ R13, 472(SP)
1647 MOVQ 32(SP), R14
1648 MOVQ 40(SP), R15
1649 MOVQ 48(SP), DI
1650 MOVQ 56(SP), SI
1651 CALL p256MulInternal(SB)
1652 MOVQ R10, 256(SP)
1653 MOVQ R11, 264(SP)
1654 MOVQ R12, 272(SP)
1655 MOVQ R13, 280(SP)
1656 MOVQ (SP), R10
1657 MOVQ 8(SP), R11
1658 MOVQ 16(SP), R12
1659 MOVQ 24(SP), R13
1660 MOVQ 384(SP), R14
1661 MOVQ 392(SP), R15
1662 MOVQ 400(SP), DI
1663 MOVQ 408(SP), SI
1664 CALL p256MulInternal(SB)
1665 MOVQ R10, 320(SP)
1666 MOVQ R11, 328(SP)
1667 MOVQ R12, 336(SP)
1668 MOVQ R13, 344(SP)
1669 XORQ AX, AX
1670 ADDQ R10, R10
1671 ADCQ R11, R11
1672 ADCQ R12, R12
1673 ADCQ R13, R13
1674 ADCQ $+0, AX
1675 MOVQ R10, R14
1676 MOVQ R11, R15
1677 MOVQ R12, DI
1678 MOVQ R13, SI
1679 SUBQ $-1, R14
1680 SBBQ p256const0<>+0(SB), R15
1681 SBBQ $+0, DI
1682 SBBQ p256const1<>+0(SB), SI
1683 SBBQ $+0, AX
1684 CMOVQCS R10, R14
1685 CMOVQCS R11, R15
1686 CMOVQCS R12, DI
1687 CMOVQCS R13, SI
1688 MOVQ 416(SP), R10
1689 MOVQ 424(SP), R11
1690 MOVQ 432(SP), R12
1691 MOVQ 440(SP), R13
1692 CALL p256SubInternal(SB)
1693 MOVQ 448(SP), R14
1694 MOVQ 456(SP), R15
1695 MOVQ 464(SP), DI
1696 MOVQ 472(SP), SI
1697 CALL p256SubInternal(SB)
1698 MOVQ R10, 160(SP)
1699 MOVQ R11, 168(SP)
1700 MOVQ R12, 176(SP)
1701 MOVQ R13, 184(SP)
1702 MOVQ R10, R14
1703 MOVQ R11, R15
1704 MOVQ R12, DI
1705 MOVQ R13, SI
1706 MOVQ 320(SP), R10
1707 MOVQ 328(SP), R11
1708 MOVQ 336(SP), R12
1709 MOVQ 344(SP), R13
1710 CALL p256SubInternal(SB)
1711 MOVQ 352(SP), R14
1712 MOVQ 360(SP), R15
1713 MOVQ 368(SP), DI
1714 MOVQ 376(SP), SI
1715 CALL p256MulInternal(SB)
1716 MOVQ 256(SP), R14
1717 MOVQ 264(SP), R15
1718 MOVQ 272(SP), DI
1719 MOVQ 280(SP), SI
1720 CALL p256SubInternal(SB)
1721 MOVQ R10, 192(SP)
1722 MOVQ R11, 200(SP)
1723 MOVQ R12, 208(SP)
1724 MOVQ R13, 216(SP)
1725
1726 // Load stored values from stack
1727 MOVQ 480(SP), AX
1728 MOVL 488(SP), BX
1729 MOVL 492(SP), CX
1730
1731 // The result is not valid if (sel == 0), conditional choose
1732 MOVOU 160(SP), X0
1733 MOVOU 176(SP), X1
1734 MOVOU 192(SP), X2
1735 MOVOU 208(SP), X3
1736 MOVOU 224(SP), X4
1737 MOVOU 240(SP), X5
1738 MOVL BX, X6
1739 MOVL CX, X7
1740 PXOR X8, X8
1741 PCMPEQL X9, X9
1742 PSHUFD $0x00, X6, X6
1743 PSHUFD $0x00, X7, X7
1744 PCMPEQL X8, X6
1745 PCMPEQL X8, X7
1746 MOVOU X6, X15
1747 PANDN X9, X15
1748 MOVOU (SP), X9
1749 MOVOU 16(SP), X10
1750 MOVOU 32(SP), X11
1751 MOVOU 48(SP), X12
1752 MOVOU 64(SP), X13
1753 MOVOU 80(SP), X14
1754 PAND X15, X0
1755 PAND X15, X1
1756 PAND X15, X2
1757 PAND X15, X3
1758 PAND X15, X4
1759 PAND X15, X5
1760 PAND X6, X9
1761 PAND X6, X10
1762 PAND X6, X11
1763 PAND X6, X12
1764 PAND X6, X13
1765 PAND X6, X14
1766 PXOR X9, X0
1767 PXOR X10, X1
1768 PXOR X11, X2
1769 PXOR X12, X3
1770 PXOR X13, X4
1771 PXOR X14, X5
1772
1773 // Similarly if zero == 0
1774 PCMPEQL X9, X9
1775 MOVOU X7, X15
1776 PANDN X9, X15
1777 MOVOU 96(SP), X9
1778 MOVOU 112(SP), X10
1779 MOVOU 128(SP), X11
1780 MOVOU 144(SP), X12
1781 MOVOU p256one<>+0(SB), X13
1782 MOVOU p256one<>+16(SB), X14
1783 PAND X15, X0
1784 PAND X15, X1
1785 PAND X15, X2
1786 PAND X15, X3
1787 PAND X15, X4
1788 PAND X15, X5
1789 PAND X7, X9
1790 PAND X7, X10
1791 PAND X7, X11
1792 PAND X7, X12
1793 PAND X7, X13
1794 PAND X7, X14
1795 PXOR X9, X0
1796 PXOR X10, X1
1797 PXOR X11, X2
1798 PXOR X12, X3
1799 PXOR X13, X4
1800 PXOR X14, X5
1801
1802 // Finally output the result
1803 MOVOU X0, (AX)
1804 MOVOU X1, 16(AX)
1805 MOVOU X2, 32(AX)
1806 MOVOU X3, 48(AX)
1807 MOVOU X4, 64(AX)
1808 MOVOU X5, 80(AX)
1809 MOVQ $0x00000000, 480(SP)
1810 RET
1811
1812 DATA p256one<>+0(SB)/8, $0x0000000000000001
1813 DATA p256one<>+8(SB)/8, $0xffffffff00000000
1814 DATA p256one<>+16(SB)/8, $0xffffffffffffffff
1815 DATA p256one<>+24(SB)/8, $0x00000000fffffffe
1816 GLOBL p256one<>(SB), RODATA, $32
1817
1818 // func p256IsZero()
1819 // Requires: CMOV
1820 TEXT p256IsZero(SB), NOSPLIT, $0
1821 // AX contains a flag that is set if the input is zero.
1822 XORQ AX, AX
1823 MOVQ $0x00000001, R15
1824
1825 // Check whether [acc4..acc7] are all zero.
1826 MOVQ R10, R14
1827 ORQ R11, R14
1828 ORQ R12, R14
1829 ORQ R13, R14
1830
1831 // Set the zero flag if so. (CMOV of a constant to a register doesn't
1832 // appear to be supported in Go. Thus t1 = 1.)
1833 CMOVQEQ R15, AX
1834
1835 // XOR [acc4..acc7] with P and compare with zero again.
1836 XORQ $-1, R10
1837 XORQ p256const0<>+0(SB), R11
1838 XORQ p256const1<>+0(SB), R13
1839 ORQ R11, R10
1840 ORQ R12, R10
1841 ORQ R13, R10
1842
1843 // Set the zero flag if so.
1844 CMOVQEQ R15, AX
1845 RET
1846
1847 // func p256PointAddAsm(res *P256Point, in1 *P256Point, in2 *P256Point) int
1848 // Requires: CMOV, SSE2
1849 TEXT ·p256PointAddAsm(SB), $680-32
1850 // Move input to stack in order to free registers
1851 MOVQ res+0(FP), AX
1852 MOVQ in1+8(FP), BX
1853 MOVQ in2+16(FP), CX
1854 MOVOU (BX), X0
1855 MOVOU 16(BX), X1
1856 MOVOU 32(BX), X2
1857 MOVOU 48(BX), X3
1858 MOVOU 64(BX), X4
1859 MOVOU 80(BX), X5
1860 MOVOU X0, (SP)
1861 MOVOU X1, 16(SP)
1862 MOVOU X2, 32(SP)
1863 MOVOU X3, 48(SP)
1864 MOVOU X4, 64(SP)
1865 MOVOU X5, 80(SP)
1866 MOVOU (CX), X0
1867 MOVOU 16(CX), X1
1868 MOVOU 32(CX), X2
1869 MOVOU 48(CX), X3
1870 MOVOU 64(CX), X4
1871 MOVOU 80(CX), X5
1872 MOVOU X0, 96(SP)
1873 MOVOU X1, 112(SP)
1874 MOVOU X2, 128(SP)
1875 MOVOU X3, 144(SP)
1876 MOVOU X4, 160(SP)
1877 MOVOU X5, 176(SP)
1878
1879 // Store pointer to result
1880 MOVQ AX, 640(SP)
1881
1882 // Begin point add
1883 MOVQ 160(SP), R10
1884 MOVQ 168(SP), R11
1885 MOVQ 176(SP), R12
1886 MOVQ 184(SP), R13
1887 CALL p256SqrInternal(SB)
1888 MOVQ R10, 448(SP)
1889 MOVQ R11, 456(SP)
1890 MOVQ R12, 464(SP)
1891 MOVQ R13, 472(SP)
1892 MOVQ 160(SP), R14
1893 MOVQ 168(SP), R15
1894 MOVQ 176(SP), DI
1895 MOVQ 184(SP), SI
1896 CALL p256MulInternal(SB)
1897 MOVQ 32(SP), R14
1898 MOVQ 40(SP), R15
1899 MOVQ 48(SP), DI
1900 MOVQ 56(SP), SI
1901 CALL p256MulInternal(SB)
1902 MOVQ R10, 352(SP)
1903 MOVQ R11, 360(SP)
1904 MOVQ R12, 368(SP)
1905 MOVQ R13, 376(SP)
1906 MOVQ 64(SP), R10
1907 MOVQ 72(SP), R11
1908 MOVQ 80(SP), R12
1909 MOVQ 88(SP), R13
1910 CALL p256SqrInternal(SB)
1911 MOVQ R10, 416(SP)
1912 MOVQ R11, 424(SP)
1913 MOVQ R12, 432(SP)
1914 MOVQ R13, 440(SP)
1915 MOVQ 64(SP), R14
1916 MOVQ 72(SP), R15
1917 MOVQ 80(SP), DI
1918 MOVQ 88(SP), SI
1919 CALL p256MulInternal(SB)
1920 MOVQ 128(SP), R14
1921 MOVQ 136(SP), R15
1922 MOVQ 144(SP), DI
1923 MOVQ 152(SP), SI
1924 CALL p256MulInternal(SB)
1925 MOVQ R10, 384(SP)
1926 MOVQ R11, 392(SP)
1927 MOVQ R12, 400(SP)
1928 MOVQ R13, 408(SP)
1929 MOVQ 352(SP), R14
1930 MOVQ 360(SP), R15
1931 MOVQ 368(SP), DI
1932 MOVQ 376(SP), SI
1933 CALL p256SubInternal(SB)
1934 MOVQ R10, 512(SP)
1935 MOVQ R11, 520(SP)
1936 MOVQ R12, 528(SP)
1937 MOVQ R13, 536(SP)
1938 CALL p256IsZero(SB)
1939 MOVQ AX, 648(SP)
1940 MOVQ 448(SP), R10
1941 MOVQ 456(SP), R11
1942 MOVQ 464(SP), R12
1943 MOVQ 472(SP), R13
1944 MOVQ (SP), R14
1945 MOVQ 8(SP), R15
1946 MOVQ 16(SP), DI
1947 MOVQ 24(SP), SI
1948 CALL p256MulInternal(SB)
1949 MOVQ R10, 288(SP)
1950 MOVQ R11, 296(SP)
1951 MOVQ R12, 304(SP)
1952 MOVQ R13, 312(SP)
1953 MOVQ 416(SP), R10
1954 MOVQ 424(SP), R11
1955 MOVQ 432(SP), R12
1956 MOVQ 440(SP), R13
1957 MOVQ 96(SP), R14
1958 MOVQ 104(SP), R15
1959 MOVQ 112(SP), DI
1960 MOVQ 120(SP), SI
1961 CALL p256MulInternal(SB)
1962 MOVQ R10, 320(SP)
1963 MOVQ R11, 328(SP)
1964 MOVQ R12, 336(SP)
1965 MOVQ R13, 344(SP)
1966 MOVQ 288(SP), R14
1967 MOVQ 296(SP), R15
1968 MOVQ 304(SP), DI
1969 MOVQ 312(SP), SI
1970 CALL p256SubInternal(SB)
1971 MOVQ R10, 480(SP)
1972 MOVQ R11, 488(SP)
1973 MOVQ R12, 496(SP)
1974 MOVQ R13, 504(SP)
1975 CALL p256IsZero(SB)
1976 ANDQ 648(SP), AX
1977 MOVQ AX, 648(SP)
1978 MOVQ 512(SP), R10
1979 MOVQ 520(SP), R11
1980 MOVQ 528(SP), R12
1981 MOVQ 536(SP), R13
1982 CALL p256SqrInternal(SB)
1983 MOVQ R10, 576(SP)
1984 MOVQ R11, 584(SP)
1985 MOVQ R12, 592(SP)
1986 MOVQ R13, 600(SP)
1987 MOVQ 480(SP), R10
1988 MOVQ 488(SP), R11
1989 MOVQ 496(SP), R12
1990 MOVQ 504(SP), R13
1991 CALL p256SqrInternal(SB)
1992 MOVQ R10, 544(SP)
1993 MOVQ R11, 552(SP)
1994 MOVQ R12, 560(SP)
1995 MOVQ R13, 568(SP)
1996 MOVQ 480(SP), R14
1997 MOVQ 488(SP), R15
1998 MOVQ 496(SP), DI
1999 MOVQ 504(SP), SI
2000 CALL p256MulInternal(SB)
2001 MOVQ R10, 608(SP)
2002 MOVQ R11, 616(SP)
2003 MOVQ R12, 624(SP)
2004 MOVQ R13, 632(SP)
2005 MOVQ 352(SP), R14
2006 MOVQ 360(SP), R15
2007 MOVQ 368(SP), DI
2008 MOVQ 376(SP), SI
2009 CALL p256MulInternal(SB)
2010 MOVQ R10, 384(SP)
2011 MOVQ R11, 392(SP)
2012 MOVQ R12, 400(SP)
2013 MOVQ R13, 408(SP)
2014 MOVQ 64(SP), R10
2015 MOVQ 72(SP), R11
2016 MOVQ 80(SP), R12
2017 MOVQ 88(SP), R13
2018 MOVQ 160(SP), R14
2019 MOVQ 168(SP), R15
2020 MOVQ 176(SP), DI
2021 MOVQ 184(SP), SI
2022 CALL p256MulInternal(SB)
2023 MOVQ 480(SP), R14
2024 MOVQ 488(SP), R15
2025 MOVQ 496(SP), DI
2026 MOVQ 504(SP), SI
2027 CALL p256MulInternal(SB)
2028 MOVQ R10, 256(SP)
2029 MOVQ R11, 264(SP)
2030 MOVQ R12, 272(SP)
2031 MOVQ R13, 280(SP)
2032 MOVQ 544(SP), R10
2033 MOVQ 552(SP), R11
2034 MOVQ 560(SP), R12
2035 MOVQ 568(SP), R13
2036 MOVQ 288(SP), R14
2037 MOVQ 296(SP), R15
2038 MOVQ 304(SP), DI
2039 MOVQ 312(SP), SI
2040 CALL p256MulInternal(SB)
2041 MOVQ R10, 320(SP)
2042 MOVQ R11, 328(SP)
2043 MOVQ R12, 336(SP)
2044 MOVQ R13, 344(SP)
2045 XORQ AX, AX
2046 ADDQ R10, R10
2047 ADCQ R11, R11
2048 ADCQ R12, R12
2049 ADCQ R13, R13
2050 ADCQ $+0, AX
2051 MOVQ R10, R14
2052 MOVQ R11, R15
2053 MOVQ R12, DI
2054 MOVQ R13, SI
2055 SUBQ $-1, R14
2056 SBBQ p256const0<>+0(SB), R15
2057 SBBQ $+0, DI
2058 SBBQ p256const1<>+0(SB), SI
2059 SBBQ $+0, AX
2060 CMOVQCS R10, R14
2061 CMOVQCS R11, R15
2062 CMOVQCS R12, DI
2063 CMOVQCS R13, SI
2064 MOVQ 576(SP), R10
2065 MOVQ 584(SP), R11
2066 MOVQ 592(SP), R12
2067 MOVQ 600(SP), R13
2068 CALL p256SubInternal(SB)
2069 MOVQ 608(SP), R14
2070 MOVQ 616(SP), R15
2071 MOVQ 624(SP), DI
2072 MOVQ 632(SP), SI
2073 CALL p256SubInternal(SB)
2074 MOVQ R10, 192(SP)
2075 MOVQ R11, 200(SP)
2076 MOVQ R12, 208(SP)
2077 MOVQ R13, 216(SP)
2078 MOVQ R10, R14
2079 MOVQ R11, R15
2080 MOVQ R12, DI
2081 MOVQ R13, SI
2082 MOVQ 320(SP), R10
2083 MOVQ 328(SP), R11
2084 MOVQ 336(SP), R12
2085 MOVQ 344(SP), R13
2086 CALL p256SubInternal(SB)
2087 MOVQ 512(SP), R14
2088 MOVQ 520(SP), R15
2089 MOVQ 528(SP), DI
2090 MOVQ 536(SP), SI
2091 CALL p256MulInternal(SB)
2092 MOVQ 384(SP), R14
2093 MOVQ 392(SP), R15
2094 MOVQ 400(SP), DI
2095 MOVQ 408(SP), SI
2096 CALL p256SubInternal(SB)
2097 MOVQ R10, 224(SP)
2098 MOVQ R11, 232(SP)
2099 MOVQ R12, 240(SP)
2100 MOVQ R13, 248(SP)
2101 MOVOU 192(SP), X0
2102 MOVOU 208(SP), X1
2103 MOVOU 224(SP), X2
2104 MOVOU 240(SP), X3
2105 MOVOU 256(SP), X4
2106 MOVOU 272(SP), X5
2107
2108 // Finally output the result
2109 MOVQ 640(SP), AX
2110 MOVQ $0x00000000, 640(SP)
2111 MOVOU X0, (AX)
2112 MOVOU X1, 16(AX)
2113 MOVOU X2, 32(AX)
2114 MOVOU X3, 48(AX)
2115 MOVOU X4, 64(AX)
2116 MOVOU X5, 80(AX)
2117 MOVQ 648(SP), AX
2118 MOVQ AX, ret+24(FP)
2119 RET
2120
2121 // func p256PointDoubleAsm(res *P256Point, in *P256Point)
2122 // Requires: CMOV, SSE2
2123 TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-16
2124 MOVQ res+0(FP), AX
2125 MOVQ in+8(FP), BX
2126 MOVOU (BX), X0
2127 MOVOU 16(BX), X1
2128 MOVOU 32(BX), X2
2129 MOVOU 48(BX), X3
2130 MOVOU 64(BX), X4
2131 MOVOU 80(BX), X5
2132 MOVOU X0, (SP)
2133 MOVOU X1, 16(SP)
2134 MOVOU X2, 32(SP)
2135 MOVOU X3, 48(SP)
2136 MOVOU X4, 64(SP)
2137 MOVOU X5, 80(SP)
2138
2139 // Store pointer to result
2140 MOVQ AX, 224(SP)
2141
2142 // Begin point double
2143 MOVQ 64(SP), R10
2144 MOVQ 72(SP), R11
2145 MOVQ 80(SP), R12
2146 MOVQ 88(SP), R13
2147 CALL p256SqrInternal(SB)
2148 MOVQ R10, 160(SP)
2149 MOVQ R11, 168(SP)
2150 MOVQ R12, 176(SP)
2151 MOVQ R13, 184(SP)
2152 MOVQ (SP), R14
2153 MOVQ 8(SP), R15
2154 MOVQ 16(SP), DI
2155 MOVQ 24(SP), SI
2156 XORQ AX, AX
2157 ADDQ R14, R10
2158 ADCQ R15, R11
2159 ADCQ DI, R12
2160 ADCQ SI, R13
2161 ADCQ $+0, AX
2162 MOVQ R10, R14
2163 MOVQ R11, R15
2164 MOVQ R12, DI
2165 MOVQ R13, SI
2166 SUBQ $-1, R14
2167 SBBQ p256const0<>+0(SB), R15
2168 SBBQ $+0, DI
2169 SBBQ p256const1<>+0(SB), SI
2170 SBBQ $+0, AX
2171 CMOVQCS R10, R14
2172 CMOVQCS R11, R15
2173 CMOVQCS R12, DI
2174 CMOVQCS R13, SI
2175 MOVQ R14, 128(SP)
2176 MOVQ R15, 136(SP)
2177 MOVQ DI, 144(SP)
2178 MOVQ SI, 152(SP)
2179 MOVQ 64(SP), R10
2180 MOVQ 72(SP), R11
2181 MOVQ 80(SP), R12
2182 MOVQ 88(SP), R13
2183 MOVQ 32(SP), R14
2184 MOVQ 40(SP), R15
2185 MOVQ 48(SP), DI
2186 MOVQ 56(SP), SI
2187 CALL p256MulInternal(SB)
2188 XORQ AX, AX
2189 ADDQ R10, R10
2190 ADCQ R11, R11
2191 ADCQ R12, R12
2192 ADCQ R13, R13
2193 ADCQ $+0, AX
2194 MOVQ R10, R14
2195 MOVQ R11, R15
2196 MOVQ R12, DI
2197 MOVQ R13, SI
2198 SUBQ $-1, R14
2199 SBBQ p256const0<>+0(SB), R15
2200 SBBQ $+0, DI
2201 SBBQ p256const1<>+0(SB), SI
2202 SBBQ $+0, AX
2203 CMOVQCS R10, R14
2204 CMOVQCS R11, R15
2205 CMOVQCS R12, DI
2206 CMOVQCS R13, SI
2207 MOVQ 224(SP), AX
2208
2209 // Store z
2210 MOVQ R14, 64(AX)
2211 MOVQ R15, 72(AX)
2212 MOVQ DI, 80(AX)
2213 MOVQ SI, 88(AX)
2214 MOVQ (SP), R10
2215 MOVQ 8(SP), R11
2216 MOVQ 16(SP), R12
2217 MOVQ 24(SP), R13
2218 MOVQ 160(SP), R14
2219 MOVQ 168(SP), R15
2220 MOVQ 176(SP), DI
2221 MOVQ 184(SP), SI
2222 CALL p256SubInternal(SB)
2223 MOVQ 128(SP), R14
2224 MOVQ 136(SP), R15
2225 MOVQ 144(SP), DI
2226 MOVQ 152(SP), SI
2227 CALL p256MulInternal(SB)
2228 MOVQ R10, 128(SP)
2229 MOVQ R11, 136(SP)
2230 MOVQ R12, 144(SP)
2231 MOVQ R13, 152(SP)
2232
2233 // Multiply by 3
2234 XORQ AX, AX
2235 ADDQ R10, R10
2236 ADCQ R11, R11
2237 ADCQ R12, R12
2238 ADCQ R13, R13
2239 ADCQ $+0, AX
2240 MOVQ R10, R14
2241 MOVQ R11, R15
2242 MOVQ R12, DI
2243 MOVQ R13, SI
2244 SUBQ $-1, R14
2245 SBBQ p256const0<>+0(SB), R15
2246 SBBQ $+0, DI
2247 SBBQ p256const1<>+0(SB), SI
2248 SBBQ $+0, AX
2249 CMOVQCS R10, R14
2250 CMOVQCS R11, R15
2251 CMOVQCS R12, DI
2252 CMOVQCS R13, SI
2253 MOVQ 128(SP), R10
2254 MOVQ 136(SP), R11
2255 MOVQ 144(SP), R12
2256 MOVQ 152(SP), R13
2257 XORQ AX, AX
2258 ADDQ R14, R10
2259 ADCQ R15, R11
2260 ADCQ DI, R12
2261 ADCQ SI, R13
2262 ADCQ $+0, AX
2263 MOVQ R10, R14
2264 MOVQ R11, R15
2265 MOVQ R12, DI
2266 MOVQ R13, SI
2267 SUBQ $-1, R14
2268 SBBQ p256const0<>+0(SB), R15
2269 SBBQ $+0, DI
2270 SBBQ p256const1<>+0(SB), SI
2271 SBBQ $+0, AX
2272 CMOVQCS R10, R14
2273 CMOVQCS R11, R15
2274 CMOVQCS R12, DI
2275 CMOVQCS R13, SI
2276 MOVQ R14, 128(SP)
2277 MOVQ R15, 136(SP)
2278 MOVQ DI, 144(SP)
2279 MOVQ SI, 152(SP)
2280
2281 // ////////////////////////
2282 MOVQ 32(SP), R10
2283 MOVQ 40(SP), R11
2284 MOVQ 48(SP), R12
2285 MOVQ 56(SP), R13
2286 XORQ AX, AX
2287 ADDQ R10, R10
2288 ADCQ R11, R11
2289 ADCQ R12, R12
2290 ADCQ R13, R13
2291 ADCQ $+0, AX
2292 MOVQ R10, R14
2293 MOVQ R11, R15
2294 MOVQ R12, DI
2295 MOVQ R13, SI
2296 SUBQ $-1, R14
2297 SBBQ p256const0<>+0(SB), R15
2298 SBBQ $+0, DI
2299 SBBQ p256const1<>+0(SB), SI
2300 SBBQ $+0, AX
2301 CMOVQCS R10, R14
2302 CMOVQCS R11, R15
2303 CMOVQCS R12, DI
2304 CMOVQCS R13, SI
2305 MOVQ R14, R10
2306 MOVQ R15, R11
2307 MOVQ DI, R12
2308 MOVQ SI, R13
2309 CALL p256SqrInternal(SB)
2310 MOVQ R10, 96(SP)
2311 MOVQ R11, 104(SP)
2312 MOVQ R12, 112(SP)
2313 MOVQ R13, 120(SP)
2314 CALL p256SqrInternal(SB)
2315
2316 // Divide by 2
2317 XORQ AX, AX
2318 MOVQ R10, R14
2319 MOVQ R11, R15
2320 MOVQ R12, DI
2321 MOVQ R13, SI
2322 ADDQ $-1, R10
2323 ADCQ p256const0<>+0(SB), R11
2324 ADCQ $0x00, R12
2325 ADCQ p256const1<>+0(SB), R13
2326 ADCQ $0x00, AX
2327 TESTQ $0x00000001, R14
2328 CMOVQEQ R14, R10
2329 CMOVQEQ R15, R11
2330 CMOVQEQ DI, R12
2331 CMOVQEQ SI, R13
2332 ANDQ R14, AX
2333 SHRQ $0x01, R11, R10
2334 SHRQ $0x01, R12, R11
2335 SHRQ $0x01, R13, R12
2336 SHRQ $0x01, AX, R13
2337 MOVQ R10, 32(SP)
2338 MOVQ R11, 40(SP)
2339 MOVQ R12, 48(SP)
2340 MOVQ R13, 56(SP)
2341
2342 // /////////////////////////
2343 MOVQ (SP), R10
2344 MOVQ 8(SP), R11
2345 MOVQ 16(SP), R12
2346 MOVQ 24(SP), R13
2347 MOVQ 96(SP), R14
2348 MOVQ 104(SP), R15
2349 MOVQ 112(SP), DI
2350 MOVQ 120(SP), SI
2351 CALL p256MulInternal(SB)
2352 MOVQ R10, 96(SP)
2353 MOVQ R11, 104(SP)
2354 MOVQ R12, 112(SP)
2355 MOVQ R13, 120(SP)
2356 XORQ AX, AX
2357 ADDQ R10, R10
2358 ADCQ R11, R11
2359 ADCQ R12, R12
2360 ADCQ R13, R13
2361 ADCQ $+0, AX
2362 MOVQ R10, R14
2363 MOVQ R11, R15
2364 MOVQ R12, DI
2365 MOVQ R13, SI
2366 SUBQ $-1, R14
2367 SBBQ p256const0<>+0(SB), R15
2368 SBBQ $+0, DI
2369 SBBQ p256const1<>+0(SB), SI
2370 SBBQ $+0, AX
2371 CMOVQCS R10, R14
2372 CMOVQCS R11, R15
2373 CMOVQCS R12, DI
2374 CMOVQCS R13, SI
2375 MOVQ R14, 192(SP)
2376 MOVQ R15, 200(SP)
2377 MOVQ DI, 208(SP)
2378 MOVQ SI, 216(SP)
2379 MOVQ 128(SP), R10
2380 MOVQ 136(SP), R11
2381 MOVQ 144(SP), R12
2382 MOVQ 152(SP), R13
2383 CALL p256SqrInternal(SB)
2384 MOVQ 192(SP), R14
2385 MOVQ 200(SP), R15
2386 MOVQ 208(SP), DI
2387 MOVQ 216(SP), SI
2388 CALL p256SubInternal(SB)
2389 MOVQ 224(SP), AX
2390
2391 // Store x
2392 MOVQ R10, (AX)
2393 MOVQ R11, 8(AX)
2394 MOVQ R12, 16(AX)
2395 MOVQ R13, 24(AX)
2396 MOVQ R10, R14
2397 MOVQ R11, R15
2398 MOVQ R12, DI
2399 MOVQ R13, SI
2400 MOVQ 96(SP), R10
2401 MOVQ 104(SP), R11
2402 MOVQ 112(SP), R12
2403 MOVQ 120(SP), R13
2404 CALL p256SubInternal(SB)
2405 MOVQ 128(SP), R14
2406 MOVQ 136(SP), R15
2407 MOVQ 144(SP), DI
2408 MOVQ 152(SP), SI
2409 CALL p256MulInternal(SB)
2410 MOVQ 32(SP), R14
2411 MOVQ 40(SP), R15
2412 MOVQ 48(SP), DI
2413 MOVQ 56(SP), SI
2414 CALL p256SubInternal(SB)
2415 MOVQ 224(SP), AX
2416
2417 // Store y
2418 MOVQ R10, 32(AX)
2419 MOVQ R11, 40(AX)
2420 MOVQ R12, 48(AX)
2421 MOVQ R13, 56(AX)
2422
2423 // ///////////////////////
2424 MOVQ $0x00000000, 224(SP)
2425 RET
2426
View as plain text