p256_asm_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  // This file contains constant-time, 64-bit assembly implementation of
     8  // P256. The optimizations performed here are described in detail in:
     9  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
    10  //                          256-bit primes"
    11  // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    12  // https://eprint.iacr.org/2013/816.pdf
    13  
    14  #include "textflag.h"
    15  
    16  #define res_ptr R0
    17  #define a_ptr R1
    18  #define b_ptr R2
    19  
    20  #define acc0 R3
    21  #define acc1 R4
    22  #define acc2 R5
    23  #define acc3 R6
    24  
    25  #define acc4 R7
    26  #define acc5 R8
    27  #define acc6 R9
    28  #define acc7 R10
    29  #define t0 R11
    30  #define t1 R12
    31  #define t2 R13
    32  #define t3 R14
    33  #define const0 R15
    34  #define const1 R16
    35  
    36  #define hlp0 R17
    37  #define hlp1 res_ptr
    38  
    39  #define x0 R19
    40  #define x1 R20
    41  #define x2 R21
    42  #define x3 R22
    43  #define y0 R23
    44  #define y1 R24
    45  #define y2 R25
    46  #define y3 R26
    47  
    48  #define const2 t2
    49  #define const3 t3
    50  
    51  DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    52  DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    53  DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    54  DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    55  DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    56  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    57  DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    58  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    59  DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    60  DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    61  DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    62  GLOBL p256const0<>(SB), 8, $8
    63  GLOBL p256const1<>(SB), 8, $8
    64  GLOBL p256ordK0<>(SB), 8, $8
    65  GLOBL p256ord<>(SB), 8, $32
    66  GLOBL p256one<>(SB), 8, $32
    67  
    68  /* ---------------------------------------*/
    69  // func p256MovCond(res, a, b *P256Point, cond int)
    70  // If cond == 0 res=b, else res=a
    71  TEXT ·p256MovCond(SB),NOSPLIT,$0
    72  	MOVD	res+0(FP), res_ptr
    73  	MOVD	a+8(FP), a_ptr
    74  	MOVD	b+16(FP), b_ptr
    75  	MOVD	cond+24(FP), R3
    76  
    77  	CMP	$0, R3
    78  	// Two remarks:
    79  	// 1) Will want to revisit NEON, when support is better
    80  	// 2) CSEL might not be constant time on all ARM processors
    81  	LDP	0*16(a_ptr), (R4, R5)
    82  	LDP	1*16(a_ptr), (R6, R7)
    83  	LDP	2*16(a_ptr), (R8, R9)
    84  	LDP	0*16(b_ptr), (R16, R17)
    85  	LDP	1*16(b_ptr), (R19, R20)
    86  	LDP	2*16(b_ptr), (R21, R22)
    87  	CSEL	EQ, R16, R4, R4
    88  	CSEL	EQ, R17, R5, R5
    89  	CSEL	EQ, R19, R6, R6
    90  	CSEL	EQ, R20, R7, R7
    91  	CSEL	EQ, R21, R8, R8
    92  	CSEL	EQ, R22, R9, R9
    93  	STP	(R4, R5), 0*16(res_ptr)
    94  	STP	(R6, R7), 1*16(res_ptr)
    95  	STP	(R8, R9), 2*16(res_ptr)
    96  
    97  	LDP	3*16(a_ptr), (R4, R5)
    98  	LDP	4*16(a_ptr), (R6, R7)
    99  	LDP	5*16(a_ptr), (R8, R9)
   100  	LDP	3*16(b_ptr), (R16, R17)
   101  	LDP	4*16(b_ptr), (R19, R20)
   102  	LDP	5*16(b_ptr), (R21, R22)
   103  	CSEL	EQ, R16, R4, R4
   104  	CSEL	EQ, R17, R5, R5
   105  	CSEL	EQ, R19, R6, R6
   106  	CSEL	EQ, R20, R7, R7
   107  	CSEL	EQ, R21, R8, R8
   108  	CSEL	EQ, R22, R9, R9
   109  	STP	(R4, R5), 3*16(res_ptr)
   110  	STP	(R6, R7), 4*16(res_ptr)
   111  	STP	(R8, R9), 5*16(res_ptr)
   112  
   113  	RET
   114  /* ---------------------------------------*/
   115  // func p256NegCond(val *p256Element, cond int)
   116  TEXT ·p256NegCond(SB),NOSPLIT,$0
   117  	MOVD	val+0(FP), a_ptr
   118  	MOVD	cond+8(FP), hlp0
   119  	MOVD	a_ptr, res_ptr
   120  	// acc = poly
   121  	MOVD	$-1, acc0
   122  	MOVD	p256const0<>(SB), acc1
   123  	MOVD	$0, acc2
   124  	MOVD	p256const1<>(SB), acc3
   125  	// Load the original value
   126  	LDP	0*16(a_ptr), (t0, t1)
   127  	LDP	1*16(a_ptr), (t2, t3)
   128  	// Speculatively subtract
   129  	SUBS	t0, acc0
   130  	SBCS	t1, acc1
   131  	SBCS	t2, acc2
   132  	SBC	t3, acc3
   133  	// If condition is 0, keep original value
   134  	CMP	$0, hlp0
   135  	CSEL	EQ, t0, acc0, acc0
   136  	CSEL	EQ, t1, acc1, acc1
   137  	CSEL	EQ, t2, acc2, acc2
   138  	CSEL	EQ, t3, acc3, acc3
   139  	// Store result
   140  	STP	(acc0, acc1), 0*16(res_ptr)
   141  	STP	(acc2, acc3), 1*16(res_ptr)
   142  
   143  	RET
   144  /* ---------------------------------------*/
   145  // func p256Sqr(res, in *p256Element, n int)
   146  TEXT ·p256Sqr(SB),NOSPLIT,$0
   147  	MOVD	res+0(FP), res_ptr
   148  	MOVD	in+8(FP), a_ptr
   149  	MOVD	n+16(FP), b_ptr
   150  
   151  	MOVD	p256const0<>(SB), const0
   152  	MOVD	p256const1<>(SB), const1
   153  
   154  	LDP	0*16(a_ptr), (x0, x1)
   155  	LDP	1*16(a_ptr), (x2, x3)
   156  
   157  sqrLoop:
   158  	SUB	$1, b_ptr
   159  	CALL	p256SqrInternal<>(SB)
   160  	MOVD	y0, x0
   161  	MOVD	y1, x1
   162  	MOVD	y2, x2
   163  	MOVD	y3, x3
   164  	CBNZ	b_ptr, sqrLoop
   165  
   166  	STP	(y0, y1), 0*16(res_ptr)
   167  	STP	(y2, y3), 1*16(res_ptr)
   168  	RET
   169  /* ---------------------------------------*/
   170  // func p256Mul(res, in1, in2 *p256Element)
   171  TEXT ·p256Mul(SB),NOSPLIT,$0
   172  	MOVD	res+0(FP), res_ptr
   173  	MOVD	in1+8(FP), a_ptr
   174  	MOVD	in2+16(FP), b_ptr
   175  
   176  	MOVD	p256const0<>(SB), const0
   177  	MOVD	p256const1<>(SB), const1
   178  
   179  	LDP	0*16(a_ptr), (x0, x1)
   180  	LDP	1*16(a_ptr), (x2, x3)
   181  
   182  	LDP	0*16(b_ptr), (y0, y1)
   183  	LDP	1*16(b_ptr), (y2, y3)
   184  
   185  	CALL	p256MulInternal<>(SB)
   186  
   187  	STP	(y0, y1), 0*16(res_ptr)
   188  	STP	(y2, y3), 1*16(res_ptr)
   189  	RET
   190  /* ---------------------------------------*/
   191  // func p256FromMont(res, in *p256Element)
   192  TEXT ·p256FromMont(SB),NOSPLIT,$0
   193  	MOVD	res+0(FP), res_ptr
   194  	MOVD	in+8(FP), a_ptr
   195  
   196  	MOVD	p256const0<>(SB), const0
   197  	MOVD	p256const1<>(SB), const1
   198  
   199  	LDP	0*16(a_ptr), (acc0, acc1)
   200  	LDP	1*16(a_ptr), (acc2, acc3)
   201  	// Only reduce, no multiplications are needed
   202  	// First reduction step
   203  	ADDS	acc0<<32, acc1, acc1
   204  	LSR	$32, acc0, t0
   205  	MUL	acc0, const1, t1
   206  	UMULH	acc0, const1, acc0
   207  	ADCS	t0, acc2
   208  	ADCS	t1, acc3
   209  	ADC	$0, acc0
   210  	// Second reduction step
   211  	ADDS	acc1<<32, acc2, acc2
   212  	LSR	$32, acc1, t0
   213  	MUL	acc1, const1, t1
   214  	UMULH	acc1, const1, acc1
   215  	ADCS	t0, acc3
   216  	ADCS	t1, acc0
   217  	ADC	$0, acc1
   218  	// Third reduction step
   219  	ADDS	acc2<<32, acc3, acc3
   220  	LSR	$32, acc2, t0
   221  	MUL	acc2, const1, t1
   222  	UMULH	acc2, const1, acc2
   223  	ADCS	t0, acc0
   224  	ADCS	t1, acc1
   225  	ADC	$0, acc2
   226  	// Last reduction step
   227  	ADDS	acc3<<32, acc0, acc0
   228  	LSR	$32, acc3, t0
   229  	MUL	acc3, const1, t1
   230  	UMULH	acc3, const1, acc3
   231  	ADCS	t0, acc1
   232  	ADCS	t1, acc2
   233  	ADC	$0, acc3
   234  
   235  	SUBS	$-1, acc0, t0
   236  	SBCS	const0, acc1, t1
   237  	SBCS	$0, acc2, t2
   238  	SBCS	const1, acc3, t3
   239  
   240  	CSEL	CS, t0, acc0, acc0
   241  	CSEL	CS, t1, acc1, acc1
   242  	CSEL	CS, t2, acc2, acc2
   243  	CSEL	CS, t3, acc3, acc3
   244  
   245  	STP	(acc0, acc1), 0*16(res_ptr)
   246  	STP	(acc2, acc3), 1*16(res_ptr)
   247  
   248  	RET
   249  /* ---------------------------------------*/
   250  // func p256Select(res *P256Point, table *p256Table, idx int)
   251  TEXT ·p256Select(SB),NOSPLIT,$0
   252  	MOVD	idx+16(FP), const0
   253  	MOVD	table+8(FP), b_ptr
   254  	MOVD	res+0(FP), res_ptr
   255  
   256  	EOR	x0, x0, x0
   257  	EOR	x1, x1, x1
   258  	EOR	x2, x2, x2
   259  	EOR	x3, x3, x3
   260  	EOR	y0, y0, y0
   261  	EOR	y1, y1, y1
   262  	EOR	y2, y2, y2
   263  	EOR	y3, y3, y3
   264  	EOR	t0, t0, t0
   265  	EOR	t1, t1, t1
   266  	EOR	t2, t2, t2
   267  	EOR	t3, t3, t3
   268  
   269  	MOVD	$0, const1
   270  
   271  loop_select:
   272  		ADD	$1, const1
   273  		CMP	const0, const1
   274  		LDP.P	16(b_ptr), (acc0, acc1)
   275  		CSEL	EQ, acc0, x0, x0
   276  		CSEL	EQ, acc1, x1, x1
   277  		LDP.P	16(b_ptr), (acc2, acc3)
   278  		CSEL	EQ, acc2, x2, x2
   279  		CSEL	EQ, acc3, x3, x3
   280  		LDP.P	16(b_ptr), (acc4, acc5)
   281  		CSEL	EQ, acc4, y0, y0
   282  		CSEL	EQ, acc5, y1, y1
   283  		LDP.P	16(b_ptr), (acc6, acc7)
   284  		CSEL	EQ, acc6, y2, y2
   285  		CSEL	EQ, acc7, y3, y3
   286  		LDP.P	16(b_ptr), (acc0, acc1)
   287  		CSEL	EQ, acc0, t0, t0
   288  		CSEL	EQ, acc1, t1, t1
   289  		LDP.P	16(b_ptr), (acc2, acc3)
   290  		CSEL	EQ, acc2, t2, t2
   291  		CSEL	EQ, acc3, t3, t3
   292  
   293  		CMP	$16, const1
   294  		BNE	loop_select
   295  
   296  	STP	(x0, x1), 0*16(res_ptr)
   297  	STP	(x2, x3), 1*16(res_ptr)
   298  	STP	(y0, y1), 2*16(res_ptr)
   299  	STP	(y2, y3), 3*16(res_ptr)
   300  	STP	(t0, t1), 4*16(res_ptr)
   301  	STP	(t2, t3), 5*16(res_ptr)
   302  	RET
   303  /* ---------------------------------------*/
   304  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   305  TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   306  	MOVD	idx+16(FP), t0
   307  	MOVD	table+8(FP), t1
   308  	MOVD	res+0(FP), res_ptr
   309  
   310  	EOR	x0, x0, x0
   311  	EOR	x1, x1, x1
   312  	EOR	x2, x2, x2
   313  	EOR	x3, x3, x3
   314  	EOR	y0, y0, y0
   315  	EOR	y1, y1, y1
   316  	EOR	y2, y2, y2
   317  	EOR	y3, y3, y3
   318  
   319  	MOVD	$0, t2
   320  
   321  loop_select:
   322  		ADD	$1, t2
   323  		CMP	t0, t2
   324  		LDP.P	16(t1), (acc0, acc1)
   325  		CSEL	EQ, acc0, x0, x0
   326  		CSEL	EQ, acc1, x1, x1
   327  		LDP.P	16(t1), (acc2, acc3)
   328  		CSEL	EQ, acc2, x2, x2
   329  		CSEL	EQ, acc3, x3, x3
   330  		LDP.P	16(t1), (acc4, acc5)
   331  		CSEL	EQ, acc4, y0, y0
   332  		CSEL	EQ, acc5, y1, y1
   333  		LDP.P	16(t1), (acc6, acc7)
   334  		CSEL	EQ, acc6, y2, y2
   335  		CSEL	EQ, acc7, y3, y3
   336  
   337  		CMP	$32, t2
   338  		BNE	loop_select
   339  
   340  	STP	(x0, x1), 0*16(res_ptr)
   341  	STP	(x2, x3), 1*16(res_ptr)
   342  	STP	(y0, y1), 2*16(res_ptr)
   343  	STP	(y2, y3), 3*16(res_ptr)
   344  	RET
   345  /* ---------------------------------------*/
   346  // func p256OrdSqr(res, in *p256OrdElement, n int)
   347  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   348  	MOVD	in+8(FP), a_ptr
   349  	MOVD	n+16(FP), b_ptr
   350  
   351  	MOVD	p256ordK0<>(SB), hlp1
   352  	LDP	p256ord<>+0x00(SB), (const0, const1)
   353  	LDP	p256ord<>+0x10(SB), (const2, const3)
   354  
   355  	LDP	0*16(a_ptr), (x0, x1)
   356  	LDP	1*16(a_ptr), (x2, x3)
   357  
   358  ordSqrLoop:
   359  	SUB	$1, b_ptr
   360  
   361  	// x[1:] * x[0]
   362  	MUL	x0, x1, acc1
   363  	UMULH	x0, x1, acc2
   364  
   365  	MUL	x0, x2, t0
   366  	ADDS	t0, acc2, acc2
   367  	UMULH	x0, x2, acc3
   368  
   369  	MUL	x0, x3, t0
   370  	ADCS	t0, acc3, acc3
   371  	UMULH	x0, x3, acc4
   372  	ADC	$0, acc4, acc4
   373  	// x[2:] * x[1]
   374  	MUL	x1, x2, t0
   375  	ADDS	t0, acc3
   376  	UMULH	x1, x2, t1
   377  	ADCS	t1, acc4
   378  	ADC	$0, ZR, acc5
   379  
   380  	MUL	x1, x3, t0
   381  	ADDS	t0, acc4
   382  	UMULH	x1, x3, t1
   383  	ADC	t1, acc5
   384  	// x[3] * x[2]
   385  	MUL	x2, x3, t0
   386  	ADDS	t0, acc5
   387  	UMULH	x2, x3, acc6
   388  	ADC	$0, acc6
   389  
   390  	MOVD	$0, acc7
   391  	// *2
   392  	ADDS	acc1, acc1
   393  	ADCS	acc2, acc2
   394  	ADCS	acc3, acc3
   395  	ADCS	acc4, acc4
   396  	ADCS	acc5, acc5
   397  	ADCS	acc6, acc6
   398  	ADC	$0, acc7
   399  	// Missing products
   400  	MUL	x0, x0, acc0
   401  	UMULH	x0, x0, t0
   402  	ADDS	t0, acc1, acc1
   403  
   404  	MUL	x1, x1, t0
   405  	ADCS	t0, acc2, acc2
   406  	UMULH	x1, x1, t1
   407  	ADCS	t1, acc3, acc3
   408  
   409  	MUL	x2, x2, t0
   410  	ADCS	t0, acc4, acc4
   411  	UMULH	x2, x2, t1
   412  	ADCS	t1, acc5, acc5
   413  
   414  	MUL	x3, x3, t0
   415  	ADCS	t0, acc6, acc6
   416  	UMULH	x3, x3, t1
   417  	ADC	t1, acc7, acc7
   418  	// First reduction step
   419  	MUL	acc0, hlp1, hlp0
   420  
   421  	MUL	const0, hlp1, t0
   422  	ADDS	t0, acc0, acc0
   423  	UMULH	const0, hlp0, t1
   424  
   425  	MUL	const1, hlp0, t0
   426  	ADCS	t0, acc1, acc1
   427  	UMULH	const1, hlp0, y0
   428  
   429  	MUL	const2, hlp0, t0
   430  	ADCS	t0, acc2, acc2
   431  	UMULH	const2, hlp0, acc0
   432  
   433  	MUL	const3, hlp0, t0
   434  	ADCS	t0, acc3, acc3
   435  
   436  	UMULH	const3, hlp0, hlp0
   437  	ADC	$0, hlp0
   438  
   439  	ADDS	t1, acc1, acc1
   440  	ADCS	y0, acc2, acc2
   441  	ADCS	acc0, acc3, acc3
   442  	ADC	$0, hlp0, acc0
   443  	// Second reduction step
   444  	MUL	acc1, hlp1, hlp0
   445  
   446  	MUL	const0, hlp1, t0
   447  	ADDS	t0, acc1, acc1
   448  	UMULH	const0, hlp0, t1
   449  
   450  	MUL	const1, hlp0, t0
   451  	ADCS	t0, acc2, acc2
   452  	UMULH	const1, hlp0, y0
   453  
   454  	MUL	const2, hlp0, t0
   455  	ADCS	t0, acc3, acc3
   456  	UMULH	const2, hlp0, acc1
   457  
   458  	MUL	const3, hlp0, t0
   459  	ADCS	t0, acc0, acc0
   460  
   461  	UMULH	const3, hlp0, hlp0
   462  	ADC	$0, hlp0
   463  
   464  	ADDS	t1, acc2, acc2
   465  	ADCS	y0, acc3, acc3
   466  	ADCS	acc1, acc0, acc0
   467  	ADC	$0, hlp0, acc1
   468  	// Third reduction step
   469  	MUL	acc2, hlp1, hlp0
   470  
   471  	MUL	const0, hlp1, t0
   472  	ADDS	t0, acc2, acc2
   473  	UMULH	const0, hlp0, t1
   474  
   475  	MUL	const1, hlp0, t0
   476  	ADCS	t0, acc3, acc3
   477  	UMULH	const1, hlp0, y0
   478  
   479  	MUL	const2, hlp0, t0
   480  	ADCS	t0, acc0, acc0
   481  	UMULH	const2, hlp0, acc2
   482  
   483  	MUL	const3, hlp0, t0
   484  	ADCS	t0, acc1, acc1
   485  
   486  	UMULH	const3, hlp0, hlp0
   487  	ADC	$0, hlp0
   488  
   489  	ADDS	t1, acc3, acc3
   490  	ADCS	y0, acc0, acc0
   491  	ADCS	acc2, acc1, acc1
   492  	ADC	$0, hlp0, acc2
   493  
   494  	// Last reduction step
   495  	MUL	acc3, hlp1, hlp0
   496  
   497  	MUL	const0, hlp1, t0
   498  	ADDS	t0, acc3, acc3
   499  	UMULH	const0, hlp0, t1
   500  
   501  	MUL	const1, hlp0, t0
   502  	ADCS	t0, acc0, acc0
   503  	UMULH	const1, hlp0, y0
   504  
   505  	MUL	const2, hlp0, t0
   506  	ADCS	t0, acc1, acc1
   507  	UMULH	const2, hlp0, acc3
   508  
   509  	MUL	const3, hlp0, t0
   510  	ADCS	t0, acc2, acc2
   511  
   512  	UMULH	const3, hlp0, hlp0
   513  	ADC	$0, acc7
   514  
   515  	ADDS	t1, acc0, acc0
   516  	ADCS	y0, acc1, acc1
   517  	ADCS	acc3, acc2, acc2
   518  	ADC	$0, hlp0, acc3
   519  
   520  	ADDS	acc4, acc0, acc0
   521  	ADCS	acc5, acc1, acc1
   522  	ADCS	acc6, acc2, acc2
   523  	ADCS	acc7, acc3, acc3
   524  	ADC	$0, ZR, acc4
   525  
   526  	SUBS	const0, acc0, y0
   527  	SBCS	const1, acc1, y1
   528  	SBCS	const2, acc2, y2
   529  	SBCS	const3, acc3, y3
   530  	SBCS	$0, acc4, acc4
   531  
   532  	CSEL	CS, y0, acc0, x0
   533  	CSEL	CS, y1, acc1, x1
   534  	CSEL	CS, y2, acc2, x2
   535  	CSEL	CS, y3, acc3, x3
   536  
   537  	CBNZ	b_ptr, ordSqrLoop
   538  
   539  	MOVD	res+0(FP), res_ptr
   540  	STP	(x0, x1), 0*16(res_ptr)
   541  	STP	(x2, x3), 1*16(res_ptr)
   542  
   543  	RET
   544  /* ---------------------------------------*/
   545  // func p256OrdMul(res, in1, in2 *p256OrdElement)
   546  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   547  	MOVD	in1+8(FP), a_ptr
   548  	MOVD	in2+16(FP), b_ptr
   549  
   550  	MOVD	p256ordK0<>(SB), hlp1
   551  	LDP	p256ord<>+0x00(SB), (const0, const1)
   552  	LDP	p256ord<>+0x10(SB), (const2, const3)
   553  
   554  	LDP	0*16(a_ptr), (x0, x1)
   555  	LDP	1*16(a_ptr), (x2, x3)
   556  	LDP	0*16(b_ptr), (y0, y1)
   557  	LDP	1*16(b_ptr), (y2, y3)
   558  
   559  	// y[0] * x
   560  	MUL	y0, x0, acc0
   561  	UMULH	y0, x0, acc1
   562  
   563  	MUL	y0, x1, t0
   564  	ADDS	t0, acc1
   565  	UMULH	y0, x1, acc2
   566  
   567  	MUL	y0, x2, t0
   568  	ADCS	t0, acc2
   569  	UMULH	y0, x2, acc3
   570  
   571  	MUL	y0, x3, t0
   572  	ADCS	t0, acc3
   573  	UMULH	y0, x3, acc4
   574  	ADC	$0, acc4
   575  	// First reduction step
   576  	MUL	acc0, hlp1, hlp0
   577  
   578  	MUL	const0, hlp1, t0
   579  	ADDS	t0, acc0, acc0
   580  	UMULH	const0, hlp0, t1
   581  
   582  	MUL	const1, hlp0, t0
   583  	ADCS	t0, acc1, acc1
   584  	UMULH	const1, hlp0, y0
   585  
   586  	MUL	const2, hlp0, t0
   587  	ADCS	t0, acc2, acc2
   588  	UMULH	const2, hlp0, acc0
   589  
   590  	MUL	const3, hlp0, t0
   591  	ADCS	t0, acc3, acc3
   592  
   593  	UMULH	const3, hlp0, hlp0
   594  	ADC	$0, acc4
   595  
   596  	ADDS	t1, acc1, acc1
   597  	ADCS	y0, acc2, acc2
   598  	ADCS	acc0, acc3, acc3
   599  	ADC	$0, hlp0, acc0
   600  	// y[1] * x
   601  	MUL	y1, x0, t0
   602  	ADDS	t0, acc1
   603  	UMULH	y1, x0, t1
   604  
   605  	MUL	y1, x1, t0
   606  	ADCS	t0, acc2
   607  	UMULH	y1, x1, hlp0
   608  
   609  	MUL	y1, x2, t0
   610  	ADCS	t0, acc3
   611  	UMULH	y1, x2, y0
   612  
   613  	MUL	y1, x3, t0
   614  	ADCS	t0, acc4
   615  	UMULH	y1, x3, y1
   616  	ADC	$0, ZR, acc5
   617  
   618  	ADDS	t1, acc2
   619  	ADCS	hlp0, acc3
   620  	ADCS	y0, acc4
   621  	ADC	y1, acc5
   622  	// Second reduction step
   623  	MUL	acc1, hlp1, hlp0
   624  
   625  	MUL	const0, hlp1, t0
   626  	ADDS	t0, acc1, acc1
   627  	UMULH	const0, hlp0, t1
   628  
   629  	MUL	const1, hlp0, t0
   630  	ADCS	t0, acc2, acc2
   631  	UMULH	const1, hlp0, y0
   632  
   633  	MUL	const2, hlp0, t0
   634  	ADCS	t0, acc3, acc3
   635  	UMULH	const2, hlp0, acc1
   636  
   637  	MUL	const3, hlp0, t0
   638  	ADCS	t0, acc0, acc0
   639  
   640  	UMULH	const3, hlp0, hlp0
   641  	ADC	$0, acc5
   642  
   643  	ADDS	t1, acc2, acc2
   644  	ADCS	y0, acc3, acc3
   645  	ADCS	acc1, acc0, acc0
   646  	ADC	$0, hlp0, acc1
   647  	// y[2] * x
   648  	MUL	y2, x0, t0
   649  	ADDS	t0, acc2
   650  	UMULH	y2, x0, t1
   651  
   652  	MUL	y2, x1, t0
   653  	ADCS	t0, acc3
   654  	UMULH	y2, x1, hlp0
   655  
   656  	MUL	y2, x2, t0
   657  	ADCS	t0, acc4
   658  	UMULH	y2, x2, y0
   659  
   660  	MUL	y2, x3, t0
   661  	ADCS	t0, acc5
   662  	UMULH	y2, x3, y1
   663  	ADC	$0, ZR, acc6
   664  
   665  	ADDS	t1, acc3
   666  	ADCS	hlp0, acc4
   667  	ADCS	y0, acc5
   668  	ADC	y1, acc6
   669  	// Third reduction step
   670  	MUL	acc2, hlp1, hlp0
   671  
   672  	MUL	const0, hlp1, t0
   673  	ADDS	t0, acc2, acc2
   674  	UMULH	const0, hlp0, t1
   675  
   676  	MUL	const1, hlp0, t0
   677  	ADCS	t0, acc3, acc3
   678  	UMULH	const1, hlp0, y0
   679  
   680  	MUL	const2, hlp0, t0
   681  	ADCS	t0, acc0, acc0
   682  	UMULH	const2, hlp0, acc2
   683  
   684  	MUL	const3, hlp0, t0
   685  	ADCS	t0, acc1, acc1
   686  
   687  	UMULH	const3, hlp0, hlp0
   688  	ADC	$0, acc6
   689  
   690  	ADDS	t1, acc3, acc3
   691  	ADCS	y0, acc0, acc0
   692  	ADCS	acc2, acc1, acc1
   693  	ADC	$0, hlp0, acc2
   694  	// y[3] * x
   695  	MUL	y3, x0, t0
   696  	ADDS	t0, acc3
   697  	UMULH	y3, x0, t1
   698  
   699  	MUL	y3, x1, t0
   700  	ADCS	t0, acc4
   701  	UMULH	y3, x1, hlp0
   702  
   703  	MUL	y3, x2, t0
   704  	ADCS	t0, acc5
   705  	UMULH	y3, x2, y0
   706  
   707  	MUL	y3, x3, t0
   708  	ADCS	t0, acc6
   709  	UMULH	y3, x3, y1
   710  	ADC	$0, ZR, acc7
   711  
   712  	ADDS	t1, acc4
   713  	ADCS	hlp0, acc5
   714  	ADCS	y0, acc6
   715  	ADC	y1, acc7
   716  	// Last reduction step
   717  	MUL	acc3, hlp1, hlp0
   718  
   719  	MUL	const0, hlp1, t0
   720  	ADDS	t0, acc3, acc3
   721  	UMULH	const0, hlp0, t1
   722  
   723  	MUL	const1, hlp0, t0
   724  	ADCS	t0, acc0, acc0
   725  	UMULH	const1, hlp0, y0
   726  
   727  	MUL	const2, hlp0, t0
   728  	ADCS	t0, acc1, acc1
   729  	UMULH	const2, hlp0, acc3
   730  
   731  	MUL	const3, hlp0, t0
   732  	ADCS	t0, acc2, acc2
   733  
   734  	UMULH	const3, hlp0, hlp0
   735  	ADC	$0, acc7
   736  
   737  	ADDS	t1, acc0, acc0
   738  	ADCS	y0, acc1, acc1
   739  	ADCS	acc3, acc2, acc2
   740  	ADC	$0, hlp0, acc3
   741  
   742  	ADDS	acc4, acc0, acc0
   743  	ADCS	acc5, acc1, acc1
   744  	ADCS	acc6, acc2, acc2
   745  	ADCS	acc7, acc3, acc3
   746  	ADC	$0, ZR, acc4
   747  
   748  	SUBS	const0, acc0, t0
   749  	SBCS	const1, acc1, t1
   750  	SBCS	const2, acc2, t2
   751  	SBCS	const3, acc3, t3
   752  	SBCS	$0, acc4, acc4
   753  
   754  	CSEL	CS, t0, acc0, acc0
   755  	CSEL	CS, t1, acc1, acc1
   756  	CSEL	CS, t2, acc2, acc2
   757  	CSEL	CS, t3, acc3, acc3
   758  
   759  	MOVD	res+0(FP), res_ptr
   760  	STP	(acc0, acc1), 0*16(res_ptr)
   761  	STP	(acc2, acc3), 1*16(res_ptr)
   762  
   763  	RET
   764  /* ---------------------------------------*/
   765  TEXT p256SubInternal<>(SB),NOSPLIT,$0
   766  	SUBS	x0, y0, acc0
   767  	SBCS	x1, y1, acc1
   768  	SBCS	x2, y2, acc2
   769  	SBCS	x3, y3, acc3
   770  	SBC	$0, ZR, t0
   771  
   772  	ADDS	$-1, acc0, acc4
   773  	ADCS	const0, acc1, acc5
   774  	ADCS	$0, acc2, acc6
   775  	ADC	const1, acc3, acc7
   776  
   777  	ANDS	$1, t0
   778  	CSEL	EQ, acc0, acc4, x0
   779  	CSEL	EQ, acc1, acc5, x1
   780  	CSEL	EQ, acc2, acc6, x2
   781  	CSEL	EQ, acc3, acc7, x3
   782  
   783  	RET
   784  /* ---------------------------------------*/
   785  TEXT p256SqrInternal<>(SB),NOSPLIT,$0
   786  	// x[1:] * x[0]
   787  	MUL	x0, x1, acc1
   788  	UMULH	x0, x1, acc2
   789  
   790  	MUL	x0, x2, t0
   791  	ADDS	t0, acc2, acc2
   792  	UMULH	x0, x2, acc3
   793  
   794  	MUL	x0, x3, t0
   795  	ADCS	t0, acc3, acc3
   796  	UMULH	x0, x3, acc4
   797  	ADC	$0, acc4, acc4
   798  	// x[2:] * x[1]
   799  	MUL	x1, x2, t0
   800  	ADDS	t0, acc3
   801  	UMULH	x1, x2, t1
   802  	ADCS	t1, acc4
   803  	ADC	$0, ZR, acc5
   804  
   805  	MUL	x1, x3, t0
   806  	ADDS	t0, acc4
   807  	UMULH	x1, x3, t1
   808  	ADC	t1, acc5
   809  	// x[3] * x[2]
   810  	MUL	x2, x3, t0
   811  	ADDS	t0, acc5
   812  	UMULH	x2, x3, acc6
   813  	ADC	$0, acc6
   814  
   815  	MOVD	$0, acc7
   816  	// *2
   817  	ADDS	acc1, acc1
   818  	ADCS	acc2, acc2
   819  	ADCS	acc3, acc3
   820  	ADCS	acc4, acc4
   821  	ADCS	acc5, acc5
   822  	ADCS	acc6, acc6
   823  	ADC	$0, acc7
   824  	// Missing products
   825  	MUL	x0, x0, acc0
   826  	UMULH	x0, x0, t0
   827  	ADDS	t0, acc1, acc1
   828  
   829  	MUL	x1, x1, t0
   830  	ADCS	t0, acc2, acc2
   831  	UMULH	x1, x1, t1
   832  	ADCS	t1, acc3, acc3
   833  
   834  	MUL	x2, x2, t0
   835  	ADCS	t0, acc4, acc4
   836  	UMULH	x2, x2, t1
   837  	ADCS	t1, acc5, acc5
   838  
   839  	MUL	x3, x3, t0
   840  	ADCS	t0, acc6, acc6
   841  	UMULH	x3, x3, t1
   842  	ADCS	t1, acc7, acc7
   843  	// First reduction step
   844  	ADDS	acc0<<32, acc1, acc1
   845  	LSR	$32, acc0, t0
   846  	MUL	acc0, const1, t1
   847  	UMULH	acc0, const1, acc0
   848  	ADCS	t0, acc2, acc2
   849  	ADCS	t1, acc3, acc3
   850  	ADC	$0, acc0, acc0
   851  	// Second reduction step
   852  	ADDS	acc1<<32, acc2, acc2
   853  	LSR	$32, acc1, t0
   854  	MUL	acc1, const1, t1
   855  	UMULH	acc1, const1, acc1
   856  	ADCS	t0, acc3, acc3
   857  	ADCS	t1, acc0, acc0
   858  	ADC	$0, acc1, acc1
   859  	// Third reduction step
   860  	ADDS	acc2<<32, acc3, acc3
   861  	LSR	$32, acc2, t0
   862  	MUL	acc2, const1, t1
   863  	UMULH	acc2, const1, acc2
   864  	ADCS	t0, acc0, acc0
   865  	ADCS	t1, acc1, acc1
   866  	ADC	$0, acc2, acc2
   867  	// Last reduction step
   868  	ADDS	acc3<<32, acc0, acc0
   869  	LSR	$32, acc3, t0
   870  	MUL	acc3, const1, t1
   871  	UMULH	acc3, const1, acc3
   872  	ADCS	t0, acc1, acc1
   873  	ADCS	t1, acc2, acc2
   874  	ADC	$0, acc3, acc3
   875  	// Add bits [511:256] of the sqr result
   876  	ADDS	acc4, acc0, acc0
   877  	ADCS	acc5, acc1, acc1
   878  	ADCS	acc6, acc2, acc2
   879  	ADCS	acc7, acc3, acc3
   880  	ADC	$0, ZR, acc4
   881  
   882  	SUBS	$-1, acc0, t0
   883  	SBCS	const0, acc1, t1
   884  	SBCS	$0, acc2, t2
   885  	SBCS	const1, acc3, t3
   886  	SBCS	$0, acc4, acc4
   887  
   888  	CSEL	CS, t0, acc0, y0
   889  	CSEL	CS, t1, acc1, y1
   890  	CSEL	CS, t2, acc2, y2
   891  	CSEL	CS, t3, acc3, y3
   892  	RET
   893  /* ---------------------------------------*/
   894  TEXT p256MulInternal<>(SB),NOSPLIT,$0
   895  	// y[0] * x
   896  	MUL	y0, x0, acc0
   897  	UMULH	y0, x0, acc1
   898  
   899  	MUL	y0, x1, t0
   900  	ADDS	t0, acc1
   901  	UMULH	y0, x1, acc2
   902  
   903  	MUL	y0, x2, t0
   904  	ADCS	t0, acc2
   905  	UMULH	y0, x2, acc3
   906  
   907  	MUL	y0, x3, t0
   908  	ADCS	t0, acc3
   909  	UMULH	y0, x3, acc4
   910  	ADC	$0, acc4
   911  	// First reduction step
   912  	ADDS	acc0<<32, acc1, acc1
   913  	LSR	$32, acc0, t0
   914  	MUL	acc0, const1, t1
   915  	UMULH	acc0, const1, acc0
   916  	ADCS	t0, acc2
   917  	ADCS	t1, acc3
   918  	ADC	$0, acc0
   919  	// y[1] * x
   920  	MUL	y1, x0, t0
   921  	ADDS	t0, acc1
   922  	UMULH	y1, x0, t1
   923  
   924  	MUL	y1, x1, t0
   925  	ADCS	t0, acc2
   926  	UMULH	y1, x1, t2
   927  
   928  	MUL	y1, x2, t0
   929  	ADCS	t0, acc3
   930  	UMULH	y1, x2, t3
   931  
   932  	MUL	y1, x3, t0
   933  	ADCS	t0, acc4
   934  	UMULH	y1, x3, hlp0
   935  	ADC	$0, ZR, acc5
   936  
   937  	ADDS	t1, acc2
   938  	ADCS	t2, acc3
   939  	ADCS	t3, acc4
   940  	ADC	hlp0, acc5
   941  	// Second reduction step
   942  	ADDS	acc1<<32, acc2, acc2
   943  	LSR	$32, acc1, t0
   944  	MUL	acc1, const1, t1
   945  	UMULH	acc1, const1, acc1
   946  	ADCS	t0, acc3
   947  	ADCS	t1, acc0
   948  	ADC	$0, acc1
   949  	// y[2] * x
   950  	MUL	y2, x0, t0
   951  	ADDS	t0, acc2
   952  	UMULH	y2, x0, t1
   953  
   954  	MUL	y2, x1, t0
   955  	ADCS	t0, acc3
   956  	UMULH	y2, x1, t2
   957  
   958  	MUL	y2, x2, t0
   959  	ADCS	t0, acc4
   960  	UMULH	y2, x2, t3
   961  
   962  	MUL	y2, x3, t0
   963  	ADCS	t0, acc5
   964  	UMULH	y2, x3, hlp0
   965  	ADC	$0, ZR, acc6
   966  
   967  	ADDS	t1, acc3
   968  	ADCS	t2, acc4
   969  	ADCS	t3, acc5
   970  	ADC	hlp0, acc6
   971  	// Third reduction step
   972  	ADDS	acc2<<32, acc3, acc3
   973  	LSR	$32, acc2, t0
   974  	MUL	acc2, const1, t1
   975  	UMULH	acc2, const1, acc2
   976  	ADCS	t0, acc0
   977  	ADCS	t1, acc1
   978  	ADC	$0, acc2
   979  	// y[3] * x
   980  	MUL	y3, x0, t0
   981  	ADDS	t0, acc3
   982  	UMULH	y3, x0, t1
   983  
   984  	MUL	y3, x1, t0
   985  	ADCS	t0, acc4
   986  	UMULH	y3, x1, t2
   987  
   988  	MUL	y3, x2, t0
   989  	ADCS	t0, acc5
   990  	UMULH	y3, x2, t3
   991  
   992  	MUL	y3, x3, t0
   993  	ADCS	t0, acc6
   994  	UMULH	y3, x3, hlp0
   995  	ADC	$0, ZR, acc7
   996  
   997  	ADDS	t1, acc4
   998  	ADCS	t2, acc5
   999  	ADCS	t3, acc6
  1000  	ADC	hlp0, acc7
  1001  	// Last reduction step
  1002  	ADDS	acc3<<32, acc0, acc0
  1003  	LSR	$32, acc3, t0
  1004  	MUL	acc3, const1, t1
  1005  	UMULH	acc3, const1, acc3
  1006  	ADCS	t0, acc1
  1007  	ADCS	t1, acc2
  1008  	ADC	$0, acc3
  1009  	// Add bits [511:256] of the mul result
  1010  	ADDS	acc4, acc0, acc0
  1011  	ADCS	acc5, acc1, acc1
  1012  	ADCS	acc6, acc2, acc2
  1013  	ADCS	acc7, acc3, acc3
  1014  	ADC	$0, ZR, acc4
  1015  
  1016  	SUBS	$-1, acc0, t0
  1017  	SBCS	const0, acc1, t1
  1018  	SBCS	$0, acc2, t2
  1019  	SBCS	const1, acc3, t3
  1020  	SBCS	$0, acc4, acc4
  1021  
  1022  	CSEL	CS, t0, acc0, y0
  1023  	CSEL	CS, t1, acc1, y1
  1024  	CSEL	CS, t2, acc2, y2
  1025  	CSEL	CS, t3, acc3, y3
  1026  	RET
  1027  /* ---------------------------------------*/
  1028  #define p256MulBy2Inline       \
  1029  	ADDS	y0, y0, x0;    \
  1030  	ADCS	y1, y1, x1;    \
  1031  	ADCS	y2, y2, x2;    \
  1032  	ADCS	y3, y3, x3;    \
  1033  	ADC	$0, ZR, hlp0;  \
  1034  	SUBS	$-1, x0, t0;   \
  1035  	SBCS	const0, x1, t1;\
  1036  	SBCS	$0, x2, t2;    \
  1037  	SBCS	const1, x3, t3;\
  1038  	SBCS	$0, hlp0, hlp0;\
  1039  	CSEL	CC, x0, t0, x0;\
  1040  	CSEL	CC, x1, t1, x1;\
  1041  	CSEL	CC, x2, t2, x2;\
  1042  	CSEL	CC, x3, t3, x3;
  1043  /* ---------------------------------------*/
  1044  #define x1in(off) (off)(a_ptr)
  1045  #define y1in(off) (off + 32)(a_ptr)
  1046  #define z1in(off) (off + 64)(a_ptr)
  1047  #define x2in(off) (off)(b_ptr)
  1048  #define z2in(off) (off + 64)(b_ptr)
  1049  #define x3out(off) (off)(res_ptr)
  1050  #define y3out(off) (off + 32)(res_ptr)
  1051  #define z3out(off) (off + 64)(res_ptr)
  1052  #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1053  #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1054  #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1055  #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1056  /* ---------------------------------------*/
  1057  #define y2in(off)  (32*0 + 8 + off)(RSP)
  1058  #define s2(off)    (32*1 + 8 + off)(RSP)
  1059  #define z1sqr(off) (32*2 + 8 + off)(RSP)
  1060  #define h(off)	   (32*3 + 8 + off)(RSP)
  1061  #define r(off)	   (32*4 + 8 + off)(RSP)
  1062  #define hsqr(off)  (32*5 + 8 + off)(RSP)
  1063  #define rsqr(off)  (32*6 + 8 + off)(RSP)
  1064  #define hcub(off)  (32*7 + 8 + off)(RSP)
  1065  
  1066  #define z2sqr(off) (32*8 + 8 + off)(RSP)
  1067  #define s1(off) (32*9 + 8 + off)(RSP)
  1068  #define u1(off) (32*10 + 8 + off)(RSP)
  1069  #define u2(off) (32*11 + 8 + off)(RSP)
  1070  
  1071  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1072  TEXT ·p256PointAddAffineAsm(SB),0,$264-48
  1073  	MOVD	in1+8(FP), a_ptr
  1074  	MOVD	in2+16(FP), b_ptr
  1075  	MOVD	sign+24(FP), hlp0
  1076  	MOVD	sel+32(FP), hlp1
  1077  	MOVD	zero+40(FP), t2
  1078  
  1079  	MOVD	$1, t0
  1080  	CMP	$0, t2
  1081  	CSEL	EQ, ZR, t0, t2
  1082  	CMP	$0, hlp1
  1083  	CSEL	EQ, ZR, t0, hlp1
  1084  
  1085  	MOVD	p256const0<>(SB), const0
  1086  	MOVD	p256const1<>(SB), const1
  1087  	EOR	t2<<1, hlp1
  1088  
  1089  	// Negate y2in based on sign
  1090  	LDP	2*16(b_ptr), (y0, y1)
  1091  	LDP	3*16(b_ptr), (y2, y3)
  1092  	MOVD	$-1, acc0
  1093  
  1094  	SUBS	y0, acc0, acc0
  1095  	SBCS	y1, const0, acc1
  1096  	SBCS	y2, ZR, acc2
  1097  	SBCS	y3, const1, acc3
  1098  	SBC	$0, ZR, t0
  1099  
  1100  	ADDS	$-1, acc0, acc4
  1101  	ADCS	const0, acc1, acc5
  1102  	ADCS	$0, acc2, acc6
  1103  	ADCS	const1, acc3, acc7
  1104  	ADC	$0, t0, t0
  1105  
  1106  	CMP	$0, t0
  1107  	CSEL	EQ, acc4, acc0, acc0
  1108  	CSEL	EQ, acc5, acc1, acc1
  1109  	CSEL	EQ, acc6, acc2, acc2
  1110  	CSEL	EQ, acc7, acc3, acc3
  1111  	// If condition is 0, keep original value
  1112  	CMP	$0, hlp0
  1113  	CSEL	EQ, y0, acc0, y0
  1114  	CSEL	EQ, y1, acc1, y1
  1115  	CSEL	EQ, y2, acc2, y2
  1116  	CSEL	EQ, y3, acc3, y3
  1117  	// Store result
  1118  	STy(y2in)
  1119  	// Begin point add
  1120  	LDx(z1in)
  1121  	CALL	p256SqrInternal<>(SB)    // z1ˆ2
  1122  	STy(z1sqr)
  1123  
  1124  	LDx(x2in)
  1125  	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
  1126  
  1127  	LDx(x1in)
  1128  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1129  	STx(h)
  1130  
  1131  	LDy(z1in)
  1132  	CALL	p256MulInternal<>(SB)    // z3 = h * z1
  1133  
  1134  	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1135  	LDP	5*16(a_ptr), (acc2, acc3)
  1136  	ANDS	$1, hlp1, ZR
  1137  	CSEL	EQ, acc0, y0, y0
  1138  	CSEL	EQ, acc1, y1, y1
  1139  	CSEL	EQ, acc2, y2, y2
  1140  	CSEL	EQ, acc3, y3, y3
  1141  	LDP	p256one<>+0x00(SB), (acc0, acc1)
  1142  	LDP	p256one<>+0x10(SB), (acc2, acc3)
  1143  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1144  	CSEL	EQ, acc0, y0, y0
  1145  	CSEL	EQ, acc1, y1, y1
  1146  	CSEL	EQ, acc2, y2, y2
  1147  	CSEL	EQ, acc3, y3, y3
  1148  	LDx(z1in)
  1149  	MOVD	res+0(FP), t0
  1150  	STP	(y0, y1), 4*16(t0)
  1151  	STP	(y2, y3), 5*16(t0)
  1152  
  1153  	LDy(z1sqr)
  1154  	CALL	p256MulInternal<>(SB)    // z1 ^ 3
  1155  
  1156  	LDx(y2in)
  1157  	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1158  	STy(s2)
  1159  
  1160  	LDx(y1in)
  1161  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1162  	STx(r)
  1163  
  1164  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1165  	STy	(rsqr)
  1166  
  1167  	LDx(h)
  1168  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1169  	STy(hsqr)
  1170  
  1171  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1172  	STy(hcub)
  1173  
  1174  	LDx(y1in)
  1175  	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
  1176  	STy(s2)
  1177  
  1178  	LDP	hsqr(0*8), (x0, x1)
  1179  	LDP	hsqr(2*8), (x2, x3)
  1180  	LDP	0*16(a_ptr), (y0, y1)
  1181  	LDP	1*16(a_ptr), (y2, y3)
  1182  	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
  1183  	STP	(y0, y1), h(0*8)
  1184  	STP	(y2, y3), h(2*8)
  1185  
  1186  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1187  
  1188  	LDy(rsqr)
  1189  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1190  
  1191  	MOVD	x0, y0
  1192  	MOVD	x1, y1
  1193  	MOVD	x2, y2
  1194  	MOVD	x3, y3
  1195  	LDx(hcub)
  1196  	CALL	p256SubInternal<>(SB)
  1197  
  1198  	LDP	0*16(a_ptr), (acc0, acc1)
  1199  	LDP	1*16(a_ptr), (acc2, acc3)
  1200  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1201  	CSEL	EQ, acc0, x0, x0
  1202  	CSEL	EQ, acc1, x1, x1
  1203  	CSEL	EQ, acc2, x2, x2
  1204  	CSEL	EQ, acc3, x3, x3
  1205  	LDP	0*16(b_ptr), (acc0, acc1)
  1206  	LDP	1*16(b_ptr), (acc2, acc3)
  1207  	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1208  	CSEL	EQ, acc0, x0, x0
  1209  	CSEL	EQ, acc1, x1, x1
  1210  	CSEL	EQ, acc2, x2, x2
  1211  	CSEL	EQ, acc3, x3, x3
  1212  	MOVD	res+0(FP), t0
  1213  	STP	(x0, x1), 0*16(t0)
  1214  	STP	(x2, x3), 1*16(t0)
  1215  
  1216  	LDP	h(0*8), (y0, y1)
  1217  	LDP	h(2*8), (y2, y3)
  1218  	CALL	p256SubInternal<>(SB)
  1219  
  1220  	LDP	r(0*8), (y0, y1)
  1221  	LDP	r(2*8), (y2, y3)
  1222  	CALL	p256MulInternal<>(SB)
  1223  
  1224  	LDP	s2(0*8), (x0, x1)
  1225  	LDP	s2(2*8), (x2, x3)
  1226  	CALL	p256SubInternal<>(SB)
  1227  	LDP	2*16(a_ptr), (acc0, acc1)
  1228  	LDP	3*16(a_ptr), (acc2, acc3)
  1229  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1230  	CSEL	EQ, acc0, x0, x0
  1231  	CSEL	EQ, acc1, x1, x1
  1232  	CSEL	EQ, acc2, x2, x2
  1233  	CSEL	EQ, acc3, x3, x3
  1234  	LDP	y2in(0*8), (acc0, acc1)
  1235  	LDP	y2in(2*8), (acc2, acc3)
  1236  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1237  	CSEL	EQ, acc0, x0, x0
  1238  	CSEL	EQ, acc1, x1, x1
  1239  	CSEL	EQ, acc2, x2, x2
  1240  	CSEL	EQ, acc3, x3, x3
  1241  	MOVD	res+0(FP), t0
  1242  	STP	(x0, x1), 2*16(t0)
  1243  	STP	(x2, x3), 3*16(t0)
  1244  
  1245  	RET
  1246  
  1247  #define p256AddInline          \
  1248  	ADDS	y0, x0, x0;    \
  1249  	ADCS	y1, x1, x1;    \
  1250  	ADCS	y2, x2, x2;    \
  1251  	ADCS	y3, x3, x3;    \
  1252  	ADC	$0, ZR, hlp0;  \
  1253  	SUBS	$-1, x0, t0;   \
  1254  	SBCS	const0, x1, t1;\
  1255  	SBCS	$0, x2, t2;    \
  1256  	SBCS	const1, x3, t3;\
  1257  	SBCS	$0, hlp0, hlp0;\
  1258  	CSEL	CC, x0, t0, x0;\
  1259  	CSEL	CC, x1, t1, x1;\
  1260  	CSEL	CC, x2, t2, x2;\
  1261  	CSEL	CC, x3, t3, x3;
  1262  
  1263  #define s(off)	(32*0 + 8 + off)(RSP)
  1264  #define m(off)	(32*1 + 8 + off)(RSP)
  1265  #define zsqr(off) (32*2 + 8 + off)(RSP)
  1266  #define tmp(off)  (32*3 + 8 + off)(RSP)
  1267  
  1268  //func p256PointDoubleAsm(res, in *P256Point)
  1269  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
  1270  	MOVD	res+0(FP), res_ptr
  1271  	MOVD	in+8(FP), a_ptr
  1272  
  1273  	MOVD	p256const0<>(SB), const0
  1274  	MOVD	p256const1<>(SB), const1
  1275  
  1276  	// Begin point double
  1277  	LDP	4*16(a_ptr), (x0, x1)
  1278  	LDP	5*16(a_ptr), (x2, x3)
  1279  	CALL	p256SqrInternal<>(SB)
  1280  	STP	(y0, y1), zsqr(0*8)
  1281  	STP	(y2, y3), zsqr(2*8)
  1282  
  1283  	LDP	0*16(a_ptr), (x0, x1)
  1284  	LDP	1*16(a_ptr), (x2, x3)
  1285  	p256AddInline
  1286  	STx(m)
  1287  
  1288  	LDx(z1in)
  1289  	LDy(y1in)
  1290  	CALL	p256MulInternal<>(SB)
  1291  	p256MulBy2Inline
  1292  	STx(z3out)
  1293  
  1294  	LDy(x1in)
  1295  	LDx(zsqr)
  1296  	CALL	p256SubInternal<>(SB)
  1297  	LDy(m)
  1298  	CALL	p256MulInternal<>(SB)
  1299  
  1300  	// Multiply by 3
  1301  	p256MulBy2Inline
  1302  	p256AddInline
  1303  	STx(m)
  1304  
  1305  	LDy(y1in)
  1306  	p256MulBy2Inline
  1307  	CALL	p256SqrInternal<>(SB)
  1308  	STy(s)
  1309  	MOVD	y0, x0
  1310  	MOVD	y1, x1
  1311  	MOVD	y2, x2
  1312  	MOVD	y3, x3
  1313  	CALL	p256SqrInternal<>(SB)
  1314  
  1315  	// Divide by 2
  1316  	ADDS	$-1, y0, t0
  1317  	ADCS	const0, y1, t1
  1318  	ADCS	$0, y2, t2
  1319  	ADCS	const1, y3, t3
  1320  	ADC	$0, ZR, hlp0
  1321  
  1322  	ANDS	$1, y0, ZR
  1323  	CSEL	EQ, y0, t0, t0
  1324  	CSEL	EQ, y1, t1, t1
  1325  	CSEL	EQ, y2, t2, t2
  1326  	CSEL	EQ, y3, t3, t3
  1327  	AND	y0, hlp0, hlp0
  1328  
  1329  	EXTR	$1, t0, t1, y0
  1330  	EXTR	$1, t1, t2, y1
  1331  	EXTR	$1, t2, t3, y2
  1332  	EXTR	$1, t3, hlp0, y3
  1333  	STy(y3out)
  1334  
  1335  	LDx(x1in)
  1336  	LDy(s)
  1337  	CALL	p256MulInternal<>(SB)
  1338  	STy(s)
  1339  	p256MulBy2Inline
  1340  	STx(tmp)
  1341  
  1342  	LDx(m)
  1343  	CALL	p256SqrInternal<>(SB)
  1344  	LDx(tmp)
  1345  	CALL	p256SubInternal<>(SB)
  1346  
  1347  	STx(x3out)
  1348  
  1349  	LDy(s)
  1350  	CALL	p256SubInternal<>(SB)
  1351  
  1352  	LDy(m)
  1353  	CALL	p256MulInternal<>(SB)
  1354  
  1355  	LDx(y3out)
  1356  	CALL	p256SubInternal<>(SB)
  1357  	STx(y3out)
  1358  	RET
  1359  /* ---------------------------------------*/
  1360  #undef y2in
  1361  #undef x3out
  1362  #undef y3out
  1363  #undef z3out
  1364  #define y2in(off) (off + 32)(b_ptr)
  1365  #define x3out(off) (off)(b_ptr)
  1366  #define y3out(off) (off + 32)(b_ptr)
  1367  #define z3out(off) (off + 64)(b_ptr)
  1368  // func p256PointAddAsm(res, in1, in2 *P256Point) int
  1369  TEXT ·p256PointAddAsm(SB),0,$392-32
  1370  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1371  	// Move input to stack in order to free registers
  1372  	MOVD	in1+8(FP), a_ptr
  1373  	MOVD	in2+16(FP), b_ptr
  1374  
  1375  	MOVD	p256const0<>(SB), const0
  1376  	MOVD	p256const1<>(SB), const1
  1377  
  1378  	// Begin point add
  1379  	LDx(z2in)
  1380  	CALL	p256SqrInternal<>(SB)    // z2^2
  1381  	STy(z2sqr)
  1382  
  1383  	CALL	p256MulInternal<>(SB)    // z2^3
  1384  
  1385  	LDx(y1in)
  1386  	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1387  	STy(s1)
  1388  
  1389  	LDx(z1in)
  1390  	CALL	p256SqrInternal<>(SB)    // z1^2
  1391  	STy(z1sqr)
  1392  
  1393  	CALL	p256MulInternal<>(SB)    // z1^3
  1394  
  1395  	LDx(y2in)
  1396  	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1397  
  1398  	LDx(s1)
  1399  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1400  	STx(r)
  1401  
  1402  	MOVD	$1, t2
  1403  	ORR	x0, x1, t0             // Check if zero mod p256
  1404  	ORR	x2, x3, t1
  1405  	ORR	t1, t0, t0
  1406  	CMP	$0, t0
  1407  	CSEL	EQ, t2, ZR, hlp1
  1408  
  1409  	EOR	$-1, x0, t0
  1410  	EOR	const0, x1, t1
  1411  	EOR	const1, x3, t3
  1412  
  1413  	ORR	t0, t1, t0
  1414  	ORR	x2, t3, t1
  1415  	ORR	t1, t0, t0
  1416  	CMP	$0, t0
  1417  	CSEL	EQ, t2, hlp1, hlp1
  1418  
  1419  	LDx(z2sqr)
  1420  	LDy(x1in)
  1421  	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1422  	STy(u1)
  1423  
  1424  	LDx(z1sqr)
  1425  	LDy(x2in)
  1426  	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1427  	STy(u2)
  1428  
  1429  	LDx(u1)
  1430  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1431  	STx(h)
  1432  
  1433  	MOVD	$1, t2
  1434  	ORR	x0, x1, t0             // Check if zero mod p256
  1435  	ORR	x2, x3, t1
  1436  	ORR	t1, t0, t0
  1437  	CMP	$0, t0
  1438  	CSEL	EQ, t2, ZR, hlp0
  1439  
  1440  	EOR	$-1, x0, t0
  1441  	EOR	const0, x1, t1
  1442  	EOR	const1, x3, t3
  1443  
  1444  	ORR	t0, t1, t0
  1445  	ORR	x2, t3, t1
  1446  	ORR	t1, t0, t0
  1447  	CMP	$0, t0
  1448  	CSEL	EQ, t2, hlp0, hlp0
  1449  
  1450  	AND	hlp0, hlp1, hlp1
  1451  
  1452  	LDx(r)
  1453  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1454  	STy(rsqr)
  1455  
  1456  	LDx(h)
  1457  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1458  	STy(hsqr)
  1459  
  1460  	LDx(h)
  1461  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1462  	STy(hcub)
  1463  
  1464  	LDx(s1)
  1465  	CALL	p256MulInternal<>(SB)
  1466  	STy(s2)
  1467  
  1468  	LDx(z1in)
  1469  	LDy(z2in)
  1470  	CALL	p256MulInternal<>(SB)    // z1 * z2
  1471  	LDx(h)
  1472  	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
  1473  	MOVD	res+0(FP), b_ptr
  1474  	STy(z3out)
  1475  
  1476  	LDx(hsqr)
  1477  	LDy(u1)
  1478  	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
  1479  	STy(u2)
  1480  
  1481  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1482  	LDy(rsqr)
  1483  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1484  
  1485  	MOVD	x0, y0
  1486  	MOVD	x1, y1
  1487  	MOVD	x2, y2
  1488  	MOVD	x3, y3
  1489  	LDx(hcub)
  1490  	CALL	p256SubInternal<>(SB)
  1491  	STx(x3out)
  1492  
  1493  	LDy(u2)
  1494  	CALL	p256SubInternal<>(SB)
  1495  
  1496  	LDy(r)
  1497  	CALL	p256MulInternal<>(SB)
  1498  
  1499  	LDx(s2)
  1500  	CALL	p256SubInternal<>(SB)
  1501  	STx(y3out)
  1502  
  1503  	MOVD	hlp1, R0
  1504  	MOVD	R0, ret+24(FP)
  1505  
  1506  	RET
  1507
View as plain text