Text file src/crypto/internal/nistec/p256_asm_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  // This file contains constant-time, 64-bit assembly implementation of
     8  // P256. The optimizations performed here are described in detail in:
     9  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
    10  //                          256-bit primes"
    11  // http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    12  // https://eprint.iacr.org/2013/816.pdf
    13  
    14  #include "textflag.h"
    15  
    16  #define res_ptr R0
    17  #define a_ptr R1
    18  #define b_ptr R2
    19  
    20  #define acc0 R3
    21  #define acc1 R4
    22  #define acc2 R5
    23  #define acc3 R6
    24  
    25  #define acc4 R7
    26  #define acc5 R8
    27  #define acc6 R9
    28  #define acc7 R10
    29  #define t0 R11
    30  #define t1 R12
    31  #define t2 R13
    32  #define t3 R14
    33  #define const0 R15
    34  #define const1 R16
    35  
    36  #define hlp0 R17
    37  #define hlp1 res_ptr
    38  
    39  #define x0 R19
    40  #define x1 R20
    41  #define x2 R21
    42  #define x3 R22
    43  #define y0 R23
    44  #define y1 R24
    45  #define y2 R25
    46  #define y3 R26
    47  
    48  #define const2 t2
    49  #define const3 t3
    50  
    51  DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    52  DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    53  DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    54  DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    55  DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    56  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    57  DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    58  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    59  DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    60  DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    61  DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    62  GLOBL p256const0<>(SB), 8, $8
    63  GLOBL p256const1<>(SB), 8, $8
    64  GLOBL p256ordK0<>(SB), 8, $8
    65  GLOBL p256ord<>(SB), 8, $32
    66  GLOBL p256one<>(SB), 8, $32
    67  
    68  /* ---------------------------------------*/
    69  // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    70  TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    71  	JMP	·p256BigToLittle(SB)
    72  /* ---------------------------------------*/
    73  // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    74  TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    75  	JMP	·p256BigToLittle(SB)
    76  /* ---------------------------------------*/
    77  // func p256LittleToBig(res *[32]byte, in *p256Element)
    78  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    79  	JMP	·p256BigToLittle(SB)
    80  /* ---------------------------------------*/
    81  // func p256BigToLittle(res *p256Element, in *[32]byte)
    82  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    83  	MOVD	res+0(FP), res_ptr
    84  	MOVD	in+8(FP), a_ptr
    85  
    86  	LDP	0*16(a_ptr), (acc0, acc1)
    87  	LDP	1*16(a_ptr), (acc2, acc3)
    88  
    89  	REV	acc0, acc0
    90  	REV	acc1, acc1
    91  	REV	acc2, acc2
    92  	REV	acc3, acc3
    93  
    94  	STP	(acc3, acc2), 0*16(res_ptr)
    95  	STP	(acc1, acc0), 1*16(res_ptr)
    96  	RET
    97  /* ---------------------------------------*/
    98  // func p256MovCond(res, a, b *P256Point, cond int)
    99  // If cond == 0 res=b, else res=a
   100  TEXT ·p256MovCond(SB),NOSPLIT,$0
   101  	MOVD	res+0(FP), res_ptr
   102  	MOVD	a+8(FP), a_ptr
   103  	MOVD	b+16(FP), b_ptr
   104  	MOVD	cond+24(FP), R3
   105  
   106  	CMP	$0, R3
   107  	// Two remarks:
   108  	// 1) Will want to revisit NEON, when support is better
   109  	// 2) CSEL might not be constant time on all ARM processors
   110  	LDP	0*16(a_ptr), (R4, R5)
   111  	LDP	1*16(a_ptr), (R6, R7)
   112  	LDP	2*16(a_ptr), (R8, R9)
   113  	LDP	0*16(b_ptr), (R16, R17)
   114  	LDP	1*16(b_ptr), (R19, R20)
   115  	LDP	2*16(b_ptr), (R21, R22)
   116  	CSEL	EQ, R16, R4, R4
   117  	CSEL	EQ, R17, R5, R5
   118  	CSEL	EQ, R19, R6, R6
   119  	CSEL	EQ, R20, R7, R7
   120  	CSEL	EQ, R21, R8, R8
   121  	CSEL	EQ, R22, R9, R9
   122  	STP	(R4, R5), 0*16(res_ptr)
   123  	STP	(R6, R7), 1*16(res_ptr)
   124  	STP	(R8, R9), 2*16(res_ptr)
   125  
   126  	LDP	3*16(a_ptr), (R4, R5)
   127  	LDP	4*16(a_ptr), (R6, R7)
   128  	LDP	5*16(a_ptr), (R8, R9)
   129  	LDP	3*16(b_ptr), (R16, R17)
   130  	LDP	4*16(b_ptr), (R19, R20)
   131  	LDP	5*16(b_ptr), (R21, R22)
   132  	CSEL	EQ, R16, R4, R4
   133  	CSEL	EQ, R17, R5, R5
   134  	CSEL	EQ, R19, R6, R6
   135  	CSEL	EQ, R20, R7, R7
   136  	CSEL	EQ, R21, R8, R8
   137  	CSEL	EQ, R22, R9, R9
   138  	STP	(R4, R5), 3*16(res_ptr)
   139  	STP	(R6, R7), 4*16(res_ptr)
   140  	STP	(R8, R9), 5*16(res_ptr)
   141  
   142  	RET
   143  /* ---------------------------------------*/
   144  // func p256NegCond(val *p256Element, cond int)
   145  TEXT ·p256NegCond(SB),NOSPLIT,$0
   146  	MOVD	val+0(FP), a_ptr
   147  	MOVD	cond+8(FP), hlp0
   148  	MOVD	a_ptr, res_ptr
   149  	// acc = poly
   150  	MOVD	$-1, acc0
   151  	MOVD	p256const0<>(SB), acc1
   152  	MOVD	$0, acc2
   153  	MOVD	p256const1<>(SB), acc3
   154  	// Load the original value
   155  	LDP	0*16(a_ptr), (t0, t1)
   156  	LDP	1*16(a_ptr), (t2, t3)
   157  	// Speculatively subtract
   158  	SUBS	t0, acc0
   159  	SBCS	t1, acc1
   160  	SBCS	t2, acc2
   161  	SBC	t3, acc3
   162  	// If condition is 0, keep original value
   163  	CMP	$0, hlp0
   164  	CSEL	EQ, t0, acc0, acc0
   165  	CSEL	EQ, t1, acc1, acc1
   166  	CSEL	EQ, t2, acc2, acc2
   167  	CSEL	EQ, t3, acc3, acc3
   168  	// Store result
   169  	STP	(acc0, acc1), 0*16(res_ptr)
   170  	STP	(acc2, acc3), 1*16(res_ptr)
   171  
   172  	RET
   173  /* ---------------------------------------*/
   174  // func p256Sqr(res, in *p256Element, n int)
   175  TEXT ·p256Sqr(SB),NOSPLIT,$0
   176  	MOVD	res+0(FP), res_ptr
   177  	MOVD	in+8(FP), a_ptr
   178  	MOVD	n+16(FP), b_ptr
   179  
   180  	MOVD	p256const0<>(SB), const0
   181  	MOVD	p256const1<>(SB), const1
   182  
   183  	LDP	0*16(a_ptr), (x0, x1)
   184  	LDP	1*16(a_ptr), (x2, x3)
   185  
   186  sqrLoop:
   187  	SUB	$1, b_ptr
   188  	CALL	p256SqrInternal<>(SB)
   189  	MOVD	y0, x0
   190  	MOVD	y1, x1
   191  	MOVD	y2, x2
   192  	MOVD	y3, x3
   193  	CBNZ	b_ptr, sqrLoop
   194  
   195  	STP	(y0, y1), 0*16(res_ptr)
   196  	STP	(y2, y3), 1*16(res_ptr)
   197  	RET
   198  /* ---------------------------------------*/
   199  // func p256Mul(res, in1, in2 *p256Element)
   200  TEXT ·p256Mul(SB),NOSPLIT,$0
   201  	MOVD	res+0(FP), res_ptr
   202  	MOVD	in1+8(FP), a_ptr
   203  	MOVD	in2+16(FP), b_ptr
   204  
   205  	MOVD	p256const0<>(SB), const0
   206  	MOVD	p256const1<>(SB), const1
   207  
   208  	LDP	0*16(a_ptr), (x0, x1)
   209  	LDP	1*16(a_ptr), (x2, x3)
   210  
   211  	LDP	0*16(b_ptr), (y0, y1)
   212  	LDP	1*16(b_ptr), (y2, y3)
   213  
   214  	CALL	p256MulInternal<>(SB)
   215  
   216  	STP	(y0, y1), 0*16(res_ptr)
   217  	STP	(y2, y3), 1*16(res_ptr)
   218  	RET
   219  /* ---------------------------------------*/
   220  // func p256FromMont(res, in *p256Element)
   221  TEXT ·p256FromMont(SB),NOSPLIT,$0
   222  	MOVD	res+0(FP), res_ptr
   223  	MOVD	in+8(FP), a_ptr
   224  
   225  	MOVD	p256const0<>(SB), const0
   226  	MOVD	p256const1<>(SB), const1
   227  
   228  	LDP	0*16(a_ptr), (acc0, acc1)
   229  	LDP	1*16(a_ptr), (acc2, acc3)
   230  	// Only reduce, no multiplications are needed
   231  	// First reduction step
   232  	ADDS	acc0<<32, acc1, acc1
   233  	LSR	$32, acc0, t0
   234  	MUL	acc0, const1, t1
   235  	UMULH	acc0, const1, acc0
   236  	ADCS	t0, acc2
   237  	ADCS	t1, acc3
   238  	ADC	$0, acc0
   239  	// Second reduction step
   240  	ADDS	acc1<<32, acc2, acc2
   241  	LSR	$32, acc1, t0
   242  	MUL	acc1, const1, t1
   243  	UMULH	acc1, const1, acc1
   244  	ADCS	t0, acc3
   245  	ADCS	t1, acc0
   246  	ADC	$0, acc1
   247  	// Third reduction step
   248  	ADDS	acc2<<32, acc3, acc3
   249  	LSR	$32, acc2, t0
   250  	MUL	acc2, const1, t1
   251  	UMULH	acc2, const1, acc2
   252  	ADCS	t0, acc0
   253  	ADCS	t1, acc1
   254  	ADC	$0, acc2
   255  	// Last reduction step
   256  	ADDS	acc3<<32, acc0, acc0
   257  	LSR	$32, acc3, t0
   258  	MUL	acc3, const1, t1
   259  	UMULH	acc3, const1, acc3
   260  	ADCS	t0, acc1
   261  	ADCS	t1, acc2
   262  	ADC	$0, acc3
   263  
   264  	SUBS	$-1, acc0, t0
   265  	SBCS	const0, acc1, t1
   266  	SBCS	$0, acc2, t2
   267  	SBCS	const1, acc3, t3
   268  
   269  	CSEL	CS, t0, acc0, acc0
   270  	CSEL	CS, t1, acc1, acc1
   271  	CSEL	CS, t2, acc2, acc2
   272  	CSEL	CS, t3, acc3, acc3
   273  
   274  	STP	(acc0, acc1), 0*16(res_ptr)
   275  	STP	(acc2, acc3), 1*16(res_ptr)
   276  
   277  	RET
   278  /* ---------------------------------------*/
   279  // func p256Select(res *P256Point, table *p256Table, idx int)
   280  TEXT ·p256Select(SB),NOSPLIT,$0
   281  	MOVD	idx+16(FP), const0
   282  	MOVD	table+8(FP), b_ptr
   283  	MOVD	res+0(FP), res_ptr
   284  
   285  	EOR	x0, x0, x0
   286  	EOR	x1, x1, x1
   287  	EOR	x2, x2, x2
   288  	EOR	x3, x3, x3
   289  	EOR	y0, y0, y0
   290  	EOR	y1, y1, y1
   291  	EOR	y2, y2, y2
   292  	EOR	y3, y3, y3
   293  	EOR	t0, t0, t0
   294  	EOR	t1, t1, t1
   295  	EOR	t2, t2, t2
   296  	EOR	t3, t3, t3
   297  
   298  	MOVD	$0, const1
   299  
   300  loop_select:
   301  		ADD	$1, const1
   302  		CMP	const0, const1
   303  		LDP.P	16(b_ptr), (acc0, acc1)
   304  		CSEL	EQ, acc0, x0, x0
   305  		CSEL	EQ, acc1, x1, x1
   306  		LDP.P	16(b_ptr), (acc2, acc3)
   307  		CSEL	EQ, acc2, x2, x2
   308  		CSEL	EQ, acc3, x3, x3
   309  		LDP.P	16(b_ptr), (acc4, acc5)
   310  		CSEL	EQ, acc4, y0, y0
   311  		CSEL	EQ, acc5, y1, y1
   312  		LDP.P	16(b_ptr), (acc6, acc7)
   313  		CSEL	EQ, acc6, y2, y2
   314  		CSEL	EQ, acc7, y3, y3
   315  		LDP.P	16(b_ptr), (acc0, acc1)
   316  		CSEL	EQ, acc0, t0, t0
   317  		CSEL	EQ, acc1, t1, t1
   318  		LDP.P	16(b_ptr), (acc2, acc3)
   319  		CSEL	EQ, acc2, t2, t2
   320  		CSEL	EQ, acc3, t3, t3
   321  
   322  		CMP	$16, const1
   323  		BNE	loop_select
   324  
   325  	STP	(x0, x1), 0*16(res_ptr)
   326  	STP	(x2, x3), 1*16(res_ptr)
   327  	STP	(y0, y1), 2*16(res_ptr)
   328  	STP	(y2, y3), 3*16(res_ptr)
   329  	STP	(t0, t1), 4*16(res_ptr)
   330  	STP	(t2, t3), 5*16(res_ptr)
   331  	RET
   332  /* ---------------------------------------*/
   333  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   334  TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   335  	MOVD	idx+16(FP), t0
   336  	MOVD	table+8(FP), t1
   337  	MOVD	res+0(FP), res_ptr
   338  
   339  	EOR	x0, x0, x0
   340  	EOR	x1, x1, x1
   341  	EOR	x2, x2, x2
   342  	EOR	x3, x3, x3
   343  	EOR	y0, y0, y0
   344  	EOR	y1, y1, y1
   345  	EOR	y2, y2, y2
   346  	EOR	y3, y3, y3
   347  
   348  	MOVD	$0, t2
   349  
   350  loop_select:
   351  		ADD	$1, t2
   352  		CMP	t0, t2
   353  		LDP.P	16(t1), (acc0, acc1)
   354  		CSEL	EQ, acc0, x0, x0
   355  		CSEL	EQ, acc1, x1, x1
   356  		LDP.P	16(t1), (acc2, acc3)
   357  		CSEL	EQ, acc2, x2, x2
   358  		CSEL	EQ, acc3, x3, x3
   359  		LDP.P	16(t1), (acc4, acc5)
   360  		CSEL	EQ, acc4, y0, y0
   361  		CSEL	EQ, acc5, y1, y1
   362  		LDP.P	16(t1), (acc6, acc7)
   363  		CSEL	EQ, acc6, y2, y2
   364  		CSEL	EQ, acc7, y3, y3
   365  
   366  		CMP	$32, t2
   367  		BNE	loop_select
   368  
   369  	STP	(x0, x1), 0*16(res_ptr)
   370  	STP	(x2, x3), 1*16(res_ptr)
   371  	STP	(y0, y1), 2*16(res_ptr)
   372  	STP	(y2, y3), 3*16(res_ptr)
   373  	RET
   374  /* ---------------------------------------*/
   375  // func p256OrdSqr(res, in *p256OrdElement, n int)
   376  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
   377  	MOVD	in+8(FP), a_ptr
   378  	MOVD	n+16(FP), b_ptr
   379  
   380  	MOVD	p256ordK0<>(SB), hlp1
   381  	LDP	p256ord<>+0x00(SB), (const0, const1)
   382  	LDP	p256ord<>+0x10(SB), (const2, const3)
   383  
   384  	LDP	0*16(a_ptr), (x0, x1)
   385  	LDP	1*16(a_ptr), (x2, x3)
   386  
   387  ordSqrLoop:
   388  	SUB	$1, b_ptr
   389  
   390  	// x[1:] * x[0]
   391  	MUL	x0, x1, acc1
   392  	UMULH	x0, x1, acc2
   393  
   394  	MUL	x0, x2, t0
   395  	ADDS	t0, acc2, acc2
   396  	UMULH	x0, x2, acc3
   397  
   398  	MUL	x0, x3, t0
   399  	ADCS	t0, acc3, acc3
   400  	UMULH	x0, x3, acc4
   401  	ADC	$0, acc4, acc4
   402  	// x[2:] * x[1]
   403  	MUL	x1, x2, t0
   404  	ADDS	t0, acc3
   405  	UMULH	x1, x2, t1
   406  	ADCS	t1, acc4
   407  	ADC	$0, ZR, acc5
   408  
   409  	MUL	x1, x3, t0
   410  	ADDS	t0, acc4
   411  	UMULH	x1, x3, t1
   412  	ADC	t1, acc5
   413  	// x[3] * x[2]
   414  	MUL	x2, x3, t0
   415  	ADDS	t0, acc5
   416  	UMULH	x2, x3, acc6
   417  	ADC	$0, acc6
   418  
   419  	MOVD	$0, acc7
   420  	// *2
   421  	ADDS	acc1, acc1
   422  	ADCS	acc2, acc2
   423  	ADCS	acc3, acc3
   424  	ADCS	acc4, acc4
   425  	ADCS	acc5, acc5
   426  	ADCS	acc6, acc6
   427  	ADC	$0, acc7
   428  	// Missing products
   429  	MUL	x0, x0, acc0
   430  	UMULH	x0, x0, t0
   431  	ADDS	t0, acc1, acc1
   432  
   433  	MUL	x1, x1, t0
   434  	ADCS	t0, acc2, acc2
   435  	UMULH	x1, x1, t1
   436  	ADCS	t1, acc3, acc3
   437  
   438  	MUL	x2, x2, t0
   439  	ADCS	t0, acc4, acc4
   440  	UMULH	x2, x2, t1
   441  	ADCS	t1, acc5, acc5
   442  
   443  	MUL	x3, x3, t0
   444  	ADCS	t0, acc6, acc6
   445  	UMULH	x3, x3, t1
   446  	ADC	t1, acc7, acc7
   447  	// First reduction step
   448  	MUL	acc0, hlp1, hlp0
   449  
   450  	MUL	const0, hlp1, t0
   451  	ADDS	t0, acc0, acc0
   452  	UMULH	const0, hlp0, t1
   453  
   454  	MUL	const1, hlp0, t0
   455  	ADCS	t0, acc1, acc1
   456  	UMULH	const1, hlp0, y0
   457  
   458  	MUL	const2, hlp0, t0
   459  	ADCS	t0, acc2, acc2
   460  	UMULH	const2, hlp0, acc0
   461  
   462  	MUL	const3, hlp0, t0
   463  	ADCS	t0, acc3, acc3
   464  
   465  	UMULH	const3, hlp0, hlp0
   466  	ADC	$0, hlp0
   467  
   468  	ADDS	t1, acc1, acc1
   469  	ADCS	y0, acc2, acc2
   470  	ADCS	acc0, acc3, acc3
   471  	ADC	$0, hlp0, acc0
   472  	// Second reduction step
   473  	MUL	acc1, hlp1, hlp0
   474  
   475  	MUL	const0, hlp1, t0
   476  	ADDS	t0, acc1, acc1
   477  	UMULH	const0, hlp0, t1
   478  
   479  	MUL	const1, hlp0, t0
   480  	ADCS	t0, acc2, acc2
   481  	UMULH	const1, hlp0, y0
   482  
   483  	MUL	const2, hlp0, t0
   484  	ADCS	t0, acc3, acc3
   485  	UMULH	const2, hlp0, acc1
   486  
   487  	MUL	const3, hlp0, t0
   488  	ADCS	t0, acc0, acc0
   489  
   490  	UMULH	const3, hlp0, hlp0
   491  	ADC	$0, hlp0
   492  
   493  	ADDS	t1, acc2, acc2
   494  	ADCS	y0, acc3, acc3
   495  	ADCS	acc1, acc0, acc0
   496  	ADC	$0, hlp0, acc1
   497  	// Third reduction step
   498  	MUL	acc2, hlp1, hlp0
   499  
   500  	MUL	const0, hlp1, t0
   501  	ADDS	t0, acc2, acc2
   502  	UMULH	const0, hlp0, t1
   503  
   504  	MUL	const1, hlp0, t0
   505  	ADCS	t0, acc3, acc3
   506  	UMULH	const1, hlp0, y0
   507  
   508  	MUL	const2, hlp0, t0
   509  	ADCS	t0, acc0, acc0
   510  	UMULH	const2, hlp0, acc2
   511  
   512  	MUL	const3, hlp0, t0
   513  	ADCS	t0, acc1, acc1
   514  
   515  	UMULH	const3, hlp0, hlp0
   516  	ADC	$0, hlp0
   517  
   518  	ADDS	t1, acc3, acc3
   519  	ADCS	y0, acc0, acc0
   520  	ADCS	acc2, acc1, acc1
   521  	ADC	$0, hlp0, acc2
   522  
   523  	// Last reduction step
   524  	MUL	acc3, hlp1, hlp0
   525  
   526  	MUL	const0, hlp1, t0
   527  	ADDS	t0, acc3, acc3
   528  	UMULH	const0, hlp0, t1
   529  
   530  	MUL	const1, hlp0, t0
   531  	ADCS	t0, acc0, acc0
   532  	UMULH	const1, hlp0, y0
   533  
   534  	MUL	const2, hlp0, t0
   535  	ADCS	t0, acc1, acc1
   536  	UMULH	const2, hlp0, acc3
   537  
   538  	MUL	const3, hlp0, t0
   539  	ADCS	t0, acc2, acc2
   540  
   541  	UMULH	const3, hlp0, hlp0
   542  	ADC	$0, acc7
   543  
   544  	ADDS	t1, acc0, acc0
   545  	ADCS	y0, acc1, acc1
   546  	ADCS	acc3, acc2, acc2
   547  	ADC	$0, hlp0, acc3
   548  
   549  	ADDS	acc4, acc0, acc0
   550  	ADCS	acc5, acc1, acc1
   551  	ADCS	acc6, acc2, acc2
   552  	ADCS	acc7, acc3, acc3
   553  	ADC	$0, ZR, acc4
   554  
   555  	SUBS	const0, acc0, y0
   556  	SBCS	const1, acc1, y1
   557  	SBCS	const2, acc2, y2
   558  	SBCS	const3, acc3, y3
   559  	SBCS	$0, acc4, acc4
   560  
   561  	CSEL	CS, y0, acc0, x0
   562  	CSEL	CS, y1, acc1, x1
   563  	CSEL	CS, y2, acc2, x2
   564  	CSEL	CS, y3, acc3, x3
   565  
   566  	CBNZ	b_ptr, ordSqrLoop
   567  
   568  	MOVD	res+0(FP), res_ptr
   569  	STP	(x0, x1), 0*16(res_ptr)
   570  	STP	(x2, x3), 1*16(res_ptr)
   571  
   572  	RET
   573  /* ---------------------------------------*/
   574  // func p256OrdMul(res, in1, in2 *p256OrdElement)
   575  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   576  	MOVD	in1+8(FP), a_ptr
   577  	MOVD	in2+16(FP), b_ptr
   578  
   579  	MOVD	p256ordK0<>(SB), hlp1
   580  	LDP	p256ord<>+0x00(SB), (const0, const1)
   581  	LDP	p256ord<>+0x10(SB), (const2, const3)
   582  
   583  	LDP	0*16(a_ptr), (x0, x1)
   584  	LDP	1*16(a_ptr), (x2, x3)
   585  	LDP	0*16(b_ptr), (y0, y1)
   586  	LDP	1*16(b_ptr), (y2, y3)
   587  
   588  	// y[0] * x
   589  	MUL	y0, x0, acc0
   590  	UMULH	y0, x0, acc1
   591  
   592  	MUL	y0, x1, t0
   593  	ADDS	t0, acc1
   594  	UMULH	y0, x1, acc2
   595  
   596  	MUL	y0, x2, t0
   597  	ADCS	t0, acc2
   598  	UMULH	y0, x2, acc3
   599  
   600  	MUL	y0, x3, t0
   601  	ADCS	t0, acc3
   602  	UMULH	y0, x3, acc4
   603  	ADC	$0, acc4
   604  	// First reduction step
   605  	MUL	acc0, hlp1, hlp0
   606  
   607  	MUL	const0, hlp1, t0
   608  	ADDS	t0, acc0, acc0
   609  	UMULH	const0, hlp0, t1
   610  
   611  	MUL	const1, hlp0, t0
   612  	ADCS	t0, acc1, acc1
   613  	UMULH	const1, hlp0, y0
   614  
   615  	MUL	const2, hlp0, t0
   616  	ADCS	t0, acc2, acc2
   617  	UMULH	const2, hlp0, acc0
   618  
   619  	MUL	const3, hlp0, t0
   620  	ADCS	t0, acc3, acc3
   621  
   622  	UMULH	const3, hlp0, hlp0
   623  	ADC	$0, acc4
   624  
   625  	ADDS	t1, acc1, acc1
   626  	ADCS	y0, acc2, acc2
   627  	ADCS	acc0, acc3, acc3
   628  	ADC	$0, hlp0, acc0
   629  	// y[1] * x
   630  	MUL	y1, x0, t0
   631  	ADDS	t0, acc1
   632  	UMULH	y1, x0, t1
   633  
   634  	MUL	y1, x1, t0
   635  	ADCS	t0, acc2
   636  	UMULH	y1, x1, hlp0
   637  
   638  	MUL	y1, x2, t0
   639  	ADCS	t0, acc3
   640  	UMULH	y1, x2, y0
   641  
   642  	MUL	y1, x3, t0
   643  	ADCS	t0, acc4
   644  	UMULH	y1, x3, y1
   645  	ADC	$0, ZR, acc5
   646  
   647  	ADDS	t1, acc2
   648  	ADCS	hlp0, acc3
   649  	ADCS	y0, acc4
   650  	ADC	y1, acc5
   651  	// Second reduction step
   652  	MUL	acc1, hlp1, hlp0
   653  
   654  	MUL	const0, hlp1, t0
   655  	ADDS	t0, acc1, acc1
   656  	UMULH	const0, hlp0, t1
   657  
   658  	MUL	const1, hlp0, t0
   659  	ADCS	t0, acc2, acc2
   660  	UMULH	const1, hlp0, y0
   661  
   662  	MUL	const2, hlp0, t0
   663  	ADCS	t0, acc3, acc3
   664  	UMULH	const2, hlp0, acc1
   665  
   666  	MUL	const3, hlp0, t0
   667  	ADCS	t0, acc0, acc0
   668  
   669  	UMULH	const3, hlp0, hlp0
   670  	ADC	$0, acc5
   671  
   672  	ADDS	t1, acc2, acc2
   673  	ADCS	y0, acc3, acc3
   674  	ADCS	acc1, acc0, acc0
   675  	ADC	$0, hlp0, acc1
   676  	// y[2] * x
   677  	MUL	y2, x0, t0
   678  	ADDS	t0, acc2
   679  	UMULH	y2, x0, t1
   680  
   681  	MUL	y2, x1, t0
   682  	ADCS	t0, acc3
   683  	UMULH	y2, x1, hlp0
   684  
   685  	MUL	y2, x2, t0
   686  	ADCS	t0, acc4
   687  	UMULH	y2, x2, y0
   688  
   689  	MUL	y2, x3, t0
   690  	ADCS	t0, acc5
   691  	UMULH	y2, x3, y1
   692  	ADC	$0, ZR, acc6
   693  
   694  	ADDS	t1, acc3
   695  	ADCS	hlp0, acc4
   696  	ADCS	y0, acc5
   697  	ADC	y1, acc6
   698  	// Third reduction step
   699  	MUL	acc2, hlp1, hlp0
   700  
   701  	MUL	const0, hlp1, t0
   702  	ADDS	t0, acc2, acc2
   703  	UMULH	const0, hlp0, t1
   704  
   705  	MUL	const1, hlp0, t0
   706  	ADCS	t0, acc3, acc3
   707  	UMULH	const1, hlp0, y0
   708  
   709  	MUL	const2, hlp0, t0
   710  	ADCS	t0, acc0, acc0
   711  	UMULH	const2, hlp0, acc2
   712  
   713  	MUL	const3, hlp0, t0
   714  	ADCS	t0, acc1, acc1
   715  
   716  	UMULH	const3, hlp0, hlp0
   717  	ADC	$0, acc6
   718  
   719  	ADDS	t1, acc3, acc3
   720  	ADCS	y0, acc0, acc0
   721  	ADCS	acc2, acc1, acc1
   722  	ADC	$0, hlp0, acc2
   723  	// y[3] * x
   724  	MUL	y3, x0, t0
   725  	ADDS	t0, acc3
   726  	UMULH	y3, x0, t1
   727  
   728  	MUL	y3, x1, t0
   729  	ADCS	t0, acc4
   730  	UMULH	y3, x1, hlp0
   731  
   732  	MUL	y3, x2, t0
   733  	ADCS	t0, acc5
   734  	UMULH	y3, x2, y0
   735  
   736  	MUL	y3, x3, t0
   737  	ADCS	t0, acc6
   738  	UMULH	y3, x3, y1
   739  	ADC	$0, ZR, acc7
   740  
   741  	ADDS	t1, acc4
   742  	ADCS	hlp0, acc5
   743  	ADCS	y0, acc6
   744  	ADC	y1, acc7
   745  	// Last reduction step
   746  	MUL	acc3, hlp1, hlp0
   747  
   748  	MUL	const0, hlp1, t0
   749  	ADDS	t0, acc3, acc3
   750  	UMULH	const0, hlp0, t1
   751  
   752  	MUL	const1, hlp0, t0
   753  	ADCS	t0, acc0, acc0
   754  	UMULH	const1, hlp0, y0
   755  
   756  	MUL	const2, hlp0, t0
   757  	ADCS	t0, acc1, acc1
   758  	UMULH	const2, hlp0, acc3
   759  
   760  	MUL	const3, hlp0, t0
   761  	ADCS	t0, acc2, acc2
   762  
   763  	UMULH	const3, hlp0, hlp0
   764  	ADC	$0, acc7
   765  
   766  	ADDS	t1, acc0, acc0
   767  	ADCS	y0, acc1, acc1
   768  	ADCS	acc3, acc2, acc2
   769  	ADC	$0, hlp0, acc3
   770  
   771  	ADDS	acc4, acc0, acc0
   772  	ADCS	acc5, acc1, acc1
   773  	ADCS	acc6, acc2, acc2
   774  	ADCS	acc7, acc3, acc3
   775  	ADC	$0, ZR, acc4
   776  
   777  	SUBS	const0, acc0, t0
   778  	SBCS	const1, acc1, t1
   779  	SBCS	const2, acc2, t2
   780  	SBCS	const3, acc3, t3
   781  	SBCS	$0, acc4, acc4
   782  
   783  	CSEL	CS, t0, acc0, acc0
   784  	CSEL	CS, t1, acc1, acc1
   785  	CSEL	CS, t2, acc2, acc2
   786  	CSEL	CS, t3, acc3, acc3
   787  
   788  	MOVD	res+0(FP), res_ptr
   789  	STP	(acc0, acc1), 0*16(res_ptr)
   790  	STP	(acc2, acc3), 1*16(res_ptr)
   791  
   792  	RET
   793  /* ---------------------------------------*/
   794  TEXT p256SubInternal<>(SB),NOSPLIT,$0
   795  	SUBS	x0, y0, acc0
   796  	SBCS	x1, y1, acc1
   797  	SBCS	x2, y2, acc2
   798  	SBCS	x3, y3, acc3
   799  	SBC	$0, ZR, t0
   800  
   801  	ADDS	$-1, acc0, acc4
   802  	ADCS	const0, acc1, acc5
   803  	ADCS	$0, acc2, acc6
   804  	ADC	const1, acc3, acc7
   805  
   806  	ANDS	$1, t0
   807  	CSEL	EQ, acc0, acc4, x0
   808  	CSEL	EQ, acc1, acc5, x1
   809  	CSEL	EQ, acc2, acc6, x2
   810  	CSEL	EQ, acc3, acc7, x3
   811  
   812  	RET
   813  /* ---------------------------------------*/
   814  TEXT p256SqrInternal<>(SB),NOSPLIT,$0
   815  	// x[1:] * x[0]
   816  	MUL	x0, x1, acc1
   817  	UMULH	x0, x1, acc2
   818  
   819  	MUL	x0, x2, t0
   820  	ADDS	t0, acc2, acc2
   821  	UMULH	x0, x2, acc3
   822  
   823  	MUL	x0, x3, t0
   824  	ADCS	t0, acc3, acc3
   825  	UMULH	x0, x3, acc4
   826  	ADC	$0, acc4, acc4
   827  	// x[2:] * x[1]
   828  	MUL	x1, x2, t0
   829  	ADDS	t0, acc3
   830  	UMULH	x1, x2, t1
   831  	ADCS	t1, acc4
   832  	ADC	$0, ZR, acc5
   833  
   834  	MUL	x1, x3, t0
   835  	ADDS	t0, acc4
   836  	UMULH	x1, x3, t1
   837  	ADC	t1, acc5
   838  	// x[3] * x[2]
   839  	MUL	x2, x3, t0
   840  	ADDS	t0, acc5
   841  	UMULH	x2, x3, acc6
   842  	ADC	$0, acc6
   843  
   844  	MOVD	$0, acc7
   845  	// *2
   846  	ADDS	acc1, acc1
   847  	ADCS	acc2, acc2
   848  	ADCS	acc3, acc3
   849  	ADCS	acc4, acc4
   850  	ADCS	acc5, acc5
   851  	ADCS	acc6, acc6
   852  	ADC	$0, acc7
   853  	// Missing products
   854  	MUL	x0, x0, acc0
   855  	UMULH	x0, x0, t0
   856  	ADDS	t0, acc1, acc1
   857  
   858  	MUL	x1, x1, t0
   859  	ADCS	t0, acc2, acc2
   860  	UMULH	x1, x1, t1
   861  	ADCS	t1, acc3, acc3
   862  
   863  	MUL	x2, x2, t0
   864  	ADCS	t0, acc4, acc4
   865  	UMULH	x2, x2, t1
   866  	ADCS	t1, acc5, acc5
   867  
   868  	MUL	x3, x3, t0
   869  	ADCS	t0, acc6, acc6
   870  	UMULH	x3, x3, t1
   871  	ADCS	t1, acc7, acc7
   872  	// First reduction step
   873  	ADDS	acc0<<32, acc1, acc1
   874  	LSR	$32, acc0, t0
   875  	MUL	acc0, const1, t1
   876  	UMULH	acc0, const1, acc0
   877  	ADCS	t0, acc2, acc2
   878  	ADCS	t1, acc3, acc3
   879  	ADC	$0, acc0, acc0
   880  	// Second reduction step
   881  	ADDS	acc1<<32, acc2, acc2
   882  	LSR	$32, acc1, t0
   883  	MUL	acc1, const1, t1
   884  	UMULH	acc1, const1, acc1
   885  	ADCS	t0, acc3, acc3
   886  	ADCS	t1, acc0, acc0
   887  	ADC	$0, acc1, acc1
   888  	// Third reduction step
   889  	ADDS	acc2<<32, acc3, acc3
   890  	LSR	$32, acc2, t0
   891  	MUL	acc2, const1, t1
   892  	UMULH	acc2, const1, acc2
   893  	ADCS	t0, acc0, acc0
   894  	ADCS	t1, acc1, acc1
   895  	ADC	$0, acc2, acc2
   896  	// Last reduction step
   897  	ADDS	acc3<<32, acc0, acc0
   898  	LSR	$32, acc3, t0
   899  	MUL	acc3, const1, t1
   900  	UMULH	acc3, const1, acc3
   901  	ADCS	t0, acc1, acc1
   902  	ADCS	t1, acc2, acc2
   903  	ADC	$0, acc3, acc3
   904  	// Add bits [511:256] of the sqr result
   905  	ADDS	acc4, acc0, acc0
   906  	ADCS	acc5, acc1, acc1
   907  	ADCS	acc6, acc2, acc2
   908  	ADCS	acc7, acc3, acc3
   909  	ADC	$0, ZR, acc4
   910  
   911  	SUBS	$-1, acc0, t0
   912  	SBCS	const0, acc1, t1
   913  	SBCS	$0, acc2, t2
   914  	SBCS	const1, acc3, t3
   915  	SBCS	$0, acc4, acc4
   916  
   917  	CSEL	CS, t0, acc0, y0
   918  	CSEL	CS, t1, acc1, y1
   919  	CSEL	CS, t2, acc2, y2
   920  	CSEL	CS, t3, acc3, y3
   921  	RET
   922  /* ---------------------------------------*/
   923  TEXT p256MulInternal<>(SB),NOSPLIT,$0
   924  	// y[0] * x
   925  	MUL	y0, x0, acc0
   926  	UMULH	y0, x0, acc1
   927  
   928  	MUL	y0, x1, t0
   929  	ADDS	t0, acc1
   930  	UMULH	y0, x1, acc2
   931  
   932  	MUL	y0, x2, t0
   933  	ADCS	t0, acc2
   934  	UMULH	y0, x2, acc3
   935  
   936  	MUL	y0, x3, t0
   937  	ADCS	t0, acc3
   938  	UMULH	y0, x3, acc4
   939  	ADC	$0, acc4
   940  	// First reduction step
   941  	ADDS	acc0<<32, acc1, acc1
   942  	LSR	$32, acc0, t0
   943  	MUL	acc0, const1, t1
   944  	UMULH	acc0, const1, acc0
   945  	ADCS	t0, acc2
   946  	ADCS	t1, acc3
   947  	ADC	$0, acc0
   948  	// y[1] * x
   949  	MUL	y1, x0, t0
   950  	ADDS	t0, acc1
   951  	UMULH	y1, x0, t1
   952  
   953  	MUL	y1, x1, t0
   954  	ADCS	t0, acc2
   955  	UMULH	y1, x1, t2
   956  
   957  	MUL	y1, x2, t0
   958  	ADCS	t0, acc3
   959  	UMULH	y1, x2, t3
   960  
   961  	MUL	y1, x3, t0
   962  	ADCS	t0, acc4
   963  	UMULH	y1, x3, hlp0
   964  	ADC	$0, ZR, acc5
   965  
   966  	ADDS	t1, acc2
   967  	ADCS	t2, acc3
   968  	ADCS	t3, acc4
   969  	ADC	hlp0, acc5
   970  	// Second reduction step
   971  	ADDS	acc1<<32, acc2, acc2
   972  	LSR	$32, acc1, t0
   973  	MUL	acc1, const1, t1
   974  	UMULH	acc1, const1, acc1
   975  	ADCS	t0, acc3
   976  	ADCS	t1, acc0
   977  	ADC	$0, acc1
   978  	// y[2] * x
   979  	MUL	y2, x0, t0
   980  	ADDS	t0, acc2
   981  	UMULH	y2, x0, t1
   982  
   983  	MUL	y2, x1, t0
   984  	ADCS	t0, acc3
   985  	UMULH	y2, x1, t2
   986  
   987  	MUL	y2, x2, t0
   988  	ADCS	t0, acc4
   989  	UMULH	y2, x2, t3
   990  
   991  	MUL	y2, x3, t0
   992  	ADCS	t0, acc5
   993  	UMULH	y2, x3, hlp0
   994  	ADC	$0, ZR, acc6
   995  
   996  	ADDS	t1, acc3
   997  	ADCS	t2, acc4
   998  	ADCS	t3, acc5
   999  	ADC	hlp0, acc6
  1000  	// Third reduction step
  1001  	ADDS	acc2<<32, acc3, acc3
  1002  	LSR	$32, acc2, t0
  1003  	MUL	acc2, const1, t1
  1004  	UMULH	acc2, const1, acc2
  1005  	ADCS	t0, acc0
  1006  	ADCS	t1, acc1
  1007  	ADC	$0, acc2
  1008  	// y[3] * x
  1009  	MUL	y3, x0, t0
  1010  	ADDS	t0, acc3
  1011  	UMULH	y3, x0, t1
  1012  
  1013  	MUL	y3, x1, t0
  1014  	ADCS	t0, acc4
  1015  	UMULH	y3, x1, t2
  1016  
  1017  	MUL	y3, x2, t0
  1018  	ADCS	t0, acc5
  1019  	UMULH	y3, x2, t3
  1020  
  1021  	MUL	y3, x3, t0
  1022  	ADCS	t0, acc6
  1023  	UMULH	y3, x3, hlp0
  1024  	ADC	$0, ZR, acc7
  1025  
  1026  	ADDS	t1, acc4
  1027  	ADCS	t2, acc5
  1028  	ADCS	t3, acc6
  1029  	ADC	hlp0, acc7
  1030  	// Last reduction step
  1031  	ADDS	acc3<<32, acc0, acc0
  1032  	LSR	$32, acc3, t0
  1033  	MUL	acc3, const1, t1
  1034  	UMULH	acc3, const1, acc3
  1035  	ADCS	t0, acc1
  1036  	ADCS	t1, acc2
  1037  	ADC	$0, acc3
  1038  	// Add bits [511:256] of the mul result
  1039  	ADDS	acc4, acc0, acc0
  1040  	ADCS	acc5, acc1, acc1
  1041  	ADCS	acc6, acc2, acc2
  1042  	ADCS	acc7, acc3, acc3
  1043  	ADC	$0, ZR, acc4
  1044  
  1045  	SUBS	$-1, acc0, t0
  1046  	SBCS	const0, acc1, t1
  1047  	SBCS	$0, acc2, t2
  1048  	SBCS	const1, acc3, t3
  1049  	SBCS	$0, acc4, acc4
  1050  
  1051  	CSEL	CS, t0, acc0, y0
  1052  	CSEL	CS, t1, acc1, y1
  1053  	CSEL	CS, t2, acc2, y2
  1054  	CSEL	CS, t3, acc3, y3
  1055  	RET
  1056  /* ---------------------------------------*/
  1057  #define p256MulBy2Inline       \
  1058  	ADDS	y0, y0, x0;    \
  1059  	ADCS	y1, y1, x1;    \
  1060  	ADCS	y2, y2, x2;    \
  1061  	ADCS	y3, y3, x3;    \
  1062  	ADC	$0, ZR, hlp0;  \
  1063  	SUBS	$-1, x0, t0;   \
  1064  	SBCS	const0, x1, t1;\
  1065  	SBCS	$0, x2, t2;    \
  1066  	SBCS	const1, x3, t3;\
  1067  	SBCS	$0, hlp0, hlp0;\
  1068  	CSEL	CC, x0, t0, x0;\
  1069  	CSEL	CC, x1, t1, x1;\
  1070  	CSEL	CC, x2, t2, x2;\
  1071  	CSEL	CC, x3, t3, x3;
  1072  /* ---------------------------------------*/
  1073  #define x1in(off) (off)(a_ptr)
  1074  #define y1in(off) (off + 32)(a_ptr)
  1075  #define z1in(off) (off + 64)(a_ptr)
  1076  #define x2in(off) (off)(b_ptr)
  1077  #define z2in(off) (off + 64)(b_ptr)
  1078  #define x3out(off) (off)(res_ptr)
  1079  #define y3out(off) (off + 32)(res_ptr)
  1080  #define z3out(off) (off + 64)(res_ptr)
  1081  #define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
  1082  #define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
  1083  #define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
  1084  #define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
  1085  /* ---------------------------------------*/
  1086  #define y2in(off)  (32*0 + 8 + off)(RSP)
  1087  #define s2(off)    (32*1 + 8 + off)(RSP)
  1088  #define z1sqr(off) (32*2 + 8 + off)(RSP)
  1089  #define h(off)	   (32*3 + 8 + off)(RSP)
  1090  #define r(off)	   (32*4 + 8 + off)(RSP)
  1091  #define hsqr(off)  (32*5 + 8 + off)(RSP)
  1092  #define rsqr(off)  (32*6 + 8 + off)(RSP)
  1093  #define hcub(off)  (32*7 + 8 + off)(RSP)
  1094  
  1095  #define z2sqr(off) (32*8 + 8 + off)(RSP)
  1096  #define s1(off) (32*9 + 8 + off)(RSP)
  1097  #define u1(off) (32*10 + 8 + off)(RSP)
  1098  #define u2(off) (32*11 + 8 + off)(RSP)
  1099  
  1100  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1101  TEXT ·p256PointAddAffineAsm(SB),0,$264-48
  1102  	MOVD	in1+8(FP), a_ptr
  1103  	MOVD	in2+16(FP), b_ptr
  1104  	MOVD	sign+24(FP), hlp0
  1105  	MOVD	sel+32(FP), hlp1
  1106  	MOVD	zero+40(FP), t2
  1107  
  1108  	MOVD	$1, t0
  1109  	CMP	$0, t2
  1110  	CSEL	EQ, ZR, t0, t2
  1111  	CMP	$0, hlp1
  1112  	CSEL	EQ, ZR, t0, hlp1
  1113  
  1114  	MOVD	p256const0<>(SB), const0
  1115  	MOVD	p256const1<>(SB), const1
  1116  	EOR	t2<<1, hlp1
  1117  
  1118  	// Negate y2in based on sign
  1119  	LDP	2*16(b_ptr), (y0, y1)
  1120  	LDP	3*16(b_ptr), (y2, y3)
  1121  	MOVD	$-1, acc0
  1122  
  1123  	SUBS	y0, acc0, acc0
  1124  	SBCS	y1, const0, acc1
  1125  	SBCS	y2, ZR, acc2
  1126  	SBCS	y3, const1, acc3
  1127  	SBC	$0, ZR, t0
  1128  
  1129  	ADDS	$-1, acc0, acc4
  1130  	ADCS	const0, acc1, acc5
  1131  	ADCS	$0, acc2, acc6
  1132  	ADCS	const1, acc3, acc7
  1133  	ADC	$0, t0, t0
  1134  
  1135  	CMP	$0, t0
  1136  	CSEL	EQ, acc4, acc0, acc0
  1137  	CSEL	EQ, acc5, acc1, acc1
  1138  	CSEL	EQ, acc6, acc2, acc2
  1139  	CSEL	EQ, acc7, acc3, acc3
  1140  	// If condition is 0, keep original value
  1141  	CMP	$0, hlp0
  1142  	CSEL	EQ, y0, acc0, y0
  1143  	CSEL	EQ, y1, acc1, y1
  1144  	CSEL	EQ, y2, acc2, y2
  1145  	CSEL	EQ, y3, acc3, y3
  1146  	// Store result
  1147  	STy(y2in)
  1148  	// Begin point add
  1149  	LDx(z1in)
  1150  	CALL	p256SqrInternal<>(SB)    // z1ˆ2
  1151  	STy(z1sqr)
  1152  
  1153  	LDx(x2in)
  1154  	CALL	p256MulInternal<>(SB)    // x2 * z1ˆ2
  1155  
  1156  	LDx(x1in)
  1157  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1158  	STx(h)
  1159  
  1160  	LDy(z1in)
  1161  	CALL	p256MulInternal<>(SB)    // z3 = h * z1
  1162  
  1163  	LDP	4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
  1164  	LDP	5*16(a_ptr), (acc2, acc3)
  1165  	ANDS	$1, hlp1, ZR
  1166  	CSEL	EQ, acc0, y0, y0
  1167  	CSEL	EQ, acc1, y1, y1
  1168  	CSEL	EQ, acc2, y2, y2
  1169  	CSEL	EQ, acc3, y3, y3
  1170  	LDP	p256one<>+0x00(SB), (acc0, acc1)
  1171  	LDP	p256one<>+0x10(SB), (acc2, acc3)
  1172  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, z3 = 1
  1173  	CSEL	EQ, acc0, y0, y0
  1174  	CSEL	EQ, acc1, y1, y1
  1175  	CSEL	EQ, acc2, y2, y2
  1176  	CSEL	EQ, acc3, y3, y3
  1177  	LDx(z1in)
  1178  	MOVD	res+0(FP), t0
  1179  	STP	(y0, y1), 4*16(t0)
  1180  	STP	(y2, y3), 5*16(t0)
  1181  
  1182  	LDy(z1sqr)
  1183  	CALL	p256MulInternal<>(SB)    // z1 ^ 3
  1184  
  1185  	LDx(y2in)
  1186  	CALL	p256MulInternal<>(SB)    // s2 = y2 * z1ˆ3
  1187  	STy(s2)
  1188  
  1189  	LDx(y1in)
  1190  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1191  	STx(r)
  1192  
  1193  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1194  	STy	(rsqr)
  1195  
  1196  	LDx(h)
  1197  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1198  	STy(hsqr)
  1199  
  1200  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1201  	STy(hcub)
  1202  
  1203  	LDx(y1in)
  1204  	CALL	p256MulInternal<>(SB)    // y1 * hˆ3
  1205  	STy(s2)
  1206  
  1207  	LDP	hsqr(0*8), (x0, x1)
  1208  	LDP	hsqr(2*8), (x2, x3)
  1209  	LDP	0*16(a_ptr), (y0, y1)
  1210  	LDP	1*16(a_ptr), (y2, y3)
  1211  	CALL	p256MulInternal<>(SB)    // u1 * hˆ2
  1212  	STP	(y0, y1), h(0*8)
  1213  	STP	(y2, y3), h(2*8)
  1214  
  1215  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1216  
  1217  	LDy(rsqr)
  1218  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1219  
  1220  	MOVD	x0, y0
  1221  	MOVD	x1, y1
  1222  	MOVD	x2, y2
  1223  	MOVD	x3, y3
  1224  	LDx(hcub)
  1225  	CALL	p256SubInternal<>(SB)
  1226  
  1227  	LDP	0*16(a_ptr), (acc0, acc1)
  1228  	LDP	1*16(a_ptr), (acc2, acc3)
  1229  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, x3 = x1
  1230  	CSEL	EQ, acc0, x0, x0
  1231  	CSEL	EQ, acc1, x1, x1
  1232  	CSEL	EQ, acc2, x2, x2
  1233  	CSEL	EQ, acc3, x3, x3
  1234  	LDP	0*16(b_ptr), (acc0, acc1)
  1235  	LDP	1*16(b_ptr), (acc2, acc3)
  1236  	ANDS	$2, hlp1, ZR           // iff select[1] == 0, x3 = x2
  1237  	CSEL	EQ, acc0, x0, x0
  1238  	CSEL	EQ, acc1, x1, x1
  1239  	CSEL	EQ, acc2, x2, x2
  1240  	CSEL	EQ, acc3, x3, x3
  1241  	MOVD	res+0(FP), t0
  1242  	STP	(x0, x1), 0*16(t0)
  1243  	STP	(x2, x3), 1*16(t0)
  1244  
  1245  	LDP	h(0*8), (y0, y1)
  1246  	LDP	h(2*8), (y2, y3)
  1247  	CALL	p256SubInternal<>(SB)
  1248  
  1249  	LDP	r(0*8), (y0, y1)
  1250  	LDP	r(2*8), (y2, y3)
  1251  	CALL	p256MulInternal<>(SB)
  1252  
  1253  	LDP	s2(0*8), (x0, x1)
  1254  	LDP	s2(2*8), (x2, x3)
  1255  	CALL	p256SubInternal<>(SB)
  1256  	LDP	2*16(a_ptr), (acc0, acc1)
  1257  	LDP	3*16(a_ptr), (acc2, acc3)
  1258  	ANDS	$1, hlp1, ZR           // iff select[0] == 0, y3 = y1
  1259  	CSEL	EQ, acc0, x0, x0
  1260  	CSEL	EQ, acc1, x1, x1
  1261  	CSEL	EQ, acc2, x2, x2
  1262  	CSEL	EQ, acc3, x3, x3
  1263  	LDP	y2in(0*8), (acc0, acc1)
  1264  	LDP	y2in(2*8), (acc2, acc3)
  1265  	ANDS	$2, hlp1, ZR            // iff select[1] == 0, y3 = y2
  1266  	CSEL	EQ, acc0, x0, x0
  1267  	CSEL	EQ, acc1, x1, x1
  1268  	CSEL	EQ, acc2, x2, x2
  1269  	CSEL	EQ, acc3, x3, x3
  1270  	MOVD	res+0(FP), t0
  1271  	STP	(x0, x1), 2*16(t0)
  1272  	STP	(x2, x3), 3*16(t0)
  1273  
  1274  	RET
  1275  
  1276  #define p256AddInline          \
  1277  	ADDS	y0, x0, x0;    \
  1278  	ADCS	y1, x1, x1;    \
  1279  	ADCS	y2, x2, x2;    \
  1280  	ADCS	y3, x3, x3;    \
  1281  	ADC	$0, ZR, hlp0;  \
  1282  	SUBS	$-1, x0, t0;   \
  1283  	SBCS	const0, x1, t1;\
  1284  	SBCS	$0, x2, t2;    \
  1285  	SBCS	const1, x3, t3;\
  1286  	SBCS	$0, hlp0, hlp0;\
  1287  	CSEL	CC, x0, t0, x0;\
  1288  	CSEL	CC, x1, t1, x1;\
  1289  	CSEL	CC, x2, t2, x2;\
  1290  	CSEL	CC, x3, t3, x3;
  1291  
  1292  #define s(off)	(32*0 + 8 + off)(RSP)
  1293  #define m(off)	(32*1 + 8 + off)(RSP)
  1294  #define zsqr(off) (32*2 + 8 + off)(RSP)
  1295  #define tmp(off)  (32*3 + 8 + off)(RSP)
  1296  
  1297  //func p256PointDoubleAsm(res, in *P256Point)
  1298  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16
  1299  	MOVD	res+0(FP), res_ptr
  1300  	MOVD	in+8(FP), a_ptr
  1301  
  1302  	MOVD	p256const0<>(SB), const0
  1303  	MOVD	p256const1<>(SB), const1
  1304  
  1305  	// Begin point double
  1306  	LDP	4*16(a_ptr), (x0, x1)
  1307  	LDP	5*16(a_ptr), (x2, x3)
  1308  	CALL	p256SqrInternal<>(SB)
  1309  	STP	(y0, y1), zsqr(0*8)
  1310  	STP	(y2, y3), zsqr(2*8)
  1311  
  1312  	LDP	0*16(a_ptr), (x0, x1)
  1313  	LDP	1*16(a_ptr), (x2, x3)
  1314  	p256AddInline
  1315  	STx(m)
  1316  
  1317  	LDx(z1in)
  1318  	LDy(y1in)
  1319  	CALL	p256MulInternal<>(SB)
  1320  	p256MulBy2Inline
  1321  	STx(z3out)
  1322  
  1323  	LDy(x1in)
  1324  	LDx(zsqr)
  1325  	CALL	p256SubInternal<>(SB)
  1326  	LDy(m)
  1327  	CALL	p256MulInternal<>(SB)
  1328  
  1329  	// Multiply by 3
  1330  	p256MulBy2Inline
  1331  	p256AddInline
  1332  	STx(m)
  1333  
  1334  	LDy(y1in)
  1335  	p256MulBy2Inline
  1336  	CALL	p256SqrInternal<>(SB)
  1337  	STy(s)
  1338  	MOVD	y0, x0
  1339  	MOVD	y1, x1
  1340  	MOVD	y2, x2
  1341  	MOVD	y3, x3
  1342  	CALL	p256SqrInternal<>(SB)
  1343  
  1344  	// Divide by 2
  1345  	ADDS	$-1, y0, t0
  1346  	ADCS	const0, y1, t1
  1347  	ADCS	$0, y2, t2
  1348  	ADCS	const1, y3, t3
  1349  	ADC	$0, ZR, hlp0
  1350  
  1351  	ANDS	$1, y0, ZR
  1352  	CSEL	EQ, y0, t0, t0
  1353  	CSEL	EQ, y1, t1, t1
  1354  	CSEL	EQ, y2, t2, t2
  1355  	CSEL	EQ, y3, t3, t3
  1356  	AND	y0, hlp0, hlp0
  1357  
  1358  	EXTR	$1, t0, t1, y0
  1359  	EXTR	$1, t1, t2, y1
  1360  	EXTR	$1, t2, t3, y2
  1361  	EXTR	$1, t3, hlp0, y3
  1362  	STy(y3out)
  1363  
  1364  	LDx(x1in)
  1365  	LDy(s)
  1366  	CALL	p256MulInternal<>(SB)
  1367  	STy(s)
  1368  	p256MulBy2Inline
  1369  	STx(tmp)
  1370  
  1371  	LDx(m)
  1372  	CALL	p256SqrInternal<>(SB)
  1373  	LDx(tmp)
  1374  	CALL	p256SubInternal<>(SB)
  1375  
  1376  	STx(x3out)
  1377  
  1378  	LDy(s)
  1379  	CALL	p256SubInternal<>(SB)
  1380  
  1381  	LDy(m)
  1382  	CALL	p256MulInternal<>(SB)
  1383  
  1384  	LDx(y3out)
  1385  	CALL	p256SubInternal<>(SB)
  1386  	STx(y3out)
  1387  	RET
  1388  /* ---------------------------------------*/
  1389  #undef y2in
  1390  #undef x3out
  1391  #undef y3out
  1392  #undef z3out
  1393  #define y2in(off) (off + 32)(b_ptr)
  1394  #define x3out(off) (off)(b_ptr)
  1395  #define y3out(off) (off + 32)(b_ptr)
  1396  #define z3out(off) (off + 64)(b_ptr)
  1397  // func p256PointAddAsm(res, in1, in2 *P256Point) int
  1398  TEXT ·p256PointAddAsm(SB),0,$392-32
  1399  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  1400  	// Move input to stack in order to free registers
  1401  	MOVD	in1+8(FP), a_ptr
  1402  	MOVD	in2+16(FP), b_ptr
  1403  
  1404  	MOVD	p256const0<>(SB), const0
  1405  	MOVD	p256const1<>(SB), const1
  1406  
  1407  	// Begin point add
  1408  	LDx(z2in)
  1409  	CALL	p256SqrInternal<>(SB)    // z2^2
  1410  	STy(z2sqr)
  1411  
  1412  	CALL	p256MulInternal<>(SB)    // z2^3
  1413  
  1414  	LDx(y1in)
  1415  	CALL	p256MulInternal<>(SB)    // s1 = z2ˆ3*y1
  1416  	STy(s1)
  1417  
  1418  	LDx(z1in)
  1419  	CALL	p256SqrInternal<>(SB)    // z1^2
  1420  	STy(z1sqr)
  1421  
  1422  	CALL	p256MulInternal<>(SB)    // z1^3
  1423  
  1424  	LDx(y2in)
  1425  	CALL	p256MulInternal<>(SB)    // s2 = z1ˆ3*y2
  1426  
  1427  	LDx(s1)
  1428  	CALL	p256SubInternal<>(SB)    // r = s2 - s1
  1429  	STx(r)
  1430  
  1431  	MOVD	$1, t2
  1432  	ORR	x0, x1, t0             // Check if zero mod p256
  1433  	ORR	x2, x3, t1
  1434  	ORR	t1, t0, t0
  1435  	CMP	$0, t0
  1436  	CSEL	EQ, t2, ZR, hlp1
  1437  
  1438  	EOR	$-1, x0, t0
  1439  	EOR	const0, x1, t1
  1440  	EOR	const1, x3, t3
  1441  
  1442  	ORR	t0, t1, t0
  1443  	ORR	x2, t3, t1
  1444  	ORR	t1, t0, t0
  1445  	CMP	$0, t0
  1446  	CSEL	EQ, t2, hlp1, hlp1
  1447  
  1448  	LDx(z2sqr)
  1449  	LDy(x1in)
  1450  	CALL	p256MulInternal<>(SB)    // u1 = x1 * z2ˆ2
  1451  	STy(u1)
  1452  
  1453  	LDx(z1sqr)
  1454  	LDy(x2in)
  1455  	CALL	p256MulInternal<>(SB)    // u2 = x2 * z1ˆ2
  1456  	STy(u2)
  1457  
  1458  	LDx(u1)
  1459  	CALL	p256SubInternal<>(SB)    // h = u2 - u1
  1460  	STx(h)
  1461  
  1462  	MOVD	$1, t2
  1463  	ORR	x0, x1, t0             // Check if zero mod p256
  1464  	ORR	x2, x3, t1
  1465  	ORR	t1, t0, t0
  1466  	CMP	$0, t0
  1467  	CSEL	EQ, t2, ZR, hlp0
  1468  
  1469  	EOR	$-1, x0, t0
  1470  	EOR	const0, x1, t1
  1471  	EOR	const1, x3, t3
  1472  
  1473  	ORR	t0, t1, t0
  1474  	ORR	x2, t3, t1
  1475  	ORR	t1, t0, t0
  1476  	CMP	$0, t0
  1477  	CSEL	EQ, t2, hlp0, hlp0
  1478  
  1479  	AND	hlp0, hlp1, hlp1
  1480  
  1481  	LDx(r)
  1482  	CALL	p256SqrInternal<>(SB)    // rsqr = rˆ2
  1483  	STy(rsqr)
  1484  
  1485  	LDx(h)
  1486  	CALL	p256SqrInternal<>(SB)    // hsqr = hˆ2
  1487  	STy(hsqr)
  1488  
  1489  	LDx(h)
  1490  	CALL	p256MulInternal<>(SB)    // hcub = hˆ3
  1491  	STy(hcub)
  1492  
  1493  	LDx(s1)
  1494  	CALL	p256MulInternal<>(SB)
  1495  	STy(s2)
  1496  
  1497  	LDx(z1in)
  1498  	LDy(z2in)
  1499  	CALL	p256MulInternal<>(SB)    // z1 * z2
  1500  	LDx(h)
  1501  	CALL	p256MulInternal<>(SB)    // z1 * z2 * h
  1502  	MOVD	res+0(FP), b_ptr
  1503  	STy(z3out)
  1504  
  1505  	LDx(hsqr)
  1506  	LDy(u1)
  1507  	CALL	p256MulInternal<>(SB)    // hˆ2 * u1
  1508  	STy(u2)
  1509  
  1510  	p256MulBy2Inline               // u1 * hˆ2 * 2, inline
  1511  	LDy(rsqr)
  1512  	CALL	p256SubInternal<>(SB)    // rˆ2 - u1 * hˆ2 * 2
  1513  
  1514  	MOVD	x0, y0
  1515  	MOVD	x1, y1
  1516  	MOVD	x2, y2
  1517  	MOVD	x3, y3
  1518  	LDx(hcub)
  1519  	CALL	p256SubInternal<>(SB)
  1520  	STx(x3out)
  1521  
  1522  	LDy(u2)
  1523  	CALL	p256SubInternal<>(SB)
  1524  
  1525  	LDy(r)
  1526  	CALL	p256MulInternal<>(SB)
  1527  
  1528  	LDx(s2)
  1529  	CALL	p256SubInternal<>(SB)
  1530  	STx(y3out)
  1531  
  1532  	MOVD	hlp1, R0
  1533  	MOVD	R0, ret+24(FP)
  1534  
  1535  	RET
  1536  

View as plain text