Text file src/crypto/internal/nistec/p256_asm_amd64.s

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  // This file contains constant-time, 64-bit assembly implementation of
     8  // P256. The optimizations performed here are described in detail in:
     9  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
    10  //                          256-bit primes"
    11  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
    12  // https://eprint.iacr.org/2013/816.pdf
    13  
    14  #include "textflag.h"
    15  
    16  #define res_ptr DI
    17  #define x_ptr SI
    18  #define y_ptr CX
    19  
    20  #define acc0 R8
    21  #define acc1 R9
    22  #define acc2 R10
    23  #define acc3 R11
    24  #define acc4 R12
    25  #define acc5 R13
    26  #define t0 R14
    27  #define t1 R15
    28  
    29  DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
    30  DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
    31  DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
    32  DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
    33  DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
    34  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    35  DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
    36  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    37  DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
    38  DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
    39  DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
    40  GLOBL p256const0<>(SB), 8, $8
    41  GLOBL p256const1<>(SB), 8, $8
    42  GLOBL p256ordK0<>(SB), 8, $8
    43  GLOBL p256ord<>(SB), 8, $32
    44  GLOBL p256one<>(SB), 8, $32
    45  
    46  /* ---------------------------------------*/
    47  // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    48  TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0
    49  	JMP ·p256BigToLittle(SB)
    50  /* ---------------------------------------*/
    51  // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    52  TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0
    53  	JMP ·p256BigToLittle(SB)
    54  /* ---------------------------------------*/
    55  // func p256LittleToBig(res *[32]byte, in *p256Element)
    56  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    57  	JMP ·p256BigToLittle(SB)
    58  /* ---------------------------------------*/
    59  // func p256BigToLittle(res *p256Element, in *[32]byte)
    60  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    61  	MOVQ res+0(FP), res_ptr
    62  	MOVQ in+8(FP), x_ptr
    63  
    64  	MOVQ (8*0)(x_ptr), acc0
    65  	MOVQ (8*1)(x_ptr), acc1
    66  	MOVQ (8*2)(x_ptr), acc2
    67  	MOVQ (8*3)(x_ptr), acc3
    68  
    69  	BSWAPQ acc0
    70  	BSWAPQ acc1
    71  	BSWAPQ acc2
    72  	BSWAPQ acc3
    73  
    74  	MOVQ acc3, (8*0)(res_ptr)
    75  	MOVQ acc2, (8*1)(res_ptr)
    76  	MOVQ acc1, (8*2)(res_ptr)
    77  	MOVQ acc0, (8*3)(res_ptr)
    78  
    79  	RET
    80  /* ---------------------------------------*/
    81  // func p256MovCond(res, a, b *P256Point, cond int)
    82  TEXT ·p256MovCond(SB),NOSPLIT,$0
    83  	MOVQ res+0(FP), res_ptr
    84  	MOVQ a+8(FP), x_ptr
    85  	MOVQ b+16(FP), y_ptr
    86  	MOVQ cond+24(FP), X12
    87  
    88  	PXOR X13, X13
    89  	PSHUFD $0, X12, X12
    90  	PCMPEQL X13, X12
    91  
    92  	MOVOU X12, X0
    93  	MOVOU (16*0)(x_ptr), X6
    94  	PANDN X6, X0
    95  	MOVOU X12, X1
    96  	MOVOU (16*1)(x_ptr), X7
    97  	PANDN X7, X1
    98  	MOVOU X12, X2
    99  	MOVOU (16*2)(x_ptr), X8
   100  	PANDN X8, X2
   101  	MOVOU X12, X3
   102  	MOVOU (16*3)(x_ptr), X9
   103  	PANDN X9, X3
   104  	MOVOU X12, X4
   105  	MOVOU (16*4)(x_ptr), X10
   106  	PANDN X10, X4
   107  	MOVOU X12, X5
   108  	MOVOU (16*5)(x_ptr), X11
   109  	PANDN X11, X5
   110  
   111  	MOVOU (16*0)(y_ptr), X6
   112  	MOVOU (16*1)(y_ptr), X7
   113  	MOVOU (16*2)(y_ptr), X8
   114  	MOVOU (16*3)(y_ptr), X9
   115  	MOVOU (16*4)(y_ptr), X10
   116  	MOVOU (16*5)(y_ptr), X11
   117  
   118  	PAND X12, X6
   119  	PAND X12, X7
   120  	PAND X12, X8
   121  	PAND X12, X9
   122  	PAND X12, X10
   123  	PAND X12, X11
   124  
   125  	PXOR X6, X0
   126  	PXOR X7, X1
   127  	PXOR X8, X2
   128  	PXOR X9, X3
   129  	PXOR X10, X4
   130  	PXOR X11, X5
   131  
   132  	MOVOU X0, (16*0)(res_ptr)
   133  	MOVOU X1, (16*1)(res_ptr)
   134  	MOVOU X2, (16*2)(res_ptr)
   135  	MOVOU X3, (16*3)(res_ptr)
   136  	MOVOU X4, (16*4)(res_ptr)
   137  	MOVOU X5, (16*5)(res_ptr)
   138  
   139  	RET
   140  /* ---------------------------------------*/
   141  // func p256NegCond(val *p256Element, cond int)
   142  TEXT ·p256NegCond(SB),NOSPLIT,$0
   143  	MOVQ val+0(FP), res_ptr
   144  	MOVQ cond+8(FP), t0
   145  	// acc = poly
   146  	MOVQ $-1, acc0
   147  	MOVQ p256const0<>(SB), acc1
   148  	MOVQ $0, acc2
   149  	MOVQ p256const1<>(SB), acc3
   150  	// Load the original value
   151  	MOVQ (8*0)(res_ptr), acc5
   152  	MOVQ (8*1)(res_ptr), x_ptr
   153  	MOVQ (8*2)(res_ptr), y_ptr
   154  	MOVQ (8*3)(res_ptr), t1
   155  	// Speculatively subtract
   156  	SUBQ acc5, acc0
   157  	SBBQ x_ptr, acc1
   158  	SBBQ y_ptr, acc2
   159  	SBBQ t1, acc3
   160  	// If condition is 0, keep original value
   161  	TESTQ t0, t0
   162  	CMOVQEQ acc5, acc0
   163  	CMOVQEQ x_ptr, acc1
   164  	CMOVQEQ y_ptr, acc2
   165  	CMOVQEQ t1, acc3
   166  	// Store result
   167  	MOVQ acc0, (8*0)(res_ptr)
   168  	MOVQ acc1, (8*1)(res_ptr)
   169  	MOVQ acc2, (8*2)(res_ptr)
   170  	MOVQ acc3, (8*3)(res_ptr)
   171  
   172  	RET
   173  /* ---------------------------------------*/
   174  // func p256Sqr(res, in *p256Element, n int)
   175  TEXT ·p256Sqr(SB),NOSPLIT,$0
   176  	MOVQ res+0(FP), res_ptr
   177  	MOVQ in+8(FP), x_ptr
   178  	MOVQ n+16(FP), BX
   179  
   180  sqrLoop:
   181  
   182  	// y[1:] * y[0]
   183  	MOVQ (8*0)(x_ptr), t0
   184  
   185  	MOVQ (8*1)(x_ptr), AX
   186  	MULQ t0
   187  	MOVQ AX, acc1
   188  	MOVQ DX, acc2
   189  
   190  	MOVQ (8*2)(x_ptr), AX
   191  	MULQ t0
   192  	ADDQ AX, acc2
   193  	ADCQ $0, DX
   194  	MOVQ DX, acc3
   195  
   196  	MOVQ (8*3)(x_ptr), AX
   197  	MULQ t0
   198  	ADDQ AX, acc3
   199  	ADCQ $0, DX
   200  	MOVQ DX, acc4
   201  	// y[2:] * y[1]
   202  	MOVQ (8*1)(x_ptr), t0
   203  
   204  	MOVQ (8*2)(x_ptr), AX
   205  	MULQ t0
   206  	ADDQ AX, acc3
   207  	ADCQ $0, DX
   208  	MOVQ DX, t1
   209  
   210  	MOVQ (8*3)(x_ptr), AX
   211  	MULQ t0
   212  	ADDQ t1, acc4
   213  	ADCQ $0, DX
   214  	ADDQ AX, acc4
   215  	ADCQ $0, DX
   216  	MOVQ DX, acc5
   217  	// y[3] * y[2]
   218  	MOVQ (8*2)(x_ptr), t0
   219  
   220  	MOVQ (8*3)(x_ptr), AX
   221  	MULQ t0
   222  	ADDQ AX, acc5
   223  	ADCQ $0, DX
   224  	MOVQ DX, y_ptr
   225  	XORQ t1, t1
   226  	// *2
   227  	ADDQ acc1, acc1
   228  	ADCQ acc2, acc2
   229  	ADCQ acc3, acc3
   230  	ADCQ acc4, acc4
   231  	ADCQ acc5, acc5
   232  	ADCQ y_ptr, y_ptr
   233  	ADCQ $0, t1
   234  	// Missing products
   235  	MOVQ (8*0)(x_ptr), AX
   236  	MULQ AX
   237  	MOVQ AX, acc0
   238  	MOVQ DX, t0
   239  
   240  	MOVQ (8*1)(x_ptr), AX
   241  	MULQ AX
   242  	ADDQ t0, acc1
   243  	ADCQ AX, acc2
   244  	ADCQ $0, DX
   245  	MOVQ DX, t0
   246  
   247  	MOVQ (8*2)(x_ptr), AX
   248  	MULQ AX
   249  	ADDQ t0, acc3
   250  	ADCQ AX, acc4
   251  	ADCQ $0, DX
   252  	MOVQ DX, t0
   253  
   254  	MOVQ (8*3)(x_ptr), AX
   255  	MULQ AX
   256  	ADDQ t0, acc5
   257  	ADCQ AX, y_ptr
   258  	ADCQ DX, t1
   259  	MOVQ t1, x_ptr
   260  	// First reduction step
   261  	MOVQ acc0, AX
   262  	MOVQ acc0, t1
   263  	SHLQ $32, acc0
   264  	MULQ p256const1<>(SB)
   265  	SHRQ $32, t1
   266  	ADDQ acc0, acc1
   267  	ADCQ t1, acc2
   268  	ADCQ AX, acc3
   269  	ADCQ $0, DX
   270  	MOVQ DX, acc0
   271  	// Second reduction step
   272  	MOVQ acc1, AX
   273  	MOVQ acc1, t1
   274  	SHLQ $32, acc1
   275  	MULQ p256const1<>(SB)
   276  	SHRQ $32, t1
   277  	ADDQ acc1, acc2
   278  	ADCQ t1, acc3
   279  	ADCQ AX, acc0
   280  	ADCQ $0, DX
   281  	MOVQ DX, acc1
   282  	// Third reduction step
   283  	MOVQ acc2, AX
   284  	MOVQ acc2, t1
   285  	SHLQ $32, acc2
   286  	MULQ p256const1<>(SB)
   287  	SHRQ $32, t1
   288  	ADDQ acc2, acc3
   289  	ADCQ t1, acc0
   290  	ADCQ AX, acc1
   291  	ADCQ $0, DX
   292  	MOVQ DX, acc2
   293  	// Last reduction step
   294  	XORQ t0, t0
   295  	MOVQ acc3, AX
   296  	MOVQ acc3, t1
   297  	SHLQ $32, acc3
   298  	MULQ p256const1<>(SB)
   299  	SHRQ $32, t1
   300  	ADDQ acc3, acc0
   301  	ADCQ t1, acc1
   302  	ADCQ AX, acc2
   303  	ADCQ $0, DX
   304  	MOVQ DX, acc3
   305  	// Add bits [511:256] of the sqr result
   306  	ADCQ acc4, acc0
   307  	ADCQ acc5, acc1
   308  	ADCQ y_ptr, acc2
   309  	ADCQ x_ptr, acc3
   310  	ADCQ $0, t0
   311  
   312  	MOVQ acc0, acc4
   313  	MOVQ acc1, acc5
   314  	MOVQ acc2, y_ptr
   315  	MOVQ acc3, t1
   316  	// Subtract p256
   317  	SUBQ $-1, acc0
   318  	SBBQ p256const0<>(SB) ,acc1
   319  	SBBQ $0, acc2
   320  	SBBQ p256const1<>(SB), acc3
   321  	SBBQ $0, t0
   322  
   323  	CMOVQCS acc4, acc0
   324  	CMOVQCS acc5, acc1
   325  	CMOVQCS y_ptr, acc2
   326  	CMOVQCS t1, acc3
   327  
   328  	MOVQ acc0, (8*0)(res_ptr)
   329  	MOVQ acc1, (8*1)(res_ptr)
   330  	MOVQ acc2, (8*2)(res_ptr)
   331  	MOVQ acc3, (8*3)(res_ptr)
   332  	MOVQ res_ptr, x_ptr
   333  	DECQ BX
   334  	JNE  sqrLoop
   335  
   336  	RET
   337  /* ---------------------------------------*/
   338  // func p256Mul(res, in1, in2 *p256Element)
   339  TEXT ·p256Mul(SB),NOSPLIT,$0
   340  	MOVQ res+0(FP), res_ptr
   341  	MOVQ in1+8(FP), x_ptr
   342  	MOVQ in2+16(FP), y_ptr
   343  	// x * y[0]
   344  	MOVQ (8*0)(y_ptr), t0
   345  
   346  	MOVQ (8*0)(x_ptr), AX
   347  	MULQ t0
   348  	MOVQ AX, acc0
   349  	MOVQ DX, acc1
   350  
   351  	MOVQ (8*1)(x_ptr), AX
   352  	MULQ t0
   353  	ADDQ AX, acc1
   354  	ADCQ $0, DX
   355  	MOVQ DX, acc2
   356  
   357  	MOVQ (8*2)(x_ptr), AX
   358  	MULQ t0
   359  	ADDQ AX, acc2
   360  	ADCQ $0, DX
   361  	MOVQ DX, acc3
   362  
   363  	MOVQ (8*3)(x_ptr), AX
   364  	MULQ t0
   365  	ADDQ AX, acc3
   366  	ADCQ $0, DX
   367  	MOVQ DX, acc4
   368  	XORQ acc5, acc5
   369  	// First reduction step
   370  	MOVQ acc0, AX
   371  	MOVQ acc0, t1
   372  	SHLQ $32, acc0
   373  	MULQ p256const1<>(SB)
   374  	SHRQ $32, t1
   375  	ADDQ acc0, acc1
   376  	ADCQ t1, acc2
   377  	ADCQ AX, acc3
   378  	ADCQ DX, acc4
   379  	ADCQ $0, acc5
   380  	XORQ acc0, acc0
   381  	// x * y[1]
   382  	MOVQ (8*1)(y_ptr), t0
   383  
   384  	MOVQ (8*0)(x_ptr), AX
   385  	MULQ t0
   386  	ADDQ AX, acc1
   387  	ADCQ $0, DX
   388  	MOVQ DX, t1
   389  
   390  	MOVQ (8*1)(x_ptr), AX
   391  	MULQ t0
   392  	ADDQ t1, acc2
   393  	ADCQ $0, DX
   394  	ADDQ AX, acc2
   395  	ADCQ $0, DX
   396  	MOVQ DX, t1
   397  
   398  	MOVQ (8*2)(x_ptr), AX
   399  	MULQ t0
   400  	ADDQ t1, acc3
   401  	ADCQ $0, DX
   402  	ADDQ AX, acc3
   403  	ADCQ $0, DX
   404  	MOVQ DX, t1
   405  
   406  	MOVQ (8*3)(x_ptr), AX
   407  	MULQ t0
   408  	ADDQ t1, acc4
   409  	ADCQ $0, DX
   410  	ADDQ AX, acc4
   411  	ADCQ DX, acc5
   412  	ADCQ $0, acc0
   413  	// Second reduction step
   414  	MOVQ acc1, AX
   415  	MOVQ acc1, t1
   416  	SHLQ $32, acc1
   417  	MULQ p256const1<>(SB)
   418  	SHRQ $32, t1
   419  	ADDQ acc1, acc2
   420  	ADCQ t1, acc3
   421  	ADCQ AX, acc4
   422  	ADCQ DX, acc5
   423  	ADCQ $0, acc0
   424  	XORQ acc1, acc1
   425  	// x * y[2]
   426  	MOVQ (8*2)(y_ptr), t0
   427  
   428  	MOVQ (8*0)(x_ptr), AX
   429  	MULQ t0
   430  	ADDQ AX, acc2
   431  	ADCQ $0, DX
   432  	MOVQ DX, t1
   433  
   434  	MOVQ (8*1)(x_ptr), AX
   435  	MULQ t0
   436  	ADDQ t1, acc3
   437  	ADCQ $0, DX
   438  	ADDQ AX, acc3
   439  	ADCQ $0, DX
   440  	MOVQ DX, t1
   441  
   442  	MOVQ (8*2)(x_ptr), AX
   443  	MULQ t0
   444  	ADDQ t1, acc4
   445  	ADCQ $0, DX
   446  	ADDQ AX, acc4
   447  	ADCQ $0, DX
   448  	MOVQ DX, t1
   449  
   450  	MOVQ (8*3)(x_ptr), AX
   451  	MULQ t0
   452  	ADDQ t1, acc5
   453  	ADCQ $0, DX
   454  	ADDQ AX, acc5
   455  	ADCQ DX, acc0
   456  	ADCQ $0, acc1
   457  	// Third reduction step
   458  	MOVQ acc2, AX
   459  	MOVQ acc2, t1
   460  	SHLQ $32, acc2
   461  	MULQ p256const1<>(SB)
   462  	SHRQ $32, t1
   463  	ADDQ acc2, acc3
   464  	ADCQ t1, acc4
   465  	ADCQ AX, acc5
   466  	ADCQ DX, acc0
   467  	ADCQ $0, acc1
   468  	XORQ acc2, acc2
   469  	// x * y[3]
   470  	MOVQ (8*3)(y_ptr), t0
   471  
   472  	MOVQ (8*0)(x_ptr), AX
   473  	MULQ t0
   474  	ADDQ AX, acc3
   475  	ADCQ $0, DX
   476  	MOVQ DX, t1
   477  
   478  	MOVQ (8*1)(x_ptr), AX
   479  	MULQ t0
   480  	ADDQ t1, acc4
   481  	ADCQ $0, DX
   482  	ADDQ AX, acc4
   483  	ADCQ $0, DX
   484  	MOVQ DX, t1
   485  
   486  	MOVQ (8*2)(x_ptr), AX
   487  	MULQ t0
   488  	ADDQ t1, acc5
   489  	ADCQ $0, DX
   490  	ADDQ AX, acc5
   491  	ADCQ $0, DX
   492  	MOVQ DX, t1
   493  
   494  	MOVQ (8*3)(x_ptr), AX
   495  	MULQ t0
   496  	ADDQ t1, acc0
   497  	ADCQ $0, DX
   498  	ADDQ AX, acc0
   499  	ADCQ DX, acc1
   500  	ADCQ $0, acc2
   501  	// Last reduction step
   502  	MOVQ acc3, AX
   503  	MOVQ acc3, t1
   504  	SHLQ $32, acc3
   505  	MULQ p256const1<>(SB)
   506  	SHRQ $32, t1
   507  	ADDQ acc3, acc4
   508  	ADCQ t1, acc5
   509  	ADCQ AX, acc0
   510  	ADCQ DX, acc1
   511  	ADCQ $0, acc2
   512  	// Copy result [255:0]
   513  	MOVQ acc4, x_ptr
   514  	MOVQ acc5, acc3
   515  	MOVQ acc0, t0
   516  	MOVQ acc1, t1
   517  	// Subtract p256
   518  	SUBQ $-1, acc4
   519  	SBBQ p256const0<>(SB) ,acc5
   520  	SBBQ $0, acc0
   521  	SBBQ p256const1<>(SB), acc1
   522  	SBBQ $0, acc2
   523  
   524  	CMOVQCS x_ptr, acc4
   525  	CMOVQCS acc3, acc5
   526  	CMOVQCS t0, acc0
   527  	CMOVQCS t1, acc1
   528  
   529  	MOVQ acc4, (8*0)(res_ptr)
   530  	MOVQ acc5, (8*1)(res_ptr)
   531  	MOVQ acc0, (8*2)(res_ptr)
   532  	MOVQ acc1, (8*3)(res_ptr)
   533  
   534  	RET
   535  /* ---------------------------------------*/
   536  // func p256FromMont(res, in *p256Element)
   537  TEXT ·p256FromMont(SB),NOSPLIT,$0
   538  	MOVQ res+0(FP), res_ptr
   539  	MOVQ in+8(FP), x_ptr
   540  
   541  	MOVQ (8*0)(x_ptr), acc0
   542  	MOVQ (8*1)(x_ptr), acc1
   543  	MOVQ (8*2)(x_ptr), acc2
   544  	MOVQ (8*3)(x_ptr), acc3
   545  	XORQ acc4, acc4
   546  
   547  	// Only reduce, no multiplications are needed
   548  	// First stage
   549  	MOVQ acc0, AX
   550  	MOVQ acc0, t1
   551  	SHLQ $32, acc0
   552  	MULQ p256const1<>(SB)
   553  	SHRQ $32, t1
   554  	ADDQ acc0, acc1
   555  	ADCQ t1, acc2
   556  	ADCQ AX, acc3
   557  	ADCQ DX, acc4
   558  	XORQ acc5, acc5
   559  	// Second stage
   560  	MOVQ acc1, AX
   561  	MOVQ acc1, t1
   562  	SHLQ $32, acc1
   563  	MULQ p256const1<>(SB)
   564  	SHRQ $32, t1
   565  	ADDQ acc1, acc2
   566  	ADCQ t1, acc3
   567  	ADCQ AX, acc4
   568  	ADCQ DX, acc5
   569  	XORQ acc0, acc0
   570  	// Third stage
   571  	MOVQ acc2, AX
   572  	MOVQ acc2, t1
   573  	SHLQ $32, acc2
   574  	MULQ p256const1<>(SB)
   575  	SHRQ $32, t1
   576  	ADDQ acc2, acc3
   577  	ADCQ t1, acc4
   578  	ADCQ AX, acc5
   579  	ADCQ DX, acc0
   580  	XORQ acc1, acc1
   581  	// Last stage
   582  	MOVQ acc3, AX
   583  	MOVQ acc3, t1
   584  	SHLQ $32, acc3
   585  	MULQ p256const1<>(SB)
   586  	SHRQ $32, t1
   587  	ADDQ acc3, acc4
   588  	ADCQ t1, acc5
   589  	ADCQ AX, acc0
   590  	ADCQ DX, acc1
   591  
   592  	MOVQ acc4, x_ptr
   593  	MOVQ acc5, acc3
   594  	MOVQ acc0, t0
   595  	MOVQ acc1, t1
   596  
   597  	SUBQ $-1, acc4
   598  	SBBQ p256const0<>(SB), acc5
   599  	SBBQ $0, acc0
   600  	SBBQ p256const1<>(SB), acc1
   601  
   602  	CMOVQCS x_ptr, acc4
   603  	CMOVQCS acc3, acc5
   604  	CMOVQCS t0, acc0
   605  	CMOVQCS t1, acc1
   606  
   607  	MOVQ acc4, (8*0)(res_ptr)
   608  	MOVQ acc5, (8*1)(res_ptr)
   609  	MOVQ acc0, (8*2)(res_ptr)
   610  	MOVQ acc1, (8*3)(res_ptr)
   611  
   612  	RET
   613  /* ---------------------------------------*/
   614  // func p256Select(res *P256Point, table *p256Table, idx int)
   615  TEXT ·p256Select(SB),NOSPLIT,$0
   616  	MOVQ idx+16(FP),AX
   617  	MOVQ table+8(FP),DI
   618  	MOVQ res+0(FP),DX
   619  
   620  	PXOR X15, X15	// X15 = 0
   621  	PCMPEQL X14, X14 // X14 = -1
   622  	PSUBL X14, X15   // X15 = 1
   623  	MOVL AX, X14
   624  	PSHUFD $0, X14, X14
   625  
   626  	PXOR X0, X0
   627  	PXOR X1, X1
   628  	PXOR X2, X2
   629  	PXOR X3, X3
   630  	PXOR X4, X4
   631  	PXOR X5, X5
   632  	MOVQ $16, AX
   633  
   634  	MOVOU X15, X13
   635  
   636  loop_select:
   637  
   638  		MOVOU X13, X12
   639  		PADDL X15, X13
   640  		PCMPEQL X14, X12
   641  
   642  		MOVOU (16*0)(DI), X6
   643  		MOVOU (16*1)(DI), X7
   644  		MOVOU (16*2)(DI), X8
   645  		MOVOU (16*3)(DI), X9
   646  		MOVOU (16*4)(DI), X10
   647  		MOVOU (16*5)(DI), X11
   648  		ADDQ $(16*6), DI
   649  
   650  		PAND X12, X6
   651  		PAND X12, X7
   652  		PAND X12, X8
   653  		PAND X12, X9
   654  		PAND X12, X10
   655  		PAND X12, X11
   656  
   657  		PXOR X6, X0
   658  		PXOR X7, X1
   659  		PXOR X8, X2
   660  		PXOR X9, X3
   661  		PXOR X10, X4
   662  		PXOR X11, X5
   663  
   664  		DECQ AX
   665  		JNE loop_select
   666  
   667  	MOVOU X0, (16*0)(DX)
   668  	MOVOU X1, (16*1)(DX)
   669  	MOVOU X2, (16*2)(DX)
   670  	MOVOU X3, (16*3)(DX)
   671  	MOVOU X4, (16*4)(DX)
   672  	MOVOU X5, (16*5)(DX)
   673  
   674  	RET
   675  /* ---------------------------------------*/
   676  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   677  TEXT ·p256SelectAffine(SB),NOSPLIT,$0
   678  	MOVQ idx+16(FP),AX
   679  	MOVQ table+8(FP),DI
   680  	MOVQ res+0(FP),DX
   681  
   682  	PXOR X15, X15	// X15 = 0
   683  	PCMPEQL X14, X14 // X14 = -1
   684  	PSUBL X14, X15   // X15 = 1
   685  	MOVL AX, X14
   686  	PSHUFD $0, X14, X14
   687  
   688  	PXOR X0, X0
   689  	PXOR X1, X1
   690  	PXOR X2, X2
   691  	PXOR X3, X3
   692  	MOVQ $16, AX
   693  
   694  	MOVOU X15, X13
   695  
   696  loop_select_base:
   697  
   698  		MOVOU X13, X12
   699  		PADDL X15, X13
   700  		PCMPEQL X14, X12
   701  
   702  		MOVOU (16*0)(DI), X4
   703  		MOVOU (16*1)(DI), X5
   704  		MOVOU (16*2)(DI), X6
   705  		MOVOU (16*3)(DI), X7
   706  
   707  		MOVOU (16*4)(DI), X8
   708  		MOVOU (16*5)(DI), X9
   709  		MOVOU (16*6)(DI), X10
   710  		MOVOU (16*7)(DI), X11
   711  
   712  		ADDQ $(16*8), DI
   713  
   714  		PAND X12, X4
   715  		PAND X12, X5
   716  		PAND X12, X6
   717  		PAND X12, X7
   718  
   719  		MOVOU X13, X12
   720  		PADDL X15, X13
   721  		PCMPEQL X14, X12
   722  
   723  		PAND X12, X8
   724  		PAND X12, X9
   725  		PAND X12, X10
   726  		PAND X12, X11
   727  
   728  		PXOR X4, X0
   729  		PXOR X5, X1
   730  		PXOR X6, X2
   731  		PXOR X7, X3
   732  
   733  		PXOR X8, X0
   734  		PXOR X9, X1
   735  		PXOR X10, X2
   736  		PXOR X11, X3
   737  
   738  		DECQ AX
   739  		JNE loop_select_base
   740  
   741  	MOVOU X0, (16*0)(DX)
   742  	MOVOU X1, (16*1)(DX)
   743  	MOVOU X2, (16*2)(DX)
   744  	MOVOU X3, (16*3)(DX)
   745  
   746  	RET
   747  /* ---------------------------------------*/
   748  // func p256OrdMul(res, in1, in2 *p256OrdElement)
   749  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   750  	MOVQ res+0(FP), res_ptr
   751  	MOVQ in1+8(FP), x_ptr
   752  	MOVQ in2+16(FP), y_ptr
   753  	// x * y[0]
   754  	MOVQ (8*0)(y_ptr), t0
   755  
   756  	MOVQ (8*0)(x_ptr), AX
   757  	MULQ t0
   758  	MOVQ AX, acc0
   759  	MOVQ DX, acc1
   760  
   761  	MOVQ (8*1)(x_ptr), AX
   762  	MULQ t0
   763  	ADDQ AX, acc1
   764  	ADCQ $0, DX
   765  	MOVQ DX, acc2
   766  
   767  	MOVQ (8*2)(x_ptr), AX
   768  	MULQ t0
   769  	ADDQ AX, acc2
   770  	ADCQ $0, DX
   771  	MOVQ DX, acc3
   772  
   773  	MOVQ (8*3)(x_ptr), AX
   774  	MULQ t0
   775  	ADDQ AX, acc3
   776  	ADCQ $0, DX
   777  	MOVQ DX, acc4
   778  	XORQ acc5, acc5
   779  	// First reduction step
   780  	MOVQ acc0, AX
   781  	MULQ p256ordK0<>(SB)
   782  	MOVQ AX, t0
   783  
   784  	MOVQ p256ord<>+0x00(SB), AX
   785  	MULQ t0
   786  	ADDQ AX, acc0
   787  	ADCQ $0, DX
   788  	MOVQ DX, t1
   789  
   790  	MOVQ p256ord<>+0x08(SB), AX
   791  	MULQ t0
   792  	ADDQ t1, acc1
   793  	ADCQ $0, DX
   794  	ADDQ AX, acc1
   795  	ADCQ $0, DX
   796  	MOVQ DX, t1
   797  
   798  	MOVQ p256ord<>+0x10(SB), AX
   799  	MULQ t0
   800  	ADDQ t1, acc2
   801  	ADCQ $0, DX
   802  	ADDQ AX, acc2
   803  	ADCQ $0, DX
   804  	MOVQ DX, t1
   805  
   806  	MOVQ p256ord<>+0x18(SB), AX
   807  	MULQ t0
   808  	ADDQ t1, acc3
   809  	ADCQ $0, DX
   810  	ADDQ AX, acc3
   811  	ADCQ DX, acc4
   812  	ADCQ $0, acc5
   813  	// x * y[1]
   814  	MOVQ (8*1)(y_ptr), t0
   815  
   816  	MOVQ (8*0)(x_ptr), AX
   817  	MULQ t0
   818  	ADDQ AX, acc1
   819  	ADCQ $0, DX
   820  	MOVQ DX, t1
   821  
   822  	MOVQ (8*1)(x_ptr), AX
   823  	MULQ t0
   824  	ADDQ t1, acc2
   825  	ADCQ $0, DX
   826  	ADDQ AX, acc2
   827  	ADCQ $0, DX
   828  	MOVQ DX, t1
   829  
   830  	MOVQ (8*2)(x_ptr), AX
   831  	MULQ t0
   832  	ADDQ t1, acc3
   833  	ADCQ $0, DX
   834  	ADDQ AX, acc3
   835  	ADCQ $0, DX
   836  	MOVQ DX, t1
   837  
   838  	MOVQ (8*3)(x_ptr), AX
   839  	MULQ t0
   840  	ADDQ t1, acc4
   841  	ADCQ $0, DX
   842  	ADDQ AX, acc4
   843  	ADCQ DX, acc5
   844  	ADCQ $0, acc0
   845  	// Second reduction step
   846  	MOVQ acc1, AX
   847  	MULQ p256ordK0<>(SB)
   848  	MOVQ AX, t0
   849  
   850  	MOVQ p256ord<>+0x00(SB), AX
   851  	MULQ t0
   852  	ADDQ AX, acc1
   853  	ADCQ $0, DX
   854  	MOVQ DX, t1
   855  
   856  	MOVQ p256ord<>+0x08(SB), AX
   857  	MULQ t0
   858  	ADDQ t1, acc2
   859  	ADCQ $0, DX
   860  	ADDQ AX, acc2
   861  	ADCQ $0, DX
   862  	MOVQ DX, t1
   863  
   864  	MOVQ p256ord<>+0x10(SB), AX
   865  	MULQ t0
   866  	ADDQ t1, acc3
   867  	ADCQ $0, DX
   868  	ADDQ AX, acc3
   869  	ADCQ $0, DX
   870  	MOVQ DX, t1
   871  
   872  	MOVQ p256ord<>+0x18(SB), AX
   873  	MULQ t0
   874  	ADDQ t1, acc4
   875  	ADCQ $0, DX
   876  	ADDQ AX, acc4
   877  	ADCQ DX, acc5
   878  	ADCQ $0, acc0
   879  	// x * y[2]
   880  	MOVQ (8*2)(y_ptr), t0
   881  
   882  	MOVQ (8*0)(x_ptr), AX
   883  	MULQ t0
   884  	ADDQ AX, acc2
   885  	ADCQ $0, DX
   886  	MOVQ DX, t1
   887  
   888  	MOVQ (8*1)(x_ptr), AX
   889  	MULQ t0
   890  	ADDQ t1, acc3
   891  	ADCQ $0, DX
   892  	ADDQ AX, acc3
   893  	ADCQ $0, DX
   894  	MOVQ DX, t1
   895  
   896  	MOVQ (8*2)(x_ptr), AX
   897  	MULQ t0
   898  	ADDQ t1, acc4
   899  	ADCQ $0, DX
   900  	ADDQ AX, acc4
   901  	ADCQ $0, DX
   902  	MOVQ DX, t1
   903  
   904  	MOVQ (8*3)(x_ptr), AX
   905  	MULQ t0
   906  	ADDQ t1, acc5
   907  	ADCQ $0, DX
   908  	ADDQ AX, acc5
   909  	ADCQ DX, acc0
   910  	ADCQ $0, acc1
   911  	// Third reduction step
   912  	MOVQ acc2, AX
   913  	MULQ p256ordK0<>(SB)
   914  	MOVQ AX, t0
   915  
   916  	MOVQ p256ord<>+0x00(SB), AX
   917  	MULQ t0
   918  	ADDQ AX, acc2
   919  	ADCQ $0, DX
   920  	MOVQ DX, t1
   921  
   922  	MOVQ p256ord<>+0x08(SB), AX
   923  	MULQ t0
   924  	ADDQ t1, acc3
   925  	ADCQ $0, DX
   926  	ADDQ AX, acc3
   927  	ADCQ $0, DX
   928  	MOVQ DX, t1
   929  
   930  	MOVQ p256ord<>+0x10(SB), AX
   931  	MULQ t0
   932  	ADDQ t1, acc4
   933  	ADCQ $0, DX
   934  	ADDQ AX, acc4
   935  	ADCQ $0, DX
   936  	MOVQ DX, t1
   937  
   938  	MOVQ p256ord<>+0x18(SB), AX
   939  	MULQ t0
   940  	ADDQ t1, acc5
   941  	ADCQ $0, DX
   942  	ADDQ AX, acc5
   943  	ADCQ DX, acc0
   944  	ADCQ $0, acc1
   945  	// x * y[3]
   946  	MOVQ (8*3)(y_ptr), t0
   947  
   948  	MOVQ (8*0)(x_ptr), AX
   949  	MULQ t0
   950  	ADDQ AX, acc3
   951  	ADCQ $0, DX
   952  	MOVQ DX, t1
   953  
   954  	MOVQ (8*1)(x_ptr), AX
   955  	MULQ t0
   956  	ADDQ t1, acc4
   957  	ADCQ $0, DX
   958  	ADDQ AX, acc4
   959  	ADCQ $0, DX
   960  	MOVQ DX, t1
   961  
   962  	MOVQ (8*2)(x_ptr), AX
   963  	MULQ t0
   964  	ADDQ t1, acc5
   965  	ADCQ $0, DX
   966  	ADDQ AX, acc5
   967  	ADCQ $0, DX
   968  	MOVQ DX, t1
   969  
   970  	MOVQ (8*3)(x_ptr), AX
   971  	MULQ t0
   972  	ADDQ t1, acc0
   973  	ADCQ $0, DX
   974  	ADDQ AX, acc0
   975  	ADCQ DX, acc1
   976  	ADCQ $0, acc2
   977  	// Last reduction step
   978  	MOVQ acc3, AX
   979  	MULQ p256ordK0<>(SB)
   980  	MOVQ AX, t0
   981  
   982  	MOVQ p256ord<>+0x00(SB), AX
   983  	MULQ t0
   984  	ADDQ AX, acc3
   985  	ADCQ $0, DX
   986  	MOVQ DX, t1
   987  
   988  	MOVQ p256ord<>+0x08(SB), AX
   989  	MULQ t0
   990  	ADDQ t1, acc4
   991  	ADCQ $0, DX
   992  	ADDQ AX, acc4
   993  	ADCQ $0, DX
   994  	MOVQ DX, t1
   995  
   996  	MOVQ p256ord<>+0x10(SB), AX
   997  	MULQ t0
   998  	ADDQ t1, acc5
   999  	ADCQ $0, DX
  1000  	ADDQ AX, acc5
  1001  	ADCQ $0, DX
  1002  	MOVQ DX, t1
  1003  
  1004  	MOVQ p256ord<>+0x18(SB), AX
  1005  	MULQ t0
  1006  	ADDQ t1, acc0
  1007  	ADCQ $0, DX
  1008  	ADDQ AX, acc0
  1009  	ADCQ DX, acc1
  1010  	ADCQ $0, acc2
  1011  	// Copy result [255:0]
  1012  	MOVQ acc4, x_ptr
  1013  	MOVQ acc5, acc3
  1014  	MOVQ acc0, t0
  1015  	MOVQ acc1, t1
  1016  	// Subtract p256
  1017  	SUBQ p256ord<>+0x00(SB), acc4
  1018  	SBBQ p256ord<>+0x08(SB) ,acc5
  1019  	SBBQ p256ord<>+0x10(SB), acc0
  1020  	SBBQ p256ord<>+0x18(SB), acc1
  1021  	SBBQ $0, acc2
  1022  
  1023  	CMOVQCS x_ptr, acc4
  1024  	CMOVQCS acc3, acc5
  1025  	CMOVQCS t0, acc0
  1026  	CMOVQCS t1, acc1
  1027  
  1028  	MOVQ acc4, (8*0)(res_ptr)
  1029  	MOVQ acc5, (8*1)(res_ptr)
  1030  	MOVQ acc0, (8*2)(res_ptr)
  1031  	MOVQ acc1, (8*3)(res_ptr)
  1032  
  1033  	RET
  1034  /* ---------------------------------------*/
  1035  // func p256OrdSqr(res, in *p256OrdElement, n int)
  1036  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1037  	MOVQ res+0(FP), res_ptr
  1038  	MOVQ in+8(FP), x_ptr
  1039  	MOVQ n+16(FP), BX
  1040  
  1041  ordSqrLoop:
  1042  
  1043  	// y[1:] * y[0]
  1044  	MOVQ (8*0)(x_ptr), t0
  1045  
  1046  	MOVQ (8*1)(x_ptr), AX
  1047  	MULQ t0
  1048  	MOVQ AX, acc1
  1049  	MOVQ DX, acc2
  1050  
  1051  	MOVQ (8*2)(x_ptr), AX
  1052  	MULQ t0
  1053  	ADDQ AX, acc2
  1054  	ADCQ $0, DX
  1055  	MOVQ DX, acc3
  1056  
  1057  	MOVQ (8*3)(x_ptr), AX
  1058  	MULQ t0
  1059  	ADDQ AX, acc3
  1060  	ADCQ $0, DX
  1061  	MOVQ DX, acc4
  1062  	// y[2:] * y[1]
  1063  	MOVQ (8*1)(x_ptr), t0
  1064  
  1065  	MOVQ (8*2)(x_ptr), AX
  1066  	MULQ t0
  1067  	ADDQ AX, acc3
  1068  	ADCQ $0, DX
  1069  	MOVQ DX, t1
  1070  
  1071  	MOVQ (8*3)(x_ptr), AX
  1072  	MULQ t0
  1073  	ADDQ t1, acc4
  1074  	ADCQ $0, DX
  1075  	ADDQ AX, acc4
  1076  	ADCQ $0, DX
  1077  	MOVQ DX, acc5
  1078  	// y[3] * y[2]
  1079  	MOVQ (8*2)(x_ptr), t0
  1080  
  1081  	MOVQ (8*3)(x_ptr), AX
  1082  	MULQ t0
  1083  	ADDQ AX, acc5
  1084  	ADCQ $0, DX
  1085  	MOVQ DX, y_ptr
  1086  	XORQ t1, t1
  1087  	// *2
  1088  	ADDQ acc1, acc1
  1089  	ADCQ acc2, acc2
  1090  	ADCQ acc3, acc3
  1091  	ADCQ acc4, acc4
  1092  	ADCQ acc5, acc5
  1093  	ADCQ y_ptr, y_ptr
  1094  	ADCQ $0, t1
  1095  	// Missing products
  1096  	MOVQ (8*0)(x_ptr), AX
  1097  	MULQ AX
  1098  	MOVQ AX, acc0
  1099  	MOVQ DX, t0
  1100  
  1101  	MOVQ (8*1)(x_ptr), AX
  1102  	MULQ AX
  1103  	ADDQ t0, acc1
  1104  	ADCQ AX, acc2
  1105  	ADCQ $0, DX
  1106  	MOVQ DX, t0
  1107  
  1108  	MOVQ (8*2)(x_ptr), AX
  1109  	MULQ AX
  1110  	ADDQ t0, acc3
  1111  	ADCQ AX, acc4
  1112  	ADCQ $0, DX
  1113  	MOVQ DX, t0
  1114  
  1115  	MOVQ (8*3)(x_ptr), AX
  1116  	MULQ AX
  1117  	ADDQ t0, acc5
  1118  	ADCQ AX, y_ptr
  1119  	ADCQ DX, t1
  1120  	MOVQ t1, x_ptr
  1121  	// First reduction step
  1122  	MOVQ acc0, AX
  1123  	MULQ p256ordK0<>(SB)
  1124  	MOVQ AX, t0
  1125  
  1126  	MOVQ p256ord<>+0x00(SB), AX
  1127  	MULQ t0
  1128  	ADDQ AX, acc0
  1129  	ADCQ $0, DX
  1130  	MOVQ DX, t1
  1131  
  1132  	MOVQ p256ord<>+0x08(SB), AX
  1133  	MULQ t0
  1134  	ADDQ t1, acc1
  1135  	ADCQ $0, DX
  1136  	ADDQ AX, acc1
  1137  
  1138  	MOVQ t0, t1
  1139  	ADCQ DX, acc2
  1140  	ADCQ $0, t1
  1141  	SUBQ t0, acc2
  1142  	SBBQ $0, t1
  1143  
  1144  	MOVQ t0, AX
  1145  	MOVQ t0, DX
  1146  	MOVQ t0, acc0
  1147  	SHLQ $32, AX
  1148  	SHRQ $32, DX
  1149  
  1150  	ADDQ t1, acc3
  1151  	ADCQ $0, acc0
  1152  	SUBQ AX, acc3
  1153  	SBBQ DX, acc0
  1154  	// Second reduction step
  1155  	MOVQ acc1, AX
  1156  	MULQ p256ordK0<>(SB)
  1157  	MOVQ AX, t0
  1158  
  1159  	MOVQ p256ord<>+0x00(SB), AX
  1160  	MULQ t0
  1161  	ADDQ AX, acc1
  1162  	ADCQ $0, DX
  1163  	MOVQ DX, t1
  1164  
  1165  	MOVQ p256ord<>+0x08(SB), AX
  1166  	MULQ t0
  1167  	ADDQ t1, acc2
  1168  	ADCQ $0, DX
  1169  	ADDQ AX, acc2
  1170  
  1171  	MOVQ t0, t1
  1172  	ADCQ DX, acc3
  1173  	ADCQ $0, t1
  1174  	SUBQ t0, acc3
  1175  	SBBQ $0, t1
  1176  
  1177  	MOVQ t0, AX
  1178  	MOVQ t0, DX
  1179  	MOVQ t0, acc1
  1180  	SHLQ $32, AX
  1181  	SHRQ $32, DX
  1182  
  1183  	ADDQ t1, acc0
  1184  	ADCQ $0, acc1
  1185  	SUBQ AX, acc0
  1186  	SBBQ DX, acc1
  1187  	// Third reduction step
  1188  	MOVQ acc2, AX
  1189  	MULQ p256ordK0<>(SB)
  1190  	MOVQ AX, t0
  1191  
  1192  	MOVQ p256ord<>+0x00(SB), AX
  1193  	MULQ t0
  1194  	ADDQ AX, acc2
  1195  	ADCQ $0, DX
  1196  	MOVQ DX, t1
  1197  
  1198  	MOVQ p256ord<>+0x08(SB), AX
  1199  	MULQ t0
  1200  	ADDQ t1, acc3
  1201  	ADCQ $0, DX
  1202  	ADDQ AX, acc3
  1203  
  1204  	MOVQ t0, t1
  1205  	ADCQ DX, acc0
  1206  	ADCQ $0, t1
  1207  	SUBQ t0, acc0
  1208  	SBBQ $0, t1
  1209  
  1210  	MOVQ t0, AX
  1211  	MOVQ t0, DX
  1212  	MOVQ t0, acc2
  1213  	SHLQ $32, AX
  1214  	SHRQ $32, DX
  1215  
  1216  	ADDQ t1, acc1
  1217  	ADCQ $0, acc2
  1218  	SUBQ AX, acc1
  1219  	SBBQ DX, acc2
  1220  	// Last reduction step
  1221  	MOVQ acc3, AX
  1222  	MULQ p256ordK0<>(SB)
  1223  	MOVQ AX, t0
  1224  
  1225  	MOVQ p256ord<>+0x00(SB), AX
  1226  	MULQ t0
  1227  	ADDQ AX, acc3
  1228  	ADCQ $0, DX
  1229  	MOVQ DX, t1
  1230  
  1231  	MOVQ p256ord<>+0x08(SB), AX
  1232  	MULQ t0
  1233  	ADDQ t1, acc0
  1234  	ADCQ $0, DX
  1235  	ADDQ AX, acc0
  1236  	ADCQ $0, DX
  1237  	MOVQ DX, t1
  1238  
  1239  	MOVQ t0, t1
  1240  	ADCQ DX, acc1
  1241  	ADCQ $0, t1
  1242  	SUBQ t0, acc1
  1243  	SBBQ $0, t1
  1244  
  1245  	MOVQ t0, AX
  1246  	MOVQ t0, DX
  1247  	MOVQ t0, acc3
  1248  	SHLQ $32, AX
  1249  	SHRQ $32, DX
  1250  
  1251  	ADDQ t1, acc2
  1252  	ADCQ $0, acc3
  1253  	SUBQ AX, acc2
  1254  	SBBQ DX, acc3
  1255  	XORQ t0, t0
  1256  	// Add bits [511:256] of the sqr result
  1257  	ADCQ acc4, acc0
  1258  	ADCQ acc5, acc1
  1259  	ADCQ y_ptr, acc2
  1260  	ADCQ x_ptr, acc3
  1261  	ADCQ $0, t0
  1262  
  1263  	MOVQ acc0, acc4
  1264  	MOVQ acc1, acc5
  1265  	MOVQ acc2, y_ptr
  1266  	MOVQ acc3, t1
  1267  	// Subtract p256
  1268  	SUBQ p256ord<>+0x00(SB), acc0
  1269  	SBBQ p256ord<>+0x08(SB) ,acc1
  1270  	SBBQ p256ord<>+0x10(SB), acc2
  1271  	SBBQ p256ord<>+0x18(SB), acc3
  1272  	SBBQ $0, t0
  1273  
  1274  	CMOVQCS acc4, acc0
  1275  	CMOVQCS acc5, acc1
  1276  	CMOVQCS y_ptr, acc2
  1277  	CMOVQCS t1, acc3
  1278  
  1279  	MOVQ acc0, (8*0)(res_ptr)
  1280  	MOVQ acc1, (8*1)(res_ptr)
  1281  	MOVQ acc2, (8*2)(res_ptr)
  1282  	MOVQ acc3, (8*3)(res_ptr)
  1283  	MOVQ res_ptr, x_ptr
  1284  	DECQ BX
  1285  	JNE ordSqrLoop
  1286  
  1287  	RET
  1288  /* ---------------------------------------*/
  1289  #undef res_ptr
  1290  #undef x_ptr
  1291  #undef y_ptr
  1292  
  1293  #undef acc0
  1294  #undef acc1
  1295  #undef acc2
  1296  #undef acc3
  1297  #undef acc4
  1298  #undef acc5
  1299  #undef t0
  1300  #undef t1
  1301  /* ---------------------------------------*/
  1302  #define mul0 AX
  1303  #define mul1 DX
  1304  #define acc0 BX
  1305  #define acc1 CX
  1306  #define acc2 R8
  1307  #define acc3 R9
  1308  #define acc4 R10
  1309  #define acc5 R11
  1310  #define acc6 R12
  1311  #define acc7 R13
  1312  #define t0 R14
  1313  #define t1 R15
  1314  #define t2 DI
  1315  #define t3 SI
  1316  #define hlp BP
  1317  /* ---------------------------------------*/
  1318  TEXT p256SubInternal(SB),NOSPLIT,$0
  1319  	XORQ mul0, mul0
  1320  	SUBQ t0, acc4
  1321  	SBBQ t1, acc5
  1322  	SBBQ t2, acc6
  1323  	SBBQ t3, acc7
  1324  	SBBQ $0, mul0
  1325  
  1326  	MOVQ acc4, acc0
  1327  	MOVQ acc5, acc1
  1328  	MOVQ acc6, acc2
  1329  	MOVQ acc7, acc3
  1330  
  1331  	ADDQ $-1, acc4
  1332  	ADCQ p256const0<>(SB), acc5
  1333  	ADCQ $0, acc6
  1334  	ADCQ p256const1<>(SB), acc7
  1335  	ANDQ $1, mul0
  1336  
  1337  	CMOVQEQ acc0, acc4
  1338  	CMOVQEQ acc1, acc5
  1339  	CMOVQEQ acc2, acc6
  1340  	CMOVQEQ acc3, acc7
  1341  
  1342  	RET
  1343  /* ---------------------------------------*/
  1344  TEXT p256MulInternal(SB),NOSPLIT,$8
  1345  	MOVQ acc4, mul0
  1346  	MULQ t0
  1347  	MOVQ mul0, acc0
  1348  	MOVQ mul1, acc1
  1349  
  1350  	MOVQ acc4, mul0
  1351  	MULQ t1
  1352  	ADDQ mul0, acc1
  1353  	ADCQ $0, mul1
  1354  	MOVQ mul1, acc2
  1355  
  1356  	MOVQ acc4, mul0
  1357  	MULQ t2
  1358  	ADDQ mul0, acc2
  1359  	ADCQ $0, mul1
  1360  	MOVQ mul1, acc3
  1361  
  1362  	MOVQ acc4, mul0
  1363  	MULQ t3
  1364  	ADDQ mul0, acc3
  1365  	ADCQ $0, mul1
  1366  	MOVQ mul1, acc4
  1367  
  1368  	MOVQ acc5, mul0
  1369  	MULQ t0
  1370  	ADDQ mul0, acc1
  1371  	ADCQ $0, mul1
  1372  	MOVQ mul1, hlp
  1373  
  1374  	MOVQ acc5, mul0
  1375  	MULQ t1
  1376  	ADDQ hlp, acc2
  1377  	ADCQ $0, mul1
  1378  	ADDQ mul0, acc2
  1379  	ADCQ $0, mul1
  1380  	MOVQ mul1, hlp
  1381  
  1382  	MOVQ acc5, mul0
  1383  	MULQ t2
  1384  	ADDQ hlp, acc3
  1385  	ADCQ $0, mul1
  1386  	ADDQ mul0, acc3
  1387  	ADCQ $0, mul1
  1388  	MOVQ mul1, hlp
  1389  
  1390  	MOVQ acc5, mul0
  1391  	MULQ t3
  1392  	ADDQ hlp, acc4
  1393  	ADCQ $0, mul1
  1394  	ADDQ mul0, acc4
  1395  	ADCQ $0, mul1
  1396  	MOVQ mul1, acc5
  1397  
  1398  	MOVQ acc6, mul0
  1399  	MULQ t0
  1400  	ADDQ mul0, acc2
  1401  	ADCQ $0, mul1
  1402  	MOVQ mul1, hlp
  1403  
  1404  	MOVQ acc6, mul0
  1405  	MULQ t1
  1406  	ADDQ hlp, acc3
  1407  	ADCQ $0, mul1
  1408  	ADDQ mul0, acc3
  1409  	ADCQ $0, mul1
  1410  	MOVQ mul1, hlp
  1411  
  1412  	MOVQ acc6, mul0
  1413  	MULQ t2
  1414  	ADDQ hlp, acc4
  1415  	ADCQ $0, mul1
  1416  	ADDQ mul0, acc4
  1417  	ADCQ $0, mul1
  1418  	MOVQ mul1, hlp
  1419  
  1420  	MOVQ acc6, mul0
  1421  	MULQ t3
  1422  	ADDQ hlp, acc5
  1423  	ADCQ $0, mul1
  1424  	ADDQ mul0, acc5
  1425  	ADCQ $0, mul1
  1426  	MOVQ mul1, acc6
  1427  
  1428  	MOVQ acc7, mul0
  1429  	MULQ t0
  1430  	ADDQ mul0, acc3
  1431  	ADCQ $0, mul1
  1432  	MOVQ mul1, hlp
  1433  
  1434  	MOVQ acc7, mul0
  1435  	MULQ t1
  1436  	ADDQ hlp, acc4
  1437  	ADCQ $0, mul1
  1438  	ADDQ mul0, acc4
  1439  	ADCQ $0, mul1
  1440  	MOVQ mul1, hlp
  1441  
  1442  	MOVQ acc7, mul0
  1443  	MULQ t2
  1444  	ADDQ hlp, acc5
  1445  	ADCQ $0, mul1
  1446  	ADDQ mul0, acc5
  1447  	ADCQ $0, mul1
  1448  	MOVQ mul1, hlp
  1449  
  1450  	MOVQ acc7, mul0
  1451  	MULQ t3
  1452  	ADDQ hlp, acc6
  1453  	ADCQ $0, mul1
  1454  	ADDQ mul0, acc6
  1455  	ADCQ $0, mul1
  1456  	MOVQ mul1, acc7
  1457  	// First reduction step
  1458  	MOVQ acc0, mul0
  1459  	MOVQ acc0, hlp
  1460  	SHLQ $32, acc0
  1461  	MULQ p256const1<>(SB)
  1462  	SHRQ $32, hlp
  1463  	ADDQ acc0, acc1
  1464  	ADCQ hlp, acc2
  1465  	ADCQ mul0, acc3
  1466  	ADCQ $0, mul1
  1467  	MOVQ mul1, acc0
  1468  	// Second reduction step
  1469  	MOVQ acc1, mul0
  1470  	MOVQ acc1, hlp
  1471  	SHLQ $32, acc1
  1472  	MULQ p256const1<>(SB)
  1473  	SHRQ $32, hlp
  1474  	ADDQ acc1, acc2
  1475  	ADCQ hlp, acc3
  1476  	ADCQ mul0, acc0
  1477  	ADCQ $0, mul1
  1478  	MOVQ mul1, acc1
  1479  	// Third reduction step
  1480  	MOVQ acc2, mul0
  1481  	MOVQ acc2, hlp
  1482  	SHLQ $32, acc2
  1483  	MULQ p256const1<>(SB)
  1484  	SHRQ $32, hlp
  1485  	ADDQ acc2, acc3
  1486  	ADCQ hlp, acc0
  1487  	ADCQ mul0, acc1
  1488  	ADCQ $0, mul1
  1489  	MOVQ mul1, acc2
  1490  	// Last reduction step
  1491  	MOVQ acc3, mul0
  1492  	MOVQ acc3, hlp
  1493  	SHLQ $32, acc3
  1494  	MULQ p256const1<>(SB)
  1495  	SHRQ $32, hlp
  1496  	ADDQ acc3, acc0
  1497  	ADCQ hlp, acc1
  1498  	ADCQ mul0, acc2
  1499  	ADCQ $0, mul1
  1500  	MOVQ mul1, acc3
  1501  	MOVQ $0, BP
  1502  	// Add bits [511:256] of the result
  1503  	ADCQ acc0, acc4
  1504  	ADCQ acc1, acc5
  1505  	ADCQ acc2, acc6
  1506  	ADCQ acc3, acc7
  1507  	ADCQ $0, hlp
  1508  	// Copy result
  1509  	MOVQ acc4, acc0
  1510  	MOVQ acc5, acc1
  1511  	MOVQ acc6, acc2
  1512  	MOVQ acc7, acc3
  1513  	// Subtract p256
  1514  	SUBQ $-1, acc4
  1515  	SBBQ p256const0<>(SB) ,acc5
  1516  	SBBQ $0, acc6
  1517  	SBBQ p256const1<>(SB), acc7
  1518  	SBBQ $0, hlp
  1519  	// If the result of the subtraction is negative, restore the previous result
  1520  	CMOVQCS acc0, acc4
  1521  	CMOVQCS acc1, acc5
  1522  	CMOVQCS acc2, acc6
  1523  	CMOVQCS acc3, acc7
  1524  
  1525  	RET
  1526  /* ---------------------------------------*/
  1527  TEXT p256SqrInternal(SB),NOSPLIT,$8
  1528  
  1529  	MOVQ acc4, mul0
  1530  	MULQ acc5
  1531  	MOVQ mul0, acc1
  1532  	MOVQ mul1, acc2
  1533  
  1534  	MOVQ acc4, mul0
  1535  	MULQ acc6
  1536  	ADDQ mul0, acc2
  1537  	ADCQ $0, mul1
  1538  	MOVQ mul1, acc3
  1539  
  1540  	MOVQ acc4, mul0
  1541  	MULQ acc7
  1542  	ADDQ mul0, acc3
  1543  	ADCQ $0, mul1
  1544  	MOVQ mul1, t0
  1545  
  1546  	MOVQ acc5, mul0
  1547  	MULQ acc6
  1548  	ADDQ mul0, acc3
  1549  	ADCQ $0, mul1
  1550  	MOVQ mul1, hlp
  1551  
  1552  	MOVQ acc5, mul0
  1553  	MULQ acc7
  1554  	ADDQ hlp, t0
  1555  	ADCQ $0, mul1
  1556  	ADDQ mul0, t0
  1557  	ADCQ $0, mul1
  1558  	MOVQ mul1, t1
  1559  
  1560  	MOVQ acc6, mul0
  1561  	MULQ acc7
  1562  	ADDQ mul0, t1
  1563  	ADCQ $0, mul1
  1564  	MOVQ mul1, t2
  1565  	XORQ t3, t3
  1566  	// *2
  1567  	ADDQ acc1, acc1
  1568  	ADCQ acc2, acc2
  1569  	ADCQ acc3, acc3
  1570  	ADCQ t0, t0
  1571  	ADCQ t1, t1
  1572  	ADCQ t2, t2
  1573  	ADCQ $0, t3
  1574  	// Missing products
  1575  	MOVQ acc4, mul0
  1576  	MULQ mul0
  1577  	MOVQ mul0, acc0
  1578  	MOVQ DX, acc4
  1579  
  1580  	MOVQ acc5, mul0
  1581  	MULQ mul0
  1582  	ADDQ acc4, acc1
  1583  	ADCQ mul0, acc2
  1584  	ADCQ $0, DX
  1585  	MOVQ DX, acc4
  1586  
  1587  	MOVQ acc6, mul0
  1588  	MULQ mul0
  1589  	ADDQ acc4, acc3
  1590  	ADCQ mul0, t0
  1591  	ADCQ $0, DX
  1592  	MOVQ DX, acc4
  1593  
  1594  	MOVQ acc7, mul0
  1595  	MULQ mul0
  1596  	ADDQ acc4, t1
  1597  	ADCQ mul0, t2
  1598  	ADCQ DX, t3
  1599  	// First reduction step
  1600  	MOVQ acc0, mul0
  1601  	MOVQ acc0, hlp
  1602  	SHLQ $32, acc0
  1603  	MULQ p256const1<>(SB)
  1604  	SHRQ $32, hlp
  1605  	ADDQ acc0, acc1
  1606  	ADCQ hlp, acc2
  1607  	ADCQ mul0, acc3
  1608  	ADCQ $0, mul1
  1609  	MOVQ mul1, acc0
  1610  	// Second reduction step
  1611  	MOVQ acc1, mul0
  1612  	MOVQ acc1, hlp
  1613  	SHLQ $32, acc1
  1614  	MULQ p256const1<>(SB)
  1615  	SHRQ $32, hlp
  1616  	ADDQ acc1, acc2
  1617  	ADCQ hlp, acc3
  1618  	ADCQ mul0, acc0
  1619  	ADCQ $0, mul1
  1620  	MOVQ mul1, acc1
  1621  	// Third reduction step
  1622  	MOVQ acc2, mul0
  1623  	MOVQ acc2, hlp
  1624  	SHLQ $32, acc2
  1625  	MULQ p256const1<>(SB)
  1626  	SHRQ $32, hlp
  1627  	ADDQ acc2, acc3
  1628  	ADCQ hlp, acc0
  1629  	ADCQ mul0, acc1
  1630  	ADCQ $0, mul1
  1631  	MOVQ mul1, acc2
  1632  	// Last reduction step
  1633  	MOVQ acc3, mul0
  1634  	MOVQ acc3, hlp
  1635  	SHLQ $32, acc3
  1636  	MULQ p256const1<>(SB)
  1637  	SHRQ $32, hlp
  1638  	ADDQ acc3, acc0
  1639  	ADCQ hlp, acc1
  1640  	ADCQ mul0, acc2
  1641  	ADCQ $0, mul1
  1642  	MOVQ mul1, acc3
  1643  	MOVQ $0, BP
  1644  	// Add bits [511:256] of the result
  1645  	ADCQ acc0, t0
  1646  	ADCQ acc1, t1
  1647  	ADCQ acc2, t2
  1648  	ADCQ acc3, t3
  1649  	ADCQ $0, hlp
  1650  	// Copy result
  1651  	MOVQ t0, acc4
  1652  	MOVQ t1, acc5
  1653  	MOVQ t2, acc6
  1654  	MOVQ t3, acc7
  1655  	// Subtract p256
  1656  	SUBQ $-1, acc4
  1657  	SBBQ p256const0<>(SB) ,acc5
  1658  	SBBQ $0, acc6
  1659  	SBBQ p256const1<>(SB), acc7
  1660  	SBBQ $0, hlp
  1661  	// If the result of the subtraction is negative, restore the previous result
  1662  	CMOVQCS t0, acc4
  1663  	CMOVQCS t1, acc5
  1664  	CMOVQCS t2, acc6
  1665  	CMOVQCS t3, acc7
  1666  
  1667  	RET
  1668  /* ---------------------------------------*/
  1669  #define p256MulBy2Inline\
  1670  	XORQ mul0, mul0;\
  1671  	ADDQ acc4, acc4;\
  1672  	ADCQ acc5, acc5;\
  1673  	ADCQ acc6, acc6;\
  1674  	ADCQ acc7, acc7;\
  1675  	ADCQ $0, mul0;\
  1676  	MOVQ acc4, t0;\
  1677  	MOVQ acc5, t1;\
  1678  	MOVQ acc6, t2;\
  1679  	MOVQ acc7, t3;\
  1680  	SUBQ $-1, t0;\
  1681  	SBBQ p256const0<>(SB), t1;\
  1682  	SBBQ $0, t2;\
  1683  	SBBQ p256const1<>(SB), t3;\
  1684  	SBBQ $0, mul0;\
  1685  	CMOVQCS acc4, t0;\
  1686  	CMOVQCS acc5, t1;\
  1687  	CMOVQCS acc6, t2;\
  1688  	CMOVQCS acc7, t3;
  1689  /* ---------------------------------------*/
  1690  #define p256AddInline \
  1691  	XORQ mul0, mul0;\
  1692  	ADDQ t0, acc4;\
  1693  	ADCQ t1, acc5;\
  1694  	ADCQ t2, acc6;\
  1695  	ADCQ t3, acc7;\
  1696  	ADCQ $0, mul0;\
  1697  	MOVQ acc4, t0;\
  1698  	MOVQ acc5, t1;\
  1699  	MOVQ acc6, t2;\
  1700  	MOVQ acc7, t3;\
  1701  	SUBQ $-1, t0;\
  1702  	SBBQ p256const0<>(SB), t1;\
  1703  	SBBQ $0, t2;\
  1704  	SBBQ p256const1<>(SB), t3;\
  1705  	SBBQ $0, mul0;\
  1706  	CMOVQCS acc4, t0;\
  1707  	CMOVQCS acc5, t1;\
  1708  	CMOVQCS acc6, t2;\
  1709  	CMOVQCS acc7, t3;
  1710  /* ---------------------------------------*/
  1711  #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1712  #define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1713  #define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1714  #define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1715  #define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1716  #define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1717  /* ---------------------------------------*/
  1718  #define x1in(off) (32*0 + off)(SP)
  1719  #define y1in(off) (32*1 + off)(SP)
  1720  #define z1in(off) (32*2 + off)(SP)
  1721  #define x2in(off) (32*3 + off)(SP)
  1722  #define y2in(off) (32*4 + off)(SP)
  1723  #define xout(off) (32*5 + off)(SP)
  1724  #define yout(off) (32*6 + off)(SP)
  1725  #define zout(off) (32*7 + off)(SP)
  1726  #define s2(off)   (32*8 + off)(SP)
  1727  #define z1sqr(off) (32*9 + off)(SP)
  1728  #define h(off)	  (32*10 + off)(SP)
  1729  #define r(off)	  (32*11 + off)(SP)
  1730  #define hsqr(off) (32*12 + off)(SP)
  1731  #define rsqr(off) (32*13 + off)(SP)
  1732  #define hcub(off) (32*14 + off)(SP)
  1733  #define rptr	  (32*15)(SP)
  1734  #define sel_save  (32*15 + 8)(SP)
  1735  #define zero_save (32*15 + 8 + 4)(SP)
  1736  
  1737  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1738  TEXT ·p256PointAddAffineAsm(SB),0,$512-48
  1739  	// Move input to stack in order to free registers
  1740  	MOVQ res+0(FP), AX
  1741  	MOVQ in1+8(FP), BX
  1742  	MOVQ in2+16(FP), CX
  1743  	MOVQ sign+24(FP), DX
  1744  	MOVQ sel+32(FP), t1
  1745  	MOVQ zero+40(FP), t2
  1746  
  1747  	MOVOU (16*0)(BX), X0
  1748  	MOVOU (16*1)(BX), X1
  1749  	MOVOU (16*2)(BX), X2
  1750  	MOVOU (16*3)(BX), X3
  1751  	MOVOU (16*4)(BX), X4
  1752  	MOVOU (16*5)(BX), X5
  1753  
  1754  	MOVOU X0, x1in(16*0)
  1755  	MOVOU X1, x1in(16*1)
  1756  	MOVOU X2, y1in(16*0)
  1757  	MOVOU X3, y1in(16*1)
  1758  	MOVOU X4, z1in(16*0)
  1759  	MOVOU X5, z1in(16*1)
  1760  
  1761  	MOVOU (16*0)(CX), X0
  1762  	MOVOU (16*1)(CX), X1
  1763  
  1764  	MOVOU X0, x2in(16*0)
  1765  	MOVOU X1, x2in(16*1)
  1766  	// Store pointer to result
  1767  	MOVQ mul0, rptr
  1768  	MOVL t1, sel_save
  1769  	MOVL t2, zero_save
  1770  	// Negate y2in based on sign
  1771  	MOVQ (16*2 + 8*0)(CX), acc4
  1772  	MOVQ (16*2 + 8*1)(CX), acc5
  1773  	MOVQ (16*2 + 8*2)(CX), acc6
  1774  	MOVQ (16*2 + 8*3)(CX), acc7
  1775  	MOVQ $-1, acc0
  1776  	MOVQ p256const0<>(SB), acc1
  1777  	MOVQ $0, acc2
  1778  	MOVQ p256const1<>(SB), acc3
  1779  	XORQ mul0, mul0
  1780  	// Speculatively subtract
  1781  	SUBQ acc4, acc0
  1782  	SBBQ acc5, acc1
  1783  	SBBQ acc6, acc2
  1784  	SBBQ acc7, acc3
  1785  	SBBQ $0, mul0
  1786  	MOVQ acc0, t0
  1787  	MOVQ acc1, t1
  1788  	MOVQ acc2, t2
  1789  	MOVQ acc3, t3
  1790  	// Add in case the operand was > p256
  1791  	ADDQ $-1, acc0
  1792  	ADCQ p256const0<>(SB), acc1
  1793  	ADCQ $0, acc2
  1794  	ADCQ p256const1<>(SB), acc3
  1795  	ADCQ $0, mul0
  1796  	CMOVQNE t0, acc0
  1797  	CMOVQNE t1, acc1
  1798  	CMOVQNE t2, acc2
  1799  	CMOVQNE t3, acc3
  1800  	// If condition is 0, keep original value
  1801  	TESTQ DX, DX
  1802  	CMOVQEQ acc4, acc0
  1803  	CMOVQEQ acc5, acc1
  1804  	CMOVQEQ acc6, acc2
  1805  	CMOVQEQ acc7, acc3
  1806  	// Store result
  1807  	MOVQ acc0, y2in(8*0)
  1808  	MOVQ acc1, y2in(8*1)
  1809  	MOVQ acc2, y2in(8*2)
  1810  	MOVQ acc3, y2in(8*3)
  1811  	// Begin point add
  1812  	LDacc (z1in)
  1813  	CALL p256SqrInternal(SB)	// z1ˆ2
  1814  	ST (z1sqr)
  1815  
  1816  	LDt (x2in)
  1817  	CALL p256MulInternal(SB)	// x2 * z1ˆ2
  1818  
  1819  	LDt (x1in)
  1820  	CALL p256SubInternal(SB)	// h = u2 - u1
  1821  	ST (h)
  1822  
  1823  	LDt (z1in)
  1824  	CALL p256MulInternal(SB)	// z3 = h * z1
  1825  	ST (zout)
  1826  
  1827  	LDacc (z1sqr)
  1828  	CALL p256MulInternal(SB)	// z1ˆ3
  1829  
  1830  	LDt (y2in)
  1831  	CALL p256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1832  	ST (s2)
  1833  
  1834  	LDt (y1in)
  1835  	CALL p256SubInternal(SB)	// r = s2 - s1
  1836  	ST (r)
  1837  
  1838  	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  1839  	ST (rsqr)
  1840  
  1841  	LDacc (h)
  1842  	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  1843  	ST (hsqr)
  1844  
  1845  	LDt (h)
  1846  	CALL p256MulInternal(SB)	// hcub = hˆ3
  1847  	ST (hcub)
  1848  
  1849  	LDt (y1in)
  1850  	CALL p256MulInternal(SB)	// y1 * hˆ3
  1851  	ST (s2)
  1852  
  1853  	LDacc (x1in)
  1854  	LDt (hsqr)
  1855  	CALL p256MulInternal(SB)	// u1 * hˆ2
  1856  	ST (h)
  1857  
  1858  	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1859  	LDacc (rsqr)
  1860  	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1861  
  1862  	LDt (hcub)
  1863  	CALL p256SubInternal(SB)
  1864  	ST (xout)
  1865  
  1866  	MOVQ acc4, t0
  1867  	MOVQ acc5, t1
  1868  	MOVQ acc6, t2
  1869  	MOVQ acc7, t3
  1870  	LDacc (h)
  1871  	CALL p256SubInternal(SB)
  1872  
  1873  	LDt (r)
  1874  	CALL p256MulInternal(SB)
  1875  
  1876  	LDt (s2)
  1877  	CALL p256SubInternal(SB)
  1878  	ST (yout)
  1879  	// Load stored values from stack
  1880  	MOVQ rptr, AX
  1881  	MOVL sel_save, BX
  1882  	MOVL zero_save, CX
  1883  	// The result is not valid if (sel == 0), conditional choose
  1884  	MOVOU xout(16*0), X0
  1885  	MOVOU xout(16*1), X1
  1886  	MOVOU yout(16*0), X2
  1887  	MOVOU yout(16*1), X3
  1888  	MOVOU zout(16*0), X4
  1889  	MOVOU zout(16*1), X5
  1890  
  1891  	MOVL BX, X6
  1892  	MOVL CX, X7
  1893  
  1894  	PXOR X8, X8
  1895  	PCMPEQL X9, X9
  1896  
  1897  	PSHUFD $0, X6, X6
  1898  	PSHUFD $0, X7, X7
  1899  
  1900  	PCMPEQL X8, X6
  1901  	PCMPEQL X8, X7
  1902  
  1903  	MOVOU X6, X15
  1904  	PANDN X9, X15
  1905  
  1906  	MOVOU x1in(16*0), X9
  1907  	MOVOU x1in(16*1), X10
  1908  	MOVOU y1in(16*0), X11
  1909  	MOVOU y1in(16*1), X12
  1910  	MOVOU z1in(16*0), X13
  1911  	MOVOU z1in(16*1), X14
  1912  
  1913  	PAND X15, X0
  1914  	PAND X15, X1
  1915  	PAND X15, X2
  1916  	PAND X15, X3
  1917  	PAND X15, X4
  1918  	PAND X15, X5
  1919  
  1920  	PAND X6, X9
  1921  	PAND X6, X10
  1922  	PAND X6, X11
  1923  	PAND X6, X12
  1924  	PAND X6, X13
  1925  	PAND X6, X14
  1926  
  1927  	PXOR X9, X0
  1928  	PXOR X10, X1
  1929  	PXOR X11, X2
  1930  	PXOR X12, X3
  1931  	PXOR X13, X4
  1932  	PXOR X14, X5
  1933  	// Similarly if zero == 0
  1934  	PCMPEQL X9, X9
  1935  	MOVOU X7, X15
  1936  	PANDN X9, X15
  1937  
  1938  	MOVOU x2in(16*0), X9
  1939  	MOVOU x2in(16*1), X10
  1940  	MOVOU y2in(16*0), X11
  1941  	MOVOU y2in(16*1), X12
  1942  	MOVOU p256one<>+0x00(SB), X13
  1943  	MOVOU p256one<>+0x10(SB), X14
  1944  
  1945  	PAND X15, X0
  1946  	PAND X15, X1
  1947  	PAND X15, X2
  1948  	PAND X15, X3
  1949  	PAND X15, X4
  1950  	PAND X15, X5
  1951  
  1952  	PAND X7, X9
  1953  	PAND X7, X10
  1954  	PAND X7, X11
  1955  	PAND X7, X12
  1956  	PAND X7, X13
  1957  	PAND X7, X14
  1958  
  1959  	PXOR X9, X0
  1960  	PXOR X10, X1
  1961  	PXOR X11, X2
  1962  	PXOR X12, X3
  1963  	PXOR X13, X4
  1964  	PXOR X14, X5
  1965  	// Finally output the result
  1966  	MOVOU X0, (16*0)(AX)
  1967  	MOVOU X1, (16*1)(AX)
  1968  	MOVOU X2, (16*2)(AX)
  1969  	MOVOU X3, (16*3)(AX)
  1970  	MOVOU X4, (16*4)(AX)
  1971  	MOVOU X5, (16*5)(AX)
  1972  	MOVQ $0, rptr
  1973  
  1974  	RET
  1975  #undef x1in
  1976  #undef y1in
  1977  #undef z1in
  1978  #undef x2in
  1979  #undef y2in
  1980  #undef xout
  1981  #undef yout
  1982  #undef zout
  1983  #undef s2
  1984  #undef z1sqr
  1985  #undef h
  1986  #undef r
  1987  #undef hsqr
  1988  #undef rsqr
  1989  #undef hcub
  1990  #undef rptr
  1991  #undef sel_save
  1992  #undef zero_save
  1993  
  1994  // p256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  1995  // otherwise. It writes to [acc4..acc7], t0 and t1.
  1996  TEXT p256IsZero(SB),NOSPLIT,$0
  1997  	// AX contains a flag that is set if the input is zero.
  1998  	XORQ AX, AX
  1999  	MOVQ $1, t1
  2000  
  2001  	// Check whether [acc4..acc7] are all zero.
  2002  	MOVQ acc4, t0
  2003  	ORQ acc5, t0
  2004  	ORQ acc6, t0
  2005  	ORQ acc7, t0
  2006  
  2007  	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  2008  	// appear to be supported in Go. Thus t1 = 1.)
  2009  	CMOVQEQ t1, AX
  2010  
  2011  	// XOR [acc4..acc7] with P and compare with zero again.
  2012  	XORQ $-1, acc4
  2013  	XORQ p256const0<>(SB), acc5
  2014  	XORQ p256const1<>(SB), acc7
  2015  	ORQ acc5, acc4
  2016  	ORQ acc6, acc4
  2017  	ORQ acc7, acc4
  2018  
  2019  	// Set the zero flag if so.
  2020  	CMOVQEQ t1, AX
  2021  	RET
  2022  
  2023  /* ---------------------------------------*/
  2024  #define x1in(off) (32*0 + off)(SP)
  2025  #define y1in(off) (32*1 + off)(SP)
  2026  #define z1in(off) (32*2 + off)(SP)
  2027  #define x2in(off) (32*3 + off)(SP)
  2028  #define y2in(off) (32*4 + off)(SP)
  2029  #define z2in(off) (32*5 + off)(SP)
  2030  
  2031  #define xout(off) (32*6 + off)(SP)
  2032  #define yout(off) (32*7 + off)(SP)
  2033  #define zout(off) (32*8 + off)(SP)
  2034  
  2035  #define u1(off)    (32*9 + off)(SP)
  2036  #define u2(off)    (32*10 + off)(SP)
  2037  #define s1(off)    (32*11 + off)(SP)
  2038  #define s2(off)    (32*12 + off)(SP)
  2039  #define z1sqr(off) (32*13 + off)(SP)
  2040  #define z2sqr(off) (32*14 + off)(SP)
  2041  #define h(off)     (32*15 + off)(SP)
  2042  #define r(off)     (32*16 + off)(SP)
  2043  #define hsqr(off)  (32*17 + off)(SP)
  2044  #define rsqr(off)  (32*18 + off)(SP)
  2045  #define hcub(off)  (32*19 + off)(SP)
  2046  #define rptr       (32*20)(SP)
  2047  #define points_eq  (32*20+8)(SP)
  2048  
  2049  //func p256PointAddAsm(res, in1, in2 *P256Point) int
  2050  TEXT ·p256PointAddAsm(SB),0,$680-32
  2051  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  2052  	// Move input to stack in order to free registers
  2053  	MOVQ res+0(FP), AX
  2054  	MOVQ in1+8(FP), BX
  2055  	MOVQ in2+16(FP), CX
  2056  
  2057  	MOVOU (16*0)(BX), X0
  2058  	MOVOU (16*1)(BX), X1
  2059  	MOVOU (16*2)(BX), X2
  2060  	MOVOU (16*3)(BX), X3
  2061  	MOVOU (16*4)(BX), X4
  2062  	MOVOU (16*5)(BX), X5
  2063  
  2064  	MOVOU X0, x1in(16*0)
  2065  	MOVOU X1, x1in(16*1)
  2066  	MOVOU X2, y1in(16*0)
  2067  	MOVOU X3, y1in(16*1)
  2068  	MOVOU X4, z1in(16*0)
  2069  	MOVOU X5, z1in(16*1)
  2070  
  2071  	MOVOU (16*0)(CX), X0
  2072  	MOVOU (16*1)(CX), X1
  2073  	MOVOU (16*2)(CX), X2
  2074  	MOVOU (16*3)(CX), X3
  2075  	MOVOU (16*4)(CX), X4
  2076  	MOVOU (16*5)(CX), X5
  2077  
  2078  	MOVOU X0, x2in(16*0)
  2079  	MOVOU X1, x2in(16*1)
  2080  	MOVOU X2, y2in(16*0)
  2081  	MOVOU X3, y2in(16*1)
  2082  	MOVOU X4, z2in(16*0)
  2083  	MOVOU X5, z2in(16*1)
  2084  	// Store pointer to result
  2085  	MOVQ AX, rptr
  2086  	// Begin point add
  2087  	LDacc (z2in)
  2088  	CALL p256SqrInternal(SB)	// z2ˆ2
  2089  	ST (z2sqr)
  2090  	LDt (z2in)
  2091  	CALL p256MulInternal(SB)	// z2ˆ3
  2092  	LDt (y1in)
  2093  	CALL p256MulInternal(SB)	// s1 = z2ˆ3*y1
  2094  	ST (s1)
  2095  
  2096  	LDacc (z1in)
  2097  	CALL p256SqrInternal(SB)	// z1ˆ2
  2098  	ST (z1sqr)
  2099  	LDt (z1in)
  2100  	CALL p256MulInternal(SB)	// z1ˆ3
  2101  	LDt (y2in)
  2102  	CALL p256MulInternal(SB)	// s2 = z1ˆ3*y2
  2103  	ST (s2)
  2104  
  2105  	LDt (s1)
  2106  	CALL p256SubInternal(SB)	// r = s2 - s1
  2107  	ST (r)
  2108  	CALL p256IsZero(SB)
  2109  	MOVQ AX, points_eq
  2110  
  2111  	LDacc (z2sqr)
  2112  	LDt (x1in)
  2113  	CALL p256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2114  	ST (u1)
  2115  	LDacc (z1sqr)
  2116  	LDt (x2in)
  2117  	CALL p256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2118  	ST (u2)
  2119  
  2120  	LDt (u1)
  2121  	CALL p256SubInternal(SB)	// h = u2 - u1
  2122  	ST (h)
  2123  	CALL p256IsZero(SB)
  2124  	ANDQ points_eq, AX
  2125  	MOVQ AX, points_eq
  2126  
  2127  	LDacc (r)
  2128  	CALL p256SqrInternal(SB)	// rsqr = rˆ2
  2129  	ST (rsqr)
  2130  
  2131  	LDacc (h)
  2132  	CALL p256SqrInternal(SB)	// hsqr = hˆ2
  2133  	ST (hsqr)
  2134  
  2135  	LDt (h)
  2136  	CALL p256MulInternal(SB)	// hcub = hˆ3
  2137  	ST (hcub)
  2138  
  2139  	LDt (s1)
  2140  	CALL p256MulInternal(SB)
  2141  	ST (s2)
  2142  
  2143  	LDacc (z1in)
  2144  	LDt (z2in)
  2145  	CALL p256MulInternal(SB)	// z1 * z2
  2146  	LDt (h)
  2147  	CALL p256MulInternal(SB)	// z1 * z2 * h
  2148  	ST (zout)
  2149  
  2150  	LDacc (hsqr)
  2151  	LDt (u1)
  2152  	CALL p256MulInternal(SB)	// hˆ2 * u1
  2153  	ST (u2)
  2154  
  2155  	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2156  	LDacc (rsqr)
  2157  	CALL p256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2158  
  2159  	LDt (hcub)
  2160  	CALL p256SubInternal(SB)
  2161  	ST (xout)
  2162  
  2163  	MOVQ acc4, t0
  2164  	MOVQ acc5, t1
  2165  	MOVQ acc6, t2
  2166  	MOVQ acc7, t3
  2167  	LDacc (u2)
  2168  	CALL p256SubInternal(SB)
  2169  
  2170  	LDt (r)
  2171  	CALL p256MulInternal(SB)
  2172  
  2173  	LDt (s2)
  2174  	CALL p256SubInternal(SB)
  2175  	ST (yout)
  2176  
  2177  	MOVOU xout(16*0), X0
  2178  	MOVOU xout(16*1), X1
  2179  	MOVOU yout(16*0), X2
  2180  	MOVOU yout(16*1), X3
  2181  	MOVOU zout(16*0), X4
  2182  	MOVOU zout(16*1), X5
  2183  	// Finally output the result
  2184  	MOVQ rptr, AX
  2185  	MOVQ $0, rptr
  2186  	MOVOU X0, (16*0)(AX)
  2187  	MOVOU X1, (16*1)(AX)
  2188  	MOVOU X2, (16*2)(AX)
  2189  	MOVOU X3, (16*3)(AX)
  2190  	MOVOU X4, (16*4)(AX)
  2191  	MOVOU X5, (16*5)(AX)
  2192  
  2193  	MOVQ points_eq, AX
  2194  	MOVQ AX, ret+24(FP)
  2195  
  2196  	RET
  2197  #undef x1in
  2198  #undef y1in
  2199  #undef z1in
  2200  #undef x2in
  2201  #undef y2in
  2202  #undef z2in
  2203  #undef xout
  2204  #undef yout
  2205  #undef zout
  2206  #undef s1
  2207  #undef s2
  2208  #undef u1
  2209  #undef u2
  2210  #undef z1sqr
  2211  #undef z2sqr
  2212  #undef h
  2213  #undef r
  2214  #undef hsqr
  2215  #undef rsqr
  2216  #undef hcub
  2217  #undef rptr
  2218  /* ---------------------------------------*/
  2219  #define x(off) (32*0 + off)(SP)
  2220  #define y(off) (32*1 + off)(SP)
  2221  #define z(off) (32*2 + off)(SP)
  2222  
  2223  #define s(off)	(32*3 + off)(SP)
  2224  #define m(off)	(32*4 + off)(SP)
  2225  #define zsqr(off) (32*5 + off)(SP)
  2226  #define tmp(off)  (32*6 + off)(SP)
  2227  #define rptr	  (32*7)(SP)
  2228  
  2229  //func p256PointDoubleAsm(res, in *P256Point)
  2230  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
  2231  	// Move input to stack in order to free registers
  2232  	MOVQ res+0(FP), AX
  2233  	MOVQ in+8(FP), BX
  2234  
  2235  	MOVOU (16*0)(BX), X0
  2236  	MOVOU (16*1)(BX), X1
  2237  	MOVOU (16*2)(BX), X2
  2238  	MOVOU (16*3)(BX), X3
  2239  	MOVOU (16*4)(BX), X4
  2240  	MOVOU (16*5)(BX), X5
  2241  
  2242  	MOVOU X0, x(16*0)
  2243  	MOVOU X1, x(16*1)
  2244  	MOVOU X2, y(16*0)
  2245  	MOVOU X3, y(16*1)
  2246  	MOVOU X4, z(16*0)
  2247  	MOVOU X5, z(16*1)
  2248  	// Store pointer to result
  2249  	MOVQ AX, rptr
  2250  	// Begin point double
  2251  	LDacc (z)
  2252  	CALL p256SqrInternal(SB)
  2253  	ST (zsqr)
  2254  
  2255  	LDt (x)
  2256  	p256AddInline
  2257  	STt (m)
  2258  
  2259  	LDacc (z)
  2260  	LDt (y)
  2261  	CALL p256MulInternal(SB)
  2262  	p256MulBy2Inline
  2263  	MOVQ rptr, AX
  2264  	// Store z
  2265  	MOVQ t0, (16*4 + 8*0)(AX)
  2266  	MOVQ t1, (16*4 + 8*1)(AX)
  2267  	MOVQ t2, (16*4 + 8*2)(AX)
  2268  	MOVQ t3, (16*4 + 8*3)(AX)
  2269  
  2270  	LDacc (x)
  2271  	LDt (zsqr)
  2272  	CALL p256SubInternal(SB)
  2273  	LDt (m)
  2274  	CALL p256MulInternal(SB)
  2275  	ST (m)
  2276  	// Multiply by 3
  2277  	p256MulBy2Inline
  2278  	LDacc (m)
  2279  	p256AddInline
  2280  	STt (m)
  2281  	////////////////////////
  2282  	LDacc (y)
  2283  	p256MulBy2Inline
  2284  	t2acc
  2285  	CALL p256SqrInternal(SB)
  2286  	ST (s)
  2287  	CALL p256SqrInternal(SB)
  2288  	// Divide by 2
  2289  	XORQ mul0, mul0
  2290  	MOVQ acc4, t0
  2291  	MOVQ acc5, t1
  2292  	MOVQ acc6, t2
  2293  	MOVQ acc7, t3
  2294  
  2295  	ADDQ $-1, acc4
  2296  	ADCQ p256const0<>(SB), acc5
  2297  	ADCQ $0, acc6
  2298  	ADCQ p256const1<>(SB), acc7
  2299  	ADCQ $0, mul0
  2300  	TESTQ $1, t0
  2301  
  2302  	CMOVQEQ t0, acc4
  2303  	CMOVQEQ t1, acc5
  2304  	CMOVQEQ t2, acc6
  2305  	CMOVQEQ t3, acc7
  2306  	ANDQ t0, mul0
  2307  
  2308  	SHRQ $1, acc5, acc4
  2309  	SHRQ $1, acc6, acc5
  2310  	SHRQ $1, acc7, acc6
  2311  	SHRQ $1, mul0, acc7
  2312  	ST (y)
  2313  	/////////////////////////
  2314  	LDacc (x)
  2315  	LDt (s)
  2316  	CALL p256MulInternal(SB)
  2317  	ST (s)
  2318  	p256MulBy2Inline
  2319  	STt (tmp)
  2320  
  2321  	LDacc (m)
  2322  	CALL p256SqrInternal(SB)
  2323  	LDt (tmp)
  2324  	CALL p256SubInternal(SB)
  2325  
  2326  	MOVQ rptr, AX
  2327  	// Store x
  2328  	MOVQ acc4, (16*0 + 8*0)(AX)
  2329  	MOVQ acc5, (16*0 + 8*1)(AX)
  2330  	MOVQ acc6, (16*0 + 8*2)(AX)
  2331  	MOVQ acc7, (16*0 + 8*3)(AX)
  2332  
  2333  	acc2t
  2334  	LDacc (s)
  2335  	CALL p256SubInternal(SB)
  2336  
  2337  	LDt (m)
  2338  	CALL p256MulInternal(SB)
  2339  
  2340  	LDt (y)
  2341  	CALL p256SubInternal(SB)
  2342  	MOVQ rptr, AX
  2343  	// Store y
  2344  	MOVQ acc4, (16*2 + 8*0)(AX)
  2345  	MOVQ acc5, (16*2 + 8*1)(AX)
  2346  	MOVQ acc6, (16*2 + 8*2)(AX)
  2347  	MOVQ acc7, (16*2 + 8*3)(AX)
  2348  	///////////////////////
  2349  	MOVQ $0, rptr
  2350  
  2351  	RET
  2352  /* ---------------------------------------*/
  2353  

View as plain text