Text file src/crypto/internal/fips140/nistec/p256_asm_ppc64le.s

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  // This is a port of the s390x asm implementation.
    10  // to ppc64le.
    11  
    12  // Some changes were needed due to differences in
    13  // the Go opcodes and/or available instructions
    14  // between s390x and ppc64le.
    15  
    16  // 1. There were operand order differences in the
    17  // VSUBUQM, VSUBCUQ, and VSEL instructions.
    18  
    19  // 2. ppc64 does not have a multiply high and low
    20  // like s390x, so those were implemented using
    21  // macros to compute the equivalent values.
    22  
    23  // 3. The LVX, STVX instructions on ppc64 require
    24  // 16 byte alignment of the data.  To avoid that
    25  // requirement, data is loaded using LXVD2X and
    26  // STXVD2X with VPERM to reorder bytes correctly.
    27  
    28  // I have identified some areas where I believe
    29  // changes would be needed to make this work for big
    30  // endian; however additional changes beyond what I
    31  // have noted are most likely needed to make it work.
    32  // - The string used with VPERM to swap the byte order
    33  //   for loads and stores.
    34  // - The constants that are loaded from CPOOL.
    35  //
    36  
    37  // The following constants are defined in an order
    38  // that is correct for use with LXVD2X/STXVD2X
    39  // on little endian.
    40  DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    41  DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    42  DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    43  DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    44  DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    45  DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    46  DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    47  DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    48  DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    49  DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    50  DATA p256mul<>+0x00(SB)/8, $0x00000000ffffffff // P256 original
    51  DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
    52  DATA p256mul<>+0x10(SB)/8, $0xffffffff00000001 // P256 original
    53  DATA p256mul<>+0x18(SB)/8, $0x0000000000000000 // P256
    54  DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    55  DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    56  DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    57  DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    58  DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    59  DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    60  DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    61  DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    62  DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    63  DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    64  DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    65  DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    66  DATA p256mul<>+0x80(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    67  DATA p256mul<>+0x88(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    68  DATA p256mul<>+0x90(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    69  DATA p256mul<>+0x98(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    70  
    71  // External declarations for constants
    72  GLOBL p256ord<>(SB), 8, $32
    73  GLOBL p256<>(SB), 8, $80
    74  GLOBL p256mul<>(SB), 8, $160
    75  
    76  // The following macros are used to implement the ppc64le
    77  // equivalent function from the corresponding s390x
    78  // instruction for vector multiply high, low, and add,
    79  // since there aren't exact equivalent instructions.
    80  // The corresponding s390x instructions appear in the
    81  // comments.
    82  // Implementation for big endian would have to be
    83  // investigated, I think it would be different.
    84  //
    85  //
    86  // Vector multiply word
    87  //
    88  //	VMLF  x0, x1, out_low
    89  //	VMLHF x0, x1, out_hi
    90  #define VMULT(x1, x2, out_low, out_hi) \
    91  	VMULEUW x1, x2, TMP1; \
    92  	VMULOUW x1, x2, TMP2; \
    93  	VMRGEW TMP1, TMP2, out_hi; \
    94  	VMRGOW TMP1, TMP2, out_low
    95  
    96  //
    97  // Vector multiply add word
    98  //
    99  //	VMALF  x0, x1, y, out_low
   100  //	VMALHF x0, x1, y, out_hi
   101  #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
   102  	VMULEUW  y, one, TMP2; \
   103  	VMULOUW  y, one, TMP1; \
   104  	VMULEUW  x1, x2, out_low; \
   105  	VMULOUW  x1, x2, out_hi; \
   106  	VADDUDM  TMP2, out_low, TMP2; \
   107  	VADDUDM  TMP1, out_hi, TMP1; \
   108  	VMRGOW   TMP2, TMP1, out_low; \
   109  	VMRGEW   TMP2, TMP1, out_hi
   110  
   111  #define res_ptr R3
   112  #define a_ptr R4
   113  
   114  #undef res_ptr
   115  #undef a_ptr
   116  
   117  #define P1ptr   R3
   118  #define CPOOL   R7
   119  
   120  #define Y1L   V0
   121  #define Y1H   V1
   122  #define T1L   V2
   123  #define T1H   V3
   124  
   125  #define PL    V30
   126  #define PH    V31
   127  
   128  #define CAR1  V6
   129  // func p256NegCond(val *p256Point, cond int)
   130  TEXT ·p256NegCond(SB), NOSPLIT, $0-16
   131  	MOVD val+0(FP), P1ptr
   132  	MOVD $16, R16
   133  
   134  	MOVD cond+8(FP), R6
   135  	CMP  $0, R6
   136  	BC   12, 2, LR      // just return if cond == 0
   137  
   138  	MOVD $p256mul<>+0x00(SB), CPOOL
   139  
   140  	LXVD2X (P1ptr)(R0), Y1L
   141  	LXVD2X (P1ptr)(R16), Y1H
   142  
   143  	XXPERMDI Y1H, Y1H, $2, Y1H
   144  	XXPERMDI Y1L, Y1L, $2, Y1L
   145  
   146  	LXVD2X (CPOOL)(R0), PL
   147  	LXVD2X (CPOOL)(R16), PH
   148  
   149  	VSUBCUQ  PL, Y1L, CAR1      // subtract part2 giving carry
   150  	VSUBUQM  PL, Y1L, T1L       // subtract part2 giving result
   151  	VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
   152  
   153  	XXPERMDI T1H, T1H, $2, T1H
   154  	XXPERMDI T1L, T1L, $2, T1L
   155  
   156  	STXVD2X T1L, (R0+P1ptr)
   157  	STXVD2X T1H, (R16+P1ptr)
   158  	RET
   159  
   160  #undef P1ptr
   161  #undef CPOOL
   162  #undef Y1L
   163  #undef Y1H
   164  #undef T1L
   165  #undef T1H
   166  #undef PL
   167  #undef PH
   168  #undef CAR1
   169  
   170  #define P3ptr   R3
   171  #define P1ptr   R4
   172  #define P2ptr   R5
   173  
   174  #define X1L    V0
   175  #define X1H    V1
   176  #define Y1L    V2
   177  #define Y1H    V3
   178  #define Z1L    V4
   179  #define Z1H    V5
   180  #define X2L    V6
   181  #define X2H    V7
   182  #define Y2L    V8
   183  #define Y2H    V9
   184  #define Z2L    V10
   185  #define Z2H    V11
   186  #define SEL    V12
   187  #define ZER    V13
   188  
   189  // This function uses LXVD2X and STXVD2X to avoid the
   190  // data alignment requirement for LVX, STVX. Since
   191  // this code is just moving bytes and not doing arithmetic,
   192  // order of the bytes doesn't matter.
   193  //
   194  // func p256MovCond(res, a, b *p256Point, cond int)
   195  TEXT ·p256MovCond(SB), NOSPLIT, $0-32
   196  	MOVD res+0(FP), P3ptr
   197  	MOVD a+8(FP), P1ptr
   198  	MOVD b+16(FP), P2ptr
   199  	MOVD $16, R16
   200  	MOVD $32, R17
   201  	MOVD $48, R18
   202  	MOVD $56, R21
   203  	MOVD $64, R19
   204  	MOVD $80, R20
   205  	// cond is R1 + 24 (cond offset) + 32
   206  	LXVDSX (R1)(R21), SEL
   207  	VSPLTISB $0, ZER
   208  	// SEL controls whether to store a or b
   209  	VCMPEQUD SEL, ZER, SEL
   210  
   211  	LXVD2X (P1ptr+R0), X1H
   212  	LXVD2X (P1ptr+R16), X1L
   213  	LXVD2X (P1ptr+R17), Y1H
   214  	LXVD2X (P1ptr+R18), Y1L
   215  	LXVD2X (P1ptr+R19), Z1H
   216  	LXVD2X (P1ptr+R20), Z1L
   217  
   218  	LXVD2X (P2ptr+R0), X2H
   219  	LXVD2X (P2ptr+R16), X2L
   220  	LXVD2X (P2ptr+R17), Y2H
   221  	LXVD2X (P2ptr+R18), Y2L
   222  	LXVD2X (P2ptr+R19), Z2H
   223  	LXVD2X (P2ptr+R20), Z2L
   224  
   225  	VSEL X1H, X2H, SEL, X1H
   226  	VSEL X1L, X2L, SEL, X1L
   227  	VSEL Y1H, Y2H, SEL, Y1H
   228  	VSEL Y1L, Y2L, SEL, Y1L
   229  	VSEL Z1H, Z2H, SEL, Z1H
   230  	VSEL Z1L, Z2L, SEL, Z1L
   231  
   232  	STXVD2X X1H, (P3ptr+R0)
   233  	STXVD2X X1L, (P3ptr+R16)
   234  	STXVD2X Y1H, (P3ptr+R17)
   235  	STXVD2X Y1L, (P3ptr+R18)
   236  	STXVD2X Z1H, (P3ptr+R19)
   237  	STXVD2X Z1L, (P3ptr+R20)
   238  
   239  	RET
   240  
   241  #undef P3ptr
   242  #undef P1ptr
   243  #undef P2ptr
   244  #undef X1L
   245  #undef X1H
   246  #undef Y1L
   247  #undef Y1H
   248  #undef Z1L
   249  #undef Z1H
   250  #undef X2L
   251  #undef X2H
   252  #undef Y2L
   253  #undef Y2H
   254  #undef Z2L
   255  #undef Z2H
   256  #undef SEL
   257  #undef ZER
   258  
   259  #define P3ptr   R3
   260  #define P1ptr   R4
   261  #define COUNT   R5
   262  
   263  #define X1L    V0
   264  #define X1H    V1
   265  #define Y1L    V2
   266  #define Y1H    V3
   267  #define Z1L    V4
   268  #define Z1H    V5
   269  #define X2L    V6
   270  #define X2H    V7
   271  #define Y2L    V8
   272  #define Y2H    V9
   273  #define Z2L    V10
   274  #define Z2H    V11
   275  
   276  #define ONE   V18
   277  #define IDX   V19
   278  #define SEL1  V20
   279  #define SEL2  V21
   280  // func p256Select(point *p256Point, table *p256Table, idx int)
   281  TEXT ·p256Select(SB), NOSPLIT, $0-24
   282  	MOVD res+0(FP), P3ptr
   283  	MOVD table+8(FP), P1ptr
   284  	MOVD $16, R16
   285  	MOVD $32, R17
   286  	MOVD $48, R18
   287  	MOVD $64, R19
   288  	MOVD $80, R20
   289  
   290  	LXVDSX   (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
   291  	VSPLTB   $7, SEL1, IDX    // splat byte
   292  	VSPLTISB $1, ONE          // VREPIB $1, ONE
   293  	VSPLTISB $1, SEL2         // VREPIB $1, SEL2
   294  	MOVD     $16, COUNT	  // len(p256Table)
   295  	MOVD     COUNT, CTR       // set up ctr
   296  
   297  	VSPLTISB $0, X1H // VZERO  X1H
   298  	VSPLTISB $0, X1L // VZERO  X1L
   299  	VSPLTISB $0, Y1H // VZERO  Y1H
   300  	VSPLTISB $0, Y1L // VZERO  Y1L
   301  	VSPLTISB $0, Z1H // VZERO  Z1H
   302  	VSPLTISB $0, Z1L // VZERO  Z1L
   303  
   304  loop_select:
   305  
   306  	// LVXD2X is used here since data alignment doesn't
   307  	// matter.
   308  
   309  	LXVD2X (P1ptr+R0), X2H
   310  	LXVD2X (P1ptr+R16), X2L
   311  	LXVD2X (P1ptr+R17), Y2H
   312  	LXVD2X (P1ptr+R18), Y2L
   313  	LXVD2X (P1ptr+R19), Z2H
   314  	LXVD2X (P1ptr+R20), Z2L
   315  
   316  	VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
   317  
   318  	// This will result in SEL1 being all 0s or 1s, meaning
   319  	// the result is either X1L or X2L, no individual byte
   320  	// selection.
   321  
   322  	VSEL X1L, X2L, SEL1, X1L
   323  	VSEL X1H, X2H, SEL1, X1H
   324  	VSEL Y1L, Y2L, SEL1, Y1L
   325  	VSEL Y1H, Y2H, SEL1, Y1H
   326  	VSEL Z1L, Z2L, SEL1, Z1L
   327  	VSEL Z1H, Z2H, SEL1, Z1H
   328  
   329  	// Add 1 to all bytes in SEL2
   330  	VADDUBM SEL2, ONE, SEL2    // VAB  SEL2, ONE, SEL2 OK
   331  	ADD     $96, P1ptr
   332  	BDNZ    loop_select
   333  
   334  	// STXVD2X is used here so that alignment doesn't
   335  	// need to be verified. Since values were loaded
   336  	// using LXVD2X this is OK.
   337  	STXVD2X X1H, (P3ptr+R0)
   338  	STXVD2X X1L, (P3ptr+R16)
   339  	STXVD2X Y1H, (P3ptr+R17)
   340  	STXVD2X Y1L, (P3ptr+R18)
   341  	STXVD2X Z1H, (P3ptr+R19)
   342  	STXVD2X Z1L, (P3ptr+R20)
   343  	RET
   344  
   345  #undef P3ptr
   346  #undef P1ptr
   347  #undef COUNT
   348  #undef X1L
   349  #undef X1H
   350  #undef Y1L
   351  #undef Y1H
   352  #undef Z1L
   353  #undef Z1H
   354  #undef X2L
   355  #undef X2H
   356  #undef Y2L
   357  #undef Y2H
   358  #undef Z2L
   359  #undef Z2H
   360  #undef ONE
   361  #undef IDX
   362  #undef SEL1
   363  #undef SEL2
   364  
   365  #define P3ptr   R3
   366  #define P1ptr   R4
   367  #define COUNT   R5
   368  
   369  #define X1L    V0
   370  #define X1H    V1
   371  #define Y1L    V2
   372  #define Y1H    V3
   373  #define Z1L    V4
   374  #define Z1H    V5
   375  #define X2L    V6
   376  #define X2H    V7
   377  #define Y2L    V8
   378  #define Y2H    V9
   379  #define Z2L    V10
   380  #define Z2H    V11
   381  
   382  #define ONE   V18
   383  #define IDX   V19
   384  #define SEL1  V20
   385  #define SEL2  V21
   386  
   387  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   388  TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
   389  	MOVD res+0(FP), P3ptr
   390  	MOVD table+8(FP), P1ptr
   391  	MOVD $16, R16
   392  	MOVD $32, R17
   393  	MOVD $48, R18
   394  
   395  	LXVDSX (R1)(R18), SEL1
   396  	VSPLTB $7, SEL1, IDX    // splat byte
   397  
   398  	VSPLTISB $1, ONE    // Vector with byte 1s
   399  	VSPLTISB $1, SEL2   // Vector with byte 1s
   400  	MOVD     $32, COUNT // len(p256AffineTable)
   401  	MOVD     COUNT, CTR // loop count
   402  
   403  	VSPLTISB $0, X1H // VZERO  X1H
   404  	VSPLTISB $0, X1L // VZERO  X1L
   405  	VSPLTISB $0, Y1H // VZERO  Y1H
   406  	VSPLTISB $0, Y1L // VZERO  Y1L
   407  
   408  loop_select:
   409  	LXVD2X (P1ptr+R0), X2H
   410  	LXVD2X (P1ptr+R16), X2L
   411  	LXVD2X (P1ptr+R17), Y2H
   412  	LXVD2X (P1ptr+R18), Y2L
   413  
   414  	VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
   415  
   416  	VSEL X1L, X2L, SEL1, X1L // Select if idx matched
   417  	VSEL X1H, X2H, SEL1, X1H
   418  	VSEL Y1L, Y2L, SEL1, Y1L
   419  	VSEL Y1H, Y2H, SEL1, Y1H
   420  
   421  	VADDUBM SEL2, ONE, SEL2    // Increment SEL2 bytes by 1
   422  	ADD     $64, P1ptr         // Next chunk
   423  	BDNZ	loop_select
   424  
   425  	STXVD2X X1H, (P3ptr+R0)
   426  	STXVD2X X1L, (P3ptr+R16)
   427  	STXVD2X Y1H, (P3ptr+R17)
   428  	STXVD2X Y1L, (P3ptr+R18)
   429  	RET
   430  
   431  #undef P3ptr
   432  #undef P1ptr
   433  #undef COUNT
   434  #undef X1L
   435  #undef X1H
   436  #undef Y1L
   437  #undef Y1H
   438  #undef Z1L
   439  #undef Z1H
   440  #undef X2L
   441  #undef X2H
   442  #undef Y2L
   443  #undef Y2H
   444  #undef Z2L
   445  #undef Z2H
   446  #undef ONE
   447  #undef IDX
   448  #undef SEL1
   449  #undef SEL2
   450  
   451  #define res_ptr R3
   452  #define x_ptr   R4
   453  #define CPOOL   R7
   454  
   455  #define T0   V0
   456  #define T1   V1
   457  #define T2   V2
   458  #define TT0  V3
   459  #define TT1  V4
   460  
   461  #define ZER   V6
   462  #define SEL1  V7
   463  #define SEL2  V8
   464  #define CAR1  V9
   465  #define CAR2  V10
   466  #define RED1  V11
   467  #define RED2  V12
   468  #define PL    V13
   469  #define PH    V14
   470  
   471  // func p256FromMont(res, in *p256Element)
   472  TEXT ·p256FromMont(SB), NOSPLIT, $0-16
   473  	MOVD res+0(FP), res_ptr
   474  	MOVD in+8(FP), x_ptr
   475  
   476  	MOVD $16, R16
   477  	MOVD $32, R17
   478  	MOVD $48, R18
   479  	MOVD $64, R19
   480  	MOVD $p256<>+0x00(SB), CPOOL
   481  
   482  	VSPLTISB $0, T2  // VZERO T2
   483  	VSPLTISB $0, ZER // VZERO ZER
   484  
   485  	// Constants are defined so that the LXVD2X is correct
   486  	LXVD2X (CPOOL+R0), PH
   487  	LXVD2X (CPOOL+R16), PL
   488  
   489  	// VPERM byte selections
   490  	LXVD2X (CPOOL+R18), SEL2
   491  	LXVD2X (CPOOL+R19), SEL1
   492  
   493  	LXVD2X (R16)(x_ptr), T1
   494  	LXVD2X (R0)(x_ptr), T0
   495  
   496  	// Put in true little endian order
   497  	XXPERMDI T0, T0, $2, T0
   498  	XXPERMDI T1, T1, $2, T1
   499  
   500  	// First round
   501  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   502  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   503  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   504  
   505  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   506  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   507  
   508  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   509  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   510  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   511  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   512  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   513  
   514  	// Second round
   515  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   516  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   517  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   518  
   519  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   520  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   521  
   522  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   523  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   524  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   525  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   526  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   527  
   528  	// Third round
   529  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   530  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   531  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   532  
   533  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   534  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   535  
   536  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   537  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   538  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   539  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   540  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   541  
   542  	// Last round
   543  	VPERM   T1, T0, SEL1, RED2    // d1 d0 d1 d0
   544  	VPERM   ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   545  	VSUBUQM RED2, RED1, RED2      // VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   546  
   547  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
   548  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
   549  
   550  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
   551  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
   552  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
   553  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
   554  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
   555  
   556  	// ---------------------------------------------------
   557  
   558  	VSUBCUQ  T0, PL, CAR1       // VSCBIQ  PL, T0, CAR1
   559  	VSUBUQM  T0, PL, TT0        // VSQ     PL, T0, TT0
   560  	VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
   561  	VSUBEUQM T1, PH, CAR1, TT1  // VSBIQ   T1, PH, CAR1, TT1
   562  	VSUBEUQM T2, ZER, CAR2, T2  // VSBIQ   T2, ZER, CAR2, T2
   563  
   564  	VSEL TT0, T0, T2, T0
   565  	VSEL TT1, T1, T2, T1
   566  
   567  	// Reorder the bytes so STXVD2X can be used.
   568  	// TT0, TT1 used for VPERM result in case
   569  	// the caller expects T0, T1 to be good.
   570  	XXPERMDI T0, T0, $2, TT0
   571  	XXPERMDI T1, T1, $2, TT1
   572  
   573  	STXVD2X TT0, (R0)(res_ptr)
   574  	STXVD2X TT1, (R16)(res_ptr)
   575  	RET
   576  
   577  #undef res_ptr
   578  #undef x_ptr
   579  #undef CPOOL
   580  #undef T0
   581  #undef T1
   582  #undef T2
   583  #undef TT0
   584  #undef TT1
   585  #undef ZER
   586  #undef SEL1
   587  #undef SEL2
   588  #undef CAR1
   589  #undef CAR2
   590  #undef RED1
   591  #undef RED2
   592  #undef PL
   593  #undef PH
   594  
   595  // ---------------------------------------
   596  // p256MulInternal
   597  // V0-V3 V30,V31 - Not Modified
   598  // V4-V15 V27-V29 - Volatile
   599  
   600  #define CPOOL   R7
   601  
   602  // Parameters
   603  #define X0    V0 // Not modified
   604  #define X1    V1 // Not modified
   605  #define Y0    V2 // Not modified
   606  #define Y1    V3 // Not modified
   607  #define T0    V4 // Result
   608  #define T1    V5 // Result
   609  #define P0    V30 // Not modified
   610  #define P1    V31 // Not modified
   611  
   612  // Temporaries: lots of reused vector regs
   613  #define YDIG  V6 // Overloaded with CAR2
   614  #define ADD1H V7 // Overloaded with ADD3H
   615  #define ADD2H V8 // Overloaded with ADD4H
   616  #define ADD3  V9 // Overloaded with SEL2,SEL5
   617  #define ADD4  V10 // Overloaded with SEL3,SEL6
   618  #define RED1  V11 // Overloaded with CAR2
   619  #define RED2  V12
   620  #define RED3  V13 // Overloaded with SEL1
   621  #define T2    V14
   622  // Overloaded temporaries
   623  #define ADD1  V4 // Overloaded with T0
   624  #define ADD2  V5 // Overloaded with T1
   625  #define ADD3H V7 // Overloaded with ADD1H
   626  #define ADD4H V8 // Overloaded with ADD2H
   627  #define ZER   V28 // Overloaded with TMP1
   628  #define CAR1  V6 // Overloaded with YDIG
   629  #define CAR2  V11 // Overloaded with RED1
   630  // Constant Selects
   631  #define SEL1  V13 // Overloaded with RED3
   632  #define SEL2  V9 // Overloaded with ADD3,SEL5
   633  #define SEL3  V10 // Overloaded with ADD4,SEL6
   634  #define SEL4  V6 // Overloaded with YDIG,CAR1
   635  #define SEL5  V9 // Overloaded with ADD3,SEL2
   636  #define SEL6  V10 // Overloaded with ADD4,SEL3
   637  
   638  // TMP1, TMP2 used in
   639  // VMULT macros
   640  #define TMP1  V13 // Overloaded with RED3
   641  #define TMP2  V27
   642  #define ONE   V29 // 1s splatted by word
   643  
   644  /* *
   645   * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   646   * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   647   * With you, SIMD be...
   648   *
   649   *                                           +--------+--------+
   650   *                                  +--------|  RED2  |  RED1  |
   651   *                                  |        +--------+--------+
   652   *                                  |       ---+--------+--------+
   653   *                                  |  +---- T2|   T1   |   T0   |--+
   654   *                                  |  |    ---+--------+--------+  |
   655   *                                  |  |                            |
   656   *                                  |  |    ======================= |
   657   *                                  |  |                            |
   658   *                                  |  |       +--------+--------+<-+
   659   *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   660   *                                  |  |       +--------+--------+  |     |
   661   *                                  |  |     +--------+--------+<---+     |
   662   *                                  |  |     | ADD2H  | ADD1H  |--+       |
   663   *                                  |  |     +--------+--------+  |       |
   664   *                                  |  |     +--------+--------+<-+       |
   665   *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   666   *                                  |  |     +--------+--------+  | |     |
   667   *                                  |  |   +--------+--------+<---+ |     |
   668   *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   669   *                                  |  |   +--------+--------+      | |   V
   670   *                                  |  | ------------------------   | | +--------+
   671   *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   672   *                                  |  |                            | | +--------+
   673   *                                  |  +---->+--------+--------+    | |   |
   674   *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   675   *                                  |        +--------+--------+    | |   |
   676   *                                  +---->---+--------+--------+    | |   |
   677   *                                         T2|   T1   |   T0   |----+ |   |
   678   *                                        ---+--------+--------+    | |   |
   679   *                                        ---+--------+--------+<---+ |   |
   680   *                                    +--- T2|   T1   |   T0   |----------+
   681   *                                    |   ---+--------+--------+      |   |
   682   *                                    |  +--------+--------+<-------------+
   683   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   684   *                                    |  +--------+--------+     |    |   |
   685   *                                    |  +--------+<----------------------+
   686   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   687   *                                    |  +--------+              |    |
   688   *                                    +--->+--------+--------+   |    |
   689   *                                         |   T1   |   T0   |--------+
   690   *                                         +--------+--------+   |    |
   691   *                                   --------------------------- |    |
   692   *                                                               |    |
   693   *                                       +--------+--------+<----+    |
   694   *                                       |  RED2  |  RED1  |          |
   695   *                                       +--------+--------+          |
   696   *                                      ---+--------+--------+<-------+
   697   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   698   *                                      ---+--------+--------+
   699   *
   700   *                                                                *Mi obra de arte de siglo XXI @vpaprots
   701   *
   702   *
   703   * First group is special, doesn't get the two inputs:
   704   *                                             +--------+--------+<-+
   705   *                                     +-------|  ADD2  |  ADD1  |--|-----+
   706   *                                     |       +--------+--------+  |     |
   707   *                                     |     +--------+--------+<---+     |
   708   *                                     |     | ADD2H  | ADD1H  |--+       |
   709   *                                     |     +--------+--------+  |       |
   710   *                                     |     +--------+--------+<-+       |
   711   *                                     |     |  ADD4  |  ADD3  |--|-+     |
   712   *                                     |     +--------+--------+  | |     |
   713   *                                     |   +--------+--------+<---+ |     |
   714   *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   715   *                                     |   +--------+--------+      | |   V
   716   *                                     | ------------------------   | | +--------+
   717   *                                     |                            | | |  RED3  |  [d0 0 0 d0]
   718   *                                     |                            | | +--------+
   719   *                                     +---->+--------+--------+    | |   |
   720   *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
   721   *                                           +--------+--------+    | |   |
   722   *                                        ---+--------+--------+<---+ |   |
   723   *                                    +--- T2|   T1   |   T0   |----------+
   724   *                                    |   ---+--------+--------+      |   |
   725   *                                    |  +--------+--------+<-------------+
   726   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   727   *                                    |  +--------+--------+     |    |   |
   728   *                                    |  +--------+<----------------------+
   729   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   730   *                                    |  +--------+              |    |
   731   *                                    +--->+--------+--------+   |    |
   732   *                                         |   T1   |   T0   |--------+
   733   *                                         +--------+--------+   |    |
   734   *                                   --------------------------- |    |
   735   *                                                               |    |
   736   *                                       +--------+--------+<----+    |
   737   *                                       |  RED2  |  RED1  |          |
   738   *                                       +--------+--------+          |
   739   *                                      ---+--------+--------+<-------+
   740   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   741   *                                      ---+--------+--------+
   742   *
   743   * Last 'group' needs to RED2||RED1 shifted less
   744   */
   745  TEXT p256MulInternal<>(SB), NOSPLIT, $0-16
   746  	// CPOOL loaded from caller
   747  	MOVD $16, R16
   748  	MOVD $32, R17
   749  	MOVD $48, R18
   750  	MOVD $64, R19
   751  	MOVD $80, R20
   752  	MOVD $96, R21
   753  	MOVD $112, R22
   754  
   755  	// ---------------------------------------------------
   756  
   757  	VSPLTW $3, Y0, YDIG // VREPF Y0 is input
   758  
   759  	//	VMLHF X0, YDIG, ADD1H
   760  	//	VMLHF X1, YDIG, ADD2H
   761  	//	VMLF  X0, YDIG, ADD1
   762  	//	VMLF  X1, YDIG, ADD2
   763  	//
   764  	VMULT(X0, YDIG, ADD1, ADD1H)
   765  	VMULT(X1, YDIG, ADD2, ADD2H)
   766  
   767  	VSPLTISW $1, ONE
   768  	VSPLTW $2, Y0, YDIG // VREPF
   769  
   770  	//	VMALF  X0, YDIG, ADD1H, ADD3
   771  	//	VMALF  X1, YDIG, ADD2H, ADD4
   772  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   773  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   774  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   775  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   776  
   777  	LXVD2X   (R17)(CPOOL), SEL1
   778  	VSPLTISB $0, ZER               // VZERO ZER
   779  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   780  
   781  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free	// VSLDB
   782  	VSLDOI $12, ZER, ADD2, T1  // ADD2 Free	// VSLDB
   783  
   784  	VADDCUQ  T0, ADD3, CAR1     // VACCQ
   785  	VADDUQM  T0, ADD3, T0       // ADD3 Free	// VAQ
   786  	VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
   787  	VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free	// VACQ
   788  
   789  	LXVD2X  (R18)(CPOOL), SEL2
   790  	LXVD2X  (R19)(CPOOL), SEL3
   791  	LXVD2X  (R20)(CPOOL), SEL4
   792  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   793  	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   794  	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   795  	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow -->? // VSQ
   796  
   797  	VSLDOI $12, T1, T0, T0 // VSLDB
   798  	VSLDOI $12, T2, T1, T1 // VSLDB
   799  
   800  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
   801  	VADDUQM  T0, ADD3H, T0       // VAQ
   802  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
   803  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
   804  
   805  	// ---------------------------------------------------
   806  
   807  	VSPLTW $1, Y0, YDIG                // VREPF
   808  
   809  	//	VMALHF X0, YDIG, T0, ADD1H
   810  	//	VMALHF X1, YDIG, T1, ADD2H
   811  	//	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
   812  	//	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
   813  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   814  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   815  
   816  	VSPLTW $0, Y0, YDIG // VREPF
   817  
   818  	//	VMALF  X0, YDIG, ADD1H, ADD3
   819  	//	VMALF  X1, YDIG, ADD2H, ADD4
   820  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
   821  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
   822  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   823  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   824  
   825  	VSPLTISB $0, ZER               // VZERO ZER
   826  	LXVD2X   (R17)(CPOOL), SEL1
   827  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   828  
   829  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0		// VSLDB
   830  	VSLDOI $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free	// VSLDB
   831  
   832  	VADDCUQ  T0, RED1, CAR1     // VACCQ
   833  	VADDUQM  T0, RED1, T0       // VAQ
   834  	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
   835  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
   836  
   837  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
   838  	VADDUQM  T0, ADD3, T0         // VAQ
   839  	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
   840  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
   841  	VADDUQM  T2, CAR2, T2         // VAQ
   842  
   843  	LXVD2X  (R18)(CPOOL), SEL2
   844  	LXVD2X  (R19)(CPOOL), SEL3
   845  	LXVD2X  (R20)(CPOOL), SEL4
   846  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   847  	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   848  	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   849  	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow	// VSQ
   850  
   851  	VSLDOI $12, T1, T0, T0 // VSLDB
   852  	VSLDOI $12, T2, T1, T1 // VSLDB
   853  
   854  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
   855  	VADDUQM  T0, ADD3H, T0       // VAQ
   856  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
   857  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
   858  
   859  	// ---------------------------------------------------
   860  
   861  	VSPLTW $3, Y1, YDIG                // VREPF
   862  
   863  	//	VMALHF X0, YDIG, T0, ADD1H
   864  	//	VMALHF X1, YDIG, T1, ADD2H
   865  	//	VMALF  X0, YDIG, T0, ADD1
   866  	//	VMALF  X1, YDIG, T1, ADD2
   867  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   868  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   869  
   870  	VSPLTW $2, Y1, YDIG // VREPF
   871  
   872  	//	VMALF  X0, YDIG, ADD1H, ADD3
   873  	//	VMALF  X1, YDIG, ADD2H, ADD4
   874  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   875  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   876  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   877  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   878  
   879  	LXVD2X   (R17)(CPOOL), SEL1
   880  	VSPLTISB $0, ZER               // VZERO ZER
   881  	LXVD2X   (R17)(CPOOL), SEL1
   882  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   883  
   884  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free		// VSLDB
   885  	VSLDOI $12, T2, ADD2, T1   // ADD2 Free		// VSLDB
   886  
   887  	VADDCUQ  T0, RED1, CAR1     // VACCQ
   888  	VADDUQM  T0, RED1, T0       // VAQ
   889  	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
   890  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
   891  
   892  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
   893  	VADDUQM  T0, ADD3, T0         // VAQ
   894  	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
   895  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
   896  	VADDUQM  T2, CAR2, T2         // VAQ
   897  
   898  	LXVD2X  (R18)(CPOOL), SEL2
   899  	LXVD2X  (R19)(CPOOL), SEL3
   900  	LXVD2X  (R20)(CPOOL), SEL4
   901  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   902  	VPERM   RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   903  	VPERM   RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   904  	VSUBUQM RED2, RED3, RED2     // Guaranteed not to underflow	// VSQ
   905  
   906  	VSLDOI $12, T1, T0, T0 // VSLDB
   907  	VSLDOI $12, T2, T1, T1 // VSLDB
   908  
   909  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
   910  	VADDUQM  T0, ADD3H, T0       // VAQ
   911  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
   912  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
   913  
   914  	// ---------------------------------------------------
   915  
   916  	VSPLTW $1, Y1, YDIG                // VREPF
   917  
   918  	//	VMALHF X0, YDIG, T0, ADD1H
   919  	//	VMALHF X1, YDIG, T1, ADD2H
   920  	//	VMALF  X0, YDIG, T0, ADD1
   921  	//	VMALF  X1, YDIG, T1, ADD2
   922  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   923  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   924  
   925  	VSPLTW $0, Y1, YDIG // VREPF
   926  
   927  	//	VMALF  X0, YDIG, ADD1H, ADD3
   928  	//	VMALF  X1, YDIG, ADD2H, ADD4
   929  	//	VMALHF X0, YDIG, ADD1H, ADD3H
   930  	//	VMALHF X1, YDIG, ADD2H, ADD4H
   931  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
   932  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
   933  
   934  	VSPLTISB $0, ZER               // VZERO ZER
   935  	LXVD2X   (R17)(CPOOL), SEL1
   936  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   937  
   938  	VSLDOI $12, ADD2, ADD1, T0 // VSLDB
   939  	VSLDOI $12, T2, ADD2, T1   // VSLDB
   940  
   941  	VADDCUQ  T0, RED1, CAR1     // VACCQ
   942  	VADDUQM  T0, RED1, T0       // VAQ
   943  	VADDECUQ T1, RED2, CAR1, T2 // VACCCQ
   944  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
   945  
   946  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
   947  	VADDUQM  T0, ADD3, T0         // VAQ
   948  	VADDECUQ T1, ADD4, CAR1, CAR2 // VACCCQ
   949  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
   950  	VADDUQM  T2, CAR2, T2         // VAQ
   951  
   952  	LXVD2X  (R21)(CPOOL), SEL5
   953  	LXVD2X  (R22)(CPOOL), SEL6
   954  	VPERM   T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
   955  	VPERM   T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
   956  	VSUBUQM RED2, RED1, RED2     // Guaranteed not to underflow	// VSQ
   957  
   958  	VSLDOI $12, T1, T0, T0 // VSLDB
   959  	VSLDOI $12, T2, T1, T1 // VSLDB
   960  
   961  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
   962  	VADDUQM  T0, ADD3H, T0       // VAQ
   963  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
   964  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
   965  
   966  	VADDCUQ  T0, RED1, CAR1       // VACCQ
   967  	VADDUQM  T0, RED1, T0         // VAQ
   968  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
   969  	VADDEUQM T1, RED2, CAR1, T1   // VACQ
   970  	VADDUQM  T2, CAR2, T2         // VAQ
   971  
   972  	// ---------------------------------------------------
   973  
   974  	VSPLTISB $0, RED3            // VZERO   RED3
   975  	VSUBCUQ  T0, P0, CAR1        // VSCBIQ
   976  	VSUBUQM  T0, P0, ADD1H       // VSQ
   977  	VSUBECUQ T1, P1, CAR1, CAR2  // VSBCBIQ
   978  	VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
   979  	VSUBEUQM T2, RED3, CAR2, T2  // VSBIQ
   980  
   981  	// what output to use, ADD2H||ADD1H or T1||T0?
   982  	VSEL ADD1H, T0, T2, T0
   983  	VSEL ADD2H, T1, T2, T1
   984  	RET
   985  
   986  #undef CPOOL
   987  
   988  #undef X0
   989  #undef X1
   990  #undef Y0
   991  #undef Y1
   992  #undef T0
   993  #undef T1
   994  #undef P0
   995  #undef P1
   996  
   997  #undef SEL1
   998  #undef SEL2
   999  #undef SEL3
  1000  #undef SEL4
  1001  #undef SEL5
  1002  #undef SEL6
  1003  
  1004  #undef YDIG
  1005  #undef ADD1H
  1006  #undef ADD2H
  1007  #undef ADD3
  1008  #undef ADD4
  1009  #undef RED1
  1010  #undef RED2
  1011  #undef RED3
  1012  #undef T2
  1013  #undef ADD1
  1014  #undef ADD2
  1015  #undef ADD3H
  1016  #undef ADD4H
  1017  #undef ZER
  1018  #undef CAR1
  1019  #undef CAR2
  1020  
  1021  #undef TMP1
  1022  #undef TMP2
  1023  
  1024  #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1025  	VSPLTISB $0, ZER            \ // VZERO
  1026  	VSUBCUQ  X0, Y0, CAR1       \
  1027  	VSUBUQM  X0, Y0, T0         \
  1028  	VSUBECUQ X1, Y1, CAR1, SEL1 \
  1029  	VSUBEUQM X1, Y1, CAR1, T1   \
  1030  	VSUBUQM  ZER, SEL1, SEL1    \ // VSQ
  1031  	                            \
  1032  	VADDCUQ  T0, PL, CAR1       \ // VACCQ
  1033  	VADDUQM  T0, PL, TT0        \ // VAQ
  1034  	VADDEUQM T1, PH, CAR1, TT1  \ // VACQ
  1035  	                            \
  1036  	VSEL     TT0, T0, SEL1, T0  \
  1037  	VSEL     TT1, T1, SEL1, T1  \
  1038  
  1039  #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1040  	VADDCUQ  X0, Y0, CAR1        \
  1041  	VADDUQM  X0, Y0, T0          \
  1042  	VADDECUQ X1, Y1, CAR1, T2    \ // VACCCQ
  1043  	VADDEUQM X1, Y1, CAR1, T1    \
  1044  	                             \
  1045  	VSPLTISB $0, ZER             \
  1046  	VSUBCUQ  T0, PL, CAR1        \ // VSCBIQ
  1047  	VSUBUQM  T0, PL, TT0         \
  1048  	VSUBECUQ T1, PH, CAR1, CAR2  \ // VSBCBIQ
  1049  	VSUBEUQM T1, PH, CAR1, TT1   \ // VSBIQ
  1050  	VSUBEUQM T2, ZER, CAR2, SEL1 \
  1051  	                             \
  1052  	VSEL     TT0, T0, SEL1, T0   \
  1053  	VSEL     TT1, T1, SEL1, T1
  1054  
  1055  #define p256HalfInternal(T1, T0, X1, X0) \
  1056  	VSPLTISB $0, ZER            \
  1057  	VSUBEUQM ZER, ZER, X0, SEL1 \
  1058  	                            \
  1059  	VADDCUQ  X0, PL, CAR1       \
  1060  	VADDUQM  X0, PL, T0         \
  1061  	VADDECUQ X1, PH, CAR1, T2   \
  1062  	VADDEUQM X1, PH, CAR1, T1   \
  1063  	                            \
  1064  	VSEL     T0, X0, SEL1, T0   \
  1065  	VSEL     T1, X1, SEL1, T1   \
  1066  	VSEL     T2, ZER, SEL1, T2  \
  1067  	                            \
  1068  	VSLDOI   $15, T2, ZER, TT1  \
  1069  	VSLDOI   $15, T1, ZER, TT0  \
  1070  	VSPLTISB $1, SEL1           \
  1071  	VSR      T0, SEL1, T0       \ // VSRL
  1072  	VSR      T1, SEL1, T1       \
  1073  	VSPLTISB $7, SEL1           \ // VREPIB
  1074  	VSL      TT0, SEL1, TT0     \
  1075  	VSL      TT1, SEL1, TT1     \
  1076  	VOR      T0, TT0, T0        \
  1077  	VOR      T1, TT1, T1
  1078  
  1079  #define res_ptr R3
  1080  #define x_ptr   R4
  1081  #define y_ptr   R5
  1082  #define CPOOL   R7
  1083  #define TEMP    R8
  1084  #define N       R9
  1085  
  1086  // Parameters
  1087  #define X0    V0
  1088  #define X1    V1
  1089  #define Y0    V2
  1090  #define Y1    V3
  1091  #define T0    V4
  1092  #define T1    V5
  1093  
  1094  // Constants
  1095  #define P0    V30
  1096  #define P1    V31
  1097  // func p256MulAsm(res, in1, in2 *p256Element)
  1098  TEXT ·p256Mul(SB), NOSPLIT, $0-24
  1099  	MOVD res+0(FP), res_ptr
  1100  	MOVD in1+8(FP), x_ptr
  1101  	MOVD in2+16(FP), y_ptr
  1102  	MOVD $16, R16
  1103  	MOVD $32, R17
  1104  
  1105  	MOVD $p256mul<>+0x00(SB), CPOOL
  1106  
  1107  
  1108  	LXVD2X (R0)(x_ptr), X0
  1109  	LXVD2X (R16)(x_ptr), X1
  1110  
  1111  	XXPERMDI X0, X0, $2, X0
  1112  	XXPERMDI X1, X1, $2, X1
  1113  
  1114  	LXVD2X (R0)(y_ptr), Y0
  1115  	LXVD2X (R16)(y_ptr), Y1
  1116  
  1117  	XXPERMDI Y0, Y0, $2, Y0
  1118  	XXPERMDI Y1, Y1, $2, Y1
  1119  
  1120  	LXVD2X (R16)(CPOOL), P1
  1121  	LXVD2X (R0)(CPOOL), P0
  1122  
  1123  	CALL p256MulInternal<>(SB)
  1124  
  1125  	MOVD $p256mul<>+0x00(SB), CPOOL
  1126  
  1127  	XXPERMDI T0, T0, $2, T0
  1128  	XXPERMDI T1, T1, $2, T1
  1129  	STXVD2X T0, (R0)(res_ptr)
  1130  	STXVD2X T1, (R16)(res_ptr)
  1131  	RET
  1132  
  1133  // func p256Sqr(res, in *p256Element, n int)
  1134  TEXT ·p256Sqr(SB), NOSPLIT, $0-24
  1135  	MOVD res+0(FP), res_ptr
  1136  	MOVD in+8(FP), x_ptr
  1137  	MOVD $16, R16
  1138  	MOVD $32, R17
  1139  
  1140  	MOVD $p256mul<>+0x00(SB), CPOOL
  1141  
  1142  	LXVD2X (R0)(x_ptr), X0
  1143  	LXVD2X (R16)(x_ptr), X1
  1144  
  1145  	XXPERMDI X0, X0, $2, X0
  1146  	XXPERMDI X1, X1, $2, X1
  1147  
  1148  sqrLoop:
  1149  	// Sqr uses same value for both
  1150  
  1151  	VOR	X0, X0, Y0
  1152  	VOR	X1, X1, Y1
  1153  
  1154  	LXVD2X (R16)(CPOOL), P1
  1155  	LXVD2X (R0)(CPOOL), P0
  1156  
  1157  	CALL p256MulInternal<>(SB)
  1158  
  1159  	MOVD	n+16(FP), N
  1160  	ADD	$-1, N
  1161  	CMP	$0, N
  1162  	BEQ	done
  1163  	MOVD	N, n+16(FP)	// Save counter to avoid clobber
  1164  	VOR	T0, T0, X0
  1165  	VOR	T1, T1, X1
  1166  	BR	sqrLoop
  1167  
  1168  done:
  1169  	MOVD $p256mul<>+0x00(SB), CPOOL
  1170  
  1171  	XXPERMDI T0, T0, $2, T0
  1172  	XXPERMDI T1, T1, $2, T1
  1173  	STXVD2X T0, (R0)(res_ptr)
  1174  	STXVD2X T1, (R16)(res_ptr)
  1175  	RET
  1176  
  1177  #undef res_ptr
  1178  #undef x_ptr
  1179  #undef y_ptr
  1180  #undef CPOOL
  1181  
  1182  #undef X0
  1183  #undef X1
  1184  #undef Y0
  1185  #undef Y1
  1186  #undef T0
  1187  #undef T1
  1188  #undef P0
  1189  #undef P1
  1190  
  1191  #define P3ptr   R3
  1192  #define P1ptr   R4
  1193  #define P2ptr   R5
  1194  #define CPOOL   R7
  1195  
  1196  // Temporaries in REGs
  1197  #define Y2L    V15
  1198  #define Y2H    V16
  1199  #define T1L    V17
  1200  #define T1H    V18
  1201  #define T2L    V19
  1202  #define T2H    V20
  1203  #define T3L    V21
  1204  #define T3H    V22
  1205  #define T4L    V23
  1206  #define T4H    V24
  1207  
  1208  // Temps for Sub and Add
  1209  #define TT0  V11
  1210  #define TT1  V12
  1211  #define T2   V13
  1212  
  1213  // p256MulAsm Parameters
  1214  #define X0    V0
  1215  #define X1    V1
  1216  #define Y0    V2
  1217  #define Y1    V3
  1218  #define T0    V4
  1219  #define T1    V5
  1220  
  1221  #define PL    V30
  1222  #define PH    V31
  1223  
  1224  // Names for zero/sel selects
  1225  #define X1L    V0
  1226  #define X1H    V1
  1227  #define Y1L    V2 // p256MulAsmParmY
  1228  #define Y1H    V3 // p256MulAsmParmY
  1229  #define Z1L    V4
  1230  #define Z1H    V5
  1231  #define X2L    V0
  1232  #define X2H    V1
  1233  #define Z2L    V4
  1234  #define Z2H    V5
  1235  #define X3L    V17 // T1L
  1236  #define X3H    V18 // T1H
  1237  #define Y3L    V21 // T3L
  1238  #define Y3H    V22 // T3H
  1239  #define Z3L    V25
  1240  #define Z3H    V26
  1241  
  1242  #define ZER   V6
  1243  #define SEL1  V7
  1244  #define CAR1  V8
  1245  #define CAR2  V9
  1246  /* *
  1247   * Three operand formula:
  1248   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1249   * T1 = Z1²
  1250   * T2 = T1*Z1
  1251   * T1 = T1*X2
  1252   * T2 = T2*Y2
  1253   * T1 = T1-X1
  1254   * T2 = T2-Y1
  1255   * Z3 = Z1*T1
  1256   * T3 = T1²
  1257   * T4 = T3*T1
  1258   * T3 = T3*X1
  1259   * T1 = 2*T3
  1260   * X3 = T2²
  1261   * X3 = X3-T1
  1262   * X3 = X3-T4
  1263   * T3 = T3-X3
  1264   * T3 = T3*T2
  1265   * T4 = T4*Y1
  1266   * Y3 = T3-T4
  1267  
  1268   * Three operand formulas, but with MulInternal X,Y used to store temps
  1269  X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1270  X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1271  X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1272  X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1273  SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1274  SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1275  X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1276  X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1277  X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1278  X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1279  ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1280  X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1281  SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1282  SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1283  SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1284  X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1285  X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1286  SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1287  
  1288  	*/
  1289  //
  1290  // V27 is clobbered by p256MulInternal so must be
  1291  // saved in a temp.
  1292  //
  1293  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1294  TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $16-48
  1295  	MOVD res+0(FP), P3ptr
  1296  	MOVD in1+8(FP), P1ptr
  1297  	MOVD in2+16(FP), P2ptr
  1298  
  1299  	MOVD $p256mul<>+0x00(SB), CPOOL
  1300  
  1301  	MOVD $16, R16
  1302  	MOVD $32, R17
  1303  	MOVD $48, R18
  1304  	MOVD $64, R19
  1305  	MOVD $80, R20
  1306  	MOVD $96, R21
  1307  	MOVD $112, R22
  1308  	MOVD $128, R23
  1309  	MOVD $144, R24
  1310  	MOVD $160, R25
  1311  	MOVD $104, R26 // offset of sign+24(FP)
  1312  
  1313  	LXVD2X (R16)(CPOOL), PH
  1314  	LXVD2X (R0)(CPOOL), PL
  1315  
  1316  	LXVD2X (R17)(P2ptr), Y2L
  1317  	LXVD2X (R18)(P2ptr), Y2H
  1318  	XXPERMDI Y2H, Y2H, $2, Y2H
  1319  	XXPERMDI Y2L, Y2L, $2, Y2L
  1320  
  1321  	// Equivalent of VLREPG sign+24(FP), SEL1
  1322  	LXVDSX   (R1)(R26), SEL1
  1323  	VSPLTISB $0, ZER
  1324  	VCMPEQUD SEL1, ZER, SEL1
  1325  
  1326  	VSUBCUQ  PL, Y2L, CAR1
  1327  	VSUBUQM  PL, Y2L, T1L
  1328  	VSUBEUQM PH, Y2H, CAR1, T1H
  1329  
  1330  	VSEL T1L, Y2L, SEL1, Y2L
  1331  	VSEL T1H, Y2H, SEL1, Y2H
  1332  
  1333  /* *
  1334   * Three operand formula:
  1335   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1336   */
  1337  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1338  	LXVD2X (R19)(P1ptr), X0     // Z1H
  1339  	LXVD2X (R20)(P1ptr), X1     // Z1L
  1340  	XXPERMDI X0, X0, $2, X0
  1341  	XXPERMDI X1, X1, $2, X1
  1342  	VOR    X0, X0, Y0
  1343  	VOR    X1, X1, Y1
  1344  	CALL   p256MulInternal<>(SB)
  1345  
  1346  	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1347  	VOR  T0, T0, X0
  1348  	VOR  T1, T1, X1
  1349  	CALL p256MulInternal<>(SB)
  1350  	VOR  T0, T0, T2L
  1351  	VOR  T1, T1, T2H
  1352  
  1353  	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1354  	MOVD   in2+16(FP), P2ptr
  1355  	LXVD2X (R0)(P2ptr), Y0      // X2H
  1356  	LXVD2X (R16)(P2ptr), Y1     // X2L
  1357  	XXPERMDI Y0, Y0, $2, Y0
  1358  	XXPERMDI Y1, Y1, $2, Y1
  1359  	CALL   p256MulInternal<>(SB)
  1360  	VOR    T0, T0, T1L
  1361  	VOR    T1, T1, T1H
  1362  
  1363  	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1364  	VOR  T2L, T2L, X0
  1365  	VOR  T2H, T2H, X1
  1366  	VOR  Y2L, Y2L, Y0
  1367  	VOR  Y2H, Y2H, Y1
  1368  	CALL p256MulInternal<>(SB)
  1369  
  1370  	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1371  	MOVD   in1+8(FP), P1ptr
  1372  	LXVD2X (R17)(P1ptr), Y1L
  1373  	LXVD2X (R18)(P1ptr), Y1H
  1374  	XXPERMDI Y1H, Y1H, $2, Y1H
  1375  	XXPERMDI Y1L, Y1L, $2, Y1L
  1376  	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1377  
  1378  	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1379  	LXVD2X (R0)(P1ptr), X1L
  1380  	LXVD2X (R16)(P1ptr), X1H
  1381  	XXPERMDI X1H, X1H, $2, X1H
  1382  	XXPERMDI X1L, X1L, $2, X1L
  1383  	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1384  
  1385  	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1386  	LXVD2X (R19)(P1ptr), X0     // Z1H
  1387  	LXVD2X (R20)(P1ptr), X1     // Z1L
  1388  	XXPERMDI X0, X0, $2, X0
  1389  	XXPERMDI X1, X1, $2, X1
  1390  	CALL   p256MulInternal<>(SB)
  1391  
  1392  	VOR T0, T0, Z3L
  1393  	VOR T1, T1, Z3H
  1394  
  1395  	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1396  	VOR  Y0, Y0, X0
  1397  	VOR  Y1, Y1, X1
  1398  	CALL p256MulInternal<>(SB)
  1399  	VOR  T0, T0, X0
  1400  	VOR  T1, T1, X1
  1401  
  1402  	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1403  	CALL p256MulInternal<>(SB)
  1404  	VOR  T0, T0, T4L
  1405  	VOR  T1, T1, T4H
  1406  
  1407  	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1408  	MOVD   in1+8(FP), P1ptr
  1409  	LXVD2X (R0)(P1ptr), Y0      // X1H
  1410  	LXVD2X (R16)(P1ptr), Y1     // X1L
  1411  	XXPERMDI Y1, Y1, $2, Y1
  1412  	XXPERMDI Y0, Y0, $2, Y0
  1413  	CALL   p256MulInternal<>(SB)
  1414  	VOR    T0, T0, T3L
  1415  	VOR    T1, T1, T3H
  1416  
  1417  	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1418  	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1419  
  1420  	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1421  	VOR  T2L, T2L, X0
  1422  	VOR  T2H, T2H, X1
  1423  	VOR  T2L, T2L, Y0
  1424  	VOR  T2H, T2H, Y1
  1425  	CALL p256MulInternal<>(SB)
  1426  
  1427  	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1428  	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1429  
  1430  	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1431  	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1432  	VOR T0, T0, X3L
  1433  	VOR T1, T1, X3H
  1434  
  1435  	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1436  	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1437  
  1438  	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1439  	CALL p256MulInternal<>(SB)
  1440  	VOR  T0, T0, T3L
  1441  	VOR  T1, T1, T3H
  1442  
  1443  	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1444  	VOR    T4L, T4L, X0
  1445  	VOR    T4H, T4H, X1
  1446  	MOVD   in1+8(FP), P1ptr
  1447  	LXVD2X (R17)(P1ptr), Y0     // Y1H
  1448  	LXVD2X (R18)(P1ptr), Y1     // Y1L
  1449  	XXPERMDI Y0, Y0, $2, Y0
  1450  	XXPERMDI Y1, Y1, $2, Y1
  1451  	CALL   p256MulInternal<>(SB)
  1452  
  1453  	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1454  	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1455  
  1456  	//	if (sel == 0) {
  1457  	//		copy(P3.x[:], X1)
  1458  	//		copy(P3.y[:], Y1)
  1459  	//		copy(P3.z[:], Z1)
  1460  	//	}
  1461  
  1462  	LXVD2X (R0)(P1ptr), X1L
  1463  	LXVD2X (R16)(P1ptr), X1H
  1464  	XXPERMDI X1H, X1H, $2, X1H
  1465  	XXPERMDI X1L, X1L, $2, X1L
  1466  
  1467  	// Y1 already loaded, left over from addition
  1468  	LXVD2X (R19)(P1ptr), Z1L
  1469  	LXVD2X (R20)(P1ptr), Z1H
  1470  	XXPERMDI Z1H, Z1H, $2, Z1H
  1471  	XXPERMDI Z1L, Z1L, $2, Z1L
  1472  
  1473  	MOVD     $112, R26        // Get offset to sel+32
  1474  	LXVDSX   (R1)(R26), SEL1
  1475  	VSPLTISB $0, ZER
  1476  	VCMPEQUD SEL1, ZER, SEL1
  1477  
  1478  	VSEL X3L, X1L, SEL1, X3L
  1479  	VSEL X3H, X1H, SEL1, X3H
  1480  	VSEL Y3L, Y1L, SEL1, Y3L
  1481  	VSEL Y3H, Y1H, SEL1, Y3H
  1482  	VSEL Z3L, Z1L, SEL1, Z3L
  1483  	VSEL Z3H, Z1H, SEL1, Z3H
  1484  
  1485  	MOVD   in2+16(FP), P2ptr
  1486  	LXVD2X (R0)(P2ptr), X2L
  1487  	LXVD2X (R16)(P2ptr), X2H
  1488  	XXPERMDI X2H, X2H, $2, X2H
  1489  	XXPERMDI X2L, X2L, $2, X2L
  1490  
  1491  	// Y2 already loaded
  1492  	LXVD2X (R23)(CPOOL), Z2L
  1493  	LXVD2X (R24)(CPOOL), Z2H
  1494  
  1495  	MOVD     $120, R26        // Get the value from zero+40(FP)
  1496  	LXVDSX   (R1)(R26), SEL1
  1497  	VSPLTISB $0, ZER
  1498  	VCMPEQUD SEL1, ZER, SEL1
  1499  
  1500  	VSEL X3L, X2L, SEL1, X3L
  1501  	VSEL X3H, X2H, SEL1, X3H
  1502  	VSEL Y3L, Y2L, SEL1, Y3L
  1503  	VSEL Y3H, Y2H, SEL1, Y3H
  1504  	VSEL Z3L, Z2L, SEL1, Z3L
  1505  	VSEL Z3H, Z2H, SEL1, Z3H
  1506  
  1507  	// Reorder the bytes so they can be stored using STXVD2X.
  1508  	MOVD    res+0(FP), P3ptr
  1509  	XXPERMDI X3H, X3H, $2, X3H
  1510  	XXPERMDI X3L, X3L, $2, X3L
  1511  	XXPERMDI Y3H, Y3H, $2, Y3H
  1512  	XXPERMDI Y3L, Y3L, $2, Y3L
  1513  	XXPERMDI Z3H, Z3H, $2, Z3H
  1514  	XXPERMDI Z3L, Z3L, $2, Z3L
  1515  	STXVD2X X3L, (R0)(P3ptr)
  1516  	STXVD2X X3H, (R16)(P3ptr)
  1517  	STXVD2X Y3L, (R17)(P3ptr)
  1518  	STXVD2X Y3H, (R18)(P3ptr)
  1519  	STXVD2X Z3L, (R19)(P3ptr)
  1520  	STXVD2X Z3H, (R20)(P3ptr)
  1521  
  1522  	RET
  1523  
  1524  #undef P3ptr
  1525  #undef P1ptr
  1526  #undef P2ptr
  1527  #undef CPOOL
  1528  
  1529  #undef Y2L
  1530  #undef Y2H
  1531  #undef T1L
  1532  #undef T1H
  1533  #undef T2L
  1534  #undef T2H
  1535  #undef T3L
  1536  #undef T3H
  1537  #undef T4L
  1538  #undef T4H
  1539  
  1540  #undef TT0
  1541  #undef TT1
  1542  #undef T2
  1543  
  1544  #undef X0
  1545  #undef X1
  1546  #undef Y0
  1547  #undef Y1
  1548  #undef T0
  1549  #undef T1
  1550  
  1551  #undef PL
  1552  #undef PH
  1553  
  1554  #undef X1L
  1555  #undef X1H
  1556  #undef Y1L
  1557  #undef Y1H
  1558  #undef Z1L
  1559  #undef Z1H
  1560  #undef X2L
  1561  #undef X2H
  1562  #undef Z2L
  1563  #undef Z2H
  1564  #undef X3L
  1565  #undef X3H
  1566  #undef Y3L
  1567  #undef Y3H
  1568  #undef Z3L
  1569  #undef Z3H
  1570  
  1571  #undef ZER
  1572  #undef SEL1
  1573  #undef CAR1
  1574  #undef CAR2
  1575  
  1576  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1577  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1578  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1579  #define P3ptr   R3
  1580  #define P1ptr   R4
  1581  #define CPOOL   R7
  1582  
  1583  // Temporaries in REGs
  1584  #define X3L    V15
  1585  #define X3H    V16
  1586  #define Y3L    V17
  1587  #define Y3H    V18
  1588  #define T1L    V19
  1589  #define T1H    V20
  1590  #define T2L    V21
  1591  #define T2H    V22
  1592  #define T3L    V23
  1593  #define T3H    V24
  1594  
  1595  #define X1L    V6
  1596  #define X1H    V7
  1597  #define Y1L    V8
  1598  #define Y1H    V9
  1599  #define Z1L    V10
  1600  #define Z1H    V11
  1601  
  1602  // Temps for Sub and Add
  1603  #define TT0  V11
  1604  #define TT1  V12
  1605  #define T2   V13
  1606  
  1607  // p256MulAsm Parameters
  1608  #define X0    V0
  1609  #define X1    V1
  1610  #define Y0    V2
  1611  #define Y1    V3
  1612  #define T0    V4
  1613  #define T1    V5
  1614  
  1615  #define PL    V30
  1616  #define PH    V31
  1617  
  1618  #define Z3L    V23
  1619  #define Z3H    V24
  1620  
  1621  #define ZER   V26
  1622  #define SEL1  V27
  1623  #define CAR1  V28
  1624  #define CAR2  V29
  1625  /*
  1626   * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1627   * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1628   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1629   * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1630   * 	B  = 2Y₁
  1631   * 	Z₃ = B×Z₁
  1632   * 	C  = B²
  1633   * 	D  = C×X₁
  1634   * 	X₃ = A²-2D
  1635   * 	Y₃ = (D-X₃)×A-C²/2
  1636   *
  1637   * Three-operand formula:
  1638   *       T1 = Z1²
  1639   *       T2 = X1-T1
  1640   *       T1 = X1+T1
  1641   *       T2 = T2*T1
  1642   *       T2 = 3*T2
  1643   *       Y3 = 2*Y1
  1644   *       Z3 = Y3*Z1
  1645   *       Y3 = Y3²
  1646   *       T3 = Y3*X1
  1647   *       Y3 = Y3²
  1648   *       Y3 = half*Y3
  1649   *       X3 = T2²
  1650   *       T1 = 2*T3
  1651   *       X3 = X3-T1
  1652   *       T1 = T3-X3
  1653   *       T1 = T1*T2
  1654   *       Y3 = T1-Y3
  1655   */
  1656  // p256PointDoubleAsm(res, in1 *p256Point)
  1657  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
  1658  	MOVD res+0(FP), P3ptr
  1659  	MOVD in+8(FP), P1ptr
  1660  
  1661  	MOVD $p256mul<>+0x00(SB), CPOOL
  1662  
  1663  	MOVD $16, R16
  1664  	MOVD $32, R17
  1665  	MOVD $48, R18
  1666  	MOVD $64, R19
  1667  	MOVD $80, R20
  1668  
  1669  	LXVD2X (R16)(CPOOL), PH
  1670  	LXVD2X (R0)(CPOOL), PL
  1671  
  1672  	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1673  	LXVD2X (R19)(P1ptr), X0 // Z1H
  1674  	LXVD2X (R20)(P1ptr), X1 // Z1L
  1675  
  1676  	XXPERMDI X0, X0, $2, X0
  1677  	XXPERMDI X1, X1, $2, X1
  1678  
  1679  	VOR  X0, X0, Y0
  1680  	VOR  X1, X1, Y1
  1681  	CALL p256MulInternal<>(SB)
  1682  
  1683  	// SUB(X<X1-T)            // T2 = X1-T1
  1684  	LXVD2X (R0)(P1ptr), X1L
  1685  	LXVD2X (R16)(P1ptr), X1H
  1686  	XXPERMDI X1L, X1L, $2, X1L
  1687  	XXPERMDI X1H, X1H, $2, X1H
  1688  
  1689  	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1690  
  1691  	// ADD(Y<X1+T)            // T1 = X1+T1
  1692  	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1693  
  1694  	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1695  	CALL p256MulInternal<>(SB)
  1696  
  1697  	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1698  	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1699  	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1700  
  1701  	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1702  	LXVD2X (R17)(P1ptr), Y1L
  1703  	LXVD2X (R18)(P1ptr), Y1H
  1704  	XXPERMDI Y1L, Y1L, $2, Y1L
  1705  	XXPERMDI Y1H, Y1H, $2, Y1H
  1706  
  1707  	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  1708  
  1709  	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  1710  	LXVD2X (R19)(P1ptr), Y0
  1711  	LXVD2X (R20)(P1ptr), Y1
  1712  	XXPERMDI Y0, Y0, $2, Y0
  1713  	XXPERMDI Y1, Y1, $2, Y1
  1714  
  1715  	CALL p256MulInternal<>(SB)
  1716  
  1717  	// Leave T0, T1 as is.
  1718  	XXPERMDI T0, T0, $2, TT0
  1719  	XXPERMDI T1, T1, $2, TT1
  1720  	STXVD2X TT0, (R19)(P3ptr)
  1721  	STXVD2X TT1, (R20)(P3ptr)
  1722  
  1723  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1724  	VOR  X0, X0, Y0
  1725  	VOR  X1, X1, Y1
  1726  	CALL p256MulInternal<>(SB)
  1727  
  1728  	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  1729  	VOR    T0, T0, X0
  1730  	VOR    T1, T1, X1
  1731  	LXVD2X (R0)(P1ptr), Y0
  1732  	LXVD2X (R16)(P1ptr), Y1
  1733  	XXPERMDI Y0, Y0, $2, Y0
  1734  	XXPERMDI Y1, Y1, $2, Y1
  1735  	CALL   p256MulInternal<>(SB)
  1736  	VOR    T0, T0, T3L
  1737  	VOR    T1, T1, T3H
  1738  
  1739  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1740  	VOR  X0, X0, Y0
  1741  	VOR  X1, X1, Y1
  1742  	CALL p256MulInternal<>(SB)
  1743  
  1744  	// HAL(Y3<T)              // Y3 = half*Y3
  1745  	p256HalfInternal(Y3H,Y3L, T1,T0)
  1746  
  1747  	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  1748  	VOR  T2L, T2L, X0
  1749  	VOR  T2H, T2H, X1
  1750  	VOR  T2L, T2L, Y0
  1751  	VOR  T2H, T2H, Y1
  1752  	CALL p256MulInternal<>(SB)
  1753  
  1754  	// ADD(T1<T3+T3)          // T1 = 2*T3
  1755  	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  1756  
  1757  	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  1758  	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  1759  
  1760  	XXPERMDI X3L, X3L, $2, TT0
  1761  	XXPERMDI X3H, X3H, $2, TT1
  1762  	STXVD2X TT0, (R0)(P3ptr)
  1763  	STXVD2X TT1, (R16)(P3ptr)
  1764  
  1765  	// SUB(X<T3-X3)           // T1 = T3-X3
  1766  	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  1767  
  1768  	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  1769  	CALL p256MulInternal<>(SB)
  1770  
  1771  	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  1772  	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  1773  
  1774  	XXPERMDI Y3L, Y3L, $2, Y3L
  1775  	XXPERMDI Y3H, Y3H, $2, Y3H
  1776  	STXVD2X Y3L, (R17)(P3ptr)
  1777  	STXVD2X Y3H, (R18)(P3ptr)
  1778  	RET
  1779  
  1780  #undef P3ptr
  1781  #undef P1ptr
  1782  #undef CPOOL
  1783  #undef X3L
  1784  #undef X3H
  1785  #undef Y3L
  1786  #undef Y3H
  1787  #undef T1L
  1788  #undef T1H
  1789  #undef T2L
  1790  #undef T2H
  1791  #undef T3L
  1792  #undef T3H
  1793  #undef X1L
  1794  #undef X1H
  1795  #undef Y1L
  1796  #undef Y1H
  1797  #undef Z1L
  1798  #undef Z1H
  1799  #undef TT0
  1800  #undef TT1
  1801  #undef T2
  1802  #undef X0
  1803  #undef X1
  1804  #undef Y0
  1805  #undef Y1
  1806  #undef T0
  1807  #undef T1
  1808  #undef PL
  1809  #undef PH
  1810  #undef Z3L
  1811  #undef Z3H
  1812  #undef ZER
  1813  #undef SEL1
  1814  #undef CAR1
  1815  #undef CAR2
  1816  
  1817  #define P3ptr  R3
  1818  #define P1ptr  R4
  1819  #define P2ptr  R5
  1820  #define CPOOL  R7
  1821  #define TRUE   R14
  1822  #define RES1   R9
  1823  #define RES2   R10
  1824  
  1825  // Temporaries in REGs
  1826  #define T1L   V16
  1827  #define T1H   V17
  1828  #define T2L   V18
  1829  #define T2H   V19
  1830  #define U1L   V20
  1831  #define U1H   V21
  1832  #define S1L   V22
  1833  #define S1H   V23
  1834  #define HL    V24
  1835  #define HH    V25
  1836  #define RL    V26
  1837  #define RH    V27
  1838  
  1839  // Temps for Sub and Add
  1840  #define ZER   V6
  1841  #define SEL1  V7
  1842  #define CAR1  V8
  1843  #define CAR2  V9
  1844  #define TT0  V11
  1845  #define TT1  V12
  1846  #define T2   V13
  1847  
  1848  // p256MulAsm Parameters
  1849  #define X0    V0
  1850  #define X1    V1
  1851  #define Y0    V2
  1852  #define Y1    V3
  1853  #define T0    V4
  1854  #define T1    V5
  1855  
  1856  #define PL    V30
  1857  #define PH    V31
  1858  /*
  1859   * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  1860   *
  1861   * A = X₁×Z₂²
  1862   * B = Y₁×Z₂³
  1863   * C = X₂×Z₁²-A
  1864   * D = Y₂×Z₁³-B
  1865   * X₃ = D² - 2A×C² - C³
  1866   * Y₃ = D×(A×C² - X₃) - B×C³
  1867   * Z₃ = Z₁×Z₂×C
  1868   *
  1869   * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  1870   * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  1871   *
  1872   * T1 = Z1*Z1
  1873   * T2 = Z2*Z2
  1874   * U1 = X1*T2
  1875   * H  = X2*T1
  1876   * H  = H-U1
  1877   * Z3 = Z1*Z2
  1878   * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  1879   *
  1880   * S1 = Z2*T2
  1881   * S1 = Y1*S1
  1882   * R  = Z1*T1
  1883   * R  = Y2*R
  1884   * R  = R-S1
  1885   *
  1886   * T1 = H*H
  1887   * T2 = H*T1
  1888   * U1 = U1*T1
  1889   *
  1890   * X3 = R*R
  1891   * X3 = X3-T2
  1892   * T1 = 2*U1
  1893   * X3 = X3-T1 << store-out X3 result reg
  1894   *
  1895   * T2 = S1*T2
  1896   * Y3 = U1-X3
  1897   * Y3 = R*Y3
  1898   * Y3 = Y3-T2 << store-out Y3 result reg
  1899  
  1900  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  1901  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  1902  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  1903  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  1904  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  1905  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  1906  	// SUB(H<H-T)            // H  = H-U1
  1907  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  1908  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  1909  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  1910  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  1911  	// SUB(R<T-S1)           // R  = R-S1
  1912  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  1913  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  1914  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  1915  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  1916  	// SUB(T<T-T2)           // X3 = X3-T2
  1917  	// ADD(X<U1+U1)          // T1 = 2*U1
  1918  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  1919  	// SUB(Y<U1-T)           // Y3 = U1-X3
  1920  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  1921  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  1922  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  1923  	*/
  1924  // p256PointAddAsm(res, in1, in2 *p256Point)
  1925  TEXT ·p256PointAddAsm(SB), NOSPLIT, $16-32
  1926  	MOVD res+0(FP), P3ptr
  1927  	MOVD in1+8(FP), P1ptr
  1928  	MOVD $p256mul<>+0x00(SB), CPOOL
  1929  	MOVD $16, R16
  1930  	MOVD $32, R17
  1931  	MOVD $48, R18
  1932  	MOVD $64, R19
  1933  	MOVD $80, R20
  1934  
  1935  	LXVD2X (R16)(CPOOL), PH
  1936  	LXVD2X (R0)(CPOOL), PL
  1937  
  1938  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  1939  	LXVD2X (R19)(P1ptr), X0     // Z1L
  1940  	LXVD2X (R20)(P1ptr), X1     // Z1H
  1941  	XXPERMDI X0, X0, $2, X0
  1942  	XXPERMDI X1, X1, $2, X1
  1943  	VOR    X0, X0, Y0
  1944  	VOR    X1, X1, Y1
  1945  	CALL   p256MulInternal<>(SB)
  1946  
  1947  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  1948  	VOR  T0, T0, Y0
  1949  	VOR  T1, T1, Y1
  1950  	CALL p256MulInternal<>(SB)
  1951  	VOR  T0, T0, RL            // SAVE: RL
  1952  	VOR  T1, T1, RH            // SAVE: RH
  1953  
  1954  	STXVD2X RH, (R1)(R17) // V27 has to be saved
  1955  
  1956  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  1957  	MOVD   in2+16(FP), P2ptr
  1958  	LXVD2X (R0)(P2ptr), X0      // X2L
  1959  	LXVD2X (R16)(P2ptr), X1     // X2H
  1960  	XXPERMDI X0, X0, $2, X0
  1961  	XXPERMDI X1, X1, $2, X1
  1962  	CALL   p256MulInternal<>(SB)
  1963  	VOR    T0, T0, HL            // SAVE: HL
  1964  	VOR    T1, T1, HH            // SAVE: HH
  1965  
  1966  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  1967  	MOVD   in2+16(FP), P2ptr
  1968  	LXVD2X (R19)(P2ptr), X0     // Z2L
  1969  	LXVD2X (R20)(P2ptr), X1     // Z2H
  1970  	XXPERMDI X0, X0, $2, X0
  1971  	XXPERMDI X1, X1, $2, X1
  1972  	VOR    X0, X0, Y0
  1973  	VOR    X1, X1, Y1
  1974  	CALL   p256MulInternal<>(SB)
  1975  
  1976  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  1977  	VOR  T0, T0, Y0
  1978  	VOR  T1, T1, Y1
  1979  	CALL p256MulInternal<>(SB)
  1980  	VOR  T0, T0, S1L           // SAVE: S1L
  1981  	VOR  T1, T1, S1H           // SAVE: S1H
  1982  
  1983  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  1984  	MOVD   in1+8(FP), P1ptr
  1985  	LXVD2X (R0)(P1ptr), X0      // X1L
  1986  	LXVD2X (R16)(P1ptr), X1     // X1H
  1987  	XXPERMDI X0, X0, $2, X0
  1988  	XXPERMDI X1, X1, $2, X1
  1989  	CALL   p256MulInternal<>(SB)
  1990  	VOR    T0, T0, U1L           // SAVE: U1L
  1991  	VOR    T1, T1, U1H           // SAVE: U1H
  1992  
  1993  	// SUB(H<H-T)            // H  = H-U1
  1994  	p256SubInternal(HH,HL,HH,HL,T1,T0)
  1995  
  1996  	// if H == 0 or H^P == 0 then ret=1 else ret=0
  1997  	// clobbers T1H and T1L
  1998  	MOVD       $1, TRUE
  1999  	VSPLTISB   $0, ZER
  2000  	VOR        HL, HH, T1H
  2001  	VCMPEQUDCC ZER, T1H, T1H
  2002  
  2003  	// 26 = CR6 NE
  2004  	ISEL       $26, R0, TRUE, RES1
  2005  	VXOR       HL, PL, T1L         // SAVE: T1L
  2006  	VXOR       HH, PH, T1H         // SAVE: T1H
  2007  	VOR        T1L, T1H, T1H
  2008  	VCMPEQUDCC ZER, T1H, T1H
  2009  
  2010  	// 26 = CR6 NE
  2011  	ISEL $26, R0, TRUE, RES2
  2012  	OR   RES2, RES1, RES1
  2013  	MOVD RES1, ret+24(FP)
  2014  
  2015  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2016  	MOVD   in1+8(FP), P1ptr
  2017  	MOVD   in2+16(FP), P2ptr
  2018  	LXVD2X (R19)(P1ptr), X0        // Z1L
  2019  	LXVD2X (R20)(P1ptr), X1        // Z1H
  2020  	XXPERMDI X0, X0, $2, X0
  2021  	XXPERMDI X1, X1, $2, X1
  2022  	LXVD2X (R19)(P2ptr), Y0        // Z2L
  2023  	LXVD2X (R20)(P2ptr), Y1        // Z2H
  2024  	XXPERMDI Y0, Y0, $2, Y0
  2025  	XXPERMDI Y1, Y1, $2, Y1
  2026  	CALL   p256MulInternal<>(SB)
  2027  
  2028  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2029  	VOR     T0, T0, X0
  2030  	VOR     T1, T1, X1
  2031  	VOR     HL, HL, Y0
  2032  	VOR     HH, HH, Y1
  2033  	CALL    p256MulInternal<>(SB)
  2034  	MOVD    res+0(FP), P3ptr
  2035  	XXPERMDI T1, T1, $2, TT1
  2036  	XXPERMDI T0, T0, $2, TT0
  2037  	STXVD2X TT0, (R19)(P3ptr)
  2038  	STXVD2X TT1, (R20)(P3ptr)
  2039  
  2040  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2041  	MOVD   in1+8(FP), P1ptr
  2042  	LXVD2X (R17)(P1ptr), X0
  2043  	LXVD2X (R18)(P1ptr), X1
  2044  	XXPERMDI X0, X0, $2, X0
  2045  	XXPERMDI X1, X1, $2, X1
  2046  	VOR    S1L, S1L, Y0
  2047  	VOR    S1H, S1H, Y1
  2048  	CALL   p256MulInternal<>(SB)
  2049  	VOR    T0, T0, S1L
  2050  	VOR    T1, T1, S1H
  2051  
  2052  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2053  	MOVD   in2+16(FP), P2ptr
  2054  	LXVD2X (R17)(P2ptr), X0
  2055  	LXVD2X (R18)(P2ptr), X1
  2056  	XXPERMDI X0, X0, $2, X0
  2057  	XXPERMDI X1, X1, $2, X1
  2058  	VOR    RL, RL, Y0
  2059  
  2060  	// VOR RH, RH, Y1   RH was saved above in D2X format
  2061  	LXVD2X (R1)(R17), Y1
  2062  	CALL   p256MulInternal<>(SB)
  2063  
  2064  	// SUB(R<T-S1)           // R  = T-S1
  2065  	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2066  
  2067  	STXVD2X RH, (R1)(R17) // Save RH
  2068  
  2069  	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2070  	// clobbers T1H and T1L
  2071  	// Redo this using ISEL??
  2072  	MOVD       $1, TRUE
  2073  	VSPLTISB   $0, ZER
  2074  	VOR        RL, RH, T1H
  2075  	VCMPEQUDCC ZER, T1H, T1H
  2076  
  2077  	// 24 = CR6 NE
  2078  	ISEL       $26, R0, TRUE, RES1
  2079  	VXOR       RL, PL, T1L
  2080  	VXOR       RH, PH, T1H         // SAVE: T1L
  2081  	VOR        T1L, T1H, T1H
  2082  	VCMPEQUDCC ZER, T1H, T1H
  2083  
  2084  	// 26 = CR6 NE
  2085  	ISEL $26, R0, TRUE, RES2
  2086  	OR   RES2, RES1, RES1
  2087  	MOVD ret+24(FP), RES2
  2088  	AND  RES2, RES1, RES1
  2089  	MOVD RES1, ret+24(FP)
  2090  
  2091  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2092  	VOR  HL, HL, X0
  2093  	VOR  HH, HH, X1
  2094  	VOR  HL, HL, Y0
  2095  	VOR  HH, HH, Y1
  2096  	CALL p256MulInternal<>(SB)
  2097  
  2098  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2099  	VOR  T0, T0, Y0
  2100  	VOR  T1, T1, Y1
  2101  	CALL p256MulInternal<>(SB)
  2102  	VOR  T0, T0, T2L
  2103  	VOR  T1, T1, T2H
  2104  
  2105  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2106  	VOR  U1L, U1L, X0
  2107  	VOR  U1H, U1H, X1
  2108  	CALL p256MulInternal<>(SB)
  2109  	VOR  T0, T0, U1L
  2110  	VOR  T1, T1, U1H
  2111  
  2112  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2113  	VOR RL, RL, X0
  2114  
  2115  	// VOR  RH, RH, X1
  2116  	VOR RL, RL, Y0
  2117  
  2118  	// RH was saved above using STXVD2X
  2119  	LXVD2X (R1)(R17), X1
  2120  	VOR    X1, X1, Y1
  2121  
  2122  	// VOR  RH, RH, Y1
  2123  	CALL p256MulInternal<>(SB)
  2124  
  2125  	// SUB(T<T-T2)           // X3 = X3-T2
  2126  	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2127  
  2128  	// ADD(X<U1+U1)          // T1 = 2*U1
  2129  	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2130  
  2131  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2132  	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2133  	MOVD    res+0(FP), P3ptr
  2134  	XXPERMDI T1, T1, $2, TT1
  2135  	XXPERMDI T0, T0, $2, TT0
  2136  	STXVD2X TT0, (R0)(P3ptr)
  2137  	STXVD2X TT1, (R16)(P3ptr)
  2138  
  2139  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2140  	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2141  
  2142  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2143  	VOR RL, RL, X0
  2144  
  2145  	// VOR  RH, RH, X1
  2146  	LXVD2X (R1)(R17), X1
  2147  	CALL   p256MulInternal<>(SB)
  2148  	VOR    T0, T0, U1L
  2149  	VOR    T1, T1, U1H
  2150  
  2151  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2152  	VOR  S1L, S1L, X0
  2153  	VOR  S1H, S1H, X1
  2154  	VOR  T2L, T2L, Y0
  2155  	VOR  T2H, T2H, Y1
  2156  	CALL p256MulInternal<>(SB)
  2157  
  2158  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2159  	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2160  	MOVD    res+0(FP), P3ptr
  2161  	XXPERMDI T1, T1, $2, TT1
  2162  	XXPERMDI T0, T0, $2, TT0
  2163  	STXVD2X TT0, (R17)(P3ptr)
  2164  	STXVD2X TT1, (R18)(P3ptr)
  2165  
  2166  	RET
  2167  

View as plain text