Text file src/crypto/internal/fips140/aes/gcm/gcm_ppc64x.s

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build (ppc64 || ppc64le) && !purego
     6  
     7  // Portions based on CRYPTOGAMS code with the following comment:
     8  // # ====================================================================
     9  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    10  // # project. The module is, however, dual licensed under OpenSSL and
    11  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    12  // # details see http://www.openssl.org/~appro/cryptogams/.
    13  // # ====================================================================
    14  
    15  // The implementations for gcmHash and gcmInit are based on the generated asm
    16  // from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
    17  // from commit d47afb3c.
    18  
    19  // Changes were made due to differences in the ABI and some register usage.
    20  // Some arguments were changed due to the way the Go code passes them.
    21  
    22  // Portions that use the stitched AES-GCM approach in counterCryptASM
    23  // are based on code found in
    24  // https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
    25  
    26  #include "textflag.h"
    27  
    28  #define XIP    R3
    29  #define HTBL   R4
    30  #define INP    R5
    31  #define LEN    R6
    32  
    33  #define XL     V0
    34  #define XM     V1
    35  #define XH     V2
    36  #define IN     V3
    37  #define ZERO   V4
    38  #define T0     V5
    39  #define T1     V6
    40  #define T2     V7
    41  #define XC2    V8
    42  #define H      V9
    43  #define HH     V10
    44  #define HL     V11
    45  #define LEMASK V12
    46  #define XL1    V13
    47  #define XM1    V14
    48  #define XH1    V15
    49  #define IN1    V16
    50  #define H2     V17
    51  #define H2H    V18
    52  #define H2L    V19
    53  #define XL3    V20
    54  #define XM2    V21
    55  #define IN2    V22
    56  #define H3L    V23
    57  #define H3     V24
    58  #define H3H    V25
    59  #define XH3    V26
    60  #define XM3    V27
    61  #define IN3    V28
    62  #define H4L    V29
    63  #define H4     V30
    64  #define H4H    V31
    65  
    66  #define IN0    IN
    67  #define H21L   HL
    68  #define H21H   HH
    69  #define LOPERM H2L
    70  #define HIPERM H2H
    71  
    72  #define VXL    VS32
    73  #define VIN    VS35
    74  #define VXC2   VS40
    75  #define VH     VS41
    76  #define VHH    VS42
    77  #define VHL    VS43
    78  #define VIN1   VS48
    79  #define VH2    VS49
    80  #define VH2H   VS50
    81  #define VH2L   VS51
    82  
    83  #define VIN2   VS54
    84  #define VH3L   VS55
    85  #define VH3    VS56
    86  #define VH3H   VS57
    87  #define VIN3   VS60
    88  #define VH4L   VS61
    89  #define VH4    VS62
    90  #define VH4H   VS63
    91  
    92  #define VIN0   VIN
    93  
    94  #define ESPERM V10
    95  #define TMP2 V11
    96  
    97  DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
    98  DATA ·rcon+0x08(SB)/8, $0x0706050403020100
    99  DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
   100  DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
   101  DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
   102  DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
   103  DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
   104  DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
   105  DATA ·rcon+0x40(SB)/8, $0x0000000000000000
   106  DATA ·rcon+0x48(SB)/8, $0x0000000000000000
   107  GLOBL ·rcon(SB), RODATA, $80
   108  
   109  // The following macros provide appropriate
   110  // implementations for endianness as well as
   111  // ISA specific for power8 and power9.
   112  #ifdef GOARCH_ppc64le
   113  #  ifdef GOPPC64_power9
   114  #define P8_LXVB16X(RA,RB,VT)   LXVB16X (RA)(RB), VT
   115  #define P8_STXVB16X(VS,RA,RB)  STXVB16X VS, (RA)(RB)
   116  #  else
   117  #define NEEDS_ESPERM
   118  #define P8_LXVB16X(RA,RB,VT) \
   119  	LXVD2X  (RA+RB), VT \
   120  	VPERM	VT, VT, ESPERM, VT
   121  
   122  #define P8_STXVB16X(VS,RA,RB) \
   123  	VPERM	VS, VS, ESPERM, TMP2; \
   124  	STXVD2X TMP2, (RA+RB)
   125  
   126  #  endif
   127  #else
   128  #define P8_LXVB16X(RA,RB,VT) \
   129  	LXVD2X  (RA+RB), VT
   130  
   131  #define P8_STXVB16X(VS,RA,RB) \
   132  	STXVD2X VS, (RA+RB)
   133  
   134  #endif
   135  
   136  #define MASK_PTR   R8
   137  
   138  #define MASKV   V0
   139  #define INV     V1
   140  
   141  // The following macros are used for
   142  // the stitched implementation within
   143  // counterCryptASM.
   144  
   145  // Load the initial GCM counter value
   146  // in V30 and set up the counter increment
   147  // in V31
   148  #define SETUP_COUNTER \
   149  	P8_LXVB16X(COUNTER, R0, V30); \
   150  	VSPLTISB $1, V28; \
   151  	VXOR V31, V31, V31; \
   152  	VSLDOI $1, V31, V28, V31
   153  
   154  // These macros set up the initial value
   155  // for a single encryption, or 4 or 8
   156  // stitched encryptions implemented
   157  // with interleaving vciphers.
   158  //
   159  // The input value for each encryption
   160  // is generated by XORing the counter
   161  // from V30 with the first key in VS0
   162  // and incrementing the counter.
   163  //
   164  // Single encryption in V15
   165  #define GEN_VCIPHER_INPUT \
   166  	XXLOR VS0, VS0, V29 \
   167  	VXOR V30, V29, V15; \
   168  	VADDUWM V30, V31, V30
   169  
   170  // 4 encryptions in V15 - V18
   171  #define GEN_VCIPHER_4_INPUTS \
   172  	XXLOR VS0, VS0, V29; \
   173  	VXOR V30, V29, V15; \
   174  	VADDUWM V30, V31, V30; \
   175  	VXOR V30, V29, V16; \
   176  	VADDUWM V30, V31, V30; \
   177  	VXOR V30, V29, V17; \
   178  	VADDUWM V30, V31, V30; \
   179  	VXOR V30, V29, V18; \
   180  	VADDUWM V30, V31, V30
   181  
   182  // 8 encryptions in V15 - V22
   183  #define GEN_VCIPHER_8_INPUTS \
   184  	XXLOR VS0, VS0, V29; \
   185  	VXOR V30, V29, V15; \
   186  	VADDUWM V30, V31, V30; \
   187  	VXOR V30, V29, V16; \
   188  	VADDUWM V30, V31, V30; \
   189  	VXOR V30, V29, V17; \
   190  	VADDUWM V30, V31, V30; \
   191  	VXOR V30, V29, V18; \
   192  	VADDUWM V30, V31, V30; \
   193  	VXOR V30, V29, V19; \
   194  	VADDUWM V30, V31, V30; \
   195  	VXOR V30, V29, V20; \
   196  	VADDUWM V30, V31, V30; \
   197  	VXOR V30, V29, V21; \
   198  	VADDUWM V30, V31, V30; \
   199  	VXOR V30, V29, V22; \
   200  	VADDUWM V30, V31, V30
   201  
   202  // Load the keys to be used for
   203  // encryption based on key_len.
   204  // Keys are in VS0 - VS14
   205  // depending on key_len.
   206  // Valid keys sizes are verified
   207  // here. CR2 is set and used
   208  // throughout to check key_len.
   209  #define LOAD_KEYS(blk_key, key_len) \
   210  	MOVD	$16, R16; \
   211  	MOVD	$32, R17; \
   212  	MOVD	$48, R18; \
   213  	MOVD	$64, R19; \
   214  	LXVD2X (blk_key)(R0), VS0; \
   215  	LXVD2X (blk_key)(R16), VS1; \
   216  	LXVD2X (blk_key)(R17), VS2; \
   217  	LXVD2X (blk_key)(R18), VS3; \
   218  	LXVD2X (blk_key)(R19), VS4; \
   219  	ADD $64, R16; \
   220  	ADD $64, R17; \
   221  	ADD $64, R18; \
   222  	ADD $64, R19; \
   223  	LXVD2X (blk_key)(R16), VS5; \
   224  	LXVD2X (blk_key)(R17), VS6; \
   225  	LXVD2X (blk_key)(R18), VS7; \
   226  	LXVD2X (blk_key)(R19), VS8; \
   227  	ADD $64, R16; \
   228  	ADD $64, R17; \
   229  	ADD $64, R18; \
   230  	ADD $64, R19; \
   231  	LXVD2X (blk_key)(R16), VS9; \
   232  	LXVD2X (blk_key)(R17), VS10; \
   233  	CMP key_len, $12, CR2; \
   234  	CMP key_len, $10; \
   235  	BEQ keysLoaded; \
   236  	LXVD2X (blk_key)(R18), VS11; \
   237  	LXVD2X (blk_key)(R19), VS12; \
   238  	BEQ CR2, keysLoaded; \
   239  	ADD $64, R16; \
   240  	ADD $64, R17; \
   241  	LXVD2X (blk_key)(R16), VS13; \
   242  	LXVD2X (blk_key)(R17), VS14; \
   243  	CMP key_len, $14; \
   244  	BEQ keysLoaded; \
   245  	MOVD R0,0(R0); \
   246  keysLoaded:
   247  
   248  // Encrypt 1 (vin) with first 9
   249  // keys from VS1 - VS9.
   250  #define VCIPHER_1X9_KEYS(vin) \
   251  	XXLOR VS1, VS1, V23; \
   252  	XXLOR VS2, VS2, V24; \
   253  	XXLOR VS3, VS3, V25; \
   254  	XXLOR VS4, VS4, V26; \
   255  	XXLOR VS5, VS5, V27; \
   256  	VCIPHER vin, V23, vin; \
   257  	VCIPHER vin, V24, vin; \
   258  	VCIPHER vin, V25, vin; \
   259  	VCIPHER vin, V26, vin; \
   260  	VCIPHER vin, V27, vin; \
   261  	XXLOR VS6, VS6, V23; \
   262  	XXLOR VS7, VS7, V24; \
   263  	XXLOR VS8, VS8, V25; \
   264  	XXLOR VS9, VS9, V26; \
   265  	VCIPHER vin, V23, vin; \
   266  	VCIPHER vin, V24, vin; \
   267  	VCIPHER vin, V25, vin; \
   268  	VCIPHER	vin, V26, vin
   269  
   270  // Encrypt 1 value (vin) with
   271  // 2 specified keys
   272  #define VCIPHER_1X2_KEYS(vin, key1, key2) \
   273  	XXLOR key1, key1, V25; \
   274  	XXLOR key2, key2, V26; \
   275  	VCIPHER vin, V25, vin; \
   276  	VCIPHER vin, V26, vin
   277  
   278  // Encrypt 4 values in V15 - V18
   279  // with the specified key from
   280  // VS1 - VS9.
   281  #define VCIPHER_4X1_KEY(key) \
   282  	XXLOR key, key, V23; \
   283  	VCIPHER V15, V23, V15; \
   284  	VCIPHER V16, V23, V16; \
   285  	VCIPHER V17, V23, V17; \
   286  	VCIPHER V18, V23, V18
   287  
   288  // Encrypt 8 values in V15 - V22
   289  // with the specified key,
   290  // assuming it is a VSreg
   291  #define VCIPHER_8X1_KEY(key) \
   292  	XXLOR key, key, V23; \
   293  	VCIPHER V15, V23, V15; \
   294  	VCIPHER V16, V23, V16; \
   295  	VCIPHER V17, V23, V17; \
   296  	VCIPHER V18, V23, V18; \
   297  	VCIPHER V19, V23, V19; \
   298  	VCIPHER V20, V23, V20; \
   299  	VCIPHER V21, V23, V21; \
   300  	VCIPHER V22, V23, V22
   301  
   302  // Load input block into V1-V4
   303  // in big endian order and
   304  // update blk_inp by 64.
   305  #define LOAD_INPUT_BLOCK64(blk_inp) \
   306  	MOVD $16, R16; \
   307  	MOVD $32, R17; \
   308  	MOVD $48, R18; \
   309  	P8_LXVB16X(blk_inp,R0,V1); \
   310  	P8_LXVB16X(blk_inp,R16,V2); \
   311  	P8_LXVB16X(blk_inp,R17,V3); \
   312  	P8_LXVB16X(blk_inp,R18,V4); \
   313  	ADD $64, blk_inp
   314  
   315  // Load input block into V1-V8
   316  // in big endian order and
   317  // Update blk_inp by 128
   318  #define LOAD_INPUT_BLOCK128(blk_inp) \
   319  	MOVD $16, R16; \
   320  	MOVD $32, R17; \
   321  	MOVD $48, R18; \
   322  	MOVD $64, R19; \
   323  	MOVD $80, R20; \
   324  	MOVD $96, R21; \
   325  	MOVD $112, R22; \
   326  	P8_LXVB16X(blk_inp,R0,V1); \
   327  	P8_LXVB16X(blk_inp,R16,V2); \
   328  	P8_LXVB16X(blk_inp,R17,V3); \
   329  	P8_LXVB16X(blk_inp,R18,V4); \
   330  	P8_LXVB16X(blk_inp,R19,V5); \
   331  	P8_LXVB16X(blk_inp,R20,V6); \
   332  	P8_LXVB16X(blk_inp,R21,V7); \
   333  	P8_LXVB16X(blk_inp,R22,V8); \
   334  	ADD $128, blk_inp
   335  
   336  // Finish encryption on 8 streams and
   337  // XOR with input block
   338  #define VCIPHERLAST8_XOR_INPUT \
   339  	VCIPHERLAST     V15, V23, V15; \
   340  	VCIPHERLAST     V16, V23, V16; \
   341  	VCIPHERLAST     V17, V23, V17; \
   342  	VCIPHERLAST     V18, V23, V18; \
   343  	VCIPHERLAST     V19, V23, V19; \
   344  	VCIPHERLAST     V20, V23, V20; \
   345  	VCIPHERLAST     V21, V23, V21; \
   346  	VCIPHERLAST     V22, V23, V22; \
   347  	XXLXOR          V1, V15, V1; \
   348  	XXLXOR          V2, V16, V2; \
   349  	XXLXOR          V3, V17, V3; \
   350  	XXLXOR          V4, V18, V4; \
   351  	XXLXOR          V5, V19, V5; \
   352  	XXLXOR          V6, V20, V6; \
   353  	XXLXOR          V7, V21, V7; \
   354  	XXLXOR          V8, V22, V8
   355  
   356  // Finish encryption on 4 streams and
   357  // XOR with input block
   358  #define VCIPHERLAST4_XOR_INPUT \
   359  	VCIPHERLAST     V15, V23, V15; \
   360  	VCIPHERLAST     V16, V23, V16; \
   361  	VCIPHERLAST     V17, V23, V17; \
   362  	VCIPHERLAST     V18, V23, V18; \
   363  	XXLXOR          V1, V15, V1; \
   364  	XXLXOR          V2, V16, V2; \
   365  	XXLXOR          V3, V17, V3; \
   366  	XXLXOR          V4, V18, V4
   367  
   368  // Store output block from V1-V8
   369  // in big endian order and
   370  // Update blk_out by 128
   371  #define STORE_OUTPUT_BLOCK128(blk_out) \
   372  	P8_STXVB16X(V1,blk_out,R0); \
   373  	P8_STXVB16X(V2,blk_out,R16); \
   374  	P8_STXVB16X(V3,blk_out,R17); \
   375  	P8_STXVB16X(V4,blk_out,R18); \
   376  	P8_STXVB16X(V5,blk_out,R19); \
   377  	P8_STXVB16X(V6,blk_out,R20); \
   378  	P8_STXVB16X(V7,blk_out,R21); \
   379  	P8_STXVB16X(V8,blk_out,R22); \
   380  	ADD $128, blk_out
   381  
   382  // Store output block from V1-V4
   383  // in big endian order and
   384  // Update blk_out by 64
   385  #define STORE_OUTPUT_BLOCK64(blk_out) \
   386  	P8_STXVB16X(V1,blk_out,R0); \
   387  	P8_STXVB16X(V2,blk_out,R16); \
   388  	P8_STXVB16X(V3,blk_out,R17); \
   389  	P8_STXVB16X(V4,blk_out,R18); \
   390  	ADD $64, blk_out
   391  
   392  // func gcmInit(productTable *[256]byte, h []byte)
   393  TEXT ·gcmInit(SB), NOSPLIT, $0-32
   394  	MOVD productTable+0(FP), XIP
   395  	MOVD h+8(FP), HTBL
   396  
   397  	MOVD   $0x10, R8
   398  	MOVD   $0x20, R9
   399  	MOVD   $0x30, R10
   400  	LXVD2X (HTBL)(R0), VH // Load H
   401  
   402  	VSPLTISB $-16, XC2           // 0xf0
   403  	VSPLTISB $1, T0              // one
   404  	VADDUBM  XC2, XC2, XC2       // 0xe0
   405  	VXOR     ZERO, ZERO, ZERO
   406  	VOR      XC2, T0, XC2        // 0xe1
   407  	VSLDOI   $15, XC2, ZERO, XC2 // 0xe1...
   408  	VSLDOI   $1, ZERO, T0, T1    // ...1
   409  	VADDUBM  XC2, XC2, XC2       // 0xc2...
   410  	VSPLTISB $7, T2
   411  	VOR      XC2, T1, XC2        // 0xc2....01
   412  	VSPLTB   $0, H, T1           // most significant byte
   413  	VSL      H, T0, H            // H<<=1
   414  	VSRAB    T1, T2, T1          // broadcast carry bit
   415  	VAND     T1, XC2, T1
   416  	VXOR     H, T1, IN           // twisted H
   417  
   418  	VSLDOI $8, IN, IN, H      // twist even more ...
   419  	VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
   420  	VSLDOI $8, ZERO, H, HL    // ... and split
   421  	VSLDOI $8, H, ZERO, HH
   422  
   423  	STXVD2X VXC2, (XIP+R0) // save pre-computed table
   424  	STXVD2X VHL, (XIP+R8)
   425  	MOVD    $0x40, R8
   426  	STXVD2X VH, (XIP+R9)
   427  	MOVD    $0x50, R9
   428  	STXVD2X VHH, (XIP+R10)
   429  	MOVD    $0x60, R10
   430  
   431  	VPMSUMD IN, HL, XL // H.lo·H.lo
   432  	VPMSUMD IN, H, XM  // H.hi·H.lo+H.lo·H.hi
   433  	VPMSUMD IN, HH, XH // H.hi·H.hi
   434  
   435  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   436  
   437  	VSLDOI $8, XM, ZERO, T0
   438  	VSLDOI $8, ZERO, XM, T1
   439  	VXOR   XL, T0, XL
   440  	VXOR   XH, T1, XH
   441  
   442  	VSLDOI $8, XL, XL, XL
   443  	VXOR   XL, T2, XL
   444  
   445  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   446  	VPMSUMD XL, XC2, XL
   447  	VXOR    T1, XH, T1
   448  	VXOR    XL, T1, IN1
   449  
   450  	VSLDOI $8, IN1, IN1, H2
   451  	VSLDOI $8, ZERO, H2, H2L
   452  	VSLDOI $8, H2, ZERO, H2H
   453  
   454  	STXVD2X VH2L, (XIP+R8)  // save H^2
   455  	MOVD    $0x70, R8
   456  	STXVD2X VH2, (XIP+R9)
   457  	MOVD    $0x80, R9
   458  	STXVD2X VH2H, (XIP+R10)
   459  	MOVD    $0x90, R10
   460  
   461  	VPMSUMD IN, H2L, XL   // H.lo·H^2.lo
   462  	VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
   463  	VPMSUMD IN, H2, XM    // H.hi·H^2.lo+H.lo·H^2.hi
   464  	VPMSUMD IN1, H2, XM1  // H^2.hi·H^2.lo+H^2.lo·H^2.hi
   465  	VPMSUMD IN, H2H, XH   // H.hi·H^2.hi
   466  	VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
   467  
   468  	VPMSUMD XL, XC2, T2  // 1st reduction phase
   469  	VPMSUMD XL1, XC2, HH // 1st reduction phase
   470  
   471  	VSLDOI $8, XM, ZERO, T0
   472  	VSLDOI $8, ZERO, XM, T1
   473  	VSLDOI $8, XM1, ZERO, HL
   474  	VSLDOI $8, ZERO, XM1, H
   475  	VXOR   XL, T0, XL
   476  	VXOR   XH, T1, XH
   477  	VXOR   XL1, HL, XL1
   478  	VXOR   XH1, H, XH1
   479  
   480  	VSLDOI $8, XL, XL, XL
   481  	VSLDOI $8, XL1, XL1, XL1
   482  	VXOR   XL, T2, XL
   483  	VXOR   XL1, HH, XL1
   484  
   485  	VSLDOI  $8, XL, XL, T1  // 2nd reduction phase
   486  	VSLDOI  $8, XL1, XL1, H // 2nd reduction phase
   487  	VPMSUMD XL, XC2, XL
   488  	VPMSUMD XL1, XC2, XL1
   489  	VXOR    T1, XH, T1
   490  	VXOR    H, XH1, H
   491  	VXOR    XL, T1, XL
   492  	VXOR    XL1, H, XL1
   493  
   494  	VSLDOI $8, XL, XL, H
   495  	VSLDOI $8, XL1, XL1, H2
   496  	VSLDOI $8, ZERO, H, HL
   497  	VSLDOI $8, H, ZERO, HH
   498  	VSLDOI $8, ZERO, H2, H2L
   499  	VSLDOI $8, H2, ZERO, H2H
   500  
   501  	STXVD2X VHL, (XIP+R8)   // save H^3
   502  	MOVD    $0xa0, R8
   503  	STXVD2X VH, (XIP+R9)
   504  	MOVD    $0xb0, R9
   505  	STXVD2X VHH, (XIP+R10)
   506  	MOVD    $0xc0, R10
   507  	STXVD2X VH2L, (XIP+R8)  // save H^4
   508  	STXVD2X VH2, (XIP+R9)
   509  	STXVD2X VH2H, (XIP+R10)
   510  
   511  	RET
   512  
   513  // func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
   514  TEXT ·gcmHash(SB), NOSPLIT, $0-64
   515  	MOVD output+0(FP), XIP
   516  	MOVD productTable+24(FP), HTBL
   517  	MOVD inp+32(FP), INP
   518  	MOVD len+56(FP), LEN
   519  
   520  	MOVD   $0x10, R8
   521  	MOVD   $0x20, R9
   522  	MOVD   $0x30, R10
   523  	LXVD2X (XIP)(R0), VXL // load Xi
   524  
   525  	LXVD2X   (HTBL)(R8), VHL    // load pre-computed table
   526  	MOVD     $0x40, R8
   527  	LXVD2X   (HTBL)(R9), VH
   528  	MOVD     $0x50, R9
   529  	LXVD2X   (HTBL)(R10), VHH
   530  	MOVD     $0x60, R10
   531  	LXVD2X   (HTBL)(R0), VXC2
   532  #ifdef GOARCH_ppc64le
   533  	LVSL     (R0)(R0), LEMASK
   534  	VSPLTISB $0x07, T0
   535  	VXOR     LEMASK, T0, LEMASK
   536  	VPERM    XL, XL, LEMASK, XL
   537  #endif
   538  	VXOR     ZERO, ZERO, ZERO
   539  
   540  	CMPU LEN, $64
   541  	BGE  gcm_ghash_p8_4x
   542  
   543  	LXVD2X (INP)(R0), VIN
   544  	ADD    $16, INP, INP
   545  	SUBCCC $16, LEN, LEN
   546  #ifdef GOARCH_ppc64le
   547  	VPERM  IN, IN, LEMASK, IN
   548  #endif
   549  	VXOR   IN, XL, IN
   550  	BEQ    short
   551  
   552  	LXVD2X (HTBL)(R8), VH2L  // load H^2
   553  	MOVD   $16, R8
   554  	LXVD2X (HTBL)(R9), VH2
   555  	ADD    LEN, INP, R9      // end of input
   556  	LXVD2X (HTBL)(R10), VH2H
   557  
   558  loop_2x:
   559  	LXVD2X (INP)(R0), VIN1
   560  #ifdef GOARCH_ppc64le
   561  	VPERM  IN1, IN1, LEMASK, IN1
   562  #endif
   563  
   564  	SUBC    $32, LEN, LEN
   565  	VPMSUMD IN, H2L, XL   // H^2.lo·Xi.lo
   566  	VPMSUMD IN1, HL, XL1  // H.lo·Xi+1.lo
   567  	SUBE    R11, R11, R11 // borrow?-1:0
   568  	VPMSUMD IN, H2, XM    // H^2.hi·Xi.lo+H^2.lo·Xi.hi
   569  	VPMSUMD IN1, H, XM1   // H.hi·Xi+1.lo+H.lo·Xi+1.hi
   570  	AND     LEN, R11, R11
   571  	VPMSUMD IN, H2H, XH   // H^2.hi·Xi.hi
   572  	VPMSUMD IN1, HH, XH1  // H.hi·Xi+1.hi
   573  	ADD     R11, INP, INP
   574  
   575  	VXOR XL, XL1, XL
   576  	VXOR XM, XM1, XM
   577  
   578  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   579  
   580  	VSLDOI $8, XM, ZERO, T0
   581  	VSLDOI $8, ZERO, XM, T1
   582  	VXOR   XH, XH1, XH
   583  	VXOR   XL, T0, XL
   584  	VXOR   XH, T1, XH
   585  
   586  	VSLDOI $8, XL, XL, XL
   587  	VXOR   XL, T2, XL
   588  	LXVD2X (INP)(R8), VIN
   589  	ADD    $32, INP, INP
   590  
   591  	VSLDOI  $8, XL, XL, T1     // 2nd reduction phase
   592  	VPMSUMD XL, XC2, XL
   593  #ifdef GOARCH_ppc64le
   594  	VPERM   IN, IN, LEMASK, IN
   595  #endif
   596  	VXOR    T1, XH, T1
   597  	VXOR    IN, T1, IN
   598  	VXOR    IN, XL, IN
   599  	CMP     R9, INP
   600  	BGT     loop_2x            // done yet?
   601  
   602  	CMPWU LEN, $0
   603  	BNE   even
   604  
   605  short:
   606  	VPMSUMD IN, HL, XL // H.lo·Xi.lo
   607  	VPMSUMD IN, H, XM  // H.hi·Xi.lo+H.lo·Xi.hi
   608  	VPMSUMD IN, HH, XH // H.hi·Xi.hi
   609  
   610  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   611  
   612  	VSLDOI $8, XM, ZERO, T0
   613  	VSLDOI $8, ZERO, XM, T1
   614  	VXOR   XL, T0, XL
   615  	VXOR   XH, T1, XH
   616  
   617  	VSLDOI $8, XL, XL, XL
   618  	VXOR   XL, T2, XL
   619  
   620  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   621  	VPMSUMD XL, XC2, XL
   622  	VXOR    T1, XH, T1
   623  
   624  even:
   625  	VXOR    XL, T1, XL
   626  #ifdef GOARCH_ppc64le
   627  	VPERM   XL, XL, LEMASK, XL
   628  #endif
   629  	STXVD2X VXL, (XIP+R0)
   630  
   631  	OR R12, R12, R12 // write out Xi
   632  	RET
   633  
   634  gcm_ghash_p8_4x:
   635  	LVSL     (R8)(R0), T0      // 0x0001..0e0f
   636  	MOVD     $0x70, R8
   637  	LXVD2X   (HTBL)(R9), VH2
   638  	MOVD     $0x80, R9
   639  	VSPLTISB $8, T1            // 0x0808..0808
   640  	MOVD     $0x90, R10
   641  	LXVD2X   (HTBL)(R8), VH3L  // load H^3
   642  	MOVD     $0xa0, R8
   643  	LXVD2X   (HTBL)(R9), VH3
   644  	MOVD     $0xb0, R9
   645  	LXVD2X   (HTBL)(R10), VH3H
   646  	MOVD     $0xc0, R10
   647  	LXVD2X   (HTBL)(R8), VH4L  // load H^4
   648  	MOVD     $0x10, R8
   649  	LXVD2X   (HTBL)(R9), VH4
   650  	MOVD     $0x20, R9
   651  	LXVD2X   (HTBL)(R10), VH4H
   652  	MOVD     $0x30, R10
   653  
   654  	VSLDOI  $8, ZERO, T1, T2   // 0x0000..0808
   655  	VADDUBM T0, T2, HIPERM     // 0x0001..1617
   656  	VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
   657  
   658  	SRD $4, LEN, LEN // this allows to use sign bit as carry
   659  
   660  	LXVD2X (INP)(R0), VIN0       // load input
   661  	LXVD2X (INP)(R8), VIN1
   662  	SUBCCC $8, LEN, LEN
   663  	LXVD2X (INP)(R9), VIN2
   664  	LXVD2X (INP)(R10), VIN3
   665  	ADD    $0x40, INP, INP
   666  #ifdef GOARCH_ppc64le
   667  	VPERM  IN0, IN0, LEMASK, IN0
   668  	VPERM  IN1, IN1, LEMASK, IN1
   669  	VPERM  IN2, IN2, LEMASK, IN2
   670  	VPERM  IN3, IN3, LEMASK, IN3
   671  #endif
   672  
   673  	VXOR IN0, XL, XH
   674  
   675  	VPMSUMD IN1, H3L, XL1
   676  	VPMSUMD IN1, H3, XM1
   677  	VPMSUMD IN1, H3H, XH1
   678  
   679  	VPERM   H2, H, HIPERM, H21L
   680  	VPERM   IN2, IN3, LOPERM, T0
   681  	VPERM   H2, H, LOPERM, H21H
   682  	VPERM   IN2, IN3, HIPERM, T1
   683  	VPMSUMD IN2, H2, XM2         // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
   684  	VPMSUMD T0, H21L, XL3        // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
   685  	VPMSUMD IN3, H, XM3          // H.hi·Xi+3.lo  +H.lo·Xi+3.hi
   686  	VPMSUMD T1, H21H, XH3        // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
   687  
   688  	VXOR XM2, XM1, XM2
   689  	VXOR XL3, XL1, XL3
   690  	VXOR XM3, XM2, XM3
   691  	VXOR XH3, XH1, XH3
   692  
   693  	BLT tail_4x
   694  
   695  loop_4x:
   696  	LXVD2X (INP)(R0), VIN0
   697  	LXVD2X (INP)(R8), VIN1
   698  	SUBCCC $4, LEN, LEN
   699  	LXVD2X (INP)(R9), VIN2
   700  	LXVD2X (INP)(R10), VIN3
   701  	ADD    $0x40, INP, INP
   702  #ifdef GOARCH_ppc64le
   703  	VPERM  IN1, IN1, LEMASK, IN1
   704  	VPERM  IN2, IN2, LEMASK, IN2
   705  	VPERM  IN3, IN3, LEMASK, IN3
   706  	VPERM  IN0, IN0, LEMASK, IN0
   707  #endif
   708  
   709  	VPMSUMD XH, H4L, XL   // H^4.lo·Xi.lo
   710  	VPMSUMD XH, H4, XM    // H^4.hi·Xi.lo+H^4.lo·Xi.hi
   711  	VPMSUMD XH, H4H, XH   // H^4.hi·Xi.hi
   712  	VPMSUMD IN1, H3L, XL1
   713  	VPMSUMD IN1, H3, XM1
   714  	VPMSUMD IN1, H3H, XH1
   715  
   716  	VXOR  XL, XL3, XL
   717  	VXOR  XM, XM3, XM
   718  	VXOR  XH, XH3, XH
   719  	VPERM IN2, IN3, LOPERM, T0
   720  	VPERM IN2, IN3, HIPERM, T1
   721  
   722  	VPMSUMD XL, XC2, T2   // 1st reduction phase
   723  	VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
   724  	VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
   725  
   726  	VSLDOI $8, XM, ZERO, T0
   727  	VSLDOI $8, ZERO, XM, T1
   728  	VXOR   XL, T0, XL
   729  	VXOR   XH, T1, XH
   730  
   731  	VSLDOI $8, XL, XL, XL
   732  	VXOR   XL, T2, XL
   733  
   734  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   735  	VPMSUMD IN2, H2, XM2   // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
   736  	VPMSUMD IN3, H, XM3    // H.hi·Xi+3.lo  +H.lo·Xi+3.hi
   737  	VPMSUMD XL, XC2, XL
   738  
   739  	VXOR XL3, XL1, XL3
   740  	VXOR XH3, XH1, XH3
   741  	VXOR XH, IN0, XH
   742  	VXOR XM2, XM1, XM2
   743  	VXOR XH, T1, XH
   744  	VXOR XM3, XM2, XM3
   745  	VXOR XH, XL, XH
   746  	BGE  loop_4x
   747  
   748  tail_4x:
   749  	VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
   750  	VPMSUMD XH, H4, XM  // H^4.hi·Xi.lo+H^4.lo·Xi.hi
   751  	VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
   752  
   753  	VXOR XL, XL3, XL
   754  	VXOR XM, XM3, XM
   755  
   756  	VPMSUMD XL, XC2, T2 // 1st reduction phase
   757  
   758  	VSLDOI $8, XM, ZERO, T0
   759  	VSLDOI $8, ZERO, XM, T1
   760  	VXOR   XH, XH3, XH
   761  	VXOR   XL, T0, XL
   762  	VXOR   XH, T1, XH
   763  
   764  	VSLDOI $8, XL, XL, XL
   765  	VXOR   XL, T2, XL
   766  
   767  	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
   768  	VPMSUMD XL, XC2, XL
   769  	VXOR    T1, XH, T1
   770  	VXOR    XL, T1, XL
   771  
   772  	ADDCCC $4, LEN, LEN
   773  	BEQ    done_4x
   774  
   775  	LXVD2X (INP)(R0), VIN0
   776  	CMPU   LEN, $2
   777  	MOVD   $-4, LEN
   778  	BLT    one
   779  	LXVD2X (INP)(R8), VIN1
   780  	BEQ    two
   781  
   782  three:
   783  	LXVD2X (INP)(R9), VIN2
   784  #ifdef GOARCH_ppc64le
   785  	VPERM  IN0, IN0, LEMASK, IN0
   786  	VPERM  IN1, IN1, LEMASK, IN1
   787  	VPERM  IN2, IN2, LEMASK, IN2
   788  #endif
   789  
   790  	VXOR IN0, XL, XH
   791  	VOR  H3L, H3L, H4L
   792  	VOR  H3, H3, H4
   793  	VOR  H3H, H3H, H4H
   794  
   795  	VPERM   IN1, IN2, LOPERM, T0
   796  	VPERM   IN1, IN2, HIPERM, T1
   797  	VPMSUMD IN1, H2, XM2         // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
   798  	VPMSUMD IN2, H, XM3          // H.hi·Xi+2.lo  +H.lo·Xi+2.hi
   799  	VPMSUMD T0, H21L, XL3        // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
   800  	VPMSUMD T1, H21H, XH3        // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
   801  
   802  	VXOR XM3, XM2, XM3
   803  	JMP  tail_4x
   804  
   805  two:
   806  #ifdef GOARCH_ppc64le
   807  	VPERM IN0, IN0, LEMASK, IN0
   808  	VPERM IN1, IN1, LEMASK, IN1
   809  #endif
   810  
   811  	VXOR  IN, XL, XH
   812  	VPERM ZERO, IN1, LOPERM, T0
   813  	VPERM ZERO, IN1, HIPERM, T1
   814  
   815  	VSLDOI $8, ZERO, H2, H4L
   816  	VOR    H2, H2, H4
   817  	VSLDOI $8, H2, ZERO, H4H
   818  
   819  	VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
   820  	VPMSUMD IN1, H, XM3   // H.hi·Xi+1.lo+H.lo·Xi+2.hi
   821  	VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
   822  
   823  	JMP tail_4x
   824  
   825  one:
   826  #ifdef GOARCH_ppc64le
   827  	VPERM IN0, IN0, LEMASK, IN0
   828  #endif
   829  
   830  	VSLDOI $8, ZERO, H, H4L
   831  	VOR    H, H, H4
   832  	VSLDOI $8, H, ZERO, H4H
   833  
   834  	VXOR IN0, XL, XH
   835  	VXOR XL3, XL3, XL3
   836  	VXOR XM3, XM3, XM3
   837  	VXOR XH3, XH3, XH3
   838  
   839  	JMP tail_4x
   840  
   841  done_4x:
   842  #ifdef GOARCH_ppc64le
   843  	VPERM   XL, XL, LEMASK, XL
   844  #endif
   845  	STXVD2X VXL, (XIP+R0)      // write out Xi
   846  	RET
   847  
   848  #define BLK_INP    R3
   849  #define BLK_OUT    R4
   850  #define BLK_KEY    R5
   851  #define KEY_LEN    R6
   852  #define BLK_IDX    R7
   853  #define IDX        R8
   854  #define IN_LEN     R9
   855  #define COUNTER    R10
   856  #define CONPTR     R14
   857  #define MASK       V5
   858  
   859  // Implementation of the counterCrypt function in assembler.
   860  // Original loop is unrolled to allow for multiple encryption
   861  // streams to be done in parallel, which is achieved by interleaving
   862  // vcipher instructions from each stream. This is also referred to as
   863  // stitching, and provides significant performance improvements.
   864  // Some macros are defined which enable execution for big or little
   865  // endian as well as different ISA targets.
   866  //func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
   867  //func counterCryptASM(xr, out, in, counter, key)
   868  TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
   869  	MOVD	xr(FP), KEY_LEN
   870  	MOVD    out+8(FP), BLK_OUT
   871  	MOVD    out_len+16(FP), R8
   872  	MOVD    in+32(FP), BLK_INP
   873  	MOVD    in_len+40(FP), IN_LEN
   874  	MOVD    counter+56(FP), COUNTER
   875  	MOVD    key+64(FP), BLK_KEY
   876  
   877  // Set up permute string when needed.
   878  #ifdef NEEDS_ESPERM
   879  	MOVD    $·rcon(SB), R14
   880  	LVX     (R14), ESPERM   // Permute value for P8_ macros.
   881  #endif
   882  	SETUP_COUNTER		// V30 Counter V31 BE {0, 0, 0, 1}
   883  	LOAD_KEYS(BLK_KEY, KEY_LEN)	// VS1 - VS10/12/14 based on keysize
   884  	CMP     IN_LEN, $128
   885  	BLT	block64
   886  block128_loop:
   887  	// Do 8 encryptions in parallel by setting
   888  	// input values in V15-V22 and executing
   889  	// vcipher on the updated value and the keys.
   890  	GEN_VCIPHER_8_INPUTS
   891  	VCIPHER_8X1_KEY(VS1)
   892  	VCIPHER_8X1_KEY(VS2)
   893  	VCIPHER_8X1_KEY(VS3)
   894  	VCIPHER_8X1_KEY(VS4)
   895  	VCIPHER_8X1_KEY(VS5)
   896  	VCIPHER_8X1_KEY(VS6)
   897  	VCIPHER_8X1_KEY(VS7)
   898  	VCIPHER_8X1_KEY(VS8)
   899  	VCIPHER_8X1_KEY(VS9)
   900  	// Additional encryptions are done based on
   901  	// the key length, with the last key moved
   902  	// to V23 for use with VCIPHERLAST.
   903  	// CR2 = CMP key_len, $12
   904  	XXLOR VS10, VS10, V23
   905  	BLT	CR2, block128_last // key_len = 10
   906  	VCIPHER_8X1_KEY(VS10)
   907  	VCIPHER_8X1_KEY(VS11)
   908  	XXLOR VS12,VS12,V23
   909  	BEQ	CR2, block128_last // ken_len = 12
   910  	VCIPHER_8X1_KEY(VS12)
   911  	VCIPHER_8X1_KEY(VS13)
   912  	XXLOR VS14,VS14,V23	// key_len = 14
   913  block128_last:
   914  	// vcipher encryptions are in V15-V22 at this
   915  	// point with vcipherlast remaining to be done.
   916  	// Load input block into V1-V8, setting index offsets
   917  	// in R16-R22 to use with the STORE.
   918  	LOAD_INPUT_BLOCK128(BLK_INP)
   919  	// Do VCIPHERLAST on the last key for each encryption
   920  	// stream and XOR the result with the corresponding
   921  	// value from the input block.
   922  	VCIPHERLAST8_XOR_INPUT
   923  	// Store the results (8*16) and update BLK_OUT by 128.
   924  	STORE_OUTPUT_BLOCK128(BLK_OUT)
   925  	ADD	$-128, IN_LEN	// input size
   926  	CMP     IN_LEN, $128	// check if >= blocksize
   927  	BGE	block128_loop	// next input block
   928  	CMP	IN_LEN, $0
   929  	BEQ	done
   930  block64:
   931  	CMP	IN_LEN, $64	// Check if >= 64
   932  	BLT	block16_loop
   933  	// Do 4 encryptions in parallel by setting
   934  	// input values in V15-V18 and executing
   935  	// vcipher on the updated value and the keys.
   936  	GEN_VCIPHER_4_INPUTS
   937  	VCIPHER_4X1_KEY(VS1)
   938  	VCIPHER_4X1_KEY(VS2)
   939  	VCIPHER_4X1_KEY(VS3)
   940  	VCIPHER_4X1_KEY(VS4)
   941  	VCIPHER_4X1_KEY(VS5)
   942  	VCIPHER_4X1_KEY(VS6)
   943  	VCIPHER_4X1_KEY(VS7)
   944  	VCIPHER_4X1_KEY(VS8)
   945  	VCIPHER_4X1_KEY(VS9)
   946  	// Check key length based on CR2
   947  	// Move last key to V23 for use with later vcipherlast
   948  	XXLOR	VS10, VS10, V23
   949  	BLT	CR2, block64_last	// size = 10
   950  	VCIPHER_4X1_KEY(VS10)		// Encrypt next 2 keys
   951  	VCIPHER_4X1_KEY(VS11)
   952  	XXLOR	VS12, VS12, V23
   953  	BEQ	CR2, block64_last	// size = 12
   954  	VCIPHER_4X1_KEY(VS12)		// Encrypt last 2 keys
   955  	VCIPHER_4X1_KEY(VS13)
   956  	XXLOR	VS14, VS14, V23		// size = 14
   957  block64_last:
   958  	LOAD_INPUT_BLOCK64(BLK_INP)	// Load 64 bytes of input
   959  	// Do VCIPHERLAST on the last for each encryption
   960  	// stream and XOR the result with the corresponding
   961  	// value from the input block.
   962  	VCIPHERLAST4_XOR_INPUT
   963  	// Store the results (4*16) and update BLK_OUT by 64.
   964  	STORE_OUTPUT_BLOCK64(BLK_OUT)
   965  	ADD	$-64, IN_LEN		// decrement input block length
   966  	CMP	IN_LEN, $0		// check for remaining length
   967  	BEQ	done
   968  block16_loop:
   969  	CMP	IN_LEN, $16		// More input
   970  	BLT	final_block		// If not, then handle partial block
   971  	// Single encryption, no stitching
   972  	GEN_VCIPHER_INPUT		// Generate input value for single encryption
   973  	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 value with 9 keys
   974  	XXLOR	VS10, VS10, V23		// Last key -> V23 for later vcipiherlast
   975  	// Key length based on CR2. (LT=10, EQ=12, GT=14)
   976  	BLT	CR2, block16_last	// Finish for key size 10
   977  	VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
   978  	XXLOR	VS12, VS12, V23		// Last key -> V23 for later vcipherlast
   979  	BEQ	CR2, block16_last	// Finish for key size 12
   980  	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
   981  	XXLOR	VS14, VS14, V23		// Last key -> V23 for vcipherlast with key size 14
   982  block16_last:
   983  	P8_LXVB16X(BLK_INP, R0, V1)	// Load input
   984  	VCIPHERLAST V15, V23, V15	// Encrypt last value in V23
   985  	XXLXOR	V15, V1, V1		// XOR with input
   986  	P8_STXVB16X(V1,R0,BLK_OUT)	// Store final encryption value to output
   987  	ADD	$16, BLK_INP		// Increment input pointer
   988  	ADD	$16, BLK_OUT		// Increment output pointer
   989  	ADD	$-16, IN_LEN		// Decrement input length
   990  	BR	block16_loop		// Check for next
   991  final_block:
   992  	CMP	IN_LEN, $0
   993  	BEQ	done
   994  	GEN_VCIPHER_INPUT		// Generate input value for partial encryption
   995  	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 with 9 keys
   996  	XXLOR	VS10, VS10, V23		// Save possible last key
   997  	BLT	CR2, final_block_last
   998  	VCIPHER_1X2_KEYS(V15, VS10, VS11)	// Encrypt V15 with next 2 keys
   999  	XXLOR	VS12, VS12, V23		// Save possible last key
  1000  	BEQ	CR2, final_block_last
  1001  	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
  1002  	XXLOR	VS14, VS14, V23		// Save last key
  1003  final_block_last:
  1004  	VCIPHERLAST V15, V23, V15	// Finish encryption
  1005  #ifdef GOPPC64_power10
  1006  	// set up length
  1007  	SLD	$56, IN_LEN, R17
  1008  	LXVLL	BLK_INP, R17, V25
  1009  	VXOR	V25, V15, V25
  1010  	STXVLL	V25, BLK_OUT, R17
  1011  #else
  1012  	ADD	$32, R1, MASK_PTR
  1013  	MOVD	$0, R16
  1014  	P8_STXVB16X(V15, MASK_PTR, R0)
  1015  	CMP	IN_LEN, $8
  1016  	BLT	next4
  1017  	MOVD	0(MASK_PTR), R14
  1018  	MOVD	0(BLK_INP), R15
  1019  	XOR	R14, R15, R14
  1020  	MOVD	R14, 0(BLK_OUT)
  1021  	ADD	$8, R16
  1022  	ADD	$-8, IN_LEN
  1023  next4:
  1024  	CMP	IN_LEN, $4
  1025  	BLT	next2
  1026  	MOVWZ	(BLK_INP)(R16), R15
  1027  	MOVWZ	(MASK_PTR)(R16), R14
  1028  	XOR	R14, R15, R14
  1029  	MOVW	R14, (R16)(BLK_OUT)
  1030  	ADD	$4, R16
  1031  	ADD	$-4, IN_LEN
  1032  next2:
  1033  	CMP	IN_LEN, $2
  1034  	BLT	next1
  1035  	MOVHZ	(BLK_INP)(R16), R15
  1036  	MOVHZ	(MASK_PTR)(R16), R14
  1037  	XOR	R14, R15, R14
  1038  	MOVH	R14, (R16)(BLK_OUT)
  1039  	ADD	$2, R16
  1040  	ADD	$-2, IN_LEN
  1041  next1:
  1042  	CMP	IN_LEN, $1
  1043  	BLT	done
  1044  	MOVBZ	(MASK_PTR)(R16), R14
  1045  	MOVBZ	(BLK_INP)(R16), R15
  1046  	XOR	R14, R15, R14
  1047  	MOVB	R14, (R16)(BLK_OUT)
  1048  #endif
  1049  done:
  1050  	// Save the updated counter value
  1051  	P8_STXVB16X(V30, COUNTER, R0)
  1052  	// Clear the keys
  1053  	XXLXOR	VS0, VS0, VS0
  1054  	XXLXOR	VS1, VS1, VS1
  1055  	XXLXOR	VS2, VS2, VS2
  1056  	XXLXOR	VS3, VS3, VS3
  1057  	XXLXOR	VS4, VS4, VS4
  1058  	XXLXOR	VS5, VS5, VS5
  1059  	XXLXOR	VS6, VS6, VS6
  1060  	XXLXOR	VS7, VS7, VS7
  1061  	XXLXOR	VS8, VS8, VS8
  1062  	XXLXOR	VS9, VS9, VS9
  1063  	XXLXOR	VS10, VS10, VS10
  1064  	XXLXOR	VS11, VS11, VS11
  1065  	XXLXOR	VS12, VS12, VS12
  1066  	XXLXOR	VS13, VS13, VS13
  1067  	XXLXOR	VS14, VS14, VS14
  1068  	RET
  1069  
  1070  

View as plain text