Text file src/crypto/internal/fips140/aes/aes_ppc64x.s

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build (ppc64 || ppc64le) && !purego
     6  
     7  // Based on CRYPTOGAMS code with the following comment:
     8  // # ====================================================================
     9  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    10  // # project. The module is, however, dual licensed under OpenSSL and
    11  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    12  // # details see http://www.openssl.org/~appro/cryptogams/.
    13  // # ====================================================================
    14  
    15  // Original code can be found at the link below:
    16  // https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl
    17  
    18  // Some function names were changed to be consistent with Go function
    19  // names. For instance, function aes_p8_set_{en,de}crypt_key become
    20  // set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
    21  // and a new session was created (doEncryptKeyAsm). This was necessary to
    22  // avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
    23  // There were other modifications as well but kept the same functionality.
    24  
    25  #include "textflag.h"
    26  
    27  // For expandKeyAsm
    28  #define INP     R3
    29  #define BITS    R4
    30  #define OUTENC  R5 // Pointer to next expanded encrypt key
    31  #define PTR     R6
    32  #define CNT     R7
    33  #define ROUNDS  R8
    34  #define OUTDEC  R9  // Pointer to next expanded decrypt key
    35  #define TEMP    R19
    36  #define ZERO    V0
    37  #define IN0     V1
    38  #define IN1     V2
    39  #define KEY     V3
    40  #define RCON    V4
    41  #define MASK    V5
    42  #define TMP     V6
    43  #define STAGE   V7
    44  #define OUTPERM V8
    45  #define OUTMASK V9
    46  #define OUTHEAD V10
    47  #define OUTTAIL V11
    48  
    49  // For P9 instruction emulation
    50  #define ESPERM  V21  // Endian swapping permute into BE
    51  #define TMP2    V22  // Temporary for P8_STXVB16X/P8_STXVB16X
    52  
    53  // For {en,de}cryptBlockAsm
    54  #define BLK_INP    R3
    55  #define BLK_OUT    R4
    56  #define BLK_KEY    R5
    57  #define BLK_ROUNDS R6
    58  #define BLK_IDX    R7
    59  
    60  DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
    61  DATA ·rcon+0x08(SB)/8, $0x0706050403020100
    62  DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
    63  DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
    64  DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
    65  DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
    66  DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
    67  DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
    68  DATA ·rcon+0x40(SB)/8, $0x0000000000000000
    69  DATA ·rcon+0x48(SB)/8, $0x0000000000000000
    70  GLOBL ·rcon(SB), RODATA, $80
    71  
    72  #ifdef GOARCH_ppc64le
    73  #  ifdef GOPPC64_power9
    74  #define P8_LXVB16X(RA,RB,VT)  LXVB16X	(RA+RB), VT
    75  #define P8_STXVB16X(VS,RA,RB) STXVB16X	VS, (RA+RB)
    76  #define XXBRD_ON_LE(VA,VT)    XXBRD	VA, VT
    77  #define SETUP_ESPERM(rtmp)
    78  #  else
    79  // On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
    80  // doublewords and byte-swapping each doubleword to emulate BE load/stores.
    81  #define NEEDS_ESPERM
    82  #define P8_LXVB16X(RA,RB,VT) \
    83  	LXVD2X	(RA+RB), VT \
    84  	VPERM	VT, VT, ESPERM, VT
    85  
    86  #define P8_STXVB16X(VS,RA,RB) \
    87  	VPERM	VS, VS, ESPERM, TMP2 \
    88  	STXVD2X	TMP2, (RA+RB)
    89  
    90  #define XXBRD_ON_LE(VA,VT) \
    91  	VPERM	VA, VA, ESPERM, VT
    92  
    93  // Setup byte-swapping permute value in ESPERM for POWER9 instruction
    94  // emulation macros.
    95  #define SETUP_ESPERM(rtmp) \
    96  	MOVD	$·rcon(SB), rtmp \
    97  	LVX	(rtmp), ESPERM
    98  #  endif // defined(GOPPC64_power9)
    99  #else
   100  #define P8_LXVB16X(RA,RB,VT)  LXVD2X	(RA+RB), VT
   101  #define P8_STXVB16X(VS,RA,RB) STXVD2X	VS, (RA+RB)
   102  #define XXBRD_ON_LE(VA, VT)
   103  #define SETUP_ESPERM(rtmp)
   104  #endif // defined(GOARCH_ppc64le)
   105  
   106  // func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
   107  TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
   108  	// Load the arguments inside the registers
   109  	MOVD	nr+0(FP), ROUNDS
   110  	MOVD	key+8(FP), INP
   111  	MOVD	enc+16(FP), OUTENC
   112  	MOVD	dec+24(FP), OUTDEC
   113  
   114  #ifdef NEEDS_ESPERM
   115  	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
   116  	LVX	(PTR), ESPERM
   117  	ADD	$0x10, PTR
   118  #else
   119  	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
   120  #endif
   121  
   122  	// Get key from memory and write aligned into VR
   123  	P8_LXVB16X(INP, R0, IN0)
   124  	ADD	$0x10, INP, INP
   125  	MOVD	$0x20, TEMP
   126  
   127  	CMPW	ROUNDS, $12
   128  	LVX	(PTR)(R0), RCON    // lvx   4,0,6      Load first 16 bytes into RCON
   129  	LVX	(PTR)(TEMP), MASK
   130  	ADD	$0x10, PTR, PTR    // addi  6,6,0x10   PTR to next 16 bytes of RCON
   131  	MOVD	$8, CNT            // li    7,8        CNT = 8
   132  	VXOR	ZERO, ZERO, ZERO   // vxor  0,0,0      Zero to be zero :)
   133  	MOVD	CNT, CTR           // mtctr 7          Set the counter to 8 (rounds)
   134  
   135  	// The expanded decrypt key is the expanded encrypt key stored in reverse order.
   136  	// Move OUTDEC to the last key location, and store in descending order.
   137  	ADD	$160, OUTDEC, OUTDEC
   138  	BLT	loop128
   139  	ADD	$32, OUTDEC, OUTDEC
   140  	BEQ	l192
   141  	ADD	$32, OUTDEC, OUTDEC
   142  	JMP	l256
   143  
   144  loop128:
   145  	// Key schedule (Round 1 to 8)
   146  	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5         Rotate-n-splat
   147  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   148  	STXVD2X	IN0, (R0+OUTENC)
   149  	STXVD2X	IN0, (R0+OUTDEC)
   150  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   151  	ADD	$16, OUTENC, OUTENC
   152  	ADD	$-16, OUTDEC, OUTDEC
   153  
   154  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   155  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   156  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   157  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   158  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   159  	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
   160  	VXOR	IN0, KEY, IN0       // vxor 1,1,3
   161  	BDNZ	loop128
   162  
   163  	LVX	(PTR)(R0), RCON // lvx 4,0,6     Last two round keys
   164  
   165  	// Key schedule (Round 9)
   166  	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-spat
   167  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   168  	STXVD2X	IN0, (R0+OUTENC)
   169  	STXVD2X	IN0, (R0+OUTDEC)
   170  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   171  	ADD	$16, OUTENC, OUTENC
   172  	ADD	$-16, OUTDEC, OUTDEC
   173  
   174  	// Key schedule (Round 10)
   175  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   176  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   177  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   178  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   179  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   180  	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
   181  	VXOR	IN0, KEY, IN0       // vxor 1,1,3
   182  
   183  	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-splat
   184  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   185  	STXVD2X	IN0, (R0+OUTENC)
   186  	STXVD2X	IN0, (R0+OUTDEC)
   187  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   188  	ADD	$16, OUTENC, OUTENC
   189  	ADD	$-16, OUTDEC, OUTDEC
   190  
   191  	// Key schedule (Round 11)
   192  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   193  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   194  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   195  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   196  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   197  	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
   198  	STXVD2X	IN0, (R0+OUTENC)
   199  	STXVD2X	IN0, (R0+OUTDEC)
   200  
   201  	RET
   202  
   203  l192:
   204  	LXSDX	(INP+R0), IN1                    // Load next 8 bytes into upper half of VSR.
   205  	XXBRD_ON_LE(IN1, IN1)                    // and convert to BE ordering on LE hosts.
   206  	MOVD	$4, CNT                          // li 7,4
   207  	STXVD2X	IN0, (R0+OUTENC)
   208  	STXVD2X	IN0, (R0+OUTDEC)
   209  	ADD	$16, OUTENC, OUTENC
   210  	ADD	$-16, OUTDEC, OUTDEC
   211  	VSPLTISB	$8, KEY                  // vspltisb 3,8
   212  	MOVD	CNT, CTR                         // mtctr 7
   213  	VSUBUBM	MASK, KEY, MASK                  // vsububm 5,5,3
   214  
   215  loop192:
   216  	VPERM	IN1, IN1, MASK, KEY // vperm 3,2,2,5
   217  	VSLDOI	$12, ZERO, IN0, TMP // vsldoi 6,0,1,12
   218  	VCIPHERLAST	KEY, RCON, KEY      // vcipherlast 3,3,4
   219  
   220  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   221  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   222  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   223  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   224  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
   225  
   226  	VSLDOI	$8, ZERO, IN1, STAGE  // vsldoi 7,0,2,8
   227  	VSPLTW	$3, IN0, TMP          // vspltw 6,1,3
   228  	VXOR	TMP, IN1, TMP         // vxor 6,6,2
   229  	VSLDOI	$12, ZERO, IN1, IN1   // vsldoi 2,0,2,12
   230  	VADDUWM	RCON, RCON, RCON      // vadduwm 4,4,4
   231  	VXOR	IN1, TMP, IN1         // vxor 2,2,6
   232  	VXOR	IN0, KEY, IN0         // vxor 1,1,3
   233  	VXOR	IN1, KEY, IN1         // vxor 2,2,3
   234  	VSLDOI	$8, STAGE, IN0, STAGE // vsldoi 7,7,1,8
   235  
   236  	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
   237  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   238  	STXVD2X	STAGE, (R0+OUTENC)
   239  	STXVD2X	STAGE, (R0+OUTDEC)
   240  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   241  	ADD	$16, OUTENC, OUTENC
   242  	ADD	$-16, OUTDEC, OUTDEC
   243  
   244  	VSLDOI	$8, IN0, IN1, STAGE              // vsldoi 7,1,2,8
   245  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   246  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   247  	STXVD2X	STAGE, (R0+OUTENC)
   248  	STXVD2X	STAGE, (R0+OUTDEC)
   249  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   250  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   251  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   252  	ADD	$16, OUTENC, OUTENC
   253  	ADD	$-16, OUTDEC, OUTDEC
   254  
   255  	VSPLTW	$3, IN0, TMP                     // vspltw 6,1,3
   256  	VXOR	TMP, IN1, TMP                    // vxor 6,6,2
   257  	VSLDOI	$12, ZERO, IN1, IN1              // vsldoi 2,0,2,12
   258  	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
   259  	VXOR	IN1, TMP, IN1                    // vxor 2,2,6
   260  	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
   261  	VXOR	IN1, KEY, IN1                    // vxor 2,2,3
   262  	STXVD2X	IN0, (R0+OUTENC)
   263  	STXVD2X	IN0, (R0+OUTDEC)
   264  	ADD	$16, OUTENC, OUTENC
   265  	ADD	$-16, OUTDEC, OUTDEC
   266  	BDNZ	loop192
   267  
   268  	RET
   269  
   270  l256:
   271  	P8_LXVB16X(INP, R0, IN1)
   272  	MOVD	$7, CNT                          // li 7,7
   273  	STXVD2X	IN0, (R0+OUTENC)
   274  	STXVD2X	IN0, (R0+OUTDEC)
   275  	ADD	$16, OUTENC, OUTENC
   276  	ADD	$-16, OUTDEC, OUTDEC
   277  	MOVD	CNT, CTR                         // mtctr 7
   278  
   279  loop256:
   280  	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
   281  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
   282  	STXVD2X	IN1, (R0+OUTENC)
   283  	STXVD2X	IN1, (R0+OUTDEC)
   284  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
   285  	ADD	$16, OUTENC, OUTENC
   286  	ADD	$-16, OUTDEC, OUTDEC
   287  
   288  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   289  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   290  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   291  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
   292  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
   293  	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
   294  	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
   295  	STXVD2X	IN0, (R0+OUTENC)
   296  	STXVD2X	IN0, (R0+OUTDEC)
   297  	ADD	$16, OUTENC, OUTENC
   298  	ADD	$-16, OUTDEC, OUTDEC
   299  	BDZ	done
   300  
   301  	VSPLTW	$3, IN0, KEY        // vspltw 3,1,3
   302  	VSLDOI	$12, ZERO, IN1, TMP // vsldoi 6,0,2,12
   303  	VSBOX	KEY, KEY            // vsbox 3,3
   304  
   305  	VXOR	IN1, TMP, IN1       // vxor 2,2,6
   306  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   307  	VXOR	IN1, TMP, IN1       // vxor 2,2,6
   308  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
   309  	VXOR	IN1, TMP, IN1       // vxor 2,2,6
   310  
   311  	VXOR	IN1, KEY, IN1 // vxor 2,2,3
   312  	JMP	loop256       // b .Loop256
   313  
   314  done:
   315  	RET
   316  
   317  // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
   318  TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
   319  	MOVD	nr+0(FP), R6   // Round count/Key size
   320  	MOVD	xk+8(FP), R5   // Key pointer
   321  	MOVD	dst+16(FP), R3 // Dest pointer
   322  	MOVD	src+24(FP), R4 // Src pointer
   323  	SETUP_ESPERM(R7)
   324  
   325  	// Set CR{1,2,3}EQ to hold the key size information.
   326  	CMPU	R6, $10, CR1
   327  	CMPU	R6, $12, CR2
   328  	CMPU	R6, $14, CR3
   329  
   330  	MOVD	$16, R6
   331  	MOVD	$32, R7
   332  	MOVD	$48, R8
   333  	MOVD	$64, R9
   334  	MOVD	$80, R10
   335  	MOVD	$96, R11
   336  	MOVD	$112, R12
   337  
   338  	// Load text in BE order
   339  	P8_LXVB16X(R4, R0, V0)
   340  
   341  	// V1, V2 will hold keys, V0 is a temp.
   342  	// At completion, V2 will hold the ciphertext.
   343  	// Load xk[0:3] and xor with text
   344  	LXVD2X	(R0+R5), V1
   345  	VXOR	V0, V1, V0
   346  
   347  	// Load xk[4:11] and cipher
   348  	LXVD2X	(R6+R5), V1
   349  	LXVD2X	(R7+R5), V2
   350  	VCIPHER	V0, V1, V0
   351  	VCIPHER	V0, V2, V0
   352  
   353  	// Load xk[12:19] and cipher
   354  	LXVD2X	(R8+R5), V1
   355  	LXVD2X	(R9+R5), V2
   356  	VCIPHER	V0, V1, V0
   357  	VCIPHER	V0, V2, V0
   358  
   359  	// Load xk[20:27] and cipher
   360  	LXVD2X	(R10+R5), V1
   361  	LXVD2X	(R11+R5), V2
   362  	VCIPHER	V0, V1, V0
   363  	VCIPHER	V0, V2, V0
   364  
   365  	// Increment xk pointer to reuse constant offsets in R6-R12.
   366  	ADD	$112, R5
   367  
   368  	// Load xk[28:35] and cipher
   369  	LXVD2X	(R0+R5), V1
   370  	LXVD2X	(R6+R5), V2
   371  	VCIPHER	V0, V1, V0
   372  	VCIPHER	V0, V2, V0
   373  
   374  	// Load xk[36:43] and cipher
   375  	LXVD2X	(R7+R5), V1
   376  	LXVD2X	(R8+R5), V2
   377  	BEQ	CR1, Ldec_tail // Key size 10?
   378  	VCIPHER	V0, V1, V0
   379  	VCIPHER	V0, V2, V0
   380  
   381  	// Load xk[44:51] and cipher
   382  	LXVD2X	(R9+R5), V1
   383  	LXVD2X	(R10+R5), V2
   384  	BEQ	CR2, Ldec_tail // Key size 12?
   385  	VCIPHER	V0, V1, V0
   386  	VCIPHER	V0, V2, V0
   387  
   388  	// Load xk[52:59] and cipher
   389  	LXVD2X	(R11+R5), V1
   390  	LXVD2X	(R12+R5), V2
   391  	BNE	CR3, Linvalid_key_len // Not key size 14?
   392  	// Fallthrough to final cipher
   393  
   394  Ldec_tail:
   395  	// Cipher last two keys such that key information is
   396  	// cleared from V1 and V2.
   397  	VCIPHER		V0, V1, V1
   398  	VCIPHERLAST	V1, V2, V2
   399  
   400  	// Store the result in BE order.
   401  	P8_STXVB16X(V2, R3, R0)
   402  	RET
   403  
   404  Linvalid_key_len:
   405  	// Segfault, this should never happen. Only 3 keys sizes are created/used.
   406  	MOVD	R0, 0(R0)
   407  	RET
   408  
   409  // func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
   410  TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
   411  	MOVD	nr+0(FP), R6   // Round count/Key size
   412  	MOVD	xk+8(FP), R5   // Key pointer
   413  	MOVD	dst+16(FP), R3 // Dest pointer
   414  	MOVD	src+24(FP), R4 // Src pointer
   415  	SETUP_ESPERM(R7)
   416  
   417  	// Set CR{1,2,3}EQ to hold the key size information.
   418  	CMPU	R6, $10, CR1
   419  	CMPU	R6, $12, CR2
   420  	CMPU	R6, $14, CR3
   421  
   422  	MOVD	$16, R6
   423  	MOVD	$32, R7
   424  	MOVD	$48, R8
   425  	MOVD	$64, R9
   426  	MOVD	$80, R10
   427  	MOVD	$96, R11
   428  	MOVD	$112, R12
   429  
   430  	// Load text in BE order
   431  	P8_LXVB16X(R4, R0, V0)
   432  
   433  	// V1, V2 will hold keys, V0 is a temp.
   434  	// At completion, V2 will hold the text.
   435  	// Load xk[0:3] and xor with ciphertext
   436  	LXVD2X	(R0+R5), V1
   437  	VXOR	V0, V1, V0
   438  
   439  	// Load xk[4:11] and cipher
   440  	LXVD2X	(R6+R5), V1
   441  	LXVD2X	(R7+R5), V2
   442  	VNCIPHER	V0, V1, V0
   443  	VNCIPHER	V0, V2, V0
   444  
   445  	// Load xk[12:19] and cipher
   446  	LXVD2X	(R8+R5), V1
   447  	LXVD2X	(R9+R5), V2
   448  	VNCIPHER	V0, V1, V0
   449  	VNCIPHER	V0, V2, V0
   450  
   451  	// Load xk[20:27] and cipher
   452  	LXVD2X	(R10+R5), V1
   453  	LXVD2X	(R11+R5), V2
   454  	VNCIPHER	V0, V1, V0
   455  	VNCIPHER	V0, V2, V0
   456  
   457  	// Increment xk pointer to reuse constant offsets in R6-R12.
   458  	ADD	$112, R5
   459  
   460  	// Load xk[28:35] and cipher
   461  	LXVD2X	(R0+R5), V1
   462  	LXVD2X	(R6+R5), V2
   463  	VNCIPHER	V0, V1, V0
   464  	VNCIPHER	V0, V2, V0
   465  
   466  	// Load xk[36:43] and cipher
   467  	LXVD2X	(R7+R5), V1
   468  	LXVD2X	(R8+R5), V2
   469  	BEQ	CR1, Ldec_tail // Key size 10?
   470  	VNCIPHER	V0, V1, V0
   471  	VNCIPHER	V0, V2, V0
   472  
   473  	// Load xk[44:51] and cipher
   474  	LXVD2X	(R9+R5), V1
   475  	LXVD2X	(R10+R5), V2
   476  	BEQ	CR2, Ldec_tail // Key size 12?
   477  	VNCIPHER	V0, V1, V0
   478  	VNCIPHER	V0, V2, V0
   479  
   480  	// Load xk[52:59] and cipher
   481  	LXVD2X	(R11+R5), V1
   482  	LXVD2X	(R12+R5), V2
   483  	BNE	CR3, Linvalid_key_len // Not key size 14?
   484  	// Fallthrough to final cipher
   485  
   486  Ldec_tail:
   487  	// Cipher last two keys such that key information is
   488  	// cleared from V1 and V2.
   489  	VNCIPHER	V0, V1, V1
   490  	VNCIPHERLAST	V1, V2, V2
   491  
   492  	// Store the result in BE order.
   493  	P8_STXVB16X(V2, R3, R0)
   494  	RET
   495  
   496  Linvalid_key_len:
   497  	// Segfault, this should never happen. Only 3 keys sizes are created/used.
   498  	MOVD	R0, 0(R0)
   499  	RET
   500  
   501  // Remove defines from above so they can be defined here
   502  #undef INP
   503  #undef OUTENC
   504  #undef ROUNDS
   505  #undef KEY
   506  #undef TMP
   507  
   508  #define INP R3
   509  #define OUTP R4
   510  #define LEN R5
   511  #define KEYP R6
   512  #define ROUNDS R7
   513  #define IVP R8
   514  #define ENC R9
   515  
   516  #define INOUT V2
   517  #define TMP V3
   518  #define IVEC V4
   519  
   520  // Load the crypt key into VSRs.
   521  //
   522  // The expanded key is stored and loaded using
   523  // STXVD2X/LXVD2X. The in-memory byte ordering
   524  // depends on the endianness of the machine. The
   525  // expanded keys are generated by expandKeyAsm above.
   526  //
   527  // Rkeyp holds the key pointer. It is clobbered. Once
   528  // the expanded keys are loaded, it is not needed.
   529  //
   530  // R12,R14-R21 are scratch registers.
   531  // For keyp of 10, V6, V11-V20 hold the expanded key.
   532  // For keyp of 12, V6, V9-V20 hold the expanded key.
   533  // For keyp of 14, V6, V7-V20 hold the expanded key.
   534  #define LOAD_KEY(Rkeyp) \
   535  	MOVD	$16, R12 \
   536  	MOVD	$32, R14 \
   537  	MOVD	$48, R15 \
   538  	MOVD	$64, R16 \
   539  	MOVD	$80, R17 \
   540  	MOVD	$96, R18 \
   541  	MOVD	$112, R19 \
   542  	MOVD	$128, R20 \
   543  	MOVD	$144, R21 \
   544  	LXVD2X	(R0+Rkeyp), V6 \
   545  	ADD	$16, Rkeyp \
   546  	BEQ	CR1, L_start10 \
   547  	BEQ	CR2, L_start12 \
   548  	LXVD2X	(R0+Rkeyp), V7 \
   549  	LXVD2X	(R12+Rkeyp), V8 \
   550  	ADD	$32, Rkeyp \
   551  	L_start12: \
   552  	LXVD2X	(R0+Rkeyp), V9 \
   553  	LXVD2X	(R12+Rkeyp), V10 \
   554  	ADD	$32, Rkeyp \
   555  	L_start10: \
   556  	LXVD2X	(R0+Rkeyp), V11 \
   557  	LXVD2X	(R12+Rkeyp), V12 \
   558  	LXVD2X	(R14+Rkeyp), V13 \
   559  	LXVD2X	(R15+Rkeyp), V14 \
   560  	LXVD2X	(R16+Rkeyp), V15 \
   561  	LXVD2X	(R17+Rkeyp), V16 \
   562  	LXVD2X	(R18+Rkeyp), V17 \
   563  	LXVD2X	(R19+Rkeyp), V18 \
   564  	LXVD2X	(R20+Rkeyp), V19 \
   565  	LXVD2X	(R21+Rkeyp), V20
   566  
   567  // Perform aes cipher operation for keysize 10/12/14 using the keys
   568  // loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
   569  //
   570  // Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
   571  // performance V6 and IVEC can be swapped (xor is both associative and
   572  // commutative) during encryption:
   573  //
   574  //	VXOR INOUT, IVEC, INOUT
   575  //	VXOR INOUT, V6, INOUT
   576  //
   577  //	into
   578  //
   579  //	VXOR INOUT, V6, INOUT
   580  //	VXOR INOUT, IVEC, INOUT
   581  //
   582  #define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
   583  	VXOR	Vin, Vxor, Vout \
   584  	BEQ	CR1, label10 \
   585  	BEQ	CR2, label12 \
   586  	vcipher	Vout, V7, Vout \
   587  	vcipher	Vout, V8, Vout \
   588  	label12: \
   589  	vcipher	Vout, V9, Vout \
   590  	vcipher	Vout, V10, Vout \
   591  	label10: \
   592  	vcipher	Vout, V11, Vout \
   593  	vcipher	Vout, V12, Vout \
   594  	vcipher	Vout, V13, Vout \
   595  	vcipher	Vout, V14, Vout \
   596  	vcipher	Vout, V15, Vout \
   597  	vcipher	Vout, V16, Vout \
   598  	vcipher	Vout, V17, Vout \
   599  	vcipher	Vout, V18, Vout \
   600  	vcipher	Vout, V19, Vout \
   601  	vciphel	Vout, V20, Vout \
   602  
   603  #define CLEAR_KEYS() \
   604  	VXOR	V6, V6, V6 \
   605  	VXOR	V7, V7, V7 \
   606  	VXOR	V8, V8, V8 \
   607  	VXOR	V9, V9, V9 \
   608  	VXOR	V10, V10, V10 \
   609  	VXOR	V11, V11, V11 \
   610  	VXOR	V12, V12, V12 \
   611  	VXOR	V13, V13, V13 \
   612  	VXOR	V14, V14, V14 \
   613  	VXOR	V15, V15, V15 \
   614  	VXOR	V16, V16, V16 \
   615  	VXOR	V17, V17, V17 \
   616  	VXOR	V18, V18, V18 \
   617  	VXOR	V19, V19, V19 \
   618  	VXOR	V20, V20, V20
   619  
   620  //func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
   621  TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
   622  	MOVD	src+0(FP), INP
   623  	MOVD	dst+8(FP), OUTP
   624  	MOVD	length+16(FP), LEN
   625  	MOVD	key+24(FP), KEYP
   626  	MOVD	iv+32(FP), IVP
   627  	MOVD	enc+40(FP), ENC
   628  	MOVD	nr+48(FP), ROUNDS
   629  
   630  	SETUP_ESPERM(R11)
   631  
   632  	// Assume len > 0 && len % blockSize == 0.
   633  	CMPW	ENC, $0
   634  	P8_LXVB16X(IVP, R0, IVEC)
   635  	CMPU	ROUNDS, $10, CR1
   636  	CMPU	ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
   637  
   638  	// Setup key in VSRs, and set loop count in CTR.
   639  	LOAD_KEY(KEYP)
   640  	SRD	$4, LEN
   641  	MOVD	LEN, CTR
   642  
   643  	BEQ	Lcbc_dec
   644  
   645  	PCALIGN $16
   646  Lcbc_enc:
   647  	P8_LXVB16X(INP, R0, INOUT)
   648  	ADD	$16, INP
   649  	VXOR	INOUT, V6, INOUT
   650  	CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
   651  	VOR	INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
   652  	P8_STXVB16X(INOUT, OUTP, R0)
   653  	ADD	$16, OUTP
   654  	BDNZ	Lcbc_enc
   655  
   656  	P8_STXVB16X(INOUT, IVP, R0)
   657  	CLEAR_KEYS()
   658  	RET
   659  
   660  	PCALIGN $16
   661  Lcbc_dec:
   662  	P8_LXVB16X(INP, R0, TMP)
   663  	ADD	$16, INP
   664  	CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
   665  	VXOR	INOUT, IVEC, INOUT
   666  	VOR	TMP, TMP, IVEC // TMP is IVEC for next block.
   667  	P8_STXVB16X(INOUT, OUTP, R0)
   668  	ADD	$16, OUTP
   669  	BDNZ	Lcbc_dec
   670  
   671  	P8_STXVB16X(IVEC, IVP, R0)
   672  	CLEAR_KEYS()
   673  	RET
   674  
   675  
   676  #define DO1_CIPHER(iv0, keyv, key, op) \
   677  	LXVD2X	(key), keyv   \
   678  	ADD	$16, key      \
   679  	op	iv0, keyv, iv0
   680  
   681  #define DO2_CIPHER(iv0, iv1, keyv, key, op) \
   682  	DO1_CIPHER(iv0, keyv, key, op) \
   683  	op	iv1, keyv, iv1
   684  
   685  #define DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
   686  	DO2_CIPHER(iv0, iv1, keyv, key, op) \
   687  	op	iv2, keyv, iv2              \
   688  	op	iv3, keyv, iv3
   689  
   690  #define DO8_CIPHER(iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7, keyv, key, op) \
   691  	DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
   692  	op	iv4, keyv, iv4                        \
   693  	op	iv5, keyv, iv5                        \
   694  	op	iv6, keyv, iv6                        \
   695  	op	iv7, keyv, iv7
   696  
   697  #define XOR_STORE(src, iv, dstp, dstpoff) \
   698  	XXLXOR    src, iv, V8 \
   699  	P8_STXVB16X(V8,dstp,dstpoff)
   700  
   701  //func ctrBlocks1Asm(nr int, xk *[60]uint32, dst, src *[1 * BlockSize]byte, ivlo, ivhi uint64)
   702  TEXT ·ctrBlocks1Asm(SB), NOSPLIT|NOFRAME, $0
   703  
   704  #define CTRBLOCK_PROLOGUE \
   705  	MOVD	nr+0(FP), R3     \
   706  	MOVD	xk+8(FP), R4     \
   707  	MOVD	dst+16(FP), R5   \
   708  	MOVD	src+24(FP), R6   \
   709  	MOVD	ivlo+32(FP), R8  \
   710  	MOVD	ivhi+40(FP), R9  \
   711  	CMP	R3, $12, CR1     \
   712  	MTVSRD	R8, V0		 \
   713  	MTVSRD	R9, V1		 \
   714  	XXPERMDI V1, V0, $0, V0	 \
   715  	SETUP_ESPERM(R8)
   716  
   717  	CTRBLOCK_PROLOGUE
   718  
   719  	DO1_CIPHER(V0,V8,R4,VXOR)
   720  
   721  	BEQ	CR1, key_12
   722  	BLT	CR1, key_10
   723  key_14:
   724  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   725  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   726  key_12:
   727  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   728  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   729  key_10:
   730  	P8_LXVB16X(R6,R0,V9)
   731  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   732  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   733  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   734  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   735  
   736  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   737  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   738  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   739  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   740  
   741  	DO1_CIPHER(V0,V8,R4,VCIPHER)
   742  	DO1_CIPHER(V0,V8,R4,VCIPHERLAST)
   743  
   744  	XOR_STORE(V9,V0,R5,R0)
   745  	RET
   746  
   747  //func ctrBlocks2Asm(nr int, xk *[60]uint32, dst, src *[2 * BlockSize]byte, ivlo, ivhi uint64)
   748  TEXT ·ctrBlocks2Asm(SB), NOSPLIT|NOFRAME, $0
   749  	CTRBLOCK_PROLOGUE
   750  
   751  	XXLEQV  V8, V8, V8	// V0 is -1
   752  	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
   753  
   754  	DO2_CIPHER(V0,V1,V8,R4,VXOR)
   755  
   756  	BEQ	CR1, key_12
   757  	BLT	CR1, key_10
   758  key_14:
   759  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   760  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   761  key_12:
   762  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   763  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   764  key_10:
   765  	P8_LXVB16X(R6,R0,V9)
   766  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   767  	MOVD	$16, R8
   768  	P8_LXVB16X(R6,R8,V10)
   769  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   770  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   771  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   772  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   773  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   774  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   775  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   776  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
   777  	DO2_CIPHER(V0,V1,V8,R4,VCIPHERLAST)
   778  
   779  	XOR_STORE(V9,V0,R5,R0)
   780  	XOR_STORE(V10,V1,R5,R8)
   781  
   782  	RET
   783  
   784  //func ctrBlocks4Asm(nr int, xk *[60]uint32, dst, src *[4 * BlockSize]byte, ivlo, ivhi uint64)
   785  TEXT ·ctrBlocks4Asm(SB), NOSPLIT|NOFRAME, $0
   786  	CTRBLOCK_PROLOGUE
   787  
   788  	XXLEQV  V8, V8, V8	// V0 is -1
   789  	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
   790  	VSUBUQM V1, V8, V2
   791  	VSUBUQM V2, V8, V3
   792  
   793  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VXOR)
   794  
   795  	BEQ	CR1, key_12
   796  	BLT	CR1, key_10
   797  key_14:
   798  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   799  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   800  key_12:
   801  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   802  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   803  key_10:
   804  	P8_LXVB16X(R6,R0,V9)
   805  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   806  	MOVD	$16, R8
   807  	P8_LXVB16X(R6,R8,V10)
   808  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   809  	MOVD	$32, R9
   810  	P8_LXVB16X(R6,R9,V11)
   811  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   812  	MOVD	$48, R10
   813  	P8_LXVB16X(R6,R10,V12)
   814  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   815  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   816  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   817  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   818  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   819  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
   820  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHERLAST)
   821  
   822  	XOR_STORE(V9,V0,R5,R0)
   823  	XOR_STORE(V10,V1,R5,R8)
   824  	XOR_STORE(V11,V2,R5,R9)
   825  	XOR_STORE(V12,V3,R5,R10)
   826  
   827  	RET
   828  
   829  //func ctrBlocks8Asm(nr int, xk *[60]uint32, dst, src *[8 * BlockSize]byte, ivlo, ivhi uint64)
   830  TEXT ·ctrBlocks8Asm(SB), NOSPLIT|NOFRAME, $0
   831  	CTRBLOCK_PROLOGUE
   832  
   833  	XXLEQV  V8, V8, V8	// V8 is -1
   834  	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
   835  	VADDUQM V8, V8, V9	// V9 is -2
   836  
   837  	VSUBUQM V0, V9, V2
   838  	VSUBUQM V1, V9, V3
   839  	VSUBUQM V2, V9, V4
   840  	VSUBUQM V3, V9, V5
   841  	VSUBUQM V4, V9, V6
   842  	VSUBUQM V5, V9, V7
   843  
   844  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VXOR)
   845  
   846  	BEQ	CR1, key_12
   847  	BLT	CR1, key_10
   848  key_14:
   849  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   850  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   851  key_12:
   852  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   853  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   854  key_10:
   855  	P8_LXVB16X(R6,R0,V9)
   856  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   857  	MOVD	$16, R8
   858  	P8_LXVB16X(R6,R8,V10)
   859  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   860  	MOVD	$32, R9
   861  	P8_LXVB16X(R6,R9,V11)
   862  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   863  	MOVD	$48, R10
   864  	P8_LXVB16X(R6,R10,V12)
   865  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   866  	MOVD	$64, R11
   867  	P8_LXVB16X(R6,R11,V13)
   868  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   869  	MOVD	$80, R12
   870  	P8_LXVB16X(R6,R12,V14)
   871  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   872  	MOVD	$96, R14
   873  	P8_LXVB16X(R6,R14,V15)
   874  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   875  	MOVD	$112, R15
   876  	P8_LXVB16X(R6,R15,V16)
   877  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   878  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
   879  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHERLAST)
   880  
   881  	XOR_STORE(V9,V0,R5,R0)
   882  	XOR_STORE(V10,V1,R5,R8)
   883  	XOR_STORE(V11,V2,R5,R9)
   884  	XOR_STORE(V12,V3,R5,R10)
   885  	XOR_STORE(V13,V4,R5,R11)
   886  	XOR_STORE(V14,V5,R5,R12)
   887  	XOR_STORE(V15,V6,R5,R14)
   888  	XOR_STORE(V16,V7,R5,R15)
   889  
   890  	RET
   891  
   892  

View as plain text