// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build (ppc64 || ppc64le) && !purego

// Based on CRYPTOGAMS code with the following comment:
// # ====================================================================
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// # project. The module is, however, dual licensed under OpenSSL and
// # CRYPTOGAMS licenses depending on where you obtain it. For further
// # details see http://www.openssl.org/~appro/cryptogams/.
// # ====================================================================

// Original code can be found at the link below:
// https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl

// Some function names were changed to be consistent with Go function
// names. For instance, function aes_p8_set_{en,de}crypt_key become
// set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
// and a new session was created (doEncryptKeyAsm). This was necessary to
// avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
// There were other modifications as well but kept the same functionality.

#include "textflag.h"

// For expandKeyAsm
#define INP     R3
#define BITS    R4
#define OUTENC  R5 // Pointer to next expanded encrypt key
#define PTR     R6
#define CNT     R7
#define ROUNDS  R8
#define OUTDEC  R9  // Pointer to next expanded decrypt key
#define TEMP    R19
#define ZERO    V0
#define IN0     V1
#define IN1     V2
#define KEY     V3
#define RCON    V4
#define MASK    V5
#define TMP     V6
#define STAGE   V7
#define OUTPERM V8
#define OUTMASK V9
#define OUTHEAD V10
#define OUTTAIL V11

// For P9 instruction emulation
#define ESPERM  V21  // Endian swapping permute into BE
#define TMP2    V22  // Temporary for P8_STXVB16X/P8_STXVB16X

// For {en,de}cryptBlockAsm
#define BLK_INP    R3
#define BLK_OUT    R4
#define BLK_KEY    R5
#define BLK_ROUNDS R6
#define BLK_IDX    R7

DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
DATA ·rcon+0x08(SB)/8, $0x0706050403020100
DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
DATA ·rcon+0x40(SB)/8, $0x0000000000000000
DATA ·rcon+0x48(SB)/8, $0x0000000000000000
GLOBL ·rcon(SB), RODATA, $80

#ifdef GOARCH_ppc64le
#  ifdef GOPPC64_power9
#define P8_LXVB16X(RA,RB,VT)  LXVB16X	(RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVB16X	VS, (RA+RB)
#define XXBRD_ON_LE(VA,VT)    XXBRD	VA, VT
#define SETUP_ESPERM(rtmp)
#  else
// On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
// doublewords and byte-swapping each doubleword to emulate BE load/stores.
#define NEEDS_ESPERM
#define P8_LXVB16X(RA,RB,VT) \
	LXVD2X	(RA+RB), VT \
	VPERM	VT, VT, ESPERM, VT

#define P8_STXVB16X(VS,RA,RB) \
	VPERM	VS, VS, ESPERM, TMP2 \
	STXVD2X	TMP2, (RA+RB)

#define XXBRD_ON_LE(VA,VT) \
	VPERM	VA, VA, ESPERM, VT

// Setup byte-swapping permute value in ESPERM for POWER9 instruction
// emulation macros.
#define SETUP_ESPERM(rtmp) \
	MOVD	$·rcon(SB), rtmp \
	LVX	(rtmp), ESPERM
#  endif // defined(GOPPC64_power9)
#else
#define P8_LXVB16X(RA,RB,VT)  LXVD2X	(RA+RB), VT
#define P8_STXVB16X(VS,RA,RB) STXVD2X	VS, (RA+RB)
#define XXBRD_ON_LE(VA, VT)
#define SETUP_ESPERM(rtmp)
#endif // defined(GOARCH_ppc64le)

// func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
	// Load the arguments inside the registers
	MOVD	nr+0(FP), ROUNDS
	MOVD	key+8(FP), INP
	MOVD	enc+16(FP), OUTENC
	MOVD	dec+24(FP), OUTDEC

#ifdef NEEDS_ESPERM
	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
	LVX	(PTR), ESPERM
	ADD	$0x10, PTR
#else
	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
#endif

	// Get key from memory and write aligned into VR
	P8_LXVB16X(INP, R0, IN0)
	ADD	$0x10, INP, INP
	MOVD	$0x20, TEMP

	CMPW	ROUNDS, $12
	LVX	(PTR)(R0), RCON    // lvx   4,0,6      Load first 16 bytes into RCON
	LVX	(PTR)(TEMP), MASK
	ADD	$0x10, PTR, PTR    // addi  6,6,0x10   PTR to next 16 bytes of RCON
	MOVD	$8, CNT            // li    7,8        CNT = 8
	VXOR	ZERO, ZERO, ZERO   // vxor  0,0,0      Zero to be zero :)
	MOVD	CNT, CTR           // mtctr 7          Set the counter to 8 (rounds)

	// The expanded decrypt key is the expanded encrypt key stored in reverse order.
	// Move OUTDEC to the last key location, and store in descending order.
	ADD	$160, OUTDEC, OUTDEC
	BLT	loop128
	ADD	$32, OUTDEC, OUTDEC
	BEQ	l192
	ADD	$32, OUTDEC, OUTDEC
	JMP	l256

loop128:
	// Key schedule (Round 1 to 8)
	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5         Rotate-n-splat
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
	VXOR	IN0, KEY, IN0       // vxor 1,1,3
	BDNZ	loop128

	LVX	(PTR)(R0), RCON // lvx 4,0,6     Last two round keys

	// Key schedule (Round 9)
	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-spat
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	// Key schedule (Round 10)
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
	VXOR	IN0, KEY, IN0       // vxor 1,1,3

	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-splat
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	// Key schedule (Round 11)
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)

	RET

l192:
	LXSDX	(INP+R0), IN1                    // Load next 8 bytes into upper half of VSR.
	XXBRD_ON_LE(IN1, IN1)                    // and convert to BE ordering on LE hosts.
	MOVD	$4, CNT                          // li 7,4
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	VSPLTISB	$8, KEY                  // vspltisb 3,8
	MOVD	CNT, CTR                         // mtctr 7
	VSUBUBM	MASK, KEY, MASK                  // vsububm 5,5,3

loop192:
	VPERM	IN1, IN1, MASK, KEY // vperm 3,2,2,5
	VSLDOI	$12, ZERO, IN0, TMP // vsldoi 6,0,1,12
	VCIPHERLAST	KEY, RCON, KEY      // vcipherlast 3,3,4

	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0       // vxor 1,1,6

	VSLDOI	$8, ZERO, IN1, STAGE  // vsldoi 7,0,2,8
	VSPLTW	$3, IN0, TMP          // vspltw 6,1,3
	VXOR	TMP, IN1, TMP         // vxor 6,6,2
	VSLDOI	$12, ZERO, IN1, IN1   // vsldoi 2,0,2,12
	VADDUWM	RCON, RCON, RCON      // vadduwm 4,4,4
	VXOR	IN1, TMP, IN1         // vxor 2,2,6
	VXOR	IN0, KEY, IN0         // vxor 1,1,3
	VXOR	IN1, KEY, IN1         // vxor 2,2,3
	VSLDOI	$8, STAGE, IN0, STAGE // vsldoi 7,7,1,8

	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	STAGE, (R0+OUTENC)
	STXVD2X	STAGE, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VSLDOI	$8, IN0, IN1, STAGE              // vsldoi 7,1,2,8
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	STXVD2X	STAGE, (R0+OUTENC)
	STXVD2X	STAGE, (R0+OUTDEC)
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VSPLTW	$3, IN0, TMP                     // vspltw 6,1,3
	VXOR	TMP, IN1, TMP                    // vxor 6,6,2
	VSLDOI	$12, ZERO, IN1, IN1              // vsldoi 2,0,2,12
	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
	VXOR	IN1, TMP, IN1                    // vxor 2,2,6
	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
	VXOR	IN1, KEY, IN1                    // vxor 2,2,3
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	BDNZ	loop192

	RET

l256:
	P8_LXVB16X(INP, R0, IN1)
	MOVD	$7, CNT                          // li 7,7
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	MOVD	CNT, CTR                         // mtctr 7

loop256:
	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
	STXVD2X	IN1, (R0+OUTENC)
	STXVD2X	IN1, (R0+OUTDEC)
	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC

	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
	STXVD2X	IN0, (R0+OUTENC)
	STXVD2X	IN0, (R0+OUTDEC)
	ADD	$16, OUTENC, OUTENC
	ADD	$-16, OUTDEC, OUTDEC
	BDZ	done

	VSPLTW	$3, IN0, KEY        // vspltw 3,1,3
	VSLDOI	$12, ZERO, IN1, TMP // vsldoi 6,0,2,12
	VSBOX	KEY, KEY            // vsbox 3,3

	VXOR	IN1, TMP, IN1       // vxor 2,2,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN1, TMP, IN1       // vxor 2,2,6
	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
	VXOR	IN1, TMP, IN1       // vxor 2,2,6

	VXOR	IN1, KEY, IN1 // vxor 2,2,3
	JMP	loop256       // b .Loop256

done:
	RET

// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
	MOVD	nr+0(FP), R6   // Round count/Key size
	MOVD	xk+8(FP), R5   // Key pointer
	MOVD	dst+16(FP), R3 // Dest pointer
	MOVD	src+24(FP), R4 // Src pointer
	SETUP_ESPERM(R7)

	// Set CR{1,2,3}EQ to hold the key size information.
	CMPU	R6, $10, CR1
	CMPU	R6, $12, CR2
	CMPU	R6, $14, CR3

	MOVD	$16, R6
	MOVD	$32, R7
	MOVD	$48, R8
	MOVD	$64, R9
	MOVD	$80, R10
	MOVD	$96, R11
	MOVD	$112, R12

	// Load text in BE order
	P8_LXVB16X(R4, R0, V0)

	// V1, V2 will hold keys, V0 is a temp.
	// At completion, V2 will hold the ciphertext.
	// Load xk[0:3] and xor with text
	LXVD2X	(R0+R5), V1
	VXOR	V0, V1, V0

	// Load xk[4:11] and cipher
	LXVD2X	(R6+R5), V1
	LXVD2X	(R7+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[12:19] and cipher
	LXVD2X	(R8+R5), V1
	LXVD2X	(R9+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[20:27] and cipher
	LXVD2X	(R10+R5), V1
	LXVD2X	(R11+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Increment xk pointer to reuse constant offsets in R6-R12.
	ADD	$112, R5

	// Load xk[28:35] and cipher
	LXVD2X	(R0+R5), V1
	LXVD2X	(R6+R5), V2
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[36:43] and cipher
	LXVD2X	(R7+R5), V1
	LXVD2X	(R8+R5), V2
	BEQ	CR1, Ldec_tail // Key size 10?
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[44:51] and cipher
	LXVD2X	(R9+R5), V1
	LXVD2X	(R10+R5), V2
	BEQ	CR2, Ldec_tail // Key size 12?
	VCIPHER	V0, V1, V0
	VCIPHER	V0, V2, V0

	// Load xk[52:59] and cipher
	LXVD2X	(R11+R5), V1
	LXVD2X	(R12+R5), V2
	BNE	CR3, Linvalid_key_len // Not key size 14?
	// Fallthrough to final cipher

Ldec_tail:
	// Cipher last two keys such that key information is
	// cleared from V1 and V2.
	VCIPHER		V0, V1, V1
	VCIPHERLAST	V1, V2, V2

	// Store the result in BE order.
	P8_STXVB16X(V2, R3, R0)
	RET

Linvalid_key_len:
	// Segfault, this should never happen. Only 3 keys sizes are created/used.
	MOVD	R0, 0(R0)
	RET

// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
	MOVD	nr+0(FP), R6   // Round count/Key size
	MOVD	xk+8(FP), R5   // Key pointer
	MOVD	dst+16(FP), R3 // Dest pointer
	MOVD	src+24(FP), R4 // Src pointer
	SETUP_ESPERM(R7)

	// Set CR{1,2,3}EQ to hold the key size information.
	CMPU	R6, $10, CR1
	CMPU	R6, $12, CR2
	CMPU	R6, $14, CR3

	MOVD	$16, R6
	MOVD	$32, R7
	MOVD	$48, R8
	MOVD	$64, R9
	MOVD	$80, R10
	MOVD	$96, R11
	MOVD	$112, R12

	// Load text in BE order
	P8_LXVB16X(R4, R0, V0)

	// V1, V2 will hold keys, V0 is a temp.
	// At completion, V2 will hold the text.
	// Load xk[0:3] and xor with ciphertext
	LXVD2X	(R0+R5), V1
	VXOR	V0, V1, V0

	// Load xk[4:11] and cipher
	LXVD2X	(R6+R5), V1
	LXVD2X	(R7+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[12:19] and cipher
	LXVD2X	(R8+R5), V1
	LXVD2X	(R9+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[20:27] and cipher
	LXVD2X	(R10+R5), V1
	LXVD2X	(R11+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Increment xk pointer to reuse constant offsets in R6-R12.
	ADD	$112, R5

	// Load xk[28:35] and cipher
	LXVD2X	(R0+R5), V1
	LXVD2X	(R6+R5), V2
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[36:43] and cipher
	LXVD2X	(R7+R5), V1
	LXVD2X	(R8+R5), V2
	BEQ	CR1, Ldec_tail // Key size 10?
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[44:51] and cipher
	LXVD2X	(R9+R5), V1
	LXVD2X	(R10+R5), V2
	BEQ	CR2, Ldec_tail // Key size 12?
	VNCIPHER	V0, V1, V0
	VNCIPHER	V0, V2, V0

	// Load xk[52:59] and cipher
	LXVD2X	(R11+R5), V1
	LXVD2X	(R12+R5), V2
	BNE	CR3, Linvalid_key_len // Not key size 14?
	// Fallthrough to final cipher

Ldec_tail:
	// Cipher last two keys such that key information is
	// cleared from V1 and V2.
	VNCIPHER	V0, V1, V1
	VNCIPHERLAST	V1, V2, V2

	// Store the result in BE order.
	P8_STXVB16X(V2, R3, R0)
	RET

Linvalid_key_len:
	// Segfault, this should never happen. Only 3 keys sizes are created/used.
	MOVD	R0, 0(R0)
	RET

// Remove defines from above so they can be defined here
#undef INP
#undef OUTENC
#undef ROUNDS
#undef KEY
#undef TMP

#define INP R3
#define OUTP R4
#define LEN R5
#define KEYP R6
#define ROUNDS R7
#define IVP R8
#define ENC R9

#define INOUT V2
#define TMP V3
#define IVEC V4

// Load the crypt key into VSRs.
//
// The expanded key is stored and loaded using
// STXVD2X/LXVD2X. The in-memory byte ordering
// depends on the endianness of the machine. The
// expanded keys are generated by expandKeyAsm above.
//
// Rkeyp holds the key pointer. It is clobbered. Once
// the expanded keys are loaded, it is not needed.
//
// R12,R14-R21 are scratch registers.
// For keyp of 10, V6, V11-V20 hold the expanded key.
// For keyp of 12, V6, V9-V20 hold the expanded key.
// For keyp of 14, V6, V7-V20 hold the expanded key.
#define LOAD_KEY(Rkeyp) \
	MOVD	$16, R12 \
	MOVD	$32, R14 \
	MOVD	$48, R15 \
	MOVD	$64, R16 \
	MOVD	$80, R17 \
	MOVD	$96, R18 \
	MOVD	$112, R19 \
	MOVD	$128, R20 \
	MOVD	$144, R21 \
	LXVD2X	(R0+Rkeyp), V6 \
	ADD	$16, Rkeyp \
	BEQ	CR1, L_start10 \
	BEQ	CR2, L_start12 \
	LXVD2X	(R0+Rkeyp), V7 \
	LXVD2X	(R12+Rkeyp), V8 \
	ADD	$32, Rkeyp \
	L_start12: \
	LXVD2X	(R0+Rkeyp), V9 \
	LXVD2X	(R12+Rkeyp), V10 \
	ADD	$32, Rkeyp \
	L_start10: \
	LXVD2X	(R0+Rkeyp), V11 \
	LXVD2X	(R12+Rkeyp), V12 \
	LXVD2X	(R14+Rkeyp), V13 \
	LXVD2X	(R15+Rkeyp), V14 \
	LXVD2X	(R16+Rkeyp), V15 \
	LXVD2X	(R17+Rkeyp), V16 \
	LXVD2X	(R18+Rkeyp), V17 \
	LXVD2X	(R19+Rkeyp), V18 \
	LXVD2X	(R20+Rkeyp), V19 \
	LXVD2X	(R21+Rkeyp), V20

// Perform aes cipher operation for keysize 10/12/14 using the keys
// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
//
// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
// performance V6 and IVEC can be swapped (xor is both associative and
// commutative) during encryption:
//
//	VXOR INOUT, IVEC, INOUT
//	VXOR INOUT, V6, INOUT
//
//	into
//
//	VXOR INOUT, V6, INOUT
//	VXOR INOUT, IVEC, INOUT
//
#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
	VXOR	Vin, Vxor, Vout \
	BEQ	CR1, label10 \
	BEQ	CR2, label12 \
	vcipher	Vout, V7, Vout \
	vcipher	Vout, V8, Vout \
	label12: \
	vcipher	Vout, V9, Vout \
	vcipher	Vout, V10, Vout \
	label10: \
	vcipher	Vout, V11, Vout \
	vcipher	Vout, V12, Vout \
	vcipher	Vout, V13, Vout \
	vcipher	Vout, V14, Vout \
	vcipher	Vout, V15, Vout \
	vcipher	Vout, V16, Vout \
	vcipher	Vout, V17, Vout \
	vcipher	Vout, V18, Vout \
	vcipher	Vout, V19, Vout \
	vciphel	Vout, V20, Vout \

#define CLEAR_KEYS() \
	VXOR	V6, V6, V6 \
	VXOR	V7, V7, V7 \
	VXOR	V8, V8, V8 \
	VXOR	V9, V9, V9 \
	VXOR	V10, V10, V10 \
	VXOR	V11, V11, V11 \
	VXOR	V12, V12, V12 \
	VXOR	V13, V13, V13 \
	VXOR	V14, V14, V14 \
	VXOR	V15, V15, V15 \
	VXOR	V16, V16, V16 \
	VXOR	V17, V17, V17 \
	VXOR	V18, V18, V18 \
	VXOR	V19, V19, V19 \
	VXOR	V20, V20, V20

//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
	MOVD	src+0(FP), INP
	MOVD	dst+8(FP), OUTP
	MOVD	length+16(FP), LEN
	MOVD	key+24(FP), KEYP
	MOVD	iv+32(FP), IVP
	MOVD	enc+40(FP), ENC
	MOVD	nr+48(FP), ROUNDS

	SETUP_ESPERM(R11)

	// Assume len > 0 && len % blockSize == 0.
	CMPW	ENC, $0
	P8_LXVB16X(IVP, R0, IVEC)
	CMPU	ROUNDS, $10, CR1
	CMPU	ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.

	// Setup key in VSRs, and set loop count in CTR.
	LOAD_KEY(KEYP)
	SRD	$4, LEN
	MOVD	LEN, CTR

	BEQ	Lcbc_dec

	PCALIGN $16
Lcbc_enc:
	P8_LXVB16X(INP, R0, INOUT)
	ADD	$16, INP
	VXOR	INOUT, V6, INOUT
	CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
	VOR	INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
	P8_STXVB16X(INOUT, OUTP, R0)
	ADD	$16, OUTP
	BDNZ	Lcbc_enc

	P8_STXVB16X(INOUT, IVP, R0)
	CLEAR_KEYS()
	RET

	PCALIGN $16
Lcbc_dec:
	P8_LXVB16X(INP, R0, TMP)
	ADD	$16, INP
	CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
	VXOR	INOUT, IVEC, INOUT
	VOR	TMP, TMP, IVEC // TMP is IVEC for next block.
	P8_STXVB16X(INOUT, OUTP, R0)
	ADD	$16, OUTP
	BDNZ	Lcbc_dec

	P8_STXVB16X(IVEC, IVP, R0)
	CLEAR_KEYS()
	RET


#define DO1_CIPHER(iv0, keyv, key, op) \
	LXVD2X	(key), keyv   \
	ADD	$16, key      \
	op	iv0, keyv, iv0

#define DO2_CIPHER(iv0, iv1, keyv, key, op) \
	DO1_CIPHER(iv0, keyv, key, op) \
	op	iv1, keyv, iv1

#define DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
	DO2_CIPHER(iv0, iv1, keyv, key, op) \
	op	iv2, keyv, iv2              \
	op	iv3, keyv, iv3

#define DO8_CIPHER(iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7, keyv, key, op) \
	DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
	op	iv4, keyv, iv4                        \
	op	iv5, keyv, iv5                        \
	op	iv6, keyv, iv6                        \
	op	iv7, keyv, iv7

#define XOR_STORE(src, iv, dstp, dstpoff) \
	XXLXOR    src, iv, V8 \
	P8_STXVB16X(V8,dstp,dstpoff)

//func ctrBlocks1Asm(nr int, xk *[60]uint32, dst, src *[1 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks1Asm(SB), NOSPLIT|NOFRAME, $0

#define CTRBLOCK_PROLOGUE \
	MOVD	nr+0(FP), R3     \
	MOVD	xk+8(FP), R4     \
	MOVD	dst+16(FP), R5   \
	MOVD	src+24(FP), R6   \
	MOVD	ivlo+32(FP), R8  \
	MOVD	ivhi+40(FP), R9  \
	CMP	R3, $12, CR1     \
	MTVSRD	R8, V0		 \
	MTVSRD	R9, V1		 \
	XXPERMDI V1, V0, $0, V0	 \
	SETUP_ESPERM(R8)

	CTRBLOCK_PROLOGUE

	DO1_CIPHER(V0,V8,R4,VXOR)

	BEQ	CR1, key_12
	BLT	CR1, key_10
key_14:
	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)
key_12:
	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)
key_10:
	P8_LXVB16X(R6,R0,V9)
	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)

	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHER)

	DO1_CIPHER(V0,V8,R4,VCIPHER)
	DO1_CIPHER(V0,V8,R4,VCIPHERLAST)

	XOR_STORE(V9,V0,R5,R0)
	RET

//func ctrBlocks2Asm(nr int, xk *[60]uint32, dst, src *[2 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks2Asm(SB), NOSPLIT|NOFRAME, $0
	CTRBLOCK_PROLOGUE

	XXLEQV  V8, V8, V8	// V0 is -1
	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))

	DO2_CIPHER(V0,V1,V8,R4,VXOR)

	BEQ	CR1, key_12
	BLT	CR1, key_10
key_14:
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
key_12:
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
key_10:
	P8_LXVB16X(R6,R0,V9)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	MOVD	$16, R8
	P8_LXVB16X(R6,R8,V10)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
	DO2_CIPHER(V0,V1,V8,R4,VCIPHERLAST)

	XOR_STORE(V9,V0,R5,R0)
	XOR_STORE(V10,V1,R5,R8)

	RET

//func ctrBlocks4Asm(nr int, xk *[60]uint32, dst, src *[4 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks4Asm(SB), NOSPLIT|NOFRAME, $0
	CTRBLOCK_PROLOGUE

	XXLEQV  V8, V8, V8	// V0 is -1
	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
	VSUBUQM V1, V8, V2
	VSUBUQM V2, V8, V3

	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VXOR)

	BEQ	CR1, key_12
	BLT	CR1, key_10
key_14:
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
key_12:
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
key_10:
	P8_LXVB16X(R6,R0,V9)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	MOVD	$16, R8
	P8_LXVB16X(R6,R8,V10)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	MOVD	$32, R9
	P8_LXVB16X(R6,R9,V11)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	MOVD	$48, R10
	P8_LXVB16X(R6,R10,V12)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHERLAST)

	XOR_STORE(V9,V0,R5,R0)
	XOR_STORE(V10,V1,R5,R8)
	XOR_STORE(V11,V2,R5,R9)
	XOR_STORE(V12,V3,R5,R10)

	RET

//func ctrBlocks8Asm(nr int, xk *[60]uint32, dst, src *[8 * BlockSize]byte, ivlo, ivhi uint64)
TEXT ·ctrBlocks8Asm(SB), NOSPLIT|NOFRAME, $0
	CTRBLOCK_PROLOGUE

	XXLEQV  V8, V8, V8	// V8 is -1
	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
	VADDUQM V8, V8, V9	// V9 is -2

	VSUBUQM V0, V9, V2
	VSUBUQM V1, V9, V3
	VSUBUQM V2, V9, V4
	VSUBUQM V3, V9, V5
	VSUBUQM V4, V9, V6
	VSUBUQM V5, V9, V7

	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VXOR)

	BEQ	CR1, key_12
	BLT	CR1, key_10
key_14:
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
key_12:
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
key_10:
	P8_LXVB16X(R6,R0,V9)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	MOVD	$16, R8
	P8_LXVB16X(R6,R8,V10)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	MOVD	$32, R9
	P8_LXVB16X(R6,R9,V11)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	MOVD	$48, R10
	P8_LXVB16X(R6,R10,V12)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	MOVD	$64, R11
	P8_LXVB16X(R6,R11,V13)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	MOVD	$80, R12
	P8_LXVB16X(R6,R12,V14)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	MOVD	$96, R14
	P8_LXVB16X(R6,R14,V15)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	MOVD	$112, R15
	P8_LXVB16X(R6,R15,V16)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHERLAST)

	XOR_STORE(V9,V0,R5,R0)
	XOR_STORE(V10,V1,R5,R8)
	XOR_STORE(V11,V2,R5,R9)
	XOR_STORE(V12,V3,R5,R10)
	XOR_STORE(V13,V4,R5,R11)
	XOR_STORE(V14,V5,R5,R12)
	XOR_STORE(V15,V6,R5,R14)
	XOR_STORE(V16,V7,R5,R15)

	RET