Text file src/crypto/aes/gcm_arm64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  #define B0 V0
    10  #define B1 V1
    11  #define B2 V2
    12  #define B3 V3
    13  #define B4 V4
    14  #define B5 V5
    15  #define B6 V6
    16  #define B7 V7
    17  
    18  #define ACC0 V8
    19  #define ACC1 V9
    20  #define ACCM V10
    21  
    22  #define T0 V11
    23  #define T1 V12
    24  #define T2 V13
    25  #define T3 V14
    26  
    27  #define POLY V15
    28  #define ZERO V16
    29  #define INC V17
    30  #define CTR V18
    31  
    32  #define K0 V19
    33  #define K1 V20
    34  #define K2 V21
    35  #define K3 V22
    36  #define K4 V23
    37  #define K5 V24
    38  #define K6 V25
    39  #define K7 V26
    40  #define K8 V27
    41  #define K9 V28
    42  #define K10 V29
    43  #define K11 V30
    44  #define KLAST V31
    45  
    46  #define reduce() \
    47  	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
    48  	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
    49  	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
    50  	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
    51  	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
    52  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    53  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    54  	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
    55  	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
    56  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
    57  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
    58  	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
    59  	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
    60  
    61  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
    62  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
    63  #define pTbl R0
    64  #define tMsk R1
    65  #define tPtr R2
    66  #define plen R3
    67  #define dlen R4
    68  
    69  	MOVD	$0xC2, R1
    70  	LSL	$56, R1
    71  	MOVD	$1, R0
    72  	VMOV	R1, POLY.D[0]
    73  	VMOV	R0, POLY.D[1]
    74  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
    75  
    76  	MOVD	productTable+0(FP), pTbl
    77  	MOVD	tagMask+8(FP), tMsk
    78  	MOVD	T+16(FP), tPtr
    79  	MOVD	pLen+24(FP), plen
    80  	MOVD	dLen+32(FP), dlen
    81  
    82  	VLD1	(tPtr), [ACC0.B16]
    83  	VLD1	(tMsk), [B1.B16]
    84  
    85  	LSL	$3, plen
    86  	LSL	$3, dlen
    87  
    88  	VMOV	dlen, B0.D[0]
    89  	VMOV	plen, B0.D[1]
    90  
    91  	ADD	$14*16, pTbl
    92  	VLD1.P	(pTbl), [T1.B16, T2.B16]
    93  
    94  	VEOR	ACC0.B16, B0.B16, B0.B16
    95  
    96  	VEXT	$8, B0.B16, B0.B16, T0.B16
    97  	VEOR	B0.B16, T0.B16, T0.B16
    98  	VPMULL	B0.D1, T1.D1, ACC1.Q1
    99  	VPMULL2	B0.D2, T1.D2, ACC0.Q1
   100  	VPMULL	T0.D1, T2.D1, ACCM.Q1
   101  
   102  	reduce()
   103  
   104  	VREV64	ACC0.B16, ACC0.B16
   105  	VEOR	B1.B16, ACC0.B16, ACC0.B16
   106  
   107  	VST1	[ACC0.B16], (tPtr)
   108  	RET
   109  #undef pTbl
   110  #undef tMsk
   111  #undef tPtr
   112  #undef plen
   113  #undef dlen
   114  
   115  // func gcmAesInit(productTable *[256]byte, ks []uint32)
   116  TEXT ·gcmAesInit(SB),NOSPLIT,$0
   117  #define pTbl R0
   118  #define KS R1
   119  #define NR R2
   120  #define I R3
   121  	MOVD	productTable+0(FP), pTbl
   122  	MOVD	ks_base+8(FP), KS
   123  	MOVD	ks_len+16(FP), NR
   124  
   125  	MOVD	$0xC2, I
   126  	LSL	$56, I
   127  	VMOV	I, POLY.D[0]
   128  	MOVD	$1, I
   129  	VMOV	I, POLY.D[1]
   130  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   131  
   132  	// Encrypt block 0 with the AES key to generate the hash key H
   133  	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   134  	VEOR	B0.B16, B0.B16, B0.B16
   135  	AESE	T0.B16, B0.B16
   136  	AESMC	B0.B16, B0.B16
   137  	AESE	T1.B16, B0.B16
   138  	AESMC	B0.B16, B0.B16
   139  	AESE	T2.B16, B0.B16
   140  	AESMC	B0.B16, B0.B16
   141  	AESE	T3.B16, B0.B16
   142  	AESMC	B0.B16, B0.B16
   143  	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
   144  	AESE	T0.B16, B0.B16
   145  	AESMC	B0.B16, B0.B16
   146  	AESE	T1.B16, B0.B16
   147  	AESMC	B0.B16, B0.B16
   148  	AESE	T2.B16, B0.B16
   149  	AESMC	B0.B16, B0.B16
   150  	AESE	T3.B16, B0.B16
   151  	AESMC	B0.B16, B0.B16
   152  	TBZ	$4, NR, initEncFinish
   153  	VLD1.P	32(KS), [T0.B16, T1.B16]
   154  	AESE	T0.B16, B0.B16
   155  	AESMC	B0.B16, B0.B16
   156  	AESE	T1.B16, B0.B16
   157  	AESMC	B0.B16, B0.B16
   158  	TBZ	$3, NR, initEncFinish
   159  	VLD1.P	32(KS), [T0.B16, T1.B16]
   160  	AESE	T0.B16, B0.B16
   161  	AESMC	B0.B16, B0.B16
   162  	AESE	T1.B16, B0.B16
   163  	AESMC	B0.B16, B0.B16
   164  initEncFinish:
   165  	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
   166  	AESE	T0.B16, B0.B16
   167  	AESMC	B0.B16, B0.B16
   168  	AESE	T1.B16, B0.B16
   169  	VEOR	T2.B16, B0.B16, B0.B16
   170  
   171  	VREV64	B0.B16, B0.B16
   172  
   173  	// Multiply by 2 modulo P
   174  	VMOV	B0.D[0], I
   175  	ASR	$63, I
   176  	VMOV	I, T1.D[0]
   177  	VMOV	I, T1.D[1]
   178  	VAND	POLY.B16, T1.B16, T1.B16
   179  	VUSHR	$63, B0.D2, T2.D2
   180  	VEXT	$8, ZERO.B16, T2.B16, T2.B16
   181  	VSHL	$1, B0.D2, B0.D2
   182  	VEOR	T1.B16, B0.B16, B0.B16
   183  	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
   184  
   185  	// Karatsuba pre-computation
   186  	VEXT	$8, B0.B16, B0.B16, B1.B16
   187  	VEOR	B0.B16, B1.B16, B1.B16
   188  
   189  	ADD	$14*16, pTbl
   190  	VST1	[B0.B16, B1.B16], (pTbl)
   191  	SUB	$2*16, pTbl
   192  
   193  	VMOV	B0.B16, B2.B16
   194  	VMOV	B1.B16, B3.B16
   195  
   196  	MOVD	$7, I
   197  
   198  initLoop:
   199  	// Compute powers of H
   200  	SUBS	$1, I
   201  
   202  	VPMULL	B0.D1, B2.D1, T1.Q1
   203  	VPMULL2	B0.D2, B2.D2, T0.Q1
   204  	VPMULL	B1.D1, B3.D1, T2.Q1
   205  	VEOR	T0.B16, T2.B16, T2.B16
   206  	VEOR	T1.B16, T2.B16, T2.B16
   207  	VEXT	$8, ZERO.B16, T2.B16, T3.B16
   208  	VEXT	$8, T2.B16, ZERO.B16, T2.B16
   209  	VEOR	T2.B16, T0.B16, T0.B16
   210  	VEOR	T3.B16, T1.B16, T1.B16
   211  	VPMULL	POLY.D1, T0.D1, T2.Q1
   212  	VEXT	$8, T0.B16, T0.B16, T0.B16
   213  	VEOR	T2.B16, T0.B16, T0.B16
   214  	VPMULL	POLY.D1, T0.D1, T2.Q1
   215  	VEXT	$8, T0.B16, T0.B16, T0.B16
   216  	VEOR	T2.B16, T0.B16, T0.B16
   217  	VEOR	T1.B16, T0.B16, B2.B16
   218  	VMOV	B2.B16, B3.B16
   219  	VEXT	$8, B2.B16, B2.B16, B2.B16
   220  	VEOR	B2.B16, B3.B16, B3.B16
   221  
   222  	VST1	[B2.B16, B3.B16], (pTbl)
   223  	SUB	$2*16, pTbl
   224  
   225  	BNE	initLoop
   226  	RET
   227  #undef I
   228  #undef NR
   229  #undef KS
   230  #undef pTbl
   231  
   232  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
   233  TEXT ·gcmAesData(SB),NOSPLIT,$0
   234  #define pTbl R0
   235  #define aut R1
   236  #define tPtr R2
   237  #define autLen R3
   238  #define H0 R4
   239  #define pTblSave R5
   240  
   241  #define mulRound(X) \
   242  	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
   243  	VREV64	X.B16, X.B16               \
   244  	VEXT	$8, X.B16, X.B16, T0.B16   \
   245  	VEOR	X.B16, T0.B16, T0.B16      \
   246  	VPMULL	X.D1, T1.D1, T3.Q1         \
   247  	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
   248  	VPMULL2	X.D2, T1.D2, T3.Q1         \
   249  	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
   250  	VPMULL	T0.D1, T2.D1, T3.Q1        \
   251  	VEOR	T3.B16, ACCM.B16, ACCM.B16
   252  
   253  	MOVD	productTable+0(FP), pTbl
   254  	MOVD	data_base+8(FP), aut
   255  	MOVD	data_len+16(FP), autLen
   256  	MOVD	T+32(FP), tPtr
   257  
   258  	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
   259  	CBZ	autLen, dataBail
   260  
   261  	MOVD	$0xC2, H0
   262  	LSL	$56, H0
   263  	VMOV	H0, POLY.D[0]
   264  	MOVD	$1, H0
   265  	VMOV	H0, POLY.D[1]
   266  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   267  	MOVD	pTbl, pTblSave
   268  
   269  	CMP	$13, autLen
   270  	BEQ	dataTLS
   271  	CMP	$128, autLen
   272  	BLT	startSinglesLoop
   273  	B	octetsLoop
   274  
   275  dataTLS:
   276  	ADD	$14*16, pTbl
   277  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   278  	VEOR	B0.B16, B0.B16, B0.B16
   279  
   280  	MOVD	(aut), H0
   281  	VMOV	H0, B0.D[0]
   282  	MOVW	8(aut), H0
   283  	VMOV	H0, B0.S[2]
   284  	MOVB	12(aut), H0
   285  	VMOV	H0, B0.B[12]
   286  
   287  	MOVD	$0, autLen
   288  	B	dataMul
   289  
   290  octetsLoop:
   291  		CMP	$128, autLen
   292  		BLT	startSinglesLoop
   293  		SUB	$128, autLen
   294  
   295  		VLD1.P	32(aut), [B0.B16, B1.B16]
   296  
   297  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   298  		VREV64	B0.B16, B0.B16
   299  		VEOR	ACC0.B16, B0.B16, B0.B16
   300  		VEXT	$8, B0.B16, B0.B16, T0.B16
   301  		VEOR	B0.B16, T0.B16, T0.B16
   302  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   303  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   304  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   305  
   306  		mulRound(B1)
   307  		VLD1.P  32(aut), [B2.B16, B3.B16]
   308  		mulRound(B2)
   309  		mulRound(B3)
   310  		VLD1.P  32(aut), [B4.B16, B5.B16]
   311  		mulRound(B4)
   312  		mulRound(B5)
   313  		VLD1.P  32(aut), [B6.B16, B7.B16]
   314  		mulRound(B6)
   315  		mulRound(B7)
   316  
   317  		MOVD	pTblSave, pTbl
   318  		reduce()
   319  	B	octetsLoop
   320  
   321  startSinglesLoop:
   322  
   323  	ADD	$14*16, pTbl
   324  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   325  
   326  singlesLoop:
   327  
   328  		CMP	$16, autLen
   329  		BLT	dataEnd
   330  		SUB	$16, autLen
   331  
   332  		VLD1.P	16(aut), [B0.B16]
   333  dataMul:
   334  		VREV64	B0.B16, B0.B16
   335  		VEOR	ACC0.B16, B0.B16, B0.B16
   336  
   337  		VEXT	$8, B0.B16, B0.B16, T0.B16
   338  		VEOR	B0.B16, T0.B16, T0.B16
   339  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   340  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   341  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   342  
   343  		reduce()
   344  
   345  	B	singlesLoop
   346  
   347  dataEnd:
   348  
   349  	CBZ	autLen, dataBail
   350  	VEOR	B0.B16, B0.B16, B0.B16
   351  	ADD	autLen, aut
   352  
   353  dataLoadLoop:
   354  		MOVB.W	-1(aut), H0
   355  		VEXT	$15, B0.B16, ZERO.B16, B0.B16
   356  		VMOV	H0, B0.B[0]
   357  		SUBS	$1, autLen
   358  		BNE	dataLoadLoop
   359  	B	dataMul
   360  
   361  dataBail:
   362  	VST1	[ACC0.B16], (tPtr)
   363  	RET
   364  
   365  #undef pTbl
   366  #undef aut
   367  #undef tPtr
   368  #undef autLen
   369  #undef H0
   370  #undef pTblSave
   371  
   372  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   373  TEXT ·gcmAesEnc(SB),NOSPLIT,$0
   374  #define pTbl R0
   375  #define dstPtr R1
   376  #define ctrPtr R2
   377  #define srcPtr R3
   378  #define ks R4
   379  #define tPtr R5
   380  #define srcPtrLen R6
   381  #define aluCTR R7
   382  #define aluTMP R8
   383  #define aluK R9
   384  #define NR R10
   385  #define H0 R11
   386  #define H1 R12
   387  #define curK R13
   388  #define pTblSave R14
   389  
   390  #define aesrndx8(K) \
   391  	AESE	K.B16, B0.B16    \
   392  	AESMC	B0.B16, B0.B16   \
   393  	AESE	K.B16, B1.B16    \
   394  	AESMC	B1.B16, B1.B16   \
   395  	AESE	K.B16, B2.B16    \
   396  	AESMC	B2.B16, B2.B16   \
   397  	AESE	K.B16, B3.B16    \
   398  	AESMC	B3.B16, B3.B16   \
   399  	AESE	K.B16, B4.B16    \
   400  	AESMC	B4.B16, B4.B16   \
   401  	AESE	K.B16, B5.B16    \
   402  	AESMC	B5.B16, B5.B16   \
   403  	AESE	K.B16, B6.B16    \
   404  	AESMC	B6.B16, B6.B16   \
   405  	AESE	K.B16, B7.B16    \
   406  	AESMC	B7.B16, B7.B16
   407  
   408  #define aesrndlastx8(K) \
   409  	AESE	K.B16, B0.B16    \
   410  	AESE	K.B16, B1.B16    \
   411  	AESE	K.B16, B2.B16    \
   412  	AESE	K.B16, B3.B16    \
   413  	AESE	K.B16, B4.B16    \
   414  	AESE	K.B16, B5.B16    \
   415  	AESE	K.B16, B6.B16    \
   416  	AESE	K.B16, B7.B16
   417  
   418  	MOVD	productTable+0(FP), pTbl
   419  	MOVD	dst+8(FP), dstPtr
   420  	MOVD	src_base+32(FP), srcPtr
   421  	MOVD	src_len+40(FP), srcPtrLen
   422  	MOVD	ctr+56(FP), ctrPtr
   423  	MOVD	T+64(FP), tPtr
   424  	MOVD	ks_base+72(FP), ks
   425  	MOVD	ks_len+80(FP), NR
   426  
   427  	MOVD	$0xC2, H1
   428  	LSL	$56, H1
   429  	MOVD	$1, H0
   430  	VMOV	H1, POLY.D[0]
   431  	VMOV	H0, POLY.D[1]
   432  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   433  	// Compute NR from len(ks)
   434  	MOVD	pTbl, pTblSave
   435  	// Current tag, after AAD
   436  	VLD1	(tPtr), [ACC0.B16]
   437  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   438  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   439  	// Prepare initial counter, and the increment vector
   440  	VLD1	(ctrPtr), [CTR.B16]
   441  	VEOR	INC.B16, INC.B16, INC.B16
   442  	MOVD	$1, H0
   443  	VMOV	H0, INC.S[3]
   444  	VREV32	CTR.B16, CTR.B16
   445  	VADD	CTR.S4, INC.S4, CTR.S4
   446  	// Skip to <8 blocks loop
   447  	CMP	$128, srcPtrLen
   448  
   449  	MOVD	ks, H0
   450  	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   451  	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   452  	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   453  	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   454  	VMOV	K10.B16, KLAST.B16
   455  
   456  	BLT	startSingles
   457  	// There are at least 8 blocks to encrypt
   458  	TBZ	$4, NR, octetsLoop
   459  
   460  	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   461  	VMOV	K8.B16, K10.B16
   462  	VMOV	K9.B16, K11.B16
   463  	VMOV	KLAST.B16, K8.B16
   464  	VLD1.P	16(H0), [K9.B16]
   465  	VLD1.P  16(H0), [KLAST.B16]
   466  	TBZ	$3, NR, octetsLoop
   467  	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   468  	VMOV	KLAST.B16, K8.B16
   469  	VLD1.P	16(H0), [K9.B16]
   470  	VLD1.P  16(H0), [KLAST.B16]
   471  	ADD	$10*16, ks, H0
   472  	MOVD	H0, curK
   473  
   474  octetsLoop:
   475  		SUB	$128, srcPtrLen
   476  
   477  		VMOV	CTR.B16, B0.B16
   478  		VADD	B0.S4, INC.S4, B1.S4
   479  		VREV32	B0.B16, B0.B16
   480  		VADD	B1.S4, INC.S4, B2.S4
   481  		VREV32	B1.B16, B1.B16
   482  		VADD	B2.S4, INC.S4, B3.S4
   483  		VREV32	B2.B16, B2.B16
   484  		VADD	B3.S4, INC.S4, B4.S4
   485  		VREV32	B3.B16, B3.B16
   486  		VADD	B4.S4, INC.S4, B5.S4
   487  		VREV32	B4.B16, B4.B16
   488  		VADD	B5.S4, INC.S4, B6.S4
   489  		VREV32	B5.B16, B5.B16
   490  		VADD	B6.S4, INC.S4, B7.S4
   491  		VREV32	B6.B16, B6.B16
   492  		VADD	B7.S4, INC.S4, CTR.S4
   493  		VREV32	B7.B16, B7.B16
   494  
   495  		aesrndx8(K0)
   496  		aesrndx8(K1)
   497  		aesrndx8(K2)
   498  		aesrndx8(K3)
   499  		aesrndx8(K4)
   500  		aesrndx8(K5)
   501  		aesrndx8(K6)
   502  		aesrndx8(K7)
   503  		TBZ	$4, NR, octetsFinish
   504  		aesrndx8(K10)
   505  		aesrndx8(K11)
   506  		TBZ	$3, NR, octetsFinish
   507  		VLD1.P	32(curK), [T1.B16, T2.B16]
   508  		aesrndx8(T1)
   509  		aesrndx8(T2)
   510  		MOVD	H0, curK
   511  octetsFinish:
   512  		aesrndx8(K8)
   513  		aesrndlastx8(K9)
   514  
   515  		VEOR	KLAST.B16, B0.B16, B0.B16
   516  		VEOR	KLAST.B16, B1.B16, B1.B16
   517  		VEOR	KLAST.B16, B2.B16, B2.B16
   518  		VEOR	KLAST.B16, B3.B16, B3.B16
   519  		VEOR	KLAST.B16, B4.B16, B4.B16
   520  		VEOR	KLAST.B16, B5.B16, B5.B16
   521  		VEOR	KLAST.B16, B6.B16, B6.B16
   522  		VEOR	KLAST.B16, B7.B16, B7.B16
   523  
   524  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   525  		VEOR	B0.B16, T1.B16, B0.B16
   526  		VEOR	B1.B16, T2.B16, B1.B16
   527  		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
   528  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   529  		VEOR	B2.B16, T1.B16, B2.B16
   530  		VEOR	B3.B16, T2.B16, B3.B16
   531  		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
   532  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   533  		VEOR	B4.B16, T1.B16, B4.B16
   534  		VEOR	B5.B16, T2.B16, B5.B16
   535  		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
   536  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
   537  		VEOR	B6.B16, T1.B16, B6.B16
   538  		VEOR	B7.B16, T2.B16, B7.B16
   539  		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
   540  
   541  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   542  		VREV64	B0.B16, B0.B16
   543  		VEOR	ACC0.B16, B0.B16, B0.B16
   544  		VEXT	$8, B0.B16, B0.B16, T0.B16
   545  		VEOR	B0.B16, T0.B16, T0.B16
   546  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   547  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   548  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   549  
   550  		mulRound(B1)
   551  		mulRound(B2)
   552  		mulRound(B3)
   553  		mulRound(B4)
   554  		mulRound(B5)
   555  		mulRound(B6)
   556  		mulRound(B7)
   557  		MOVD	pTblSave, pTbl
   558  		reduce()
   559  
   560  		CMP	$128, srcPtrLen
   561  		BGE	octetsLoop
   562  
   563  startSingles:
   564  	CBZ	srcPtrLen, done
   565  	ADD	$14*16, pTbl
   566  	// Preload H and its Karatsuba precomp
   567  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   568  	// Preload AES round keys
   569  	ADD	$128, ks
   570  	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   571  	VMOV	K10.B16, KLAST.B16
   572  	TBZ	$4, NR, singlesLoop
   573  	VLD1.P	32(ks), [B1.B16, B2.B16]
   574  	VMOV	B2.B16, KLAST.B16
   575  	TBZ	$3, NR, singlesLoop
   576  	VLD1.P	32(ks), [B3.B16, B4.B16]
   577  	VMOV	B4.B16, KLAST.B16
   578  
   579  singlesLoop:
   580  		CMP	$16, srcPtrLen
   581  		BLT	tail
   582  		SUB	$16, srcPtrLen
   583  
   584  		VLD1.P	16(srcPtr), [T0.B16]
   585  		VEOR	KLAST.B16, T0.B16, T0.B16
   586  
   587  		VREV32	CTR.B16, B0.B16
   588  		VADD	CTR.S4, INC.S4, CTR.S4
   589  
   590  		AESE	K0.B16, B0.B16
   591  		AESMC	B0.B16, B0.B16
   592  		AESE	K1.B16, B0.B16
   593  		AESMC	B0.B16, B0.B16
   594  		AESE	K2.B16, B0.B16
   595  		AESMC	B0.B16, B0.B16
   596  		AESE	K3.B16, B0.B16
   597  		AESMC	B0.B16, B0.B16
   598  		AESE	K4.B16, B0.B16
   599  		AESMC	B0.B16, B0.B16
   600  		AESE	K5.B16, B0.B16
   601  		AESMC	B0.B16, B0.B16
   602  		AESE	K6.B16, B0.B16
   603  		AESMC	B0.B16, B0.B16
   604  		AESE	K7.B16, B0.B16
   605  		AESMC	B0.B16, B0.B16
   606  		AESE	K8.B16, B0.B16
   607  		AESMC	B0.B16, B0.B16
   608  		AESE	K9.B16, B0.B16
   609  		TBZ	$4, NR, singlesLast
   610  		AESMC	B0.B16, B0.B16
   611  		AESE	K10.B16, B0.B16
   612  		AESMC	B0.B16, B0.B16
   613  		AESE	B1.B16, B0.B16
   614  		TBZ	$3, NR, singlesLast
   615  		AESMC	B0.B16, B0.B16
   616  		AESE	B2.B16, B0.B16
   617  		AESMC	B0.B16, B0.B16
   618  		AESE	B3.B16, B0.B16
   619  singlesLast:
   620  		VEOR	T0.B16, B0.B16, B0.B16
   621  encReduce:
   622  		VST1.P	[B0.B16], 16(dstPtr)
   623  
   624  		VREV64	B0.B16, B0.B16
   625  		VEOR	ACC0.B16, B0.B16, B0.B16
   626  
   627  		VEXT	$8, B0.B16, B0.B16, T0.B16
   628  		VEOR	B0.B16, T0.B16, T0.B16
   629  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   630  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   631  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   632  
   633  		reduce()
   634  
   635  	B	singlesLoop
   636  tail:
   637  	CBZ	srcPtrLen, done
   638  
   639  	VEOR	T0.B16, T0.B16, T0.B16
   640  	VEOR	T3.B16, T3.B16, T3.B16
   641  	MOVD	$0, H1
   642  	SUB	$1, H1
   643  	ADD	srcPtrLen, srcPtr
   644  
   645  	TBZ	$3, srcPtrLen, ld4
   646  	MOVD.W	-8(srcPtr), H0
   647  	VMOV	H0, T0.D[0]
   648  	VMOV	H1, T3.D[0]
   649  ld4:
   650  	TBZ	$2, srcPtrLen, ld2
   651  	MOVW.W	-4(srcPtr), H0
   652  	VEXT	$12, T0.B16, ZERO.B16, T0.B16
   653  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   654  	VMOV	H0, T0.S[0]
   655  	VMOV	H1, T3.S[0]
   656  ld2:
   657  	TBZ	$1, srcPtrLen, ld1
   658  	MOVH.W	-2(srcPtr), H0
   659  	VEXT	$14, T0.B16, ZERO.B16, T0.B16
   660  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
   661  	VMOV	H0, T0.H[0]
   662  	VMOV	H1, T3.H[0]
   663  ld1:
   664  	TBZ	$0, srcPtrLen, ld0
   665  	MOVB.W	-1(srcPtr), H0
   666  	VEXT	$15, T0.B16, ZERO.B16, T0.B16
   667  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
   668  	VMOV	H0, T0.B[0]
   669  	VMOV	H1, T3.B[0]
   670  ld0:
   671  
   672  	MOVD	ZR, srcPtrLen
   673  	VEOR	KLAST.B16, T0.B16, T0.B16
   674  	VREV32	CTR.B16, B0.B16
   675  
   676  	AESE	K0.B16, B0.B16
   677  	AESMC	B0.B16, B0.B16
   678  	AESE	K1.B16, B0.B16
   679  	AESMC	B0.B16, B0.B16
   680  	AESE	K2.B16, B0.B16
   681  	AESMC	B0.B16, B0.B16
   682  	AESE	K3.B16, B0.B16
   683  	AESMC	B0.B16, B0.B16
   684  	AESE	K4.B16, B0.B16
   685  	AESMC	B0.B16, B0.B16
   686  	AESE	K5.B16, B0.B16
   687  	AESMC	B0.B16, B0.B16
   688  	AESE	K6.B16, B0.B16
   689  	AESMC	B0.B16, B0.B16
   690  	AESE	K7.B16, B0.B16
   691  	AESMC	B0.B16, B0.B16
   692  	AESE	K8.B16, B0.B16
   693  	AESMC	B0.B16, B0.B16
   694  	AESE	K9.B16, B0.B16
   695  	TBZ	$4, NR, tailLast
   696  	AESMC	B0.B16, B0.B16
   697  	AESE	K10.B16, B0.B16
   698  	AESMC	B0.B16, B0.B16
   699  	AESE	B1.B16, B0.B16
   700  	TBZ	$3, NR, tailLast
   701  	AESMC	B0.B16, B0.B16
   702  	AESE	B2.B16, B0.B16
   703  	AESMC	B0.B16, B0.B16
   704  	AESE	B3.B16, B0.B16
   705  
   706  tailLast:
   707  	VEOR	T0.B16, B0.B16, B0.B16
   708  	VAND	T3.B16, B0.B16, B0.B16
   709  	B	encReduce
   710  
   711  done:
   712  	VST1	[ACC0.B16], (tPtr)
   713  	RET
   714  
   715  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
   716  TEXT ·gcmAesDec(SB),NOSPLIT,$0
   717  	MOVD	productTable+0(FP), pTbl
   718  	MOVD	dst+8(FP), dstPtr
   719  	MOVD	src_base+32(FP), srcPtr
   720  	MOVD	src_len+40(FP), srcPtrLen
   721  	MOVD	ctr+56(FP), ctrPtr
   722  	MOVD	T+64(FP), tPtr
   723  	MOVD	ks_base+72(FP), ks
   724  	MOVD	ks_len+80(FP), NR
   725  
   726  	MOVD	$0xC2, H1
   727  	LSL	$56, H1
   728  	MOVD	$1, H0
   729  	VMOV	H1, POLY.D[0]
   730  	VMOV	H0, POLY.D[1]
   731  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
   732  	// Compute NR from len(ks)
   733  	MOVD	pTbl, pTblSave
   734  	// Current tag, after AAD
   735  	VLD1	(tPtr), [ACC0.B16]
   736  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
   737  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
   738  	// Prepare initial counter, and the increment vector
   739  	VLD1	(ctrPtr), [CTR.B16]
   740  	VEOR	INC.B16, INC.B16, INC.B16
   741  	MOVD	$1, H0
   742  	VMOV	H0, INC.S[3]
   743  	VREV32	CTR.B16, CTR.B16
   744  	VADD	CTR.S4, INC.S4, CTR.S4
   745  
   746  	MOVD	ks, H0
   747  	// For AES-128 round keys are stored in: K0 .. K10, KLAST
   748  	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
   749  	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
   750  	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
   751  	VMOV	K10.B16, KLAST.B16
   752  
   753  	// Skip to <8 blocks loop
   754  	CMP	$128, srcPtrLen
   755  	BLT	startSingles
   756  	// There are at least 8 blocks to encrypt
   757  	TBZ	$4, NR, octetsLoop
   758  
   759  	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
   760  	VMOV	K8.B16, K10.B16
   761  	VMOV	K9.B16, K11.B16
   762  	VMOV	KLAST.B16, K8.B16
   763  	VLD1.P	16(H0), [K9.B16]
   764  	VLD1.P  16(H0), [KLAST.B16]
   765  	TBZ	$3, NR, octetsLoop
   766  	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
   767  	VMOV	KLAST.B16, K8.B16
   768  	VLD1.P	16(H0), [K9.B16]
   769  	VLD1.P  16(H0), [KLAST.B16]
   770  	ADD	$10*16, ks, H0
   771  	MOVD	H0, curK
   772  
   773  octetsLoop:
   774  		SUB	$128, srcPtrLen
   775  
   776  		VMOV	CTR.B16, B0.B16
   777  		VADD	B0.S4, INC.S4, B1.S4
   778  		VREV32	B0.B16, B0.B16
   779  		VADD	B1.S4, INC.S4, B2.S4
   780  		VREV32	B1.B16, B1.B16
   781  		VADD	B2.S4, INC.S4, B3.S4
   782  		VREV32	B2.B16, B2.B16
   783  		VADD	B3.S4, INC.S4, B4.S4
   784  		VREV32	B3.B16, B3.B16
   785  		VADD	B4.S4, INC.S4, B5.S4
   786  		VREV32	B4.B16, B4.B16
   787  		VADD	B5.S4, INC.S4, B6.S4
   788  		VREV32	B5.B16, B5.B16
   789  		VADD	B6.S4, INC.S4, B7.S4
   790  		VREV32	B6.B16, B6.B16
   791  		VADD	B7.S4, INC.S4, CTR.S4
   792  		VREV32	B7.B16, B7.B16
   793  
   794  		aesrndx8(K0)
   795  		aesrndx8(K1)
   796  		aesrndx8(K2)
   797  		aesrndx8(K3)
   798  		aesrndx8(K4)
   799  		aesrndx8(K5)
   800  		aesrndx8(K6)
   801  		aesrndx8(K7)
   802  		TBZ	$4, NR, octetsFinish
   803  		aesrndx8(K10)
   804  		aesrndx8(K11)
   805  		TBZ	$3, NR, octetsFinish
   806  		VLD1.P	32(curK), [T1.B16, T2.B16]
   807  		aesrndx8(T1)
   808  		aesrndx8(T2)
   809  		MOVD	H0, curK
   810  octetsFinish:
   811  		aesrndx8(K8)
   812  		aesrndlastx8(K9)
   813  
   814  		VEOR	KLAST.B16, B0.B16, T1.B16
   815  		VEOR	KLAST.B16, B1.B16, T2.B16
   816  		VEOR	KLAST.B16, B2.B16, B2.B16
   817  		VEOR	KLAST.B16, B3.B16, B3.B16
   818  		VEOR	KLAST.B16, B4.B16, B4.B16
   819  		VEOR	KLAST.B16, B5.B16, B5.B16
   820  		VEOR	KLAST.B16, B6.B16, B6.B16
   821  		VEOR	KLAST.B16, B7.B16, B7.B16
   822  
   823  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   824  		VEOR	B0.B16, T1.B16, T1.B16
   825  		VEOR	B1.B16, T2.B16, T2.B16
   826  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   827  
   828  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
   829  		VREV64	B0.B16, B0.B16
   830  		VEOR	ACC0.B16, B0.B16, B0.B16
   831  		VEXT	$8, B0.B16, B0.B16, T0.B16
   832  		VEOR	B0.B16, T0.B16, T0.B16
   833  		VPMULL	B0.D1, T1.D1, ACC1.Q1
   834  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
   835  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   836  		mulRound(B1)
   837  
   838  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   839  		VEOR	B2.B16, B0.B16, T1.B16
   840  		VEOR	B3.B16, B1.B16, T2.B16
   841  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   842  		mulRound(B0)
   843  		mulRound(B1)
   844  
   845  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   846  		VEOR	B4.B16, B0.B16, T1.B16
   847  		VEOR	B5.B16, B1.B16, T2.B16
   848  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   849  		mulRound(B0)
   850  		mulRound(B1)
   851  
   852  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
   853  		VEOR	B6.B16, B0.B16, T1.B16
   854  		VEOR	B7.B16, B1.B16, T2.B16
   855  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
   856  		mulRound(B0)
   857  		mulRound(B1)
   858  
   859  		MOVD	pTblSave, pTbl
   860  		reduce()
   861  
   862  		CMP	$128, srcPtrLen
   863  		BGE	octetsLoop
   864  
   865  startSingles:
   866  	CBZ	srcPtrLen, done
   867  	ADD	$14*16, pTbl
   868  	// Preload H and its Karatsuba precomp
   869  	VLD1.P	(pTbl), [T1.B16, T2.B16]
   870  	// Preload AES round keys
   871  	ADD	$128, ks
   872  	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
   873  	VMOV	K10.B16, KLAST.B16
   874  	TBZ	$4, NR, singlesLoop
   875  	VLD1.P	32(ks), [B1.B16, B2.B16]
   876  	VMOV	B2.B16, KLAST.B16
   877  	TBZ	$3, NR, singlesLoop
   878  	VLD1.P	32(ks), [B3.B16, B4.B16]
   879  	VMOV	B4.B16, KLAST.B16
   880  
   881  singlesLoop:
   882  		CMP	$16, srcPtrLen
   883  		BLT	tail
   884  		SUB	$16, srcPtrLen
   885  
   886  		VLD1.P	16(srcPtr), [T0.B16]
   887  		VREV64	T0.B16, B5.B16
   888  		VEOR	KLAST.B16, T0.B16, T0.B16
   889  
   890  		VREV32	CTR.B16, B0.B16
   891  		VADD	CTR.S4, INC.S4, CTR.S4
   892  
   893  		AESE	K0.B16, B0.B16
   894  		AESMC	B0.B16, B0.B16
   895  		AESE	K1.B16, B0.B16
   896  		AESMC	B0.B16, B0.B16
   897  		AESE	K2.B16, B0.B16
   898  		AESMC	B0.B16, B0.B16
   899  		AESE	K3.B16, B0.B16
   900  		AESMC	B0.B16, B0.B16
   901  		AESE	K4.B16, B0.B16
   902  		AESMC	B0.B16, B0.B16
   903  		AESE	K5.B16, B0.B16
   904  		AESMC	B0.B16, B0.B16
   905  		AESE	K6.B16, B0.B16
   906  		AESMC	B0.B16, B0.B16
   907  		AESE	K7.B16, B0.B16
   908  		AESMC	B0.B16, B0.B16
   909  		AESE	K8.B16, B0.B16
   910  		AESMC	B0.B16, B0.B16
   911  		AESE	K9.B16, B0.B16
   912  		TBZ	$4, NR, singlesLast
   913  		AESMC	B0.B16, B0.B16
   914  		AESE	K10.B16, B0.B16
   915  		AESMC	B0.B16, B0.B16
   916  		AESE	B1.B16, B0.B16
   917  		TBZ	$3, NR, singlesLast
   918  		AESMC	B0.B16, B0.B16
   919  		AESE	B2.B16, B0.B16
   920  		AESMC	B0.B16, B0.B16
   921  		AESE	B3.B16, B0.B16
   922  singlesLast:
   923  		VEOR	T0.B16, B0.B16, B0.B16
   924  
   925  		VST1.P	[B0.B16], 16(dstPtr)
   926  
   927  		VEOR	ACC0.B16, B5.B16, B5.B16
   928  		VEXT	$8, B5.B16, B5.B16, T0.B16
   929  		VEOR	B5.B16, T0.B16, T0.B16
   930  		VPMULL	B5.D1, T1.D1, ACC1.Q1
   931  		VPMULL2	B5.D2, T1.D2, ACC0.Q1
   932  		VPMULL	T0.D1, T2.D1, ACCM.Q1
   933  		reduce()
   934  
   935  	B	singlesLoop
   936  tail:
   937  	CBZ	srcPtrLen, done
   938  
   939  	VREV32	CTR.B16, B0.B16
   940  	VADD	CTR.S4, INC.S4, CTR.S4
   941  
   942  	AESE	K0.B16, B0.B16
   943  	AESMC	B0.B16, B0.B16
   944  	AESE	K1.B16, B0.B16
   945  	AESMC	B0.B16, B0.B16
   946  	AESE	K2.B16, B0.B16
   947  	AESMC	B0.B16, B0.B16
   948  	AESE	K3.B16, B0.B16
   949  	AESMC	B0.B16, B0.B16
   950  	AESE	K4.B16, B0.B16
   951  	AESMC	B0.B16, B0.B16
   952  	AESE	K5.B16, B0.B16
   953  	AESMC	B0.B16, B0.B16
   954  	AESE	K6.B16, B0.B16
   955  	AESMC	B0.B16, B0.B16
   956  	AESE	K7.B16, B0.B16
   957  	AESMC	B0.B16, B0.B16
   958  	AESE	K8.B16, B0.B16
   959  	AESMC	B0.B16, B0.B16
   960  	AESE	K9.B16, B0.B16
   961  	TBZ	$4, NR, tailLast
   962  	AESMC	B0.B16, B0.B16
   963  	AESE	K10.B16, B0.B16
   964  	AESMC	B0.B16, B0.B16
   965  	AESE	B1.B16, B0.B16
   966  	TBZ	$3, NR, tailLast
   967  	AESMC	B0.B16, B0.B16
   968  	AESE	B2.B16, B0.B16
   969  	AESMC	B0.B16, B0.B16
   970  	AESE	B3.B16, B0.B16
   971  tailLast:
   972  	VEOR	KLAST.B16, B0.B16, B0.B16
   973  
   974  	// Assuming it is safe to load past dstPtr due to the presence of the tag
   975  	VLD1	(srcPtr), [B5.B16]
   976  
   977  	VEOR	B5.B16, B0.B16, B0.B16
   978  
   979  	VEOR	T3.B16, T3.B16, T3.B16
   980  	MOVD	$0, H1
   981  	SUB	$1, H1
   982  
   983  	TBZ	$3, srcPtrLen, ld4
   984  	VMOV	B0.D[0], H0
   985  	MOVD.P	H0, 8(dstPtr)
   986  	VMOV	H1, T3.D[0]
   987  	VEXT	$8, ZERO.B16, B0.B16, B0.B16
   988  ld4:
   989  	TBZ	$2, srcPtrLen, ld2
   990  	VMOV	B0.S[0], H0
   991  	MOVW.P	H0, 4(dstPtr)
   992  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
   993  	VMOV	H1, T3.S[0]
   994  	VEXT	$4, ZERO.B16, B0.B16, B0.B16
   995  ld2:
   996  	TBZ	$1, srcPtrLen, ld1
   997  	VMOV	B0.H[0], H0
   998  	MOVH.P	H0, 2(dstPtr)
   999  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
  1000  	VMOV	H1, T3.H[0]
  1001  	VEXT	$2, ZERO.B16, B0.B16, B0.B16
  1002  ld1:
  1003  	TBZ	$0, srcPtrLen, ld0
  1004  	VMOV	B0.B[0], H0
  1005  	MOVB.P	H0, 1(dstPtr)
  1006  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
  1007  	VMOV	H1, T3.B[0]
  1008  ld0:
  1009  
  1010  	VAND	T3.B16, B5.B16, B5.B16
  1011  	VREV64	B5.B16, B5.B16
  1012  
  1013  	VEOR	ACC0.B16, B5.B16, B5.B16
  1014  	VEXT	$8, B5.B16, B5.B16, T0.B16
  1015  	VEOR	B5.B16, T0.B16, T0.B16
  1016  	VPMULL	B5.D1, T1.D1, ACC1.Q1
  1017  	VPMULL2	B5.D2, T1.D2, ACC0.Q1
  1018  	VPMULL	T0.D1, T2.D1, ACCM.Q1
  1019  	reduce()
  1020  done:
  1021  	VST1	[ACC0.B16], (tPtr)
  1022  
  1023  	RET
  1024  

View as plain text