Text file src/crypto/sha1/sha1block_amd64.s

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // AVX2 version by Intel, same algorithm as code in Linux kernel:
     6  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
     7  // Authors:
     8  // Ilya Albrekht <ilya.albrekht@intel.com>
     9  // Maxim Locktyukhin <maxim.locktyukhin@intel.com>
    10  // Ronen Zohar <ronen.zohar@intel.com>
    11  // Chandramouli Narayanan <mouli@linux.intel.com>
    12  
    13  //go:build !purego
    14  
    15  #include "textflag.h"
    16  
    17  // SHA-1 block routine. See sha1block.go for Go equivalent.
    18  //
    19  // There are 80 rounds of 4 types:
    20  //   - rounds 0-15 are type 1 and load data (ROUND1 macro).
    21  //   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
    22  //   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
    23  //   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
    24  //   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
    25  //
    26  // Each round loads or shuffles the data, then computes a per-round
    27  // function of b, c, d, and then mixes the result into and rotates the
    28  // five registers a, b, c, d, e holding the intermediate results.
    29  //
    30  // The register rotation is implemented by rotating the arguments to
    31  // the round macros instead of by explicit move instructions.
    32  
    33  #define LOAD(index) \
    34  	MOVL	(index*4)(SI), R10; \
    35  	BSWAPL	R10; \
    36  	MOVL	R10, (index*4)(SP)
    37  
    38  #define SHUFFLE(index) \
    39  	MOVL	(((index)&0xf)*4)(SP), R10; \
    40  	XORL	(((index-3)&0xf)*4)(SP), R10; \
    41  	XORL	(((index-8)&0xf)*4)(SP), R10; \
    42  	XORL	(((index-14)&0xf)*4)(SP), R10; \
    43  	ROLL	$1, R10; \
    44  	MOVL	R10, (((index)&0xf)*4)(SP)
    45  
    46  #define FUNC1(a, b, c, d, e) \
    47  	MOVL	d, R9; \
    48  	XORL	c, R9; \
    49  	ANDL	b, R9; \
    50  	XORL	d, R9
    51  
    52  #define FUNC2(a, b, c, d, e) \
    53  	MOVL	b, R9; \
    54  	XORL	c, R9; \
    55  	XORL	d, R9
    56  
    57  #define FUNC3(a, b, c, d, e) \
    58  	MOVL	b, R8; \
    59  	ORL	c, R8; \
    60  	ANDL	d, R8; \
    61  	MOVL	b, R9; \
    62  	ANDL	c, R9; \
    63  	ORL	R8, R9
    64  
    65  #define FUNC4 FUNC2
    66  
    67  #define MIX(a, b, c, d, e, const) \
    68  	ROLL	$30, b; \
    69  	ADDL	R9, e; \
    70  	MOVL	a, R8; \
    71  	ROLL	$5, R8; \
    72  	LEAL	const(e)(R10*1), e; \
    73  	ADDL	R8, e
    74  
    75  #define ROUND1(a, b, c, d, e, index) \
    76  	LOAD(index); \
    77  	FUNC1(a, b, c, d, e); \
    78  	MIX(a, b, c, d, e, 0x5A827999)
    79  
    80  #define ROUND1x(a, b, c, d, e, index) \
    81  	SHUFFLE(index); \
    82  	FUNC1(a, b, c, d, e); \
    83  	MIX(a, b, c, d, e, 0x5A827999)
    84  
    85  #define ROUND2(a, b, c, d, e, index) \
    86  	SHUFFLE(index); \
    87  	FUNC2(a, b, c, d, e); \
    88  	MIX(a, b, c, d, e, 0x6ED9EBA1)
    89  
    90  #define ROUND3(a, b, c, d, e, index) \
    91  	SHUFFLE(index); \
    92  	FUNC3(a, b, c, d, e); \
    93  	MIX(a, b, c, d, e, 0x8F1BBCDC)
    94  
    95  #define ROUND4(a, b, c, d, e, index) \
    96  	SHUFFLE(index); \
    97  	FUNC4(a, b, c, d, e); \
    98  	MIX(a, b, c, d, e, 0xCA62C1D6)
    99  
   100  TEXT ·blockAMD64(SB),NOSPLIT,$64-32
   101  	MOVQ	dig+0(FP),	BP
   102  	MOVQ	p_base+8(FP),	SI
   103  	MOVQ	p_len+16(FP),	DX
   104  	SHRQ	$6,		DX
   105  	SHLQ	$6,		DX
   106  
   107  	LEAQ	(SI)(DX*1),	DI
   108  	MOVL	(0*4)(BP),	AX
   109  	MOVL	(1*4)(BP),	BX
   110  	MOVL	(2*4)(BP),	CX
   111  	MOVL	(3*4)(BP),	DX
   112  	MOVL	(4*4)(BP),	BP
   113  
   114  	CMPQ	SI,		DI
   115  	JEQ	end
   116  
   117  loop:
   118  	MOVL	AX,	R11
   119  	MOVL	BX,	R12
   120  	MOVL	CX,	R13
   121  	MOVL	DX,	R14
   122  	MOVL	BP,	R15
   123  
   124  	ROUND1(AX, BX, CX, DX, BP, 0)
   125  	ROUND1(BP, AX, BX, CX, DX, 1)
   126  	ROUND1(DX, BP, AX, BX, CX, 2)
   127  	ROUND1(CX, DX, BP, AX, BX, 3)
   128  	ROUND1(BX, CX, DX, BP, AX, 4)
   129  	ROUND1(AX, BX, CX, DX, BP, 5)
   130  	ROUND1(BP, AX, BX, CX, DX, 6)
   131  	ROUND1(DX, BP, AX, BX, CX, 7)
   132  	ROUND1(CX, DX, BP, AX, BX, 8)
   133  	ROUND1(BX, CX, DX, BP, AX, 9)
   134  	ROUND1(AX, BX, CX, DX, BP, 10)
   135  	ROUND1(BP, AX, BX, CX, DX, 11)
   136  	ROUND1(DX, BP, AX, BX, CX, 12)
   137  	ROUND1(CX, DX, BP, AX, BX, 13)
   138  	ROUND1(BX, CX, DX, BP, AX, 14)
   139  	ROUND1(AX, BX, CX, DX, BP, 15)
   140  
   141  	ROUND1x(BP, AX, BX, CX, DX, 16)
   142  	ROUND1x(DX, BP, AX, BX, CX, 17)
   143  	ROUND1x(CX, DX, BP, AX, BX, 18)
   144  	ROUND1x(BX, CX, DX, BP, AX, 19)
   145  
   146  	ROUND2(AX, BX, CX, DX, BP, 20)
   147  	ROUND2(BP, AX, BX, CX, DX, 21)
   148  	ROUND2(DX, BP, AX, BX, CX, 22)
   149  	ROUND2(CX, DX, BP, AX, BX, 23)
   150  	ROUND2(BX, CX, DX, BP, AX, 24)
   151  	ROUND2(AX, BX, CX, DX, BP, 25)
   152  	ROUND2(BP, AX, BX, CX, DX, 26)
   153  	ROUND2(DX, BP, AX, BX, CX, 27)
   154  	ROUND2(CX, DX, BP, AX, BX, 28)
   155  	ROUND2(BX, CX, DX, BP, AX, 29)
   156  	ROUND2(AX, BX, CX, DX, BP, 30)
   157  	ROUND2(BP, AX, BX, CX, DX, 31)
   158  	ROUND2(DX, BP, AX, BX, CX, 32)
   159  	ROUND2(CX, DX, BP, AX, BX, 33)
   160  	ROUND2(BX, CX, DX, BP, AX, 34)
   161  	ROUND2(AX, BX, CX, DX, BP, 35)
   162  	ROUND2(BP, AX, BX, CX, DX, 36)
   163  	ROUND2(DX, BP, AX, BX, CX, 37)
   164  	ROUND2(CX, DX, BP, AX, BX, 38)
   165  	ROUND2(BX, CX, DX, BP, AX, 39)
   166  
   167  	ROUND3(AX, BX, CX, DX, BP, 40)
   168  	ROUND3(BP, AX, BX, CX, DX, 41)
   169  	ROUND3(DX, BP, AX, BX, CX, 42)
   170  	ROUND3(CX, DX, BP, AX, BX, 43)
   171  	ROUND3(BX, CX, DX, BP, AX, 44)
   172  	ROUND3(AX, BX, CX, DX, BP, 45)
   173  	ROUND3(BP, AX, BX, CX, DX, 46)
   174  	ROUND3(DX, BP, AX, BX, CX, 47)
   175  	ROUND3(CX, DX, BP, AX, BX, 48)
   176  	ROUND3(BX, CX, DX, BP, AX, 49)
   177  	ROUND3(AX, BX, CX, DX, BP, 50)
   178  	ROUND3(BP, AX, BX, CX, DX, 51)
   179  	ROUND3(DX, BP, AX, BX, CX, 52)
   180  	ROUND3(CX, DX, BP, AX, BX, 53)
   181  	ROUND3(BX, CX, DX, BP, AX, 54)
   182  	ROUND3(AX, BX, CX, DX, BP, 55)
   183  	ROUND3(BP, AX, BX, CX, DX, 56)
   184  	ROUND3(DX, BP, AX, BX, CX, 57)
   185  	ROUND3(CX, DX, BP, AX, BX, 58)
   186  	ROUND3(BX, CX, DX, BP, AX, 59)
   187  
   188  	ROUND4(AX, BX, CX, DX, BP, 60)
   189  	ROUND4(BP, AX, BX, CX, DX, 61)
   190  	ROUND4(DX, BP, AX, BX, CX, 62)
   191  	ROUND4(CX, DX, BP, AX, BX, 63)
   192  	ROUND4(BX, CX, DX, BP, AX, 64)
   193  	ROUND4(AX, BX, CX, DX, BP, 65)
   194  	ROUND4(BP, AX, BX, CX, DX, 66)
   195  	ROUND4(DX, BP, AX, BX, CX, 67)
   196  	ROUND4(CX, DX, BP, AX, BX, 68)
   197  	ROUND4(BX, CX, DX, BP, AX, 69)
   198  	ROUND4(AX, BX, CX, DX, BP, 70)
   199  	ROUND4(BP, AX, BX, CX, DX, 71)
   200  	ROUND4(DX, BP, AX, BX, CX, 72)
   201  	ROUND4(CX, DX, BP, AX, BX, 73)
   202  	ROUND4(BX, CX, DX, BP, AX, 74)
   203  	ROUND4(AX, BX, CX, DX, BP, 75)
   204  	ROUND4(BP, AX, BX, CX, DX, 76)
   205  	ROUND4(DX, BP, AX, BX, CX, 77)
   206  	ROUND4(CX, DX, BP, AX, BX, 78)
   207  	ROUND4(BX, CX, DX, BP, AX, 79)
   208  
   209  	ADDL	R11, AX
   210  	ADDL	R12, BX
   211  	ADDL	R13, CX
   212  	ADDL	R14, DX
   213  	ADDL	R15, BP
   214  
   215  	ADDQ	$64, SI
   216  	CMPQ	SI, DI
   217  	JB	loop
   218  
   219  end:
   220  	MOVQ	dig+0(FP), DI
   221  	MOVL	AX, (0*4)(DI)
   222  	MOVL	BX, (1*4)(DI)
   223  	MOVL	CX, (2*4)(DI)
   224  	MOVL	DX, (3*4)(DI)
   225  	MOVL	BP, (4*4)(DI)
   226  	RET
   227  
   228  
   229  // This is the implementation using AVX2, BMI1 and BMI2. It is based on:
   230  // "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
   231  // From http://software.intel.com/en-us/articles
   232  // (look for improving-the-performance-of-the-secure-hash-algorithm-1)
   233  // This implementation is 2x unrolled, and interleaves vector instructions,
   234  // used to precompute W, with scalar computation of current round
   235  // for optimal scheduling.
   236  
   237  // Trivial helper macros.
   238  #define UPDATE_HASH(A,TB,C,D,E) \
   239  	ADDL	(R9), A \
   240  	MOVL	A, (R9) \
   241  	ADDL	4(R9), TB \
   242  	MOVL	TB, 4(R9) \
   243  	ADDL	8(R9), C \
   244  	MOVL	C, 8(R9) \
   245  	ADDL	12(R9), D \
   246  	MOVL	D, 12(R9) \
   247  	ADDL	16(R9), E \
   248  	MOVL	E, 16(R9)
   249  
   250  
   251  
   252  // Helper macros for PRECALC, which does precomputations
   253  #define PRECALC_0(OFFSET) \
   254  	VMOVDQU   OFFSET(R10),X0
   255  
   256  #define PRECALC_1(OFFSET) \
   257  	VINSERTI128 $1, OFFSET(R13), Y0, Y0
   258  
   259  #define PRECALC_2(YREG) \
   260  	VPSHUFB Y10, Y0, YREG
   261  
   262  #define PRECALC_4(YREG,K_OFFSET) \
   263  	VPADDD K_OFFSET(R8), YREG, Y0
   264  
   265  #define PRECALC_7(OFFSET) \
   266  	VMOVDQU Y0, (OFFSET*2)(R14)
   267  
   268  
   269  // Message scheduling pre-compute for rounds 0-15
   270  // R13 is a pointer to even 64-byte block
   271  // R10 is a pointer to odd 64-byte block
   272  // R14 is a pointer to temp buffer
   273  // X0 is used as temp register
   274  // YREG is clobbered as part of computation
   275  // OFFSET chooses 16 byte chunk within a block
   276  // R8 is a pointer to constants block
   277  // K_OFFSET chooses K constants relevant to this round
   278  // X10 holds swap mask
   279  #define PRECALC_00_15(OFFSET,YREG) \
   280  	PRECALC_0(OFFSET) \
   281  	PRECALC_1(OFFSET) \
   282  	PRECALC_2(YREG) \
   283  	PRECALC_4(YREG,0x0) \
   284  	PRECALC_7(OFFSET)
   285  
   286  
   287  // Helper macros for PRECALC_16_31
   288  #define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   289  	VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \  // w[i-14]
   290  	VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
   291  
   292  #define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   293  	VPXOR  REG_SUB_8, REG, REG \
   294  	VPXOR  REG_SUB_16, Y0, Y0
   295  
   296  #define PRECALC_18(REG) \
   297  	VPXOR Y0, REG, REG \
   298  	VPSLLDQ $12, REG, Y9
   299  
   300  #define PRECALC_19(REG) \
   301  	VPSLLD $1, REG, Y0 \
   302  	VPSRLD $31, REG, REG
   303  
   304  #define PRECALC_20(REG) \
   305  	VPOR REG, Y0, Y0 \
   306  	VPSLLD $2, Y9,  REG
   307  
   308  #define PRECALC_21(REG) \
   309  	VPSRLD $30, Y9, Y9 \
   310  	VPXOR REG, Y0, Y0
   311  
   312  #define PRECALC_23(REG,K_OFFSET,OFFSET) \
   313  	VPXOR Y9, Y0, REG \
   314  	VPADDD K_OFFSET(R8), REG, Y0 \
   315  	VMOVDQU Y0, (OFFSET)(R14)
   316  
   317  // Message scheduling pre-compute for rounds 16-31
   318  // calculating last 32 w[i] values in 8 XMM registers
   319  // pre-calculate K+w[i] values and store to mem
   320  // for later load by ALU add instruction.
   321  // "brute force" vectorization for rounds 16-31 only
   322  // due to w[i]->w[i-3] dependency.
   323  // clobbers 5 input ymm registers REG_SUB*
   324  // uses X0 and X9 as temp registers
   325  // As always, R8 is a pointer to constants block
   326  // and R14 is a pointer to temp buffer
   327  #define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
   328  	PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
   329  	PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
   330  	PRECALC_18(REG) \
   331  	PRECALC_19(REG) \
   332  	PRECALC_20(REG) \
   333  	PRECALC_21(REG) \
   334  	PRECALC_23(REG,K_OFFSET,OFFSET)
   335  
   336  
   337  // Helper macros for PRECALC_32_79
   338  #define PRECALC_32(REG_SUB_8,REG_SUB_4) \
   339  	VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
   340  
   341  #define PRECALC_33(REG_SUB_28,REG) \
   342  	VPXOR REG_SUB_28, REG, REG
   343  
   344  #define PRECALC_34(REG_SUB_16) \
   345  	VPXOR REG_SUB_16, Y0, Y0
   346  
   347  #define PRECALC_35(REG) \
   348  	VPXOR Y0, REG, REG
   349  
   350  #define PRECALC_36(REG) \
   351  	VPSLLD $2, REG, Y0
   352  
   353  #define PRECALC_37(REG) \
   354  	VPSRLD $30, REG, REG \
   355  	VPOR REG, Y0, REG
   356  
   357  #define PRECALC_39(REG,K_OFFSET,OFFSET) \
   358  	VPADDD K_OFFSET(R8), REG, Y0 \
   359  	VMOVDQU Y0, (OFFSET)(R14)
   360  
   361  // Message scheduling pre-compute for rounds 32-79
   362  // In SHA-1 specification we have:
   363  // w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
   364  // Which is the same as:
   365  // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
   366  // This allows for more efficient vectorization,
   367  // since w[i]->w[i-3] dependency is broken
   368  #define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
   369  	PRECALC_32(REG_SUB_8,REG_SUB_4) \
   370  	PRECALC_33(REG_SUB_28,REG) \
   371  	PRECALC_34(REG_SUB_16) \
   372  	PRECALC_35(REG) \
   373  	PRECALC_36(REG) \
   374  	PRECALC_37(REG) \
   375  	PRECALC_39(REG,K_OFFSET,OFFSET)
   376  
   377  #define PRECALC \
   378  	PRECALC_00_15(0,Y15) \
   379  	PRECALC_00_15(0x10,Y14) \
   380  	PRECALC_00_15(0x20,Y13) \
   381  	PRECALC_00_15(0x30,Y12) \
   382  	PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
   383  	PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
   384  	PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
   385  	PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
   386  	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
   387  	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
   388  	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
   389  	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
   390  	PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
   391  	PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
   392  	PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
   393  	PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
   394  	PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
   395  	PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
   396  	PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
   397  	PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
   398  
   399  // Macros calculating individual rounds have general form
   400  // CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
   401  // CALC_ROUND_{PRE,POST} macros follow
   402  
   403  #define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
   404  	ADDL OFFSET(R15),REG_E \
   405  	ANDNL REG_C,REG_A,BP \
   406  	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   407  	RORXL $0x1b, REG_A, R12 \
   408  	RORXL $2, REG_A, REG_B         // for next round
   409  
   410  // Calculate F for the next round
   411  #define CALC_F1_POST(REG_A,REG_B,REG_E) \
   412  	ANDL REG_B,REG_A \             // b&c
   413  	XORL BP, REG_A \               // F1 = (b&c) ^ (~b&d)
   414  	LEAL (REG_E)(R12*1), REG_E     // E += A >>> 5
   415  
   416  
   417  // Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
   418  #define CALC_0 \
   419  	MOVL SI, BX \ // Precalculating first round
   420  	RORXL $2, SI, SI \
   421  	ANDNL AX, BX, BP \
   422  	ANDL DI, BX \
   423  	XORL BP, BX \
   424  	CALC_F1_PRE(0x0,CX,BX,DI,DX) \
   425  	PRECALC_0(0x80) \
   426  	CALC_F1_POST(CX,SI,DX)
   427  
   428  #define CALC_1 \
   429  	CALC_F1_PRE(0x4,DX,CX,SI,AX) \
   430  	PRECALC_1(0x80) \
   431  	CALC_F1_POST(DX,BX,AX)
   432  
   433  #define CALC_2 \
   434  	CALC_F1_PRE(0x8,AX,DX,BX,DI) \
   435  	PRECALC_2(Y15) \
   436  	CALC_F1_POST(AX,CX,DI)
   437  
   438  #define CALC_3 \
   439  	CALC_F1_PRE(0xc,DI,AX,CX,SI) \
   440  	CALC_F1_POST(DI,DX,SI)
   441  
   442  #define CALC_4 \
   443  	CALC_F1_PRE(0x20,SI,DI,DX,BX) \
   444  	PRECALC_4(Y15,0x0) \
   445  	CALC_F1_POST(SI,AX,BX)
   446  
   447  #define CALC_5 \
   448  	CALC_F1_PRE(0x24,BX,SI,AX,CX) \
   449  	CALC_F1_POST(BX,DI,CX)
   450  
   451  #define CALC_6 \
   452  	CALC_F1_PRE(0x28,CX,BX,DI,DX) \
   453  	CALC_F1_POST(CX,SI,DX)
   454  
   455  #define CALC_7 \
   456  	CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
   457  	PRECALC_7(0x0) \
   458  	CALC_F1_POST(DX,BX,AX)
   459  
   460  #define CALC_8 \
   461  	CALC_F1_PRE(0x40,AX,DX,BX,DI) \
   462  	PRECALC_0(0x90) \
   463  	CALC_F1_POST(AX,CX,DI)
   464  
   465  #define CALC_9 \
   466  	CALC_F1_PRE(0x44,DI,AX,CX,SI) \
   467  	PRECALC_1(0x90) \
   468  	CALC_F1_POST(DI,DX,SI)
   469  
   470  #define CALC_10 \
   471  	CALC_F1_PRE(0x48,SI,DI,DX,BX) \
   472  	PRECALC_2(Y14) \
   473  	CALC_F1_POST(SI,AX,BX)
   474  
   475  #define CALC_11 \
   476  	CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
   477  	CALC_F1_POST(BX,DI,CX)
   478  
   479  #define CALC_12 \
   480  	CALC_F1_PRE(0x60,CX,BX,DI,DX) \
   481  	PRECALC_4(Y14,0x0) \
   482  	CALC_F1_POST(CX,SI,DX)
   483  
   484  #define CALC_13 \
   485  	CALC_F1_PRE(0x64,DX,CX,SI,AX) \
   486  	CALC_F1_POST(DX,BX,AX)
   487  
   488  #define CALC_14 \
   489  	CALC_F1_PRE(0x68,AX,DX,BX,DI) \
   490  	CALC_F1_POST(AX,CX,DI)
   491  
   492  #define CALC_15 \
   493  	CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
   494  	PRECALC_7(0x10) \
   495  	CALC_F1_POST(DI,DX,SI)
   496  
   497  #define CALC_16 \
   498  	CALC_F1_PRE(0x80,SI,DI,DX,BX) \
   499  	PRECALC_0(0xa0) \
   500  	CALC_F1_POST(SI,AX,BX)
   501  
   502  #define CALC_17 \
   503  	CALC_F1_PRE(0x84,BX,SI,AX,CX) \
   504  	PRECALC_1(0xa0) \
   505  	CALC_F1_POST(BX,DI,CX)
   506  
   507  #define CALC_18 \
   508  	CALC_F1_PRE(0x88,CX,BX,DI,DX) \
   509  	PRECALC_2(Y13) \
   510  	CALC_F1_POST(CX,SI,DX)
   511  
   512  
   513  #define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
   514  	ADDL OFFSET(R15),REG_E \
   515  	LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
   516  	RORXL $0x1b, REG_A, R12 \
   517  	RORXL $2, REG_A, REG_B         // for next round
   518  
   519  #define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
   520  	XORL REG_B, REG_A \
   521  	ADDL R12, REG_E \
   522  	XORL REG_C, REG_A
   523  
   524  #define CALC_19 \
   525  	CALC_F2_PRE(0x8c,DX,CX,AX) \
   526  	CALC_F2_POST(DX,BX,SI,AX)
   527  
   528  #define CALC_20 \
   529  	CALC_F2_PRE(0xa0,AX,DX,DI) \
   530  	PRECALC_4(Y13,0x0) \
   531  	CALC_F2_POST(AX,CX,BX,DI)
   532  
   533  #define CALC_21 \
   534  	CALC_F2_PRE(0xa4,DI,AX,SI) \
   535  	CALC_F2_POST(DI,DX,CX,SI)
   536  
   537  #define CALC_22 \
   538  	CALC_F2_PRE(0xa8,SI,DI,BX) \
   539  	CALC_F2_POST(SI,AX,DX,BX)
   540  
   541  #define CALC_23 \
   542  	CALC_F2_PRE(0xac,BX,SI,CX) \
   543  	PRECALC_7(0x20) \
   544  	CALC_F2_POST(BX,DI,AX,CX)
   545  
   546  #define CALC_24 \
   547  	CALC_F2_PRE(0xc0,CX,BX,DX) \
   548  	PRECALC_0(0xb0) \
   549  	CALC_F2_POST(CX,SI,DI,DX)
   550  
   551  #define CALC_25 \
   552  	CALC_F2_PRE(0xc4,DX,CX,AX) \
   553  	PRECALC_1(0xb0) \
   554  	CALC_F2_POST(DX,BX,SI,AX)
   555  
   556  #define CALC_26 \
   557  	CALC_F2_PRE(0xc8,AX,DX,DI) \
   558  	PRECALC_2(Y12) \
   559  	CALC_F2_POST(AX,CX,BX,DI)
   560  
   561  #define CALC_27 \
   562  	CALC_F2_PRE(0xcc,DI,AX,SI) \
   563  	CALC_F2_POST(DI,DX,CX,SI)
   564  
   565  #define CALC_28 \
   566  	CALC_F2_PRE(0xe0,SI,DI,BX) \
   567  	PRECALC_4(Y12,0x0) \
   568  	CALC_F2_POST(SI,AX,DX,BX)
   569  
   570  #define CALC_29 \
   571  	CALC_F2_PRE(0xe4,BX,SI,CX) \
   572  	CALC_F2_POST(BX,DI,AX,CX)
   573  
   574  #define CALC_30 \
   575  	CALC_F2_PRE(0xe8,CX,BX,DX) \
   576  	CALC_F2_POST(CX,SI,DI,DX)
   577  
   578  #define CALC_31 \
   579  	CALC_F2_PRE(0xec,DX,CX,AX) \
   580  	PRECALC_7(0x30) \
   581  	CALC_F2_POST(DX,BX,SI,AX)
   582  
   583  #define CALC_32 \
   584  	CALC_F2_PRE(0x100,AX,DX,DI) \
   585  	PRECALC_16(Y15,Y14,Y12,Y8) \
   586  	CALC_F2_POST(AX,CX,BX,DI)
   587  
   588  #define CALC_33 \
   589  	CALC_F2_PRE(0x104,DI,AX,SI) \
   590  	PRECALC_17(Y15,Y13,Y8) \
   591  	CALC_F2_POST(DI,DX,CX,SI)
   592  
   593  #define CALC_34 \
   594  	CALC_F2_PRE(0x108,SI,DI,BX) \
   595  	PRECALC_18(Y8) \
   596  	CALC_F2_POST(SI,AX,DX,BX)
   597  
   598  #define CALC_35 \
   599  	CALC_F2_PRE(0x10c,BX,SI,CX) \
   600  	PRECALC_19(Y8) \
   601  	CALC_F2_POST(BX,DI,AX,CX)
   602  
   603  #define CALC_36 \
   604  	CALC_F2_PRE(0x120,CX,BX,DX) \
   605  	PRECALC_20(Y8) \
   606  	CALC_F2_POST(CX,SI,DI,DX)
   607  
   608  #define CALC_37 \
   609  	CALC_F2_PRE(0x124,DX,CX,AX) \
   610  	PRECALC_21(Y8) \
   611  	CALC_F2_POST(DX,BX,SI,AX)
   612  
   613  #define CALC_38 \
   614  	CALC_F2_PRE(0x128,AX,DX,DI) \
   615  	CALC_F2_POST(AX,CX,BX,DI)
   616  
   617  
   618  #define CALC_F3_PRE(OFFSET,REG_E) \
   619  	ADDL OFFSET(R15),REG_E
   620  
   621  #define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
   622  	LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
   623  	MOVL REG_B, BP \
   624  	ORL  REG_A, BP \
   625  	RORXL $0x1b, REG_A, R12 \
   626  	RORXL $2, REG_A, REG_TB \
   627  	ANDL REG_C, BP \		// Calculate F for the next round
   628  	ANDL REG_B, REG_A \
   629  	ORL  BP, REG_A \
   630  	ADDL R12, REG_E
   631  
   632  #define CALC_39 \
   633  	CALC_F3_PRE(0x12c,SI) \
   634  	PRECALC_23(Y8,0x0,0x80) \
   635  	CALC_F3_POST(DI,DX,CX,SI,AX)
   636  
   637  #define CALC_40 \
   638  	CALC_F3_PRE(0x140,BX) \
   639  	PRECALC_16(Y14,Y13,Y8,Y7) \
   640  	CALC_F3_POST(SI,AX,DX,BX,DI)
   641  
   642  #define CALC_41 \
   643  	CALC_F3_PRE(0x144,CX) \
   644  	PRECALC_17(Y14,Y12,Y7) \
   645  	CALC_F3_POST(BX,DI,AX,CX,SI)
   646  
   647  #define CALC_42 \
   648  	CALC_F3_PRE(0x148,DX) \
   649  	PRECALC_18(Y7) \
   650  	CALC_F3_POST(CX,SI,DI,DX,BX)
   651  
   652  #define CALC_43 \
   653  	CALC_F3_PRE(0x14c,AX) \
   654  	PRECALC_19(Y7) \
   655  	CALC_F3_POST(DX,BX,SI,AX,CX)
   656  
   657  #define CALC_44 \
   658  	CALC_F3_PRE(0x160,DI) \
   659  	PRECALC_20(Y7) \
   660  	CALC_F3_POST(AX,CX,BX,DI,DX)
   661  
   662  #define CALC_45 \
   663  	CALC_F3_PRE(0x164,SI) \
   664  	PRECALC_21(Y7) \
   665  	CALC_F3_POST(DI,DX,CX,SI,AX)
   666  
   667  #define CALC_46 \
   668  	CALC_F3_PRE(0x168,BX) \
   669  	CALC_F3_POST(SI,AX,DX,BX,DI)
   670  
   671  #define CALC_47 \
   672  	CALC_F3_PRE(0x16c,CX) \
   673  	VPXOR Y9, Y0, Y7 \
   674  	VPADDD 0x20(R8), Y7, Y0 \
   675  	VMOVDQU Y0, 0xa0(R14) \
   676  	CALC_F3_POST(BX,DI,AX,CX,SI)
   677  
   678  #define CALC_48 \
   679  	CALC_F3_PRE(0x180,DX) \
   680  	PRECALC_16(Y13,Y12,Y7,Y5) \
   681  	CALC_F3_POST(CX,SI,DI,DX,BX)
   682  
   683  #define CALC_49 \
   684  	CALC_F3_PRE(0x184,AX) \
   685  	PRECALC_17(Y13,Y8,Y5) \
   686  	CALC_F3_POST(DX,BX,SI,AX,CX)
   687  
   688  #define CALC_50 \
   689  	CALC_F3_PRE(0x188,DI) \
   690  	PRECALC_18(Y5) \
   691  	CALC_F3_POST(AX,CX,BX,DI,DX)
   692  
   693  #define CALC_51 \
   694  	CALC_F3_PRE(0x18c,SI) \
   695  	PRECALC_19(Y5) \
   696  	CALC_F3_POST(DI,DX,CX,SI,AX)
   697  
   698  #define CALC_52 \
   699  	CALC_F3_PRE(0x1a0,BX) \
   700  	PRECALC_20(Y5) \
   701  	CALC_F3_POST(SI,AX,DX,BX,DI)
   702  
   703  #define CALC_53 \
   704  	CALC_F3_PRE(0x1a4,CX) \
   705  	PRECALC_21(Y5) \
   706  	CALC_F3_POST(BX,DI,AX,CX,SI)
   707  
   708  #define CALC_54 \
   709  	CALC_F3_PRE(0x1a8,DX) \
   710  	CALC_F3_POST(CX,SI,DI,DX,BX)
   711  
   712  #define CALC_55 \
   713  	CALC_F3_PRE(0x1ac,AX) \
   714  	PRECALC_23(Y5,0x20,0xc0) \
   715  	CALC_F3_POST(DX,BX,SI,AX,CX)
   716  
   717  #define CALC_56 \
   718  	CALC_F3_PRE(0x1c0,DI) \
   719  	PRECALC_16(Y12,Y8,Y5,Y3) \
   720  	CALC_F3_POST(AX,CX,BX,DI,DX)
   721  
   722  #define CALC_57 \
   723  	CALC_F3_PRE(0x1c4,SI) \
   724  	PRECALC_17(Y12,Y7,Y3) \
   725  	CALC_F3_POST(DI,DX,CX,SI,AX)
   726  
   727  #define CALC_58 \
   728  	CALC_F3_PRE(0x1c8,BX) \
   729  	PRECALC_18(Y3) \
   730  	CALC_F3_POST(SI,AX,DX,BX,DI)
   731  
   732  #define CALC_59 \
   733  	CALC_F2_PRE(0x1cc,BX,SI,CX) \
   734  	PRECALC_19(Y3) \
   735  	CALC_F2_POST(BX,DI,AX,CX)
   736  
   737  #define CALC_60 \
   738  	CALC_F2_PRE(0x1e0,CX,BX,DX) \
   739  	PRECALC_20(Y3) \
   740  	CALC_F2_POST(CX,SI,DI,DX)
   741  
   742  #define CALC_61 \
   743  	CALC_F2_PRE(0x1e4,DX,CX,AX) \
   744  	PRECALC_21(Y3) \
   745  	CALC_F2_POST(DX,BX,SI,AX)
   746  
   747  #define CALC_62 \
   748  	CALC_F2_PRE(0x1e8,AX,DX,DI) \
   749  	CALC_F2_POST(AX,CX,BX,DI)
   750  
   751  #define CALC_63 \
   752  	CALC_F2_PRE(0x1ec,DI,AX,SI) \
   753  	PRECALC_23(Y3,0x20,0xe0) \
   754  	CALC_F2_POST(DI,DX,CX,SI)
   755  
   756  #define CALC_64 \
   757  	CALC_F2_PRE(0x200,SI,DI,BX) \
   758  	PRECALC_32(Y5,Y3) \
   759  	CALC_F2_POST(SI,AX,DX,BX)
   760  
   761  #define CALC_65 \
   762  	CALC_F2_PRE(0x204,BX,SI,CX) \
   763  	PRECALC_33(Y14,Y15) \
   764  	CALC_F2_POST(BX,DI,AX,CX)
   765  
   766  #define CALC_66 \
   767  	CALC_F2_PRE(0x208,CX,BX,DX) \
   768  	PRECALC_34(Y8) \
   769  	CALC_F2_POST(CX,SI,DI,DX)
   770  
   771  #define CALC_67 \
   772  	CALC_F2_PRE(0x20c,DX,CX,AX) \
   773  	PRECALC_35(Y15) \
   774  	CALC_F2_POST(DX,BX,SI,AX)
   775  
   776  #define CALC_68 \
   777  	CALC_F2_PRE(0x220,AX,DX,DI) \
   778  	PRECALC_36(Y15) \
   779  	CALC_F2_POST(AX,CX,BX,DI)
   780  
   781  #define CALC_69 \
   782  	CALC_F2_PRE(0x224,DI,AX,SI) \
   783  	PRECALC_37(Y15) \
   784  	CALC_F2_POST(DI,DX,CX,SI)
   785  
   786  #define CALC_70 \
   787  	CALC_F2_PRE(0x228,SI,DI,BX) \
   788  	CALC_F2_POST(SI,AX,DX,BX)
   789  
   790  #define CALC_71 \
   791  	CALC_F2_PRE(0x22c,BX,SI,CX) \
   792  	PRECALC_39(Y15,0x20,0x100) \
   793  	CALC_F2_POST(BX,DI,AX,CX)
   794  
   795  #define CALC_72 \
   796  	CALC_F2_PRE(0x240,CX,BX,DX) \
   797  	PRECALC_32(Y3,Y15) \
   798  	CALC_F2_POST(CX,SI,DI,DX)
   799  
   800  #define CALC_73 \
   801  	CALC_F2_PRE(0x244,DX,CX,AX) \
   802  	PRECALC_33(Y13,Y14) \
   803  	CALC_F2_POST(DX,BX,SI,AX)
   804  
   805  #define CALC_74 \
   806  	CALC_F2_PRE(0x248,AX,DX,DI) \
   807  	PRECALC_34(Y7) \
   808  	CALC_F2_POST(AX,CX,BX,DI)
   809  
   810  #define CALC_75 \
   811  	CALC_F2_PRE(0x24c,DI,AX,SI) \
   812  	PRECALC_35(Y14) \
   813  	CALC_F2_POST(DI,DX,CX,SI)
   814  
   815  #define CALC_76 \
   816  	CALC_F2_PRE(0x260,SI,DI,BX) \
   817  	PRECALC_36(Y14) \
   818  	CALC_F2_POST(SI,AX,DX,BX)
   819  
   820  #define CALC_77 \
   821  	CALC_F2_PRE(0x264,BX,SI,CX) \
   822  	PRECALC_37(Y14) \
   823  	CALC_F2_POST(BX,DI,AX,CX)
   824  
   825  #define CALC_78 \
   826  	CALC_F2_PRE(0x268,CX,BX,DX) \
   827  	CALC_F2_POST(CX,SI,DI,DX)
   828  
   829  #define CALC_79 \
   830  	ADDL 0x26c(R15), AX \
   831  	LEAL (AX)(CX*1), AX \
   832  	RORXL $0x1b, DX, R12 \
   833  	PRECALC_39(Y14,0x20,0x120) \
   834  	ADDL R12, AX
   835  
   836  // Similar to CALC_0
   837  #define CALC_80 \
   838  	MOVL CX, DX \
   839  	RORXL $2, CX, CX \
   840  	ANDNL SI, DX, BP \
   841  	ANDL BX, DX \
   842  	XORL BP, DX \
   843  	CALC_F1_PRE(0x10,AX,DX,BX,DI) \
   844  	PRECALC_32(Y15,Y14) \
   845  	CALC_F1_POST(AX,CX,DI)
   846  
   847  #define CALC_81 \
   848  	CALC_F1_PRE(0x14,DI,AX,CX,SI) \
   849  	PRECALC_33(Y12,Y13) \
   850  	CALC_F1_POST(DI,DX,SI)
   851  
   852  #define CALC_82 \
   853  	CALC_F1_PRE(0x18,SI,DI,DX,BX) \
   854  	PRECALC_34(Y5) \
   855  	CALC_F1_POST(SI,AX,BX)
   856  
   857  #define CALC_83 \
   858  	CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
   859  	PRECALC_35(Y13) \
   860  	CALC_F1_POST(BX,DI,CX)
   861  
   862  #define CALC_84 \
   863  	CALC_F1_PRE(0x30,CX,BX,DI,DX) \
   864  	PRECALC_36(Y13) \
   865  	CALC_F1_POST(CX,SI,DX)
   866  
   867  #define CALC_85 \
   868  	CALC_F1_PRE(0x34,DX,CX,SI,AX) \
   869  	PRECALC_37(Y13) \
   870  	CALC_F1_POST(DX,BX,AX)
   871  
   872  #define CALC_86 \
   873  	CALC_F1_PRE(0x38,AX,DX,BX,DI) \
   874  	CALC_F1_POST(AX,CX,DI)
   875  
   876  #define CALC_87 \
   877  	CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
   878  	PRECALC_39(Y13,0x40,0x140) \
   879  	CALC_F1_POST(DI,DX,SI)
   880  
   881  #define CALC_88 \
   882  	CALC_F1_PRE(0x50,SI,DI,DX,BX) \
   883  	PRECALC_32(Y14,Y13) \
   884  	CALC_F1_POST(SI,AX,BX)
   885  
   886  #define CALC_89 \
   887  	CALC_F1_PRE(0x54,BX,SI,AX,CX) \
   888  	PRECALC_33(Y8,Y12) \
   889  	CALC_F1_POST(BX,DI,CX)
   890  
   891  #define CALC_90 \
   892  	CALC_F1_PRE(0x58,CX,BX,DI,DX) \
   893  	PRECALC_34(Y3) \
   894  	CALC_F1_POST(CX,SI,DX)
   895  
   896  #define CALC_91 \
   897  	CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
   898  	PRECALC_35(Y12) \
   899  	CALC_F1_POST(DX,BX,AX)
   900  
   901  #define CALC_92 \
   902  	CALC_F1_PRE(0x70,AX,DX,BX,DI) \
   903  	PRECALC_36(Y12) \
   904  	CALC_F1_POST(AX,CX,DI)
   905  
   906  #define CALC_93 \
   907  	CALC_F1_PRE(0x74,DI,AX,CX,SI) \
   908  	PRECALC_37(Y12) \
   909  	CALC_F1_POST(DI,DX,SI)
   910  
   911  #define CALC_94 \
   912  	CALC_F1_PRE(0x78,SI,DI,DX,BX) \
   913  	CALC_F1_POST(SI,AX,BX)
   914  
   915  #define CALC_95 \
   916  	CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
   917  	PRECALC_39(Y12,0x40,0x160) \
   918  	CALC_F1_POST(BX,DI,CX)
   919  
   920  #define CALC_96 \
   921  	CALC_F1_PRE(0x90,CX,BX,DI,DX) \
   922  	PRECALC_32(Y13,Y12) \
   923  	CALC_F1_POST(CX,SI,DX)
   924  
   925  #define CALC_97 \
   926  	CALC_F1_PRE(0x94,DX,CX,SI,AX) \
   927  	PRECALC_33(Y7,Y8) \
   928  	CALC_F1_POST(DX,BX,AX)
   929  
   930  #define CALC_98 \
   931  	CALC_F1_PRE(0x98,AX,DX,BX,DI) \
   932  	PRECALC_34(Y15) \
   933  	CALC_F1_POST(AX,CX,DI)
   934  
   935  #define CALC_99 \
   936  	CALC_F2_PRE(0x9c,DI,AX,SI) \
   937  	PRECALC_35(Y8) \
   938  	CALC_F2_POST(DI,DX,CX,SI)
   939  
   940  #define CALC_100 \
   941  	CALC_F2_PRE(0xb0,SI,DI,BX) \
   942  	PRECALC_36(Y8) \
   943  	CALC_F2_POST(SI,AX,DX,BX)
   944  
   945  #define CALC_101 \
   946  	CALC_F2_PRE(0xb4,BX,SI,CX) \
   947  	PRECALC_37(Y8) \
   948  	CALC_F2_POST(BX,DI,AX,CX)
   949  
   950  #define CALC_102 \
   951  	CALC_F2_PRE(0xb8,CX,BX,DX) \
   952  	CALC_F2_POST(CX,SI,DI,DX)
   953  
   954  #define CALC_103 \
   955  	CALC_F2_PRE(0xbc,DX,CX,AX) \
   956  	PRECALC_39(Y8,0x40,0x180) \
   957  	CALC_F2_POST(DX,BX,SI,AX)
   958  
   959  #define CALC_104 \
   960  	CALC_F2_PRE(0xd0,AX,DX,DI) \
   961  	PRECALC_32(Y12,Y8) \
   962  	CALC_F2_POST(AX,CX,BX,DI)
   963  
   964  #define CALC_105 \
   965  	CALC_F2_PRE(0xd4,DI,AX,SI) \
   966  	PRECALC_33(Y5,Y7) \
   967  	CALC_F2_POST(DI,DX,CX,SI)
   968  
   969  #define CALC_106 \
   970  	CALC_F2_PRE(0xd8,SI,DI,BX) \
   971  	PRECALC_34(Y14) \
   972  	CALC_F2_POST(SI,AX,DX,BX)
   973  
   974  #define CALC_107 \
   975  	CALC_F2_PRE(0xdc,BX,SI,CX) \
   976  	PRECALC_35(Y7) \
   977  	CALC_F2_POST(BX,DI,AX,CX)
   978  
   979  #define CALC_108 \
   980  	CALC_F2_PRE(0xf0,CX,BX,DX) \
   981  	PRECALC_36(Y7) \
   982  	CALC_F2_POST(CX,SI,DI,DX)
   983  
   984  #define CALC_109 \
   985  	CALC_F2_PRE(0xf4,DX,CX,AX) \
   986  	PRECALC_37(Y7) \
   987  	CALC_F2_POST(DX,BX,SI,AX)
   988  
   989  #define CALC_110 \
   990  	CALC_F2_PRE(0xf8,AX,DX,DI) \
   991  	CALC_F2_POST(AX,CX,BX,DI)
   992  
   993  #define CALC_111 \
   994  	CALC_F2_PRE(0xfc,DI,AX,SI) \
   995  	PRECALC_39(Y7,0x40,0x1a0) \
   996  	CALC_F2_POST(DI,DX,CX,SI)
   997  
   998  #define CALC_112 \
   999  	CALC_F2_PRE(0x110,SI,DI,BX) \
  1000  	PRECALC_32(Y8,Y7) \
  1001  	CALC_F2_POST(SI,AX,DX,BX)
  1002  
  1003  #define CALC_113 \
  1004  	CALC_F2_PRE(0x114,BX,SI,CX) \
  1005  	PRECALC_33(Y3,Y5) \
  1006  	CALC_F2_POST(BX,DI,AX,CX)
  1007  
  1008  #define CALC_114 \
  1009  	CALC_F2_PRE(0x118,CX,BX,DX) \
  1010  	PRECALC_34(Y13) \
  1011  	CALC_F2_POST(CX,SI,DI,DX)
  1012  
  1013  #define CALC_115 \
  1014  	CALC_F2_PRE(0x11c,DX,CX,AX) \
  1015  	PRECALC_35(Y5) \
  1016  	CALC_F2_POST(DX,BX,SI,AX)
  1017  
  1018  #define CALC_116 \
  1019  	CALC_F2_PRE(0x130,AX,DX,DI) \
  1020  	PRECALC_36(Y5) \
  1021  	CALC_F2_POST(AX,CX,BX,DI)
  1022  
  1023  #define CALC_117 \
  1024  	CALC_F2_PRE(0x134,DI,AX,SI) \
  1025  	PRECALC_37(Y5) \
  1026  	CALC_F2_POST(DI,DX,CX,SI)
  1027  
  1028  #define CALC_118 \
  1029  	CALC_F2_PRE(0x138,SI,DI,BX) \
  1030  	CALC_F2_POST(SI,AX,DX,BX)
  1031  
  1032  #define CALC_119 \
  1033  	CALC_F3_PRE(0x13c,CX) \
  1034  	PRECALC_39(Y5,0x40,0x1c0) \
  1035  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1036  
  1037  #define CALC_120 \
  1038  	CALC_F3_PRE(0x150,DX) \
  1039  	PRECALC_32(Y7,Y5) \
  1040  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1041  
  1042  #define CALC_121 \
  1043  	CALC_F3_PRE(0x154,AX) \
  1044  	PRECALC_33(Y15,Y3) \
  1045  	CALC_F3_POST(DX,BX,SI,AX,CX)
  1046  
  1047  #define CALC_122 \
  1048  	CALC_F3_PRE(0x158,DI) \
  1049  	PRECALC_34(Y12) \
  1050  	CALC_F3_POST(AX,CX,BX,DI,DX)
  1051  
  1052  #define CALC_123 \
  1053  	CALC_F3_PRE(0x15c,SI) \
  1054  	PRECALC_35(Y3) \
  1055  	CALC_F3_POST(DI,DX,CX,SI,AX)
  1056  
  1057  #define CALC_124 \
  1058  	CALC_F3_PRE(0x170,BX) \
  1059  	PRECALC_36(Y3) \
  1060  	CALC_F3_POST(SI,AX,DX,BX,DI)
  1061  
  1062  #define CALC_125 \
  1063  	CALC_F3_PRE(0x174,CX) \
  1064  	PRECALC_37(Y3) \
  1065  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1066  
  1067  #define CALC_126 \
  1068  	CALC_F3_PRE(0x178,DX) \
  1069  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1070  
  1071  #define CALC_127 \
  1072  	CALC_F3_PRE(0x17c,AX) \
  1073  	PRECALC_39(Y3,0x60,0x1e0) \
  1074  	CALC_F3_POST(DX,BX,SI,AX,CX)
  1075  
  1076  #define CALC_128 \
  1077  	CALC_F3_PRE(0x190,DI) \
  1078  	PRECALC_32(Y5,Y3) \
  1079  	CALC_F3_POST(AX,CX,BX,DI,DX)
  1080  
  1081  #define CALC_129 \
  1082  	CALC_F3_PRE(0x194,SI) \
  1083  	PRECALC_33(Y14,Y15) \
  1084  	CALC_F3_POST(DI,DX,CX,SI,AX)
  1085  
  1086  #define CALC_130 \
  1087  	CALC_F3_PRE(0x198,BX) \
  1088  	PRECALC_34(Y8) \
  1089  	CALC_F3_POST(SI,AX,DX,BX,DI)
  1090  
  1091  #define CALC_131 \
  1092  	CALC_F3_PRE(0x19c,CX) \
  1093  	PRECALC_35(Y15) \
  1094  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1095  
  1096  #define CALC_132 \
  1097  	CALC_F3_PRE(0x1b0,DX) \
  1098  	PRECALC_36(Y15) \
  1099  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1100  
  1101  #define CALC_133 \
  1102  	CALC_F3_PRE(0x1b4,AX) \
  1103  	PRECALC_37(Y15) \
  1104  	CALC_F3_POST(DX,BX,SI,AX,CX)
  1105  
  1106  #define CALC_134 \
  1107  	CALC_F3_PRE(0x1b8,DI) \
  1108  	CALC_F3_POST(AX,CX,BX,DI,DX)
  1109  
  1110  #define CALC_135 \
  1111  	CALC_F3_PRE(0x1bc,SI) \
  1112  	PRECALC_39(Y15,0x60,0x200) \
  1113  	CALC_F3_POST(DI,DX,CX,SI,AX)
  1114  
  1115  #define CALC_136 \
  1116  	CALC_F3_PRE(0x1d0,BX) \
  1117  	PRECALC_32(Y3,Y15) \
  1118  	CALC_F3_POST(SI,AX,DX,BX,DI)
  1119  
  1120  #define CALC_137 \
  1121  	CALC_F3_PRE(0x1d4,CX) \
  1122  	PRECALC_33(Y13,Y14) \
  1123  	CALC_F3_POST(BX,DI,AX,CX,SI)
  1124  
  1125  #define CALC_138 \
  1126  	CALC_F3_PRE(0x1d8,DX) \
  1127  	PRECALC_34(Y7) \
  1128  	CALC_F3_POST(CX,SI,DI,DX,BX)
  1129  
  1130  #define CALC_139 \
  1131  	CALC_F2_PRE(0x1dc,DX,CX,AX) \
  1132  	PRECALC_35(Y14) \
  1133  	CALC_F2_POST(DX,BX,SI,AX)
  1134  
  1135  #define CALC_140 \
  1136  	CALC_F2_PRE(0x1f0,AX,DX,DI) \
  1137  	PRECALC_36(Y14) \
  1138  	CALC_F2_POST(AX,CX,BX,DI)
  1139  
  1140  #define CALC_141 \
  1141  	CALC_F2_PRE(0x1f4,DI,AX,SI) \
  1142  	PRECALC_37(Y14) \
  1143  	CALC_F2_POST(DI,DX,CX,SI)
  1144  
  1145  #define CALC_142 \
  1146  	CALC_F2_PRE(0x1f8,SI,DI,BX) \
  1147  	CALC_F2_POST(SI,AX,DX,BX)
  1148  
  1149  #define CALC_143 \
  1150  	CALC_F2_PRE(0x1fc,BX,SI,CX) \
  1151  	PRECALC_39(Y14,0x60,0x220) \
  1152  	CALC_F2_POST(BX,DI,AX,CX)
  1153  
  1154  #define CALC_144 \
  1155  	CALC_F2_PRE(0x210,CX,BX,DX) \
  1156  	PRECALC_32(Y15,Y14) \
  1157  	CALC_F2_POST(CX,SI,DI,DX)
  1158  
  1159  #define CALC_145 \
  1160  	CALC_F2_PRE(0x214,DX,CX,AX) \
  1161  	PRECALC_33(Y12,Y13) \
  1162  	CALC_F2_POST(DX,BX,SI,AX)
  1163  
  1164  #define CALC_146 \
  1165  	CALC_F2_PRE(0x218,AX,DX,DI) \
  1166  	PRECALC_34(Y5) \
  1167  	CALC_F2_POST(AX,CX,BX,DI)
  1168  
  1169  #define CALC_147 \
  1170  	CALC_F2_PRE(0x21c,DI,AX,SI) \
  1171  	PRECALC_35(Y13) \
  1172  	CALC_F2_POST(DI,DX,CX,SI)
  1173  
  1174  #define CALC_148 \
  1175  	CALC_F2_PRE(0x230,SI,DI,BX) \
  1176  	PRECALC_36(Y13) \
  1177  	CALC_F2_POST(SI,AX,DX,BX)
  1178  
  1179  #define CALC_149 \
  1180  	CALC_F2_PRE(0x234,BX,SI,CX) \
  1181  	PRECALC_37(Y13) \
  1182  	CALC_F2_POST(BX,DI,AX,CX)
  1183  
  1184  #define CALC_150 \
  1185  	CALC_F2_PRE(0x238,CX,BX,DX) \
  1186  	CALC_F2_POST(CX,SI,DI,DX)
  1187  
  1188  #define CALC_151 \
  1189  	CALC_F2_PRE(0x23c,DX,CX,AX) \
  1190  	PRECALC_39(Y13,0x60,0x240) \
  1191  	CALC_F2_POST(DX,BX,SI,AX)
  1192  
  1193  #define CALC_152 \
  1194  	CALC_F2_PRE(0x250,AX,DX,DI) \
  1195  	PRECALC_32(Y14,Y13) \
  1196  	CALC_F2_POST(AX,CX,BX,DI)
  1197  
  1198  #define CALC_153 \
  1199  	CALC_F2_PRE(0x254,DI,AX,SI) \
  1200  	PRECALC_33(Y8,Y12) \
  1201  	CALC_F2_POST(DI,DX,CX,SI)
  1202  
  1203  #define CALC_154 \
  1204  	CALC_F2_PRE(0x258,SI,DI,BX) \
  1205  	PRECALC_34(Y3) \
  1206  	CALC_F2_POST(SI,AX,DX,BX)
  1207  
  1208  #define CALC_155 \
  1209  	CALC_F2_PRE(0x25c,BX,SI,CX) \
  1210  	PRECALC_35(Y12) \
  1211  	CALC_F2_POST(BX,DI,AX,CX)
  1212  
  1213  #define CALC_156 \
  1214  	CALC_F2_PRE(0x270,CX,BX,DX) \
  1215  	PRECALC_36(Y12) \
  1216  	CALC_F2_POST(CX,SI,DI,DX)
  1217  
  1218  #define CALC_157 \
  1219  	CALC_F2_PRE(0x274,DX,CX,AX) \
  1220  	PRECALC_37(Y12) \
  1221  	CALC_F2_POST(DX,BX,SI,AX)
  1222  
  1223  #define CALC_158 \
  1224  	CALC_F2_PRE(0x278,AX,DX,DI) \
  1225  	CALC_F2_POST(AX,CX,BX,DI)
  1226  
  1227  #define CALC_159 \
  1228  	ADDL 0x27c(R15),SI \
  1229  	LEAL (SI)(AX*1), SI \
  1230  	RORXL $0x1b, DI, R12 \
  1231  	PRECALC_39(Y12,0x60,0x260) \
  1232  	ADDL R12, SI
  1233  
  1234  
  1235  
  1236  #define CALC \
  1237  	MOVL	(R9), CX \
  1238  	MOVL	4(R9), SI \
  1239  	MOVL	8(R9), DI \
  1240  	MOVL	12(R9), AX \
  1241  	MOVL	16(R9), DX \
  1242  	MOVQ    SP, R14 \
  1243  	LEAQ    (2*4*80+32)(SP), R15 \
  1244  	PRECALC \ // Precalc WK for first 2 blocks
  1245  	XCHGQ   R15, R14 \
  1246  loop: \  // this loops is unrolled
  1247  	CMPQ    R10, R8 \ // we use R8 value (set below) as a signal of a last block
  1248  	JNE	begin \
  1249  	VZEROUPPER \
  1250  	RET \
  1251  begin: \
  1252  	CALC_0 \
  1253  	CALC_1 \
  1254  	CALC_2 \
  1255  	CALC_3 \
  1256  	CALC_4 \
  1257  	CALC_5 \
  1258  	CALC_6 \
  1259  	CALC_7 \
  1260  	CALC_8 \
  1261  	CALC_9 \
  1262  	CALC_10 \
  1263  	CALC_11 \
  1264  	CALC_12 \
  1265  	CALC_13 \
  1266  	CALC_14 \
  1267  	CALC_15 \
  1268  	CALC_16 \
  1269  	CALC_17 \
  1270  	CALC_18 \
  1271  	CALC_19 \
  1272  	CALC_20 \
  1273  	CALC_21 \
  1274  	CALC_22 \
  1275  	CALC_23 \
  1276  	CALC_24 \
  1277  	CALC_25 \
  1278  	CALC_26 \
  1279  	CALC_27 \
  1280  	CALC_28 \
  1281  	CALC_29 \
  1282  	CALC_30 \
  1283  	CALC_31 \
  1284  	CALC_32 \
  1285  	CALC_33 \
  1286  	CALC_34 \
  1287  	CALC_35 \
  1288  	CALC_36 \
  1289  	CALC_37 \
  1290  	CALC_38 \
  1291  	CALC_39 \
  1292  	CALC_40 \
  1293  	CALC_41 \
  1294  	CALC_42 \
  1295  	CALC_43 \
  1296  	CALC_44 \
  1297  	CALC_45 \
  1298  	CALC_46 \
  1299  	CALC_47 \
  1300  	CALC_48 \
  1301  	CALC_49 \
  1302  	CALC_50 \
  1303  	CALC_51 \
  1304  	CALC_52 \
  1305  	CALC_53 \
  1306  	CALC_54 \
  1307  	CALC_55 \
  1308  	CALC_56 \
  1309  	CALC_57 \
  1310  	CALC_58 \
  1311  	CALC_59 \
  1312  	ADDQ $128, R10 \ // move to next even-64-byte block
  1313  	CMPQ R10, R11 \ // is current block the last one?
  1314  	CMOVQCC R8, R10 \ // signal the last iteration smartly
  1315  	CALC_60 \
  1316  	CALC_61 \
  1317  	CALC_62 \
  1318  	CALC_63 \
  1319  	CALC_64 \
  1320  	CALC_65 \
  1321  	CALC_66 \
  1322  	CALC_67 \
  1323  	CALC_68 \
  1324  	CALC_69 \
  1325  	CALC_70 \
  1326  	CALC_71 \
  1327  	CALC_72 \
  1328  	CALC_73 \
  1329  	CALC_74 \
  1330  	CALC_75 \
  1331  	CALC_76 \
  1332  	CALC_77 \
  1333  	CALC_78 \
  1334  	CALC_79 \
  1335  	UPDATE_HASH(AX,DX,BX,SI,DI) \
  1336  	CMPQ R10, R8 \ // is current block the last one?
  1337  	JE loop\
  1338  	MOVL DX, CX \
  1339  	CALC_80 \
  1340  	CALC_81 \
  1341  	CALC_82 \
  1342  	CALC_83 \
  1343  	CALC_84 \
  1344  	CALC_85 \
  1345  	CALC_86 \
  1346  	CALC_87 \
  1347  	CALC_88 \
  1348  	CALC_89 \
  1349  	CALC_90 \
  1350  	CALC_91 \
  1351  	CALC_92 \
  1352  	CALC_93 \
  1353  	CALC_94 \
  1354  	CALC_95 \
  1355  	CALC_96 \
  1356  	CALC_97 \
  1357  	CALC_98 \
  1358  	CALC_99 \
  1359  	CALC_100 \
  1360  	CALC_101 \
  1361  	CALC_102 \
  1362  	CALC_103 \
  1363  	CALC_104 \
  1364  	CALC_105 \
  1365  	CALC_106 \
  1366  	CALC_107 \
  1367  	CALC_108 \
  1368  	CALC_109 \
  1369  	CALC_110 \
  1370  	CALC_111 \
  1371  	CALC_112 \
  1372  	CALC_113 \
  1373  	CALC_114 \
  1374  	CALC_115 \
  1375  	CALC_116 \
  1376  	CALC_117 \
  1377  	CALC_118 \
  1378  	CALC_119 \
  1379  	CALC_120 \
  1380  	CALC_121 \
  1381  	CALC_122 \
  1382  	CALC_123 \
  1383  	CALC_124 \
  1384  	CALC_125 \
  1385  	CALC_126 \
  1386  	CALC_127 \
  1387  	CALC_128 \
  1388  	CALC_129 \
  1389  	CALC_130 \
  1390  	CALC_131 \
  1391  	CALC_132 \
  1392  	CALC_133 \
  1393  	CALC_134 \
  1394  	CALC_135 \
  1395  	CALC_136 \
  1396  	CALC_137 \
  1397  	CALC_138 \
  1398  	CALC_139 \
  1399  	ADDQ $128, R13 \ //move to next even-64-byte block
  1400  	CMPQ R13, R11 \ //is current block the last one?
  1401  	CMOVQCC R8, R10 \
  1402  	CALC_140 \
  1403  	CALC_141 \
  1404  	CALC_142 \
  1405  	CALC_143 \
  1406  	CALC_144 \
  1407  	CALC_145 \
  1408  	CALC_146 \
  1409  	CALC_147 \
  1410  	CALC_148 \
  1411  	CALC_149 \
  1412  	CALC_150 \
  1413  	CALC_151 \
  1414  	CALC_152 \
  1415  	CALC_153 \
  1416  	CALC_154 \
  1417  	CALC_155 \
  1418  	CALC_156 \
  1419  	CALC_157 \
  1420  	CALC_158 \
  1421  	CALC_159 \
  1422  	UPDATE_HASH(SI,DI,DX,CX,BX) \
  1423  	MOVL	SI, R12 \ //Reset state for  AVX2 reg permutation
  1424  	MOVL	DI, SI \
  1425  	MOVL	DX, DI \
  1426  	MOVL	BX, DX \
  1427  	MOVL	CX, AX \
  1428  	MOVL	R12, CX \
  1429  	XCHGQ   R15, R14 \
  1430  	JMP     loop
  1431  
  1432  
  1433  
  1434  TEXT ·blockAVX2(SB),$1408-32
  1435  
  1436  	MOVQ	dig+0(FP),	DI
  1437  	MOVQ	p_base+8(FP),	SI
  1438  	MOVQ	p_len+16(FP),	DX
  1439  	SHRQ	$6,		DX
  1440  	SHLQ	$6,		DX
  1441  
  1442  	MOVQ	$K_XMM_AR<>(SB), R8
  1443  
  1444  	MOVQ	DI, R9
  1445  	MOVQ	SI, R10
  1446  	LEAQ	64(SI), R13
  1447  
  1448  	ADDQ	SI, DX
  1449  	ADDQ	$64, DX
  1450  	MOVQ	DX, R11
  1451  
  1452  	CMPQ	R13, R11
  1453  	CMOVQCC	R8, R13
  1454  
  1455  	VMOVDQU	BSWAP_SHUFB_CTL<>(SB), Y10
  1456  
  1457  	CALC // RET is inside macros
  1458  
  1459  DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
  1460  DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
  1461  DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
  1462  DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
  1463  DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
  1464  DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
  1465  DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
  1466  DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
  1467  DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
  1468  DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
  1469  DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
  1470  DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
  1471  DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
  1472  DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
  1473  DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
  1474  DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
  1475  DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
  1476  DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
  1477  DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
  1478  DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
  1479  DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
  1480  DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
  1481  DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
  1482  DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
  1483  DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
  1484  DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
  1485  DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
  1486  DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
  1487  DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
  1488  DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
  1489  DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
  1490  DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
  1491  GLOBL K_XMM_AR<>(SB),RODATA,$128
  1492  
  1493  DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
  1494  DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
  1495  DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
  1496  DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
  1497  DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
  1498  DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
  1499  DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
  1500  DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
  1501  GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
  1502  

View as plain text