Text file src/crypto/sha512/sha512block_amd64.s

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  // SHA512 block routine. See sha512block.go for Go equivalent.
    10  //
    11  // The algorithm is detailed in FIPS 180-4:
    12  //
    13  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    14  //
    15  // Wt = Mt; for 0 <= t <= 15
    16  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    17  //
    18  // a = H0
    19  // b = H1
    20  // c = H2
    21  // d = H3
    22  // e = H4
    23  // f = H5
    24  // g = H6
    25  // h = H7
    26  //
    27  // for t = 0 to 79 {
    28  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    29  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    30  //    h = g
    31  //    g = f
    32  //    f = e
    33  //    e = d + T1
    34  //    d = c
    35  //    c = b
    36  //    b = a
    37  //    a = T1 + T2
    38  // }
    39  //
    40  // H0 = a + H0
    41  // H1 = b + H1
    42  // H2 = c + H2
    43  // H3 = d + H3
    44  // H4 = e + H4
    45  // H5 = f + H5
    46  // H6 = g + H6
    47  // H7 = h + H7
    48  
    49  // Wt = Mt; for 0 <= t <= 15
    50  #define MSGSCHEDULE0(index) \
    51  	MOVQ	(index*8)(SI), AX; \
    52  	BSWAPQ	AX; \
    53  	MOVQ	AX, (index*8)(BP)
    54  
    55  // Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    56  //   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
    57  //   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
    58  #define MSGSCHEDULE1(index) \
    59  	MOVQ	((index-2)*8)(BP), AX; \
    60  	MOVQ	AX, CX; \
    61  	RORQ	$19, AX; \
    62  	MOVQ	CX, DX; \
    63  	RORQ	$61, CX; \
    64  	SHRQ	$6, DX; \
    65  	MOVQ	((index-15)*8)(BP), BX; \
    66  	XORQ	CX, AX; \
    67  	MOVQ	BX, CX; \
    68  	XORQ	DX, AX; \
    69  	RORQ	$1, BX; \
    70  	MOVQ	CX, DX; \
    71  	SHRQ	$7, DX; \
    72  	RORQ	$8, CX; \
    73  	ADDQ	((index-7)*8)(BP), AX; \
    74  	XORQ	CX, BX; \
    75  	XORQ	DX, BX; \
    76  	ADDQ	((index-16)*8)(BP), BX; \
    77  	ADDQ	BX, AX; \
    78  	MOVQ	AX, ((index)*8)(BP)
    79  
    80  // Calculate T1 in AX - uses AX, CX and DX registers.
    81  // h is also used as an accumulator. Wt is passed in AX.
    82  //   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
    83  //     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
    84  //     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
    85  #define SHA512T1(const, e, f, g, h) \
    86  	MOVQ	$const, DX; \
    87  	ADDQ	AX, h; \
    88  	MOVQ	e, AX; \
    89  	ADDQ	DX, h; \
    90  	MOVQ	e, CX; \
    91  	RORQ	$14, AX; \
    92  	MOVQ	e, DX; \
    93  	RORQ	$18, CX; \
    94  	XORQ	CX, AX; \
    95  	MOVQ	e, CX; \
    96  	RORQ	$41, DX; \
    97  	ANDQ	f, CX; \
    98  	XORQ	AX, DX; \
    99  	MOVQ	e, AX; \
   100  	NOTQ	AX; \
   101  	ADDQ	DX, h; \
   102  	ANDQ	g, AX; \
   103  	XORQ	CX, AX; \
   104  	ADDQ	h, AX
   105  
   106  // Calculate T2 in BX - uses BX, CX, DX and DI registers.
   107  //   T2 = BIGSIGMA0(a) + Maj(a, b, c)
   108  //     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
   109  //     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
   110  #define SHA512T2(a, b, c) \
   111  	MOVQ	a, DI; \
   112  	MOVQ	c, BX; \
   113  	RORQ	$28, DI; \
   114  	MOVQ	a, DX; \
   115  	ANDQ	b, BX; \
   116  	RORQ	$34, DX; \
   117  	MOVQ	a, CX; \
   118  	ANDQ	c, CX; \
   119  	XORQ	DX, DI; \
   120  	XORQ	CX, BX; \
   121  	MOVQ	a, DX; \
   122  	MOVQ	b, CX; \
   123  	RORQ	$39, DX; \
   124  	ANDQ	a, CX; \
   125  	XORQ	CX, BX; \
   126  	XORQ	DX, DI; \
   127  	ADDQ	DI, BX
   128  
   129  // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
   130  // The values for e and a are stored in d and h, ready for rotation.
   131  #define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
   132  	SHA512T1(const, e, f, g, h); \
   133  	SHA512T2(a, b, c); \
   134  	MOVQ	BX, h; \
   135  	ADDQ	AX, d; \
   136  	ADDQ	AX, h
   137  
   138  #define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
   139  	MSGSCHEDULE0(index); \
   140  	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   141  
   142  #define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
   143  	MSGSCHEDULE1(index); \
   144  	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
   145  
   146  TEXT ·blockAMD64(SB),0,$648-32
   147  	MOVQ	p_base+8(FP), SI
   148  	MOVQ	p_len+16(FP), DX
   149  	SHRQ	$7, DX
   150  	SHLQ	$7, DX
   151  
   152  	LEAQ	(SI)(DX*1), DI
   153  	MOVQ	DI, 640(SP)
   154  	CMPQ	SI, DI
   155  	JEQ	end
   156  
   157  	MOVQ	dig+0(FP), BP
   158  	MOVQ	(0*8)(BP), R8		// a = H0
   159  	MOVQ	(1*8)(BP), R9		// b = H1
   160  	MOVQ	(2*8)(BP), R10		// c = H2
   161  	MOVQ	(3*8)(BP), R11		// d = H3
   162  	MOVQ	(4*8)(BP), R12		// e = H4
   163  	MOVQ	(5*8)(BP), R13		// f = H5
   164  	MOVQ	(6*8)(BP), R14		// g = H6
   165  	MOVQ	(7*8)(BP), R15		// h = H7
   166  
   167  loop:
   168  	MOVQ	SP, BP			// message schedule
   169  
   170  	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
   171  	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
   172  	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
   173  	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
   174  	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
   175  	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
   176  	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
   177  	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
   178  	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
   179  	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
   180  	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
   181  	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
   182  	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
   183  	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
   184  	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
   185  	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
   186  
   187  	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
   188  	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
   189  	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
   190  	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
   191  	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
   192  	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
   193  	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
   194  	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
   195  	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
   196  	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
   197  	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
   198  	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
   199  	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
   200  	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
   201  	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
   202  	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
   203  	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
   204  	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
   205  	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
   206  	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
   207  	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
   208  	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
   209  	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
   210  	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
   211  	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
   212  	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
   213  	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
   214  	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
   215  	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
   216  	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
   217  	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
   218  	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
   219  	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
   220  	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
   221  	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
   222  	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
   223  	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
   224  	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
   225  	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
   226  	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
   227  	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
   228  	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
   229  	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
   230  	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
   231  	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
   232  	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
   233  	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
   234  	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
   235  	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
   236  	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
   237  	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
   238  	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
   239  	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
   240  	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
   241  	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
   242  	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
   243  	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
   244  	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
   245  	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
   246  	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
   247  	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
   248  	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
   249  	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
   250  	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
   251  
   252  	MOVQ	dig+0(FP), BP
   253  	ADDQ	(0*8)(BP), R8	// H0 = a + H0
   254  	MOVQ	R8, (0*8)(BP)
   255  	ADDQ	(1*8)(BP), R9	// H1 = b + H1
   256  	MOVQ	R9, (1*8)(BP)
   257  	ADDQ	(2*8)(BP), R10	// H2 = c + H2
   258  	MOVQ	R10, (2*8)(BP)
   259  	ADDQ	(3*8)(BP), R11	// H3 = d + H3
   260  	MOVQ	R11, (3*8)(BP)
   261  	ADDQ	(4*8)(BP), R12	// H4 = e + H4
   262  	MOVQ	R12, (4*8)(BP)
   263  	ADDQ	(5*8)(BP), R13	// H5 = f + H5
   264  	MOVQ	R13, (5*8)(BP)
   265  	ADDQ	(6*8)(BP), R14	// H6 = g + H6
   266  	MOVQ	R14, (6*8)(BP)
   267  	ADDQ	(7*8)(BP), R15	// H7 = h + H7
   268  	MOVQ	R15, (7*8)(BP)
   269  
   270  	ADDQ	$128, SI
   271  	CMPQ	SI, 640(SP)
   272  	JB	loop
   273  
   274  end:
   275  	RET
   276  
   277  // Version below is based on "Fast SHA512 Implementations on Intel
   278  // Architecture Processors" White-paper
   279  // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
   280  // AVX2 version by Intel, same algorithm in Linux kernel:
   281  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
   282  
   283  // James Guilford <james.guilford@intel.com>
   284  // Kirk Yap <kirk.s.yap@intel.com>
   285  // Tim Chen <tim.c.chen@linux.intel.com>
   286  // David Cote <david.m.cote@intel.com>
   287  // Aleksey Sidorov <aleksey.sidorov@intel.com>
   288  
   289  #define YFER_SIZE (4*8)
   290  #define SRND_SIZE (1*8)
   291  #define INP_SIZE (1*8)
   292  
   293  #define frame_YFER (0)
   294  #define frame_SRND (frame_YFER + YFER_SIZE)
   295  #define frame_INP (frame_SRND + SRND_SIZE)
   296  #define frame_INPEND (frame_INP + INP_SIZE)
   297  
   298  #define addm(p1, p2) \
   299  	ADDQ p1, p2; \
   300  	MOVQ p2, p1
   301  
   302  #define COPY_YMM_AND_BSWAP(p1, p2, p3) \
   303  	VMOVDQU p2, p1;    \
   304  	VPSHUFB p3, p1, p1
   305  
   306  #define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
   307  	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
   308  	VPALIGNR   $RVAL, YSRC2, YDST, YDST
   309  
   310  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
   311  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
   312  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
   313  DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
   314  
   315  GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
   316  
   317  DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
   318  DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
   319  DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
   320  DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
   321  
   322  GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
   323  
   324  TEXT ·blockAVX2(SB), NOSPLIT, $56-32
   325  	MOVQ dig+0(FP), SI
   326  	MOVQ p_base+8(FP), DI
   327  	MOVQ p_len+16(FP), DX
   328  
   329  	SHRQ $7, DX
   330  	SHLQ $7, DX
   331  
   332  	JZ   done_hash
   333  	ADDQ DI, DX
   334  	MOVQ DX, frame_INPEND(SP)
   335  
   336  	MOVQ (0*8)(SI), AX
   337  	MOVQ (1*8)(SI), BX
   338  	MOVQ (2*8)(SI), CX
   339  	MOVQ (3*8)(SI), R8
   340  	MOVQ (4*8)(SI), DX
   341  	MOVQ (5*8)(SI), R9
   342  	MOVQ (6*8)(SI), R10
   343  	MOVQ (7*8)(SI), R11
   344  
   345  	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
   346  
   347  loop0:
   348  	MOVQ ·_K+0(SB), BP
   349  
   350  	// byte swap first 16 dwords
   351  	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
   352  	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
   353  	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
   354  	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
   355  
   356  	MOVQ DI, frame_INP(SP)
   357  
   358  	// schedule 64 input dwords, by doing 12 rounds of 4 each
   359  	MOVQ $4, frame_SRND(SP)
   360  
   361  loop1:
   362  	VPADDQ  (BP), Y4, Y0
   363  	VMOVDQU Y0, frame_YFER(SP)
   364  
   365  	MY_VPALIGNR(Y0, Y7, Y6, 8)
   366  
   367  	VPADDQ Y4, Y0, Y0
   368  
   369  	MY_VPALIGNR(Y1, Y5, Y4, 8)
   370  
   371  	VPSRLQ $1, Y1, Y2
   372  	VPSLLQ $(64-1), Y1, Y3
   373  	VPOR   Y2, Y3, Y3
   374  
   375  	VPSRLQ $7, Y1, Y8
   376  
   377  	MOVQ  AX, DI
   378  	RORXQ $41, DX, R13
   379  	RORXQ $18, DX, R14
   380  	ADDQ  frame_YFER(SP), R11
   381  	ORQ   CX, DI
   382  	MOVQ  R9, R15
   383  	RORXQ $34, AX, R12
   384  
   385  	XORQ  R14, R13
   386  	XORQ  R10, R15
   387  	RORXQ $14, DX, R14
   388  
   389  	ANDQ  DX, R15
   390  	XORQ  R14, R13
   391  	RORXQ $39, AX, R14
   392  	ADDQ  R11, R8
   393  
   394  	ANDQ  BX, DI
   395  	XORQ  R12, R14
   396  	RORXQ $28, AX, R12
   397  
   398  	XORQ R10, R15
   399  	XORQ R12, R14
   400  	MOVQ AX, R12
   401  	ANDQ CX, R12
   402  
   403  	ADDQ R13, R15
   404  	ORQ  R12, DI
   405  	ADDQ R14, R11
   406  
   407  	ADDQ R15, R8
   408  
   409  	ADDQ R15, R11
   410  	ADDQ DI, R11
   411  
   412  	VPSRLQ $8, Y1, Y2
   413  	VPSLLQ $(64-8), Y1, Y1
   414  	VPOR   Y2, Y1, Y1
   415  
   416  	VPXOR Y8, Y3, Y3
   417  	VPXOR Y1, Y3, Y1
   418  
   419  	VPADDQ Y1, Y0, Y0
   420  
   421  	VPERM2F128 $0x0, Y0, Y0, Y4
   422  
   423  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   424  
   425  	VPERM2F128 $0x11, Y7, Y7, Y2
   426  	VPSRLQ     $6, Y2, Y8
   427  
   428  	MOVQ  R11, DI
   429  	RORXQ $41, R8, R13
   430  	RORXQ $18, R8, R14
   431  	ADDQ  1*8+frame_YFER(SP), R10
   432  	ORQ   BX, DI
   433  
   434  	MOVQ  DX, R15
   435  	RORXQ $34, R11, R12
   436  	XORQ  R14, R13
   437  	XORQ  R9, R15
   438  
   439  	RORXQ $14, R8, R14
   440  	XORQ  R14, R13
   441  	RORXQ $39, R11, R14
   442  	ANDQ  R8, R15
   443  	ADDQ  R10, CX
   444  
   445  	ANDQ AX, DI
   446  	XORQ R12, R14
   447  
   448  	RORXQ $28, R11, R12
   449  	XORQ  R9, R15
   450  
   451  	XORQ R12, R14
   452  	MOVQ R11, R12
   453  	ANDQ BX, R12
   454  	ADDQ R13, R15
   455  
   456  	ORQ  R12, DI
   457  	ADDQ R14, R10
   458  
   459  	ADDQ R15, CX
   460  	ADDQ R15, R10
   461  	ADDQ DI, R10
   462  
   463  	VPSRLQ $19, Y2, Y3
   464  	VPSLLQ $(64-19), Y2, Y1
   465  	VPOR   Y1, Y3, Y3
   466  	VPXOR  Y3, Y8, Y8
   467  	VPSRLQ $61, Y2, Y3
   468  	VPSLLQ $(64-61), Y2, Y1
   469  	VPOR   Y1, Y3, Y3
   470  	VPXOR  Y3, Y8, Y8
   471  
   472  	VPADDQ Y8, Y4, Y4
   473  
   474  	VPSRLQ $6, Y4, Y8
   475  
   476  	MOVQ  R10, DI
   477  	RORXQ $41, CX, R13
   478  	ADDQ  2*8+frame_YFER(SP), R9
   479  
   480  	RORXQ $18, CX, R14
   481  	ORQ   AX, DI
   482  	MOVQ  R8, R15
   483  	XORQ  DX, R15
   484  
   485  	RORXQ $34, R10, R12
   486  	XORQ  R14, R13
   487  	ANDQ  CX, R15
   488  
   489  	RORXQ $14, CX, R14
   490  	ADDQ  R9, BX
   491  	ANDQ  R11, DI
   492  
   493  	XORQ  R14, R13
   494  	RORXQ $39, R10, R14
   495  	XORQ  DX, R15
   496  
   497  	XORQ  R12, R14
   498  	RORXQ $28, R10, R12
   499  
   500  	XORQ R12, R14
   501  	MOVQ R10, R12
   502  	ANDQ AX, R12
   503  	ADDQ R13, R15
   504  
   505  	ORQ  R12, DI
   506  	ADDQ R14, R9
   507  	ADDQ R15, BX
   508  	ADDQ R15, R9
   509  
   510  	ADDQ DI, R9
   511  
   512  	VPSRLQ $19, Y4, Y3
   513  	VPSLLQ $(64-19), Y4, Y1
   514  	VPOR   Y1, Y3, Y3
   515  	VPXOR  Y3, Y8, Y8
   516  	VPSRLQ $61, Y4, Y3
   517  	VPSLLQ $(64-61), Y4, Y1
   518  	VPOR   Y1, Y3, Y3
   519  	VPXOR  Y3, Y8, Y8
   520  
   521  	VPADDQ Y8, Y0, Y2
   522  
   523  	VPBLENDD $0xF0, Y2, Y4, Y4
   524  
   525  	MOVQ  R9, DI
   526  	RORXQ $41, BX, R13
   527  	RORXQ $18, BX, R14
   528  	ADDQ  3*8+frame_YFER(SP), DX
   529  	ORQ   R11, DI
   530  
   531  	MOVQ  CX, R15
   532  	RORXQ $34, R9, R12
   533  	XORQ  R14, R13
   534  	XORQ  R8, R15
   535  
   536  	RORXQ $14, BX, R14
   537  	ANDQ  BX, R15
   538  	ADDQ  DX, AX
   539  	ANDQ  R10, DI
   540  
   541  	XORQ R14, R13
   542  	XORQ R8, R15
   543  
   544  	RORXQ $39, R9, R14
   545  	ADDQ  R13, R15
   546  
   547  	XORQ R12, R14
   548  	ADDQ R15, AX
   549  
   550  	RORXQ $28, R9, R12
   551  
   552  	XORQ R12, R14
   553  	MOVQ R9, R12
   554  	ANDQ R11, R12
   555  	ORQ  R12, DI
   556  
   557  	ADDQ R14, DX
   558  	ADDQ R15, DX
   559  	ADDQ DI, DX
   560  
   561  	VPADDQ  1*32(BP), Y5, Y0
   562  	VMOVDQU Y0, frame_YFER(SP)
   563  
   564  	MY_VPALIGNR(Y0, Y4, Y7, 8)
   565  
   566  	VPADDQ Y5, Y0, Y0
   567  
   568  	MY_VPALIGNR(Y1, Y6, Y5, 8)
   569  
   570  	VPSRLQ $1, Y1, Y2
   571  	VPSLLQ $(64-1), Y1, Y3
   572  	VPOR   Y2, Y3, Y3
   573  
   574  	VPSRLQ $7, Y1, Y8
   575  
   576  	MOVQ  DX, DI
   577  	RORXQ $41, AX, R13
   578  	RORXQ $18, AX, R14
   579  	ADDQ  frame_YFER(SP), R8
   580  	ORQ   R10, DI
   581  	MOVQ  BX, R15
   582  	RORXQ $34, DX, R12
   583  
   584  	XORQ  R14, R13
   585  	XORQ  CX, R15
   586  	RORXQ $14, AX, R14
   587  
   588  	ANDQ  AX, R15
   589  	XORQ  R14, R13
   590  	RORXQ $39, DX, R14
   591  	ADDQ  R8, R11
   592  
   593  	ANDQ  R9, DI
   594  	XORQ  R12, R14
   595  	RORXQ $28, DX, R12
   596  
   597  	XORQ CX, R15
   598  	XORQ R12, R14
   599  	MOVQ DX, R12
   600  	ANDQ R10, R12
   601  
   602  	ADDQ R13, R15
   603  	ORQ  R12, DI
   604  	ADDQ R14, R8
   605  
   606  	ADDQ R15, R11
   607  
   608  	ADDQ R15, R8
   609  	ADDQ DI, R8
   610  
   611  	VPSRLQ $8, Y1, Y2
   612  	VPSLLQ $(64-8), Y1, Y1
   613  	VPOR   Y2, Y1, Y1
   614  
   615  	VPXOR Y8, Y3, Y3
   616  	VPXOR Y1, Y3, Y1
   617  
   618  	VPADDQ Y1, Y0, Y0
   619  
   620  	VPERM2F128 $0x0, Y0, Y0, Y5
   621  
   622  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   623  
   624  	VPERM2F128 $0x11, Y4, Y4, Y2
   625  	VPSRLQ     $6, Y2, Y8
   626  
   627  	MOVQ  R8, DI
   628  	RORXQ $41, R11, R13
   629  	RORXQ $18, R11, R14
   630  	ADDQ  1*8+frame_YFER(SP), CX
   631  	ORQ   R9, DI
   632  
   633  	MOVQ  AX, R15
   634  	RORXQ $34, R8, R12
   635  	XORQ  R14, R13
   636  	XORQ  BX, R15
   637  
   638  	RORXQ $14, R11, R14
   639  	XORQ  R14, R13
   640  	RORXQ $39, R8, R14
   641  	ANDQ  R11, R15
   642  	ADDQ  CX, R10
   643  
   644  	ANDQ DX, DI
   645  	XORQ R12, R14
   646  
   647  	RORXQ $28, R8, R12
   648  	XORQ  BX, R15
   649  
   650  	XORQ R12, R14
   651  	MOVQ R8, R12
   652  	ANDQ R9, R12
   653  	ADDQ R13, R15
   654  
   655  	ORQ  R12, DI
   656  	ADDQ R14, CX
   657  
   658  	ADDQ R15, R10
   659  	ADDQ R15, CX
   660  	ADDQ DI, CX
   661  
   662  	VPSRLQ $19, Y2, Y3
   663  	VPSLLQ $(64-19), Y2, Y1
   664  	VPOR   Y1, Y3, Y3
   665  	VPXOR  Y3, Y8, Y8
   666  	VPSRLQ $61, Y2, Y3
   667  	VPSLLQ $(64-61), Y2, Y1
   668  	VPOR   Y1, Y3, Y3
   669  	VPXOR  Y3, Y8, Y8
   670  
   671  	VPADDQ Y8, Y5, Y5
   672  
   673  	VPSRLQ $6, Y5, Y8
   674  
   675  	MOVQ  CX, DI
   676  	RORXQ $41, R10, R13
   677  	ADDQ  2*8+frame_YFER(SP), BX
   678  
   679  	RORXQ $18, R10, R14
   680  	ORQ   DX, DI
   681  	MOVQ  R11, R15
   682  	XORQ  AX, R15
   683  
   684  	RORXQ $34, CX, R12
   685  	XORQ  R14, R13
   686  	ANDQ  R10, R15
   687  
   688  	RORXQ $14, R10, R14
   689  	ADDQ  BX, R9
   690  	ANDQ  R8, DI
   691  
   692  	XORQ  R14, R13
   693  	RORXQ $39, CX, R14
   694  	XORQ  AX, R15
   695  
   696  	XORQ  R12, R14
   697  	RORXQ $28, CX, R12
   698  
   699  	XORQ R12, R14
   700  	MOVQ CX, R12
   701  	ANDQ DX, R12
   702  	ADDQ R13, R15
   703  
   704  	ORQ  R12, DI
   705  	ADDQ R14, BX
   706  	ADDQ R15, R9
   707  	ADDQ R15, BX
   708  
   709  	ADDQ DI, BX
   710  
   711  	VPSRLQ $19, Y5, Y3
   712  	VPSLLQ $(64-19), Y5, Y1
   713  	VPOR   Y1, Y3, Y3
   714  	VPXOR  Y3, Y8, Y8
   715  	VPSRLQ $61, Y5, Y3
   716  	VPSLLQ $(64-61), Y5, Y1
   717  	VPOR   Y1, Y3, Y3
   718  	VPXOR  Y3, Y8, Y8
   719  
   720  	VPADDQ Y8, Y0, Y2
   721  
   722  	VPBLENDD $0xF0, Y2, Y5, Y5
   723  
   724  	MOVQ  BX, DI
   725  	RORXQ $41, R9, R13
   726  	RORXQ $18, R9, R14
   727  	ADDQ  3*8+frame_YFER(SP), AX
   728  	ORQ   R8, DI
   729  
   730  	MOVQ  R10, R15
   731  	RORXQ $34, BX, R12
   732  	XORQ  R14, R13
   733  	XORQ  R11, R15
   734  
   735  	RORXQ $14, R9, R14
   736  	ANDQ  R9, R15
   737  	ADDQ  AX, DX
   738  	ANDQ  CX, DI
   739  
   740  	XORQ R14, R13
   741  	XORQ R11, R15
   742  
   743  	RORXQ $39, BX, R14
   744  	ADDQ  R13, R15
   745  
   746  	XORQ R12, R14
   747  	ADDQ R15, DX
   748  
   749  	RORXQ $28, BX, R12
   750  
   751  	XORQ R12, R14
   752  	MOVQ BX, R12
   753  	ANDQ R8, R12
   754  	ORQ  R12, DI
   755  
   756  	ADDQ R14, AX
   757  	ADDQ R15, AX
   758  	ADDQ DI, AX
   759  
   760  	VPADDQ  2*32(BP), Y6, Y0
   761  	VMOVDQU Y0, frame_YFER(SP)
   762  
   763  	MY_VPALIGNR(Y0, Y5, Y4, 8)
   764  
   765  	VPADDQ Y6, Y0, Y0
   766  
   767  	MY_VPALIGNR(Y1, Y7, Y6, 8)
   768  
   769  	VPSRLQ $1, Y1, Y2
   770  	VPSLLQ $(64-1), Y1, Y3
   771  	VPOR   Y2, Y3, Y3
   772  
   773  	VPSRLQ $7, Y1, Y8
   774  
   775  	MOVQ  AX, DI
   776  	RORXQ $41, DX, R13
   777  	RORXQ $18, DX, R14
   778  	ADDQ  frame_YFER(SP), R11
   779  	ORQ   CX, DI
   780  	MOVQ  R9, R15
   781  	RORXQ $34, AX, R12
   782  
   783  	XORQ  R14, R13
   784  	XORQ  R10, R15
   785  	RORXQ $14, DX, R14
   786  
   787  	ANDQ  DX, R15
   788  	XORQ  R14, R13
   789  	RORXQ $39, AX, R14
   790  	ADDQ  R11, R8
   791  
   792  	ANDQ  BX, DI
   793  	XORQ  R12, R14
   794  	RORXQ $28, AX, R12
   795  
   796  	XORQ R10, R15
   797  	XORQ R12, R14
   798  	MOVQ AX, R12
   799  	ANDQ CX, R12
   800  
   801  	ADDQ R13, R15
   802  	ORQ  R12, DI
   803  	ADDQ R14, R11
   804  
   805  	ADDQ R15, R8
   806  
   807  	ADDQ R15, R11
   808  	ADDQ DI, R11
   809  
   810  	VPSRLQ $8, Y1, Y2
   811  	VPSLLQ $(64-8), Y1, Y1
   812  	VPOR   Y2, Y1, Y1
   813  
   814  	VPXOR Y8, Y3, Y3
   815  	VPXOR Y1, Y3, Y1
   816  
   817  	VPADDQ Y1, Y0, Y0
   818  
   819  	VPERM2F128 $0x0, Y0, Y0, Y6
   820  
   821  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
   822  
   823  	VPERM2F128 $0x11, Y5, Y5, Y2
   824  	VPSRLQ     $6, Y2, Y8
   825  
   826  	MOVQ  R11, DI
   827  	RORXQ $41, R8, R13
   828  	RORXQ $18, R8, R14
   829  	ADDQ  1*8+frame_YFER(SP), R10
   830  	ORQ   BX, DI
   831  
   832  	MOVQ  DX, R15
   833  	RORXQ $34, R11, R12
   834  	XORQ  R14, R13
   835  	XORQ  R9, R15
   836  
   837  	RORXQ $14, R8, R14
   838  	XORQ  R14, R13
   839  	RORXQ $39, R11, R14
   840  	ANDQ  R8, R15
   841  	ADDQ  R10, CX
   842  
   843  	ANDQ AX, DI
   844  	XORQ R12, R14
   845  
   846  	RORXQ $28, R11, R12
   847  	XORQ  R9, R15
   848  
   849  	XORQ R12, R14
   850  	MOVQ R11, R12
   851  	ANDQ BX, R12
   852  	ADDQ R13, R15
   853  
   854  	ORQ  R12, DI
   855  	ADDQ R14, R10
   856  
   857  	ADDQ R15, CX
   858  	ADDQ R15, R10
   859  	ADDQ DI, R10
   860  
   861  	VPSRLQ $19, Y2, Y3
   862  	VPSLLQ $(64-19), Y2, Y1
   863  	VPOR   Y1, Y3, Y3
   864  	VPXOR  Y3, Y8, Y8
   865  	VPSRLQ $61, Y2, Y3
   866  	VPSLLQ $(64-61), Y2, Y1
   867  	VPOR   Y1, Y3, Y3
   868  	VPXOR  Y3, Y8, Y8
   869  
   870  	VPADDQ Y8, Y6, Y6
   871  
   872  	VPSRLQ $6, Y6, Y8
   873  
   874  	MOVQ  R10, DI
   875  	RORXQ $41, CX, R13
   876  	ADDQ  2*8+frame_YFER(SP), R9
   877  
   878  	RORXQ $18, CX, R14
   879  	ORQ   AX, DI
   880  	MOVQ  R8, R15
   881  	XORQ  DX, R15
   882  
   883  	RORXQ $34, R10, R12
   884  	XORQ  R14, R13
   885  	ANDQ  CX, R15
   886  
   887  	RORXQ $14, CX, R14
   888  	ADDQ  R9, BX
   889  	ANDQ  R11, DI
   890  
   891  	XORQ  R14, R13
   892  	RORXQ $39, R10, R14
   893  	XORQ  DX, R15
   894  
   895  	XORQ  R12, R14
   896  	RORXQ $28, R10, R12
   897  
   898  	XORQ R12, R14
   899  	MOVQ R10, R12
   900  	ANDQ AX, R12
   901  	ADDQ R13, R15
   902  
   903  	ORQ  R12, DI
   904  	ADDQ R14, R9
   905  	ADDQ R15, BX
   906  	ADDQ R15, R9
   907  
   908  	ADDQ DI, R9
   909  
   910  	VPSRLQ $19, Y6, Y3
   911  	VPSLLQ $(64-19), Y6, Y1
   912  	VPOR   Y1, Y3, Y3
   913  	VPXOR  Y3, Y8, Y8
   914  	VPSRLQ $61, Y6, Y3
   915  	VPSLLQ $(64-61), Y6, Y1
   916  	VPOR   Y1, Y3, Y3
   917  	VPXOR  Y3, Y8, Y8
   918  
   919  	VPADDQ Y8, Y0, Y2
   920  
   921  	VPBLENDD $0xF0, Y2, Y6, Y6
   922  
   923  	MOVQ  R9, DI
   924  	RORXQ $41, BX, R13
   925  	RORXQ $18, BX, R14
   926  	ADDQ  3*8+frame_YFER(SP), DX
   927  	ORQ   R11, DI
   928  
   929  	MOVQ  CX, R15
   930  	RORXQ $34, R9, R12
   931  	XORQ  R14, R13
   932  	XORQ  R8, R15
   933  
   934  	RORXQ $14, BX, R14
   935  	ANDQ  BX, R15
   936  	ADDQ  DX, AX
   937  	ANDQ  R10, DI
   938  
   939  	XORQ R14, R13
   940  	XORQ R8, R15
   941  
   942  	RORXQ $39, R9, R14
   943  	ADDQ  R13, R15
   944  
   945  	XORQ R12, R14
   946  	ADDQ R15, AX
   947  
   948  	RORXQ $28, R9, R12
   949  
   950  	XORQ R12, R14
   951  	MOVQ R9, R12
   952  	ANDQ R11, R12
   953  	ORQ  R12, DI
   954  
   955  	ADDQ R14, DX
   956  	ADDQ R15, DX
   957  	ADDQ DI, DX
   958  
   959  	VPADDQ  3*32(BP), Y7, Y0
   960  	VMOVDQU Y0, frame_YFER(SP)
   961  	ADDQ    $(4*32), BP
   962  
   963  	MY_VPALIGNR(Y0, Y6, Y5, 8)
   964  
   965  	VPADDQ Y7, Y0, Y0
   966  
   967  	MY_VPALIGNR(Y1, Y4, Y7, 8)
   968  
   969  	VPSRLQ $1, Y1, Y2
   970  	VPSLLQ $(64-1), Y1, Y3
   971  	VPOR   Y2, Y3, Y3
   972  
   973  	VPSRLQ $7, Y1, Y8
   974  
   975  	MOVQ  DX, DI
   976  	RORXQ $41, AX, R13
   977  	RORXQ $18, AX, R14
   978  	ADDQ  frame_YFER(SP), R8
   979  	ORQ   R10, DI
   980  	MOVQ  BX, R15
   981  	RORXQ $34, DX, R12
   982  
   983  	XORQ  R14, R13
   984  	XORQ  CX, R15
   985  	RORXQ $14, AX, R14
   986  
   987  	ANDQ  AX, R15
   988  	XORQ  R14, R13
   989  	RORXQ $39, DX, R14
   990  	ADDQ  R8, R11
   991  
   992  	ANDQ  R9, DI
   993  	XORQ  R12, R14
   994  	RORXQ $28, DX, R12
   995  
   996  	XORQ CX, R15
   997  	XORQ R12, R14
   998  	MOVQ DX, R12
   999  	ANDQ R10, R12
  1000  
  1001  	ADDQ R13, R15
  1002  	ORQ  R12, DI
  1003  	ADDQ R14, R8
  1004  
  1005  	ADDQ R15, R11
  1006  
  1007  	ADDQ R15, R8
  1008  	ADDQ DI, R8
  1009  
  1010  	VPSRLQ $8, Y1, Y2
  1011  	VPSLLQ $(64-8), Y1, Y1
  1012  	VPOR   Y2, Y1, Y1
  1013  
  1014  	VPXOR Y8, Y3, Y3
  1015  	VPXOR Y1, Y3, Y1
  1016  
  1017  	VPADDQ Y1, Y0, Y0
  1018  
  1019  	VPERM2F128 $0x0, Y0, Y0, Y7
  1020  
  1021  	VPAND MASK_YMM_LO<>(SB), Y0, Y0
  1022  
  1023  	VPERM2F128 $0x11, Y6, Y6, Y2
  1024  	VPSRLQ     $6, Y2, Y8
  1025  
  1026  	MOVQ  R8, DI
  1027  	RORXQ $41, R11, R13
  1028  	RORXQ $18, R11, R14
  1029  	ADDQ  1*8+frame_YFER(SP), CX
  1030  	ORQ   R9, DI
  1031  
  1032  	MOVQ  AX, R15
  1033  	RORXQ $34, R8, R12
  1034  	XORQ  R14, R13
  1035  	XORQ  BX, R15
  1036  
  1037  	RORXQ $14, R11, R14
  1038  	XORQ  R14, R13
  1039  	RORXQ $39, R8, R14
  1040  	ANDQ  R11, R15
  1041  	ADDQ  CX, R10
  1042  
  1043  	ANDQ DX, DI
  1044  	XORQ R12, R14
  1045  
  1046  	RORXQ $28, R8, R12
  1047  	XORQ  BX, R15
  1048  
  1049  	XORQ R12, R14
  1050  	MOVQ R8, R12
  1051  	ANDQ R9, R12
  1052  	ADDQ R13, R15
  1053  
  1054  	ORQ  R12, DI
  1055  	ADDQ R14, CX
  1056  
  1057  	ADDQ R15, R10
  1058  	ADDQ R15, CX
  1059  	ADDQ DI, CX
  1060  
  1061  	VPSRLQ $19, Y2, Y3
  1062  	VPSLLQ $(64-19), Y2, Y1
  1063  	VPOR   Y1, Y3, Y3
  1064  	VPXOR  Y3, Y8, Y8
  1065  	VPSRLQ $61, Y2, Y3
  1066  	VPSLLQ $(64-61), Y2, Y1
  1067  	VPOR   Y1, Y3, Y3
  1068  	VPXOR  Y3, Y8, Y8
  1069  
  1070  	VPADDQ Y8, Y7, Y7
  1071  
  1072  	VPSRLQ $6, Y7, Y8
  1073  
  1074  	MOVQ  CX, DI
  1075  	RORXQ $41, R10, R13
  1076  	ADDQ  2*8+frame_YFER(SP), BX
  1077  
  1078  	RORXQ $18, R10, R14
  1079  	ORQ   DX, DI
  1080  	MOVQ  R11, R15
  1081  	XORQ  AX, R15
  1082  
  1083  	RORXQ $34, CX, R12
  1084  	XORQ  R14, R13
  1085  	ANDQ  R10, R15
  1086  
  1087  	RORXQ $14, R10, R14
  1088  	ADDQ  BX, R9
  1089  	ANDQ  R8, DI
  1090  
  1091  	XORQ  R14, R13
  1092  	RORXQ $39, CX, R14
  1093  	XORQ  AX, R15
  1094  
  1095  	XORQ  R12, R14
  1096  	RORXQ $28, CX, R12
  1097  
  1098  	XORQ R12, R14
  1099  	MOVQ CX, R12
  1100  	ANDQ DX, R12
  1101  	ADDQ R13, R15
  1102  
  1103  	ORQ  R12, DI
  1104  	ADDQ R14, BX
  1105  	ADDQ R15, R9
  1106  	ADDQ R15, BX
  1107  
  1108  	ADDQ DI, BX
  1109  
  1110  	VPSRLQ $19, Y7, Y3
  1111  	VPSLLQ $(64-19), Y7, Y1
  1112  	VPOR   Y1, Y3, Y3
  1113  	VPXOR  Y3, Y8, Y8
  1114  	VPSRLQ $61, Y7, Y3
  1115  	VPSLLQ $(64-61), Y7, Y1
  1116  	VPOR   Y1, Y3, Y3
  1117  	VPXOR  Y3, Y8, Y8
  1118  
  1119  	VPADDQ Y8, Y0, Y2
  1120  
  1121  	VPBLENDD $0xF0, Y2, Y7, Y7
  1122  
  1123  	MOVQ  BX, DI
  1124  	RORXQ $41, R9, R13
  1125  	RORXQ $18, R9, R14
  1126  	ADDQ  3*8+frame_YFER(SP), AX
  1127  	ORQ   R8, DI
  1128  
  1129  	MOVQ  R10, R15
  1130  	RORXQ $34, BX, R12
  1131  	XORQ  R14, R13
  1132  	XORQ  R11, R15
  1133  
  1134  	RORXQ $14, R9, R14
  1135  	ANDQ  R9, R15
  1136  	ADDQ  AX, DX
  1137  	ANDQ  CX, DI
  1138  
  1139  	XORQ R14, R13
  1140  	XORQ R11, R15
  1141  
  1142  	RORXQ $39, BX, R14
  1143  	ADDQ  R13, R15
  1144  
  1145  	XORQ R12, R14
  1146  	ADDQ R15, DX
  1147  
  1148  	RORXQ $28, BX, R12
  1149  
  1150  	XORQ R12, R14
  1151  	MOVQ BX, R12
  1152  	ANDQ R8, R12
  1153  	ORQ  R12, DI
  1154  
  1155  	ADDQ R14, AX
  1156  	ADDQ R15, AX
  1157  	ADDQ DI, AX
  1158  
  1159  	SUBQ $1, frame_SRND(SP)
  1160  	JNE  loop1
  1161  
  1162  	MOVQ $2, frame_SRND(SP)
  1163  
  1164  loop2:
  1165  	VPADDQ  (BP), Y4, Y0
  1166  	VMOVDQU Y0, frame_YFER(SP)
  1167  
  1168  	MOVQ  R9, R15
  1169  	RORXQ $41, DX, R13
  1170  	RORXQ $18, DX, R14
  1171  	XORQ  R10, R15
  1172  
  1173  	XORQ  R14, R13
  1174  	RORXQ $14, DX, R14
  1175  	ANDQ  DX, R15
  1176  
  1177  	XORQ  R14, R13
  1178  	RORXQ $34, AX, R12
  1179  	XORQ  R10, R15
  1180  	RORXQ $39, AX, R14
  1181  	MOVQ  AX, DI
  1182  
  1183  	XORQ  R12, R14
  1184  	RORXQ $28, AX, R12
  1185  	ADDQ  frame_YFER(SP), R11
  1186  	ORQ   CX, DI
  1187  
  1188  	XORQ R12, R14
  1189  	MOVQ AX, R12
  1190  	ANDQ BX, DI
  1191  	ANDQ CX, R12
  1192  	ADDQ R13, R15
  1193  
  1194  	ADDQ R11, R8
  1195  	ORQ  R12, DI
  1196  	ADDQ R14, R11
  1197  
  1198  	ADDQ R15, R8
  1199  
  1200  	ADDQ  R15, R11
  1201  	MOVQ  DX, R15
  1202  	RORXQ $41, R8, R13
  1203  	RORXQ $18, R8, R14
  1204  	XORQ  R9, R15
  1205  
  1206  	XORQ  R14, R13
  1207  	RORXQ $14, R8, R14
  1208  	ANDQ  R8, R15
  1209  	ADDQ  DI, R11
  1210  
  1211  	XORQ  R14, R13
  1212  	RORXQ $34, R11, R12
  1213  	XORQ  R9, R15
  1214  	RORXQ $39, R11, R14
  1215  	MOVQ  R11, DI
  1216  
  1217  	XORQ  R12, R14
  1218  	RORXQ $28, R11, R12
  1219  	ADDQ  8*1+frame_YFER(SP), R10
  1220  	ORQ   BX, DI
  1221  
  1222  	XORQ R12, R14
  1223  	MOVQ R11, R12
  1224  	ANDQ AX, DI
  1225  	ANDQ BX, R12
  1226  	ADDQ R13, R15
  1227  
  1228  	ADDQ R10, CX
  1229  	ORQ  R12, DI
  1230  	ADDQ R14, R10
  1231  
  1232  	ADDQ R15, CX
  1233  
  1234  	ADDQ  R15, R10
  1235  	MOVQ  R8, R15
  1236  	RORXQ $41, CX, R13
  1237  	RORXQ $18, CX, R14
  1238  	XORQ  DX, R15
  1239  
  1240  	XORQ  R14, R13
  1241  	RORXQ $14, CX, R14
  1242  	ANDQ  CX, R15
  1243  	ADDQ  DI, R10
  1244  
  1245  	XORQ  R14, R13
  1246  	RORXQ $34, R10, R12
  1247  	XORQ  DX, R15
  1248  	RORXQ $39, R10, R14
  1249  	MOVQ  R10, DI
  1250  
  1251  	XORQ  R12, R14
  1252  	RORXQ $28, R10, R12
  1253  	ADDQ  8*2+frame_YFER(SP), R9
  1254  	ORQ   AX, DI
  1255  
  1256  	XORQ R12, R14
  1257  	MOVQ R10, R12
  1258  	ANDQ R11, DI
  1259  	ANDQ AX, R12
  1260  	ADDQ R13, R15
  1261  
  1262  	ADDQ R9, BX
  1263  	ORQ  R12, DI
  1264  	ADDQ R14, R9
  1265  
  1266  	ADDQ R15, BX
  1267  
  1268  	ADDQ  R15, R9
  1269  	MOVQ  CX, R15
  1270  	RORXQ $41, BX, R13
  1271  	RORXQ $18, BX, R14
  1272  	XORQ  R8, R15
  1273  
  1274  	XORQ  R14, R13
  1275  	RORXQ $14, BX, R14
  1276  	ANDQ  BX, R15
  1277  	ADDQ  DI, R9
  1278  
  1279  	XORQ  R14, R13
  1280  	RORXQ $34, R9, R12
  1281  	XORQ  R8, R15
  1282  	RORXQ $39, R9, R14
  1283  	MOVQ  R9, DI
  1284  
  1285  	XORQ  R12, R14
  1286  	RORXQ $28, R9, R12
  1287  	ADDQ  8*3+frame_YFER(SP), DX
  1288  	ORQ   R11, DI
  1289  
  1290  	XORQ R12, R14
  1291  	MOVQ R9, R12
  1292  	ANDQ R10, DI
  1293  	ANDQ R11, R12
  1294  	ADDQ R13, R15
  1295  
  1296  	ADDQ DX, AX
  1297  	ORQ  R12, DI
  1298  	ADDQ R14, DX
  1299  
  1300  	ADDQ R15, AX
  1301  
  1302  	ADDQ R15, DX
  1303  
  1304  	ADDQ DI, DX
  1305  
  1306  	VPADDQ  1*32(BP), Y5, Y0
  1307  	VMOVDQU Y0, frame_YFER(SP)
  1308  	ADDQ    $(2*32), BP
  1309  
  1310  	MOVQ  BX, R15
  1311  	RORXQ $41, AX, R13
  1312  	RORXQ $18, AX, R14
  1313  	XORQ  CX, R15
  1314  
  1315  	XORQ  R14, R13
  1316  	RORXQ $14, AX, R14
  1317  	ANDQ  AX, R15
  1318  
  1319  	XORQ  R14, R13
  1320  	RORXQ $34, DX, R12
  1321  	XORQ  CX, R15
  1322  	RORXQ $39, DX, R14
  1323  	MOVQ  DX, DI
  1324  
  1325  	XORQ  R12, R14
  1326  	RORXQ $28, DX, R12
  1327  	ADDQ  frame_YFER(SP), R8
  1328  	ORQ   R10, DI
  1329  
  1330  	XORQ R12, R14
  1331  	MOVQ DX, R12
  1332  	ANDQ R9, DI
  1333  	ANDQ R10, R12
  1334  	ADDQ R13, R15
  1335  
  1336  	ADDQ R8, R11
  1337  	ORQ  R12, DI
  1338  	ADDQ R14, R8
  1339  
  1340  	ADDQ R15, R11
  1341  
  1342  	ADDQ  R15, R8
  1343  	MOVQ  AX, R15
  1344  	RORXQ $41, R11, R13
  1345  	RORXQ $18, R11, R14
  1346  	XORQ  BX, R15
  1347  
  1348  	XORQ  R14, R13
  1349  	RORXQ $14, R11, R14
  1350  	ANDQ  R11, R15
  1351  	ADDQ  DI, R8
  1352  
  1353  	XORQ  R14, R13
  1354  	RORXQ $34, R8, R12
  1355  	XORQ  BX, R15
  1356  	RORXQ $39, R8, R14
  1357  	MOVQ  R8, DI
  1358  
  1359  	XORQ  R12, R14
  1360  	RORXQ $28, R8, R12
  1361  	ADDQ  8*1+frame_YFER(SP), CX
  1362  	ORQ   R9, DI
  1363  
  1364  	XORQ R12, R14
  1365  	MOVQ R8, R12
  1366  	ANDQ DX, DI
  1367  	ANDQ R9, R12
  1368  	ADDQ R13, R15
  1369  
  1370  	ADDQ CX, R10
  1371  	ORQ  R12, DI
  1372  	ADDQ R14, CX
  1373  
  1374  	ADDQ R15, R10
  1375  
  1376  	ADDQ  R15, CX
  1377  	MOVQ  R11, R15
  1378  	RORXQ $41, R10, R13
  1379  	RORXQ $18, R10, R14
  1380  	XORQ  AX, R15
  1381  
  1382  	XORQ  R14, R13
  1383  	RORXQ $14, R10, R14
  1384  	ANDQ  R10, R15
  1385  	ADDQ  DI, CX
  1386  
  1387  	XORQ  R14, R13
  1388  	RORXQ $34, CX, R12
  1389  	XORQ  AX, R15
  1390  	RORXQ $39, CX, R14
  1391  	MOVQ  CX, DI
  1392  
  1393  	XORQ  R12, R14
  1394  	RORXQ $28, CX, R12
  1395  	ADDQ  8*2+frame_YFER(SP), BX
  1396  	ORQ   DX, DI
  1397  
  1398  	XORQ R12, R14
  1399  	MOVQ CX, R12
  1400  	ANDQ R8, DI
  1401  	ANDQ DX, R12
  1402  	ADDQ R13, R15
  1403  
  1404  	ADDQ BX, R9
  1405  	ORQ  R12, DI
  1406  	ADDQ R14, BX
  1407  
  1408  	ADDQ R15, R9
  1409  
  1410  	ADDQ  R15, BX
  1411  	MOVQ  R10, R15
  1412  	RORXQ $41, R9, R13
  1413  	RORXQ $18, R9, R14
  1414  	XORQ  R11, R15
  1415  
  1416  	XORQ  R14, R13
  1417  	RORXQ $14, R9, R14
  1418  	ANDQ  R9, R15
  1419  	ADDQ  DI, BX
  1420  
  1421  	XORQ  R14, R13
  1422  	RORXQ $34, BX, R12
  1423  	XORQ  R11, R15
  1424  	RORXQ $39, BX, R14
  1425  	MOVQ  BX, DI
  1426  
  1427  	XORQ  R12, R14
  1428  	RORXQ $28, BX, R12
  1429  	ADDQ  8*3+frame_YFER(SP), AX
  1430  	ORQ   R8, DI
  1431  
  1432  	XORQ R12, R14
  1433  	MOVQ BX, R12
  1434  	ANDQ CX, DI
  1435  	ANDQ R8, R12
  1436  	ADDQ R13, R15
  1437  
  1438  	ADDQ AX, DX
  1439  	ORQ  R12, DI
  1440  	ADDQ R14, AX
  1441  
  1442  	ADDQ R15, DX
  1443  
  1444  	ADDQ R15, AX
  1445  
  1446  	ADDQ DI, AX
  1447  
  1448  	VMOVDQU Y6, Y4
  1449  	VMOVDQU Y7, Y5
  1450  
  1451  	SUBQ $1, frame_SRND(SP)
  1452  	JNE  loop2
  1453  
  1454  	addm(8*0(SI),AX)
  1455  	addm(8*1(SI),BX)
  1456  	addm(8*2(SI),CX)
  1457  	addm(8*3(SI),R8)
  1458  	addm(8*4(SI),DX)
  1459  	addm(8*5(SI),R9)
  1460  	addm(8*6(SI),R10)
  1461  	addm(8*7(SI),R11)
  1462  
  1463  	MOVQ frame_INP(SP), DI
  1464  	ADDQ $128, DI
  1465  	CMPQ DI, frame_INPEND(SP)
  1466  	JNE  loop0
  1467  
  1468  done_hash:
  1469  	VZEROUPPER
  1470  	RET
  1471  

View as plain text