Text file src/crypto/internal/fips140/sha512/sha512block_ppc64x.s

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Based on CRYPTOGAMS code with the following comment:
     6  // # ====================================================================
     7  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8  // # project. The module is, however, dual licensed under OpenSSL and
     9  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    10  // # details see http://www.openssl.org/~appro/cryptogams/.
    11  // # ====================================================================
    12  
    13  //go:build (ppc64 || ppc64le) && !purego
    14  
    15  #include "textflag.h"
    16  
    17  // SHA512 block routine. See sha512block.go for Go equivalent.
    18  //
    19  // The algorithm is detailed in FIPS 180-4:
    20  //
    21  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    22  //
    23  // Wt = Mt; for 0 <= t <= 15
    24  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    25  //
    26  // a = H0
    27  // b = H1
    28  // c = H2
    29  // d = H3
    30  // e = H4
    31  // f = H5
    32  // g = H6
    33  // h = H7
    34  //
    35  // for t = 0 to 79 {
    36  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    37  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    38  //    h = g
    39  //    g = f
    40  //    f = e
    41  //    e = d + T1
    42  //    d = c
    43  //    c = b
    44  //    b = a
    45  //    a = T1 + T2
    46  // }
    47  //
    48  // H0 = a + H0
    49  // H1 = b + H1
    50  // H2 = c + H2
    51  // H3 = d + H3
    52  // H4 = e + H4
    53  // H5 = f + H5
    54  // H6 = g + H6
    55  // H7 = h + H7
    56  
    57  #define CTX	R3
    58  #define INP	R4
    59  #define END	R5
    60  #define TBL	R6
    61  #define CNT	R8
    62  #define LEN	R9
    63  #define TEMP	R12
    64  
    65  #define TBL_STRT R7 // Pointer to start of kcon table.
    66  
    67  #define R_x000	R0
    68  #define R_x010	R10
    69  #define R_x020	R25
    70  #define R_x030	R26
    71  #define R_x040	R14
    72  #define R_x050	R15
    73  #define R_x060	R16
    74  #define R_x070	R17
    75  #define R_x080	R18
    76  #define R_x090	R19
    77  #define R_x0a0	R20
    78  #define R_x0b0	R21
    79  #define R_x0c0	R22
    80  #define R_x0d0	R23
    81  #define R_x0e0	R24
    82  #define R_x0f0	R28
    83  #define R_x100	R29
    84  #define R_x110	R27
    85  
    86  
    87  // V0-V7 are A-H
    88  // V8-V23 are used for the message schedule
    89  #define KI	V24
    90  #define FUNC	V25
    91  #define S0	V26
    92  #define S1	V27
    93  #define s0	V28
    94  #define s1	V29
    95  #define LEMASK	V31	// Permutation control register for little endian
    96  
    97  // VPERM is needed on LE to switch the bytes
    98  
    99  #ifdef GOARCH_ppc64le
   100  #define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
   101  #else
   102  #define VPERMLE(va,vb,vc,vt)
   103  #endif
   104  
   105  // 2 copies of each Kt, to fill both doublewords of a vector register
   106  DATA  ·kcon+0x000(SB)/8, $0x428a2f98d728ae22
   107  DATA  ·kcon+0x008(SB)/8, $0x428a2f98d728ae22
   108  DATA  ·kcon+0x010(SB)/8, $0x7137449123ef65cd
   109  DATA  ·kcon+0x018(SB)/8, $0x7137449123ef65cd
   110  DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfec4d3b2f
   111  DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfec4d3b2f
   112  DATA  ·kcon+0x030(SB)/8, $0xe9b5dba58189dbbc
   113  DATA  ·kcon+0x038(SB)/8, $0xe9b5dba58189dbbc
   114  DATA  ·kcon+0x040(SB)/8, $0x3956c25bf348b538
   115  DATA  ·kcon+0x048(SB)/8, $0x3956c25bf348b538
   116  DATA  ·kcon+0x050(SB)/8, $0x59f111f1b605d019
   117  DATA  ·kcon+0x058(SB)/8, $0x59f111f1b605d019
   118  DATA  ·kcon+0x060(SB)/8, $0x923f82a4af194f9b
   119  DATA  ·kcon+0x068(SB)/8, $0x923f82a4af194f9b
   120  DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5da6d8118
   121  DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5da6d8118
   122  DATA  ·kcon+0x080(SB)/8, $0xd807aa98a3030242
   123  DATA  ·kcon+0x088(SB)/8, $0xd807aa98a3030242
   124  DATA  ·kcon+0x090(SB)/8, $0x12835b0145706fbe
   125  DATA  ·kcon+0x098(SB)/8, $0x12835b0145706fbe
   126  DATA  ·kcon+0x0A0(SB)/8, $0x243185be4ee4b28c
   127  DATA  ·kcon+0x0A8(SB)/8, $0x243185be4ee4b28c
   128  DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3d5ffb4e2
   129  DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3d5ffb4e2
   130  DATA  ·kcon+0x0C0(SB)/8, $0x72be5d74f27b896f
   131  DATA  ·kcon+0x0C8(SB)/8, $0x72be5d74f27b896f
   132  DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe3b1696b1
   133  DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe3b1696b1
   134  DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a725c71235
   135  DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a725c71235
   136  DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174cf692694
   137  DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174cf692694
   138  DATA  ·kcon+0x100(SB)/8, $0xe49b69c19ef14ad2
   139  DATA  ·kcon+0x108(SB)/8, $0xe49b69c19ef14ad2
   140  DATA  ·kcon+0x110(SB)/8, $0xefbe4786384f25e3
   141  DATA  ·kcon+0x118(SB)/8, $0xefbe4786384f25e3
   142  DATA  ·kcon+0x120(SB)/8, $0x0fc19dc68b8cd5b5
   143  DATA  ·kcon+0x128(SB)/8, $0x0fc19dc68b8cd5b5
   144  DATA  ·kcon+0x130(SB)/8, $0x240ca1cc77ac9c65
   145  DATA  ·kcon+0x138(SB)/8, $0x240ca1cc77ac9c65
   146  DATA  ·kcon+0x140(SB)/8, $0x2de92c6f592b0275
   147  DATA  ·kcon+0x148(SB)/8, $0x2de92c6f592b0275
   148  DATA  ·kcon+0x150(SB)/8, $0x4a7484aa6ea6e483
   149  DATA  ·kcon+0x158(SB)/8, $0x4a7484aa6ea6e483
   150  DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dcbd41fbd4
   151  DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dcbd41fbd4
   152  DATA  ·kcon+0x170(SB)/8, $0x76f988da831153b5
   153  DATA  ·kcon+0x178(SB)/8, $0x76f988da831153b5
   154  DATA  ·kcon+0x180(SB)/8, $0x983e5152ee66dfab
   155  DATA  ·kcon+0x188(SB)/8, $0x983e5152ee66dfab
   156  DATA  ·kcon+0x190(SB)/8, $0xa831c66d2db43210
   157  DATA  ·kcon+0x198(SB)/8, $0xa831c66d2db43210
   158  DATA  ·kcon+0x1A0(SB)/8, $0xb00327c898fb213f
   159  DATA  ·kcon+0x1A8(SB)/8, $0xb00327c898fb213f
   160  DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7beef0ee4
   161  DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7beef0ee4
   162  DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf33da88fc2
   163  DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf33da88fc2
   164  DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147930aa725
   165  DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147930aa725
   166  DATA  ·kcon+0x1E0(SB)/8, $0x06ca6351e003826f
   167  DATA  ·kcon+0x1E8(SB)/8, $0x06ca6351e003826f
   168  DATA  ·kcon+0x1F0(SB)/8, $0x142929670a0e6e70
   169  DATA  ·kcon+0x1F8(SB)/8, $0x142929670a0e6e70
   170  DATA  ·kcon+0x200(SB)/8, $0x27b70a8546d22ffc
   171  DATA  ·kcon+0x208(SB)/8, $0x27b70a8546d22ffc
   172  DATA  ·kcon+0x210(SB)/8, $0x2e1b21385c26c926
   173  DATA  ·kcon+0x218(SB)/8, $0x2e1b21385c26c926
   174  DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc5ac42aed
   175  DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc5ac42aed
   176  DATA  ·kcon+0x230(SB)/8, $0x53380d139d95b3df
   177  DATA  ·kcon+0x238(SB)/8, $0x53380d139d95b3df
   178  DATA  ·kcon+0x240(SB)/8, $0x650a73548baf63de
   179  DATA  ·kcon+0x248(SB)/8, $0x650a73548baf63de
   180  DATA  ·kcon+0x250(SB)/8, $0x766a0abb3c77b2a8
   181  DATA  ·kcon+0x258(SB)/8, $0x766a0abb3c77b2a8
   182  DATA  ·kcon+0x260(SB)/8, $0x81c2c92e47edaee6
   183  DATA  ·kcon+0x268(SB)/8, $0x81c2c92e47edaee6
   184  DATA  ·kcon+0x270(SB)/8, $0x92722c851482353b
   185  DATA  ·kcon+0x278(SB)/8, $0x92722c851482353b
   186  DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a14cf10364
   187  DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a14cf10364
   188  DATA  ·kcon+0x290(SB)/8, $0xa81a664bbc423001
   189  DATA  ·kcon+0x298(SB)/8, $0xa81a664bbc423001
   190  DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70d0f89791
   191  DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70d0f89791
   192  DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a30654be30
   193  DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a30654be30
   194  DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d6ef5218
   195  DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d6ef5218
   196  DATA  ·kcon+0x2D0(SB)/8, $0xd69906245565a910
   197  DATA  ·kcon+0x2D8(SB)/8, $0xd69906245565a910
   198  DATA  ·kcon+0x2E0(SB)/8, $0xf40e35855771202a
   199  DATA  ·kcon+0x2E8(SB)/8, $0xf40e35855771202a
   200  DATA  ·kcon+0x2F0(SB)/8, $0x106aa07032bbd1b8
   201  DATA  ·kcon+0x2F8(SB)/8, $0x106aa07032bbd1b8
   202  DATA  ·kcon+0x300(SB)/8, $0x19a4c116b8d2d0c8
   203  DATA  ·kcon+0x308(SB)/8, $0x19a4c116b8d2d0c8
   204  DATA  ·kcon+0x310(SB)/8, $0x1e376c085141ab53
   205  DATA  ·kcon+0x318(SB)/8, $0x1e376c085141ab53
   206  DATA  ·kcon+0x320(SB)/8, $0x2748774cdf8eeb99
   207  DATA  ·kcon+0x328(SB)/8, $0x2748774cdf8eeb99
   208  DATA  ·kcon+0x330(SB)/8, $0x34b0bcb5e19b48a8
   209  DATA  ·kcon+0x338(SB)/8, $0x34b0bcb5e19b48a8
   210  DATA  ·kcon+0x340(SB)/8, $0x391c0cb3c5c95a63
   211  DATA  ·kcon+0x348(SB)/8, $0x391c0cb3c5c95a63
   212  DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4ae3418acb
   213  DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4ae3418acb
   214  DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f7763e373
   215  DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f7763e373
   216  DATA  ·kcon+0x370(SB)/8, $0x682e6ff3d6b2b8a3
   217  DATA  ·kcon+0x378(SB)/8, $0x682e6ff3d6b2b8a3
   218  DATA  ·kcon+0x380(SB)/8, $0x748f82ee5defb2fc
   219  DATA  ·kcon+0x388(SB)/8, $0x748f82ee5defb2fc
   220  DATA  ·kcon+0x390(SB)/8, $0x78a5636f43172f60
   221  DATA  ·kcon+0x398(SB)/8, $0x78a5636f43172f60
   222  DATA  ·kcon+0x3A0(SB)/8, $0x84c87814a1f0ab72
   223  DATA  ·kcon+0x3A8(SB)/8, $0x84c87814a1f0ab72
   224  DATA  ·kcon+0x3B0(SB)/8, $0x8cc702081a6439ec
   225  DATA  ·kcon+0x3B8(SB)/8, $0x8cc702081a6439ec
   226  DATA  ·kcon+0x3C0(SB)/8, $0x90befffa23631e28
   227  DATA  ·kcon+0x3C8(SB)/8, $0x90befffa23631e28
   228  DATA  ·kcon+0x3D0(SB)/8, $0xa4506cebde82bde9
   229  DATA  ·kcon+0x3D8(SB)/8, $0xa4506cebde82bde9
   230  DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7b2c67915
   231  DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7b2c67915
   232  DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2e372532b
   233  DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2e372532b
   234  DATA  ·kcon+0x400(SB)/8, $0xca273eceea26619c
   235  DATA  ·kcon+0x408(SB)/8, $0xca273eceea26619c
   236  DATA  ·kcon+0x410(SB)/8, $0xd186b8c721c0c207
   237  DATA  ·kcon+0x418(SB)/8, $0xd186b8c721c0c207
   238  DATA  ·kcon+0x420(SB)/8, $0xeada7dd6cde0eb1e
   239  DATA  ·kcon+0x428(SB)/8, $0xeada7dd6cde0eb1e
   240  DATA  ·kcon+0x430(SB)/8, $0xf57d4f7fee6ed178
   241  DATA  ·kcon+0x438(SB)/8, $0xf57d4f7fee6ed178
   242  DATA  ·kcon+0x440(SB)/8, $0x06f067aa72176fba
   243  DATA  ·kcon+0x448(SB)/8, $0x06f067aa72176fba
   244  DATA  ·kcon+0x450(SB)/8, $0x0a637dc5a2c898a6
   245  DATA  ·kcon+0x458(SB)/8, $0x0a637dc5a2c898a6
   246  DATA  ·kcon+0x460(SB)/8, $0x113f9804bef90dae
   247  DATA  ·kcon+0x468(SB)/8, $0x113f9804bef90dae
   248  DATA  ·kcon+0x470(SB)/8, $0x1b710b35131c471b
   249  DATA  ·kcon+0x478(SB)/8, $0x1b710b35131c471b
   250  DATA  ·kcon+0x480(SB)/8, $0x28db77f523047d84
   251  DATA  ·kcon+0x488(SB)/8, $0x28db77f523047d84
   252  DATA  ·kcon+0x490(SB)/8, $0x32caab7b40c72493
   253  DATA  ·kcon+0x498(SB)/8, $0x32caab7b40c72493
   254  DATA  ·kcon+0x4A0(SB)/8, $0x3c9ebe0a15c9bebc
   255  DATA  ·kcon+0x4A8(SB)/8, $0x3c9ebe0a15c9bebc
   256  DATA  ·kcon+0x4B0(SB)/8, $0x431d67c49c100d4c
   257  DATA  ·kcon+0x4B8(SB)/8, $0x431d67c49c100d4c
   258  DATA  ·kcon+0x4C0(SB)/8, $0x4cc5d4becb3e42b6
   259  DATA  ·kcon+0x4C8(SB)/8, $0x4cc5d4becb3e42b6
   260  DATA  ·kcon+0x4D0(SB)/8, $0x597f299cfc657e2a
   261  DATA  ·kcon+0x4D8(SB)/8, $0x597f299cfc657e2a
   262  DATA  ·kcon+0x4E0(SB)/8, $0x5fcb6fab3ad6faec
   263  DATA  ·kcon+0x4E8(SB)/8, $0x5fcb6fab3ad6faec
   264  DATA  ·kcon+0x4F0(SB)/8, $0x6c44198c4a475817
   265  DATA  ·kcon+0x4F8(SB)/8, $0x6c44198c4a475817
   266  DATA  ·kcon+0x500(SB)/8, $0x0000000000000000
   267  DATA  ·kcon+0x508(SB)/8, $0x0000000000000000
   268  DATA  ·kcon+0x510(SB)/8, $0x1011121314151617
   269  DATA  ·kcon+0x518(SB)/8, $0x0001020304050607
   270  GLOBL ·kcon(SB), RODATA, $1312
   271  
   272  #define SHA512ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
   273  	VSEL		g, f, e, FUNC; \
   274  	VSHASIGMAD	$15, e, $1, S1; \
   275  	VADDUDM		xi, h, h; \
   276  	VSHASIGMAD	$0, a, $1, S0; \
   277  	VADDUDM		FUNC, h, h; \
   278  	VXOR		b, a, FUNC; \
   279  	VADDUDM		S1, h, h; \
   280  	VSEL		b, c, FUNC, FUNC; \
   281  	VADDUDM		KI, g, g; \
   282  	VADDUDM		h, d, d; \
   283  	VADDUDM		FUNC, S0, S0; \
   284  	LVX		(TBL)(idx), KI; \
   285  	VADDUDM		S0, h, h
   286  
   287  #define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
   288  	VSHASIGMAD	$0, xj_1, $0, s0; \
   289  	VSEL		g, f, e, FUNC; \
   290  	VSHASIGMAD	$15, e, $1, S1; \
   291  	VADDUDM		xi, h, h; \
   292  	VSHASIGMAD	$0, a, $1, S0; \
   293  	VSHASIGMAD	$15, xj_14, $0, s1; \
   294  	VADDUDM		FUNC, h, h; \
   295  	VXOR		b, a, FUNC; \
   296  	VADDUDM		xj_9, xj, xj; \
   297  	VADDUDM		S1, h, h; \
   298  	VSEL		b, c, FUNC, FUNC; \
   299  	VADDUDM		KI, g, g; \
   300  	VADDUDM		h, d, d; \
   301  	VADDUDM		FUNC, S0, S0; \
   302  	VADDUDM		s0, xj, xj; \
   303  	LVX		(TBL)(idx), KI; \
   304  	VADDUDM		S0, h, h; \
   305  	VADDUDM		s1, xj, xj
   306  
   307  // func blockPOWER(dig *Digest, p []byte)
   308  TEXT ·blockPOWER(SB),0,$0-32
   309  	MOVD	dig+0(FP), CTX
   310  	MOVD	p_base+8(FP), INP
   311  	MOVD	p_len+16(FP), LEN
   312  
   313  	SRD	$6, LEN
   314  	SLD	$6, LEN
   315  
   316  	ADD	INP, LEN, END
   317  
   318  	CMP	INP, END
   319  	BEQ	end
   320  
   321  	MOVD	$·kcon(SB), TBL_STRT
   322  
   323  	MOVD	R0, CNT
   324  	MOVWZ	$0x010, R_x010
   325  	MOVWZ	$0x020, R_x020
   326  	MOVWZ	$0x030, R_x030
   327  	MOVD	$0x040, R_x040
   328  	MOVD	$0x050, R_x050
   329  	MOVD	$0x060, R_x060
   330  	MOVD	$0x070, R_x070
   331  	MOVD	$0x080, R_x080
   332  	MOVD	$0x090, R_x090
   333  	MOVD	$0x0a0, R_x0a0
   334  	MOVD	$0x0b0, R_x0b0
   335  	MOVD	$0x0c0, R_x0c0
   336  	MOVD	$0x0d0, R_x0d0
   337  	MOVD	$0x0e0, R_x0e0
   338  	MOVD	$0x0f0, R_x0f0
   339  	MOVD	$0x100, R_x100
   340  	MOVD	$0x110, R_x110
   341  
   342  
   343  #ifdef GOARCH_ppc64le
   344  	// Generate the mask used with VPERM for LE
   345  	MOVWZ	$8, TEMP
   346  	LVSL	(TEMP)(R0), LEMASK
   347  	VSPLTISB	$0x0F, KI
   348  	VXOR	KI, LEMASK, LEMASK
   349  #endif
   350  
   351  	LXVD2X	(CTX)(R_x000), VS32	// v0 = vs32
   352  	LXVD2X	(CTX)(R_x010), VS34	// v2 = vs34
   353  	LXVD2X	(CTX)(R_x020), VS36	// v4 = vs36
   354  
   355  	// unpack the input values into vector registers
   356  	VSLDOI	$8, V0, V0, V1
   357  	LXVD2X	(CTX)(R_x030), VS38	// v6 = vs38
   358  	VSLDOI	$8, V2, V2, V3
   359  	VSLDOI	$8, V4, V4, V5
   360  	VSLDOI	$8, V6, V6, V7
   361  
   362  loop:
   363  	MOVD	TBL_STRT, TBL
   364  	LVX	(TBL)(R_x000), KI
   365  
   366  	LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
   367  	ADD	$16, INP
   368  
   369  	// Copy V0-V7 to VS24-VS31
   370  
   371  	XXLOR	V0, V0, VS24
   372  	XXLOR	V1, V1, VS25
   373  	XXLOR	V2, V2, VS26
   374  	XXLOR	V3, V3, VS27
   375  	XXLOR	V4, V4, VS28
   376  	XXLOR	V5, V5, VS29
   377  	XXLOR	V6, V6, VS30
   378  	XXLOR	V7, V7, VS31
   379  
   380  	VADDUDM	KI, V7, V7	// h+K[i]
   381  	LVX	(TBL)(R_x010), KI
   382  
   383  	VPERMLE(V8,V8,LEMASK,V8)
   384  	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
   385  	LXVD2X	(INP)(R_x000), VS42	// load v10 (=vs42) in advance
   386  	VSLDOI	$8, V8, V8, V9
   387  	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
   388  	VPERMLE(V10,V10,LEMASK,V10)
   389  	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
   390  	LXVD2X	(INP)(R_x010), VS44	// load v12 (=vs44) in advance
   391  	VSLDOI	$8, V10, V10, V11
   392  	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
   393  	VPERMLE(V12,V12,LEMASK,V12)
   394  	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
   395  	LXVD2X	(INP)(R_x020), VS46	// load v14 (=vs46) in advance
   396  	VSLDOI	$8, V12, V12, V13
   397  	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
   398  	VPERMLE(V14,V14,LEMASK,V14)
   399  	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
   400  	LXVD2X	(INP)(R_x030), VS48	// load v16 (=vs48) in advance
   401  	VSLDOI	$8, V14, V14, V15
   402  	SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
   403  	VPERMLE(V16,V16,LEMASK,V16)
   404  	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
   405  	LXVD2X	(INP)(R_x040), VS50	// load v18 (=vs50) in advance
   406  	VSLDOI	$8, V16, V16, V17
   407  	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
   408  	VPERMLE(V18,V18,LEMASK,V18)
   409  	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
   410  	LXVD2X	(INP)(R_x050), VS52	// load v20 (=vs52) in advance
   411  	VSLDOI	$8, V18, V18, V19
   412  	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
   413  	VPERMLE(V20,V20,LEMASK,V20)
   414  	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
   415  	LXVD2X	(INP)(R_x060), VS54	// load v22 (=vs54) in advance
   416  	VSLDOI	$8, V20, V20, V21
   417  	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
   418  	VPERMLE(V22,V22,LEMASK,V22)
   419  	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
   420  	VSLDOI	$8, V22, V22, V23
   421  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
   422  
   423  	MOVWZ	$4, TEMP
   424  	MOVWZ	TEMP, CTR
   425  	ADD	$0x120, TBL
   426  	ADD	$0x70, INP
   427  
   428  L16_xx:
   429  	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
   430  	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
   431  	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
   432  	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
   433  	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
   434  	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
   435  	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
   436  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
   437  	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
   438  	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
   439  	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
   440  	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
   441  	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
   442  	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
   443  	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
   444  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
   445  	ADD	$0x100, TBL
   446  
   447  	BDNZ	L16_xx
   448  
   449  	XXLOR	VS24, VS24, V10
   450  	XXLOR	VS25, VS25, V11
   451  	XXLOR	VS26, VS26, V12
   452  	XXLOR	VS27, VS27, V13
   453  	XXLOR	VS28, VS28, V14
   454  	XXLOR	VS29, VS29, V15
   455  	XXLOR	VS30, VS30, V16
   456  	XXLOR	VS31, VS31, V17
   457  	VADDUDM	V10, V0, V0
   458  	VADDUDM	V11, V1, V1
   459  	VADDUDM	V12, V2, V2
   460  	VADDUDM	V13, V3, V3
   461  	VADDUDM	V14, V4, V4
   462  	VADDUDM	V15, V5, V5
   463  	VADDUDM	V16, V6, V6
   464  	VADDUDM	V17, V7, V7
   465  
   466  	CMPU	INP, END
   467  	BLT	loop
   468  
   469  #ifdef GOARCH_ppc64le
   470  	VPERM	V0, V1, KI, V0
   471  	VPERM	V2, V3, KI, V2
   472  	VPERM	V4, V5, KI, V4
   473  	VPERM	V6, V7, KI, V6
   474  #else
   475  	VPERM	V1, V0, KI, V0
   476  	VPERM	V3, V2, KI, V2
   477  	VPERM	V5, V4, KI, V4
   478  	VPERM	V7, V6, KI, V6
   479  #endif
   480  	STXVD2X	VS32, (CTX+R_x000)	// v0 = vs32
   481  	STXVD2X	VS34, (CTX+R_x010)	// v2 = vs34
   482  	STXVD2X	VS36, (CTX+R_x020)	// v4 = vs36
   483  	STXVD2X	VS38, (CTX+R_x030)	// v6 = vs38
   484  
   485  end:
   486  	RET
   487  
   488  

View as plain text