Text file src/crypto/sha512/sha512block_arm64.s

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  // Based on the Linux Kernel with the following comment:
     8  // Algorithm based on https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fb87127bcefc17efab757606e1b1e333fd614dd0
     9  // Originally written by Ard Biesheuvel <ard.biesheuvel@linaro.org>
    10  
    11  #include "textflag.h"
    12  
    13  #define SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
    14  	VADD	in0.D2, rc0.D2, V5.D2 \
    15  	VEXT	$8, i3.B16, i2.B16, V6.B16 \
    16  	VEXT	$8, V5.B16, V5.B16, V5.B16 \
    17  	VEXT	$8, i2.B16, i1.B16, V7.B16 \
    18  	VADD	V5.D2, i3.D2, i3.D2 \
    19  
    20  #define SHA512ROUND(i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
    21  	VLD1.P	16(R4), [rc1.D2] \
    22  	SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
    23  	VEXT	$8, in4.B16, in3.B16, V5.B16 \
    24  	SHA512SU0	in1.D2, in0.D2 \
    25  	SHA512H	V7.D2, V6, i3 \
    26  	SHA512SU1	V5.D2, in2.D2, in0.D2 \
    27  	VADD	i3.D2, i1.D2, i4.D2 \
    28  	SHA512H2	i0.D2, i1, i3
    29  
    30  #define SHA512ROUND_NO_UPDATE(i0, i1, i2, i3, i4, rc0, rc1, in0) \
    31  	VLD1.P	16(R4), [rc1.D2] \
    32  	SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
    33  	SHA512H	V7.D2, V6, i3 \
    34  	VADD	i3.D2, i1.D2, i4.D2 \
    35  	SHA512H2	i0.D2, i1, i3
    36  
    37  #define SHA512ROUND_LAST(i0, i1, i2, i3, i4, rc0, in0) \
    38  	SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
    39  	SHA512H	V7.D2, V6, i3 \
    40  	VADD	i3.D2, i1.D2, i4.D2 \
    41  	SHA512H2	i0.D2, i1, i3
    42  
    43  // func blockAsm(dig *digest, p []byte)
    44  TEXT ·blockAsm(SB),NOSPLIT,$0
    45  	MOVD	dig+0(FP), R0
    46  	MOVD	p_base+8(FP), R1
    47  	MOVD	p_len+16(FP), R2
    48  	MOVD	·_K+0(SB), R3
    49  
    50  	// long enough to prefetch
    51  	PRFM	(R3), PLDL3KEEP
    52  	// load digest
    53  	VLD1	(R0), [V8.D2, V9.D2, V10.D2, V11.D2]
    54  loop:
    55  	// load digest in V0-V3 keeping original in V8-V11
    56  	VMOV	V8.B16, V0.B16
    57  	VMOV	V9.B16, V1.B16
    58  	VMOV	V10.B16, V2.B16
    59  	VMOV	V11.B16, V3.B16
    60  
    61  	// load message data in V12-V19
    62  	VLD1.P	64(R1), [V12.D2, V13.D2, V14.D2, V15.D2]
    63  	VLD1.P	64(R1), [V16.D2, V17.D2, V18.D2, V19.D2]
    64  
    65  	// convert message into big endian format
    66  	VREV64	V12.B16, V12.B16
    67  	VREV64	V13.B16, V13.B16
    68  	VREV64	V14.B16, V14.B16
    69  	VREV64	V15.B16, V15.B16
    70  	VREV64	V16.B16, V16.B16
    71  	VREV64	V17.B16, V17.B16
    72  	VREV64	V18.B16, V18.B16
    73  	VREV64	V19.B16, V19.B16
    74  
    75  	MOVD	R3, R4
    76  	// load first 4 round consts in V20-V23
    77  	VLD1.P	64(R4), [V20.D2, V21.D2, V22.D2, V23.D2]
    78  
    79  	SHA512ROUND(V0, V1, V2, V3, V4, V20, V24, V12, V13, V19, V16, V17)
    80  	SHA512ROUND(V3, V0, V4, V2, V1, V21, V25, V13, V14, V12, V17, V18)
    81  	SHA512ROUND(V2, V3, V1, V4, V0, V22, V26, V14, V15, V13, V18, V19)
    82  	SHA512ROUND(V4, V2, V0, V1, V3, V23, V27, V15, V16, V14, V19, V12)
    83  	SHA512ROUND(V1, V4, V3, V0, V2, V24, V28, V16, V17, V15, V12, V13)
    84  
    85  	SHA512ROUND(V0, V1, V2, V3, V4, V25, V29, V17, V18, V16, V13, V14)
    86  	SHA512ROUND(V3, V0, V4, V2, V1, V26, V30, V18, V19, V17, V14, V15)
    87  	SHA512ROUND(V2, V3, V1, V4, V0, V27, V31, V19, V12, V18, V15, V16)
    88  	SHA512ROUND(V4, V2, V0, V1, V3, V28, V24, V12, V13, V19, V16, V17)
    89  	SHA512ROUND(V1, V4, V3, V0, V2, V29, V25, V13, V14, V12, V17, V18)
    90  
    91  	SHA512ROUND(V0, V1, V2, V3, V4, V30, V26, V14, V15, V13, V18, V19)
    92  	SHA512ROUND(V3, V0, V4, V2, V1, V31, V27, V15, V16, V14, V19, V12)
    93  	SHA512ROUND(V2, V3, V1, V4, V0, V24, V28, V16, V17, V15, V12, V13)
    94  	SHA512ROUND(V4, V2, V0, V1, V3, V25, V29, V17, V18, V16, V13, V14)
    95  	SHA512ROUND(V1, V4, V3, V0, V2, V26, V30, V18, V19, V17, V14, V15)
    96  
    97  	SHA512ROUND(V0, V1, V2, V3, V4, V27, V31, V19, V12, V18, V15, V16)
    98  	SHA512ROUND(V3, V0, V4, V2, V1, V28, V24, V12, V13, V19, V16, V17)
    99  	SHA512ROUND(V2, V3, V1, V4, V0, V29, V25, V13, V14, V12, V17, V18)
   100  	SHA512ROUND(V4, V2, V0, V1, V3, V30, V26, V14, V15, V13, V18, V19)
   101  	SHA512ROUND(V1, V4, V3, V0, V2, V31, V27, V15, V16, V14, V19, V12)
   102  
   103  	SHA512ROUND(V0, V1, V2, V3, V4, V24, V28, V16, V17, V15, V12, V13)
   104  	SHA512ROUND(V3, V0, V4, V2, V1, V25, V29, V17, V18, V16, V13, V14)
   105  	SHA512ROUND(V2, V3, V1, V4, V0, V26, V30, V18, V19, V17, V14, V15)
   106  	SHA512ROUND(V4, V2, V0, V1, V3, V27, V31, V19, V12, V18, V15, V16)
   107  	SHA512ROUND(V1, V4, V3, V0, V2, V28, V24, V12, V13, V19, V16, V17)
   108  
   109  	SHA512ROUND(V0, V1, V2, V3, V4, V29, V25, V13, V14, V12, V17, V18)
   110  	SHA512ROUND(V3, V0, V4, V2, V1, V30, V26, V14, V15, V13, V18, V19)
   111  	SHA512ROUND(V2, V3, V1, V4, V0, V31, V27, V15, V16, V14, V19, V12)
   112  	SHA512ROUND(V4, V2, V0, V1, V3, V24, V28, V16, V17, V15, V12, V13)
   113  	SHA512ROUND(V1, V4, V3, V0, V2, V25, V29, V17, V18, V16, V13, V14)
   114  
   115  	SHA512ROUND(V0, V1, V2, V3, V4, V26, V30, V18, V19, V17, V14, V15)
   116  	SHA512ROUND(V3, V0, V4, V2, V1, V27, V31, V19, V12, V18, V15, V16)
   117  
   118  	SHA512ROUND_NO_UPDATE(V2, V3, V1, V4, V0, V28, V24, V12)
   119  	SHA512ROUND_NO_UPDATE(V4, V2, V0, V1, V3, V29, V25, V13)
   120  	SHA512ROUND_NO_UPDATE(V1, V4, V3, V0, V2, V30, V26, V14)
   121  	SHA512ROUND_NO_UPDATE(V0, V1, V2, V3, V4, V31, V27, V15)
   122  
   123  	SHA512ROUND_LAST(V3, V0, V4, V2, V1, V24, V16)
   124  	SHA512ROUND_LAST(V2, V3, V1, V4, V0, V25, V17)
   125  	SHA512ROUND_LAST(V4, V2, V0, V1, V3, V26, V18)
   126  	SHA512ROUND_LAST(V1, V4, V3, V0, V2, V27, V19)
   127  
   128  	// add result to digest
   129  	VADD	V0.D2, V8.D2, V8.D2
   130  	VADD	V1.D2, V9.D2, V9.D2
   131  	VADD	V2.D2, V10.D2, V10.D2
   132  	VADD	V3.D2, V11.D2, V11.D2
   133  	SUB	$128, R2
   134  	CBNZ	R2, loop
   135  
   136  	VST1	[V8.D2, V9.D2, V10.D2, V11.D2], (R0)
   137  	RET
   138  

View as plain text