Text file src/crypto/internal/fips140/sha3/sha3_arm64.s

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  // func keccakF1600NEON(a *[200]byte)
    10  TEXT ·keccakF1600NEON(SB), $200-8
    11  	MOVD	a+0(FP), R0
    12  	MOVD	$round_consts<>(SB), R1
    13  	MOVD	$24, R2 // counter for loop
    14  
    15  	VLD1.P	16(R0), [V0.D1, V1.D1]
    16  	VLD1.P	16(R0), [V2.D1, V3.D1]
    17  	VLD1.P	16(R0), [V4.D1, V5.D1]
    18  	VLD1.P	16(R0), [V6.D1, V7.D1]
    19  	VLD1.P	16(R0), [V8.D1, V9.D1]
    20  	VLD1.P	16(R0), [V10.D1, V11.D1]
    21  	VLD1.P	16(R0), [V12.D1, V13.D1]
    22  	VLD1.P	16(R0), [V14.D1, V15.D1]
    23  	VLD1.P	16(R0), [V16.D1, V17.D1]
    24  	VLD1.P	16(R0), [V18.D1, V19.D1]
    25  	VLD1.P	16(R0), [V20.D1, V21.D1]
    26  	VLD1.P	16(R0), [V22.D1, V23.D1]
    27  	VLD1	(R0), [V24.D1]
    28  
    29  	SUB	$192, R0, R0
    30  
    31  loop:
    32  	// theta
    33  	VEOR3	 V20.B16, V15.B16, V10.B16, V25.B16
    34  	VEOR3	 V21.B16, V16.B16, V11.B16, V26.B16
    35  	VEOR3	 V22.B16, V17.B16, V12.B16, V27.B16
    36  	VEOR3	 V23.B16, V18.B16, V13.B16, V28.B16
    37  	VEOR3	 V24.B16, V19.B16, V14.B16, V29.B16
    38  	VEOR3	 V25.B16, V5.B16, V0.B16, V25.B16
    39  	VEOR3	 V26.B16, V6.B16, V1.B16, V26.B16
    40  	VEOR3	 V27.B16, V7.B16, V2.B16, V27.B16
    41  	VEOR3	 V28.B16, V8.B16, V3.B16, V28.B16
    42  	VEOR3	 V29.B16, V9.B16, V4.B16, V29.B16
    43  
    44  	VRAX1	V27.D2, V25.D2, V30.D2
    45  	VRAX1	V28.D2, V26.D2, V31.D2
    46  	VRAX1	V29.D2, V27.D2, V27.D2
    47  	VRAX1	V25.D2, V28.D2, V28.D2
    48  	VRAX1	V26.D2, V29.D2, V29.D2
    49  
    50  	// theta and rho and Pi
    51  	VEOR	V29.B16, V0.B16, V0.B16
    52  
    53  	VXAR	$63, V30.D2, V1.D2, V25.D2
    54  
    55  	VXAR	$20, V30.D2, V6.D2, V1.D2
    56  	VXAR	$44, V28.D2, V9.D2, V6.D2
    57  	VXAR	$3, V31.D2, V22.D2, V9.D2
    58  	VXAR	$25, V28.D2, V14.D2, V22.D2
    59  	VXAR	$46, V29.D2, V20.D2, V14.D2
    60  
    61  	VXAR	$2, V31.D2, V2.D2, V26.D2
    62  
    63  	VXAR	$21, V31.D2, V12.D2, V2.D2
    64  	VXAR	$39, V27.D2, V13.D2, V12.D2
    65  	VXAR	$56, V28.D2, V19.D2, V13.D2
    66  	VXAR	$8, V27.D2, V23.D2, V19.D2
    67  	VXAR	$23, V29.D2, V15.D2, V23.D2
    68  
    69  	VXAR	$37, V28.D2, V4.D2, V15.D2
    70  
    71  	VXAR	$50, V28.D2, V24.D2, V28.D2
    72  	VXAR	$62, V30.D2, V21.D2, V24.D2
    73  	VXAR	$9, V27.D2, V8.D2, V8.D2
    74  	VXAR	$19, V30.D2, V16.D2, V4.D2
    75  	VXAR	$28, V29.D2, V5.D2, V16.D2
    76  
    77  	VXAR	$36, V27.D2, V3.D2, V5.D2
    78  
    79  	VXAR	$43, V27.D2, V18.D2, V27.D2
    80  	VXAR	$49, V31.D2, V17.D2, V3.D2
    81  	VXAR	$54, V30.D2, V11.D2, V30.D2
    82  	VXAR	$58, V31.D2, V7.D2, V31.D2
    83  	VXAR	$61, V29.D2, V10.D2, V29.D2
    84  
    85  	// chi and iota
    86  	VBCAX	V8.B16, V22.B16, V26.B16, V20.B16
    87  	VBCAX	V22.B16, V23.B16, V8.B16, V21.B16
    88  	VBCAX	V23.B16, V24.B16, V22.B16, V22.B16
    89  	VBCAX	V24.B16, V26.B16, V23.B16, V23.B16
    90  	VBCAX	V26.B16, V8.B16, V24.B16, V24.B16
    91  
    92  	VLD1R.P	8(R1), [V26.D2]
    93  
    94  	VBCAX	V3.B16, V19.B16, V30.B16, V17.B16
    95  	VBCAX	V19.B16, V15.B16, V3.B16, V18.B16
    96  	VBCAX	V15.B16, V16.B16, V19.B16, V19.B16
    97  	VBCAX	V16.B16, V30.B16, V15.B16, V15.B16
    98  	VBCAX	V30.B16, V3.B16, V16.B16, V16.B16
    99  
   100  	VBCAX	V31.B16, V12.B16, V25.B16, V10.B16
   101  	VBCAX	V12.B16, V13.B16, V31.B16, V11.B16
   102  	VBCAX	V13.B16, V14.B16, V12.B16, V12.B16
   103  	VBCAX	V14.B16, V25.B16, V13.B16, V13.B16
   104  	VBCAX	V25.B16, V31.B16, V14.B16, V14.B16
   105  
   106  	VBCAX	V4.B16, V9.B16, V29.B16, V7.B16
   107  	VBCAX	V9.B16, V5.B16, V4.B16, V8.B16
   108  	VBCAX	V5.B16, V6.B16, V9.B16, V9.B16
   109  	VBCAX	V6.B16, V29.B16, V5.B16, V5.B16
   110  	VBCAX	V29.B16, V4.B16, V6.B16, V6.B16
   111  
   112  	VBCAX	V28.B16, V0.B16, V27.B16, V3.B16
   113  	VBCAX	V0.B16, V1.B16, V28.B16, V4.B16
   114  
   115  	VBCAX	V1.B16, V2.B16, V0.B16, V0.B16  // iota (chi part)
   116  
   117  	VBCAX	V2.B16, V27.B16, V1.B16, V1.B16
   118  	VBCAX	V27.B16, V28.B16, V2.B16, V2.B16
   119  
   120  	VEOR	V26.B16, V0.B16, V0.B16 // iota
   121  
   122  	SUB		$1, R2, R2
   123  	CBNZ	R2, loop
   124  
   125  	VST1.P	[V0.D1, V1.D1], 16(R0)
   126  	VST1.P	[V2.D1, V3.D1], 16(R0)
   127  	VST1.P	[V4.D1, V5.D1], 16(R0)
   128  	VST1.P	[V6.D1, V7.D1], 16(R0)
   129  	VST1.P	[V8.D1, V9.D1], 16(R0)
   130  	VST1.P	[V10.D1, V11.D1], 16(R0)
   131  	VST1.P	[V12.D1, V13.D1], 16(R0)
   132  	VST1.P	[V14.D1, V15.D1], 16(R0)
   133  	VST1.P	[V16.D1, V17.D1], 16(R0)
   134  	VST1.P	[V18.D1, V19.D1], 16(R0)
   135  	VST1.P	[V20.D1, V21.D1], 16(R0)
   136  	VST1.P	[V22.D1, V23.D1], 16(R0)
   137  	VST1	[V24.D1], (R0)
   138  
   139  	RET
   140  
   141  DATA	round_consts<>+0x00(SB)/8, $0x0000000000000001
   142  DATA	round_consts<>+0x08(SB)/8, $0x0000000000008082
   143  DATA	round_consts<>+0x10(SB)/8, $0x800000000000808a
   144  DATA	round_consts<>+0x18(SB)/8, $0x8000000080008000
   145  DATA	round_consts<>+0x20(SB)/8, $0x000000000000808b
   146  DATA	round_consts<>+0x28(SB)/8, $0x0000000080000001
   147  DATA	round_consts<>+0x30(SB)/8, $0x8000000080008081
   148  DATA	round_consts<>+0x38(SB)/8, $0x8000000000008009
   149  DATA	round_consts<>+0x40(SB)/8, $0x000000000000008a
   150  DATA	round_consts<>+0x48(SB)/8, $0x0000000000000088
   151  DATA	round_consts<>+0x50(SB)/8, $0x0000000080008009
   152  DATA	round_consts<>+0x58(SB)/8, $0x000000008000000a
   153  DATA	round_consts<>+0x60(SB)/8, $0x000000008000808b
   154  DATA	round_consts<>+0x68(SB)/8, $0x800000000000008b
   155  DATA	round_consts<>+0x70(SB)/8, $0x8000000000008089
   156  DATA	round_consts<>+0x78(SB)/8, $0x8000000000008003
   157  DATA	round_consts<>+0x80(SB)/8, $0x8000000000008002
   158  DATA	round_consts<>+0x88(SB)/8, $0x8000000000000080
   159  DATA	round_consts<>+0x90(SB)/8, $0x000000000000800a
   160  DATA	round_consts<>+0x98(SB)/8, $0x800000008000000a
   161  DATA	round_consts<>+0xA0(SB)/8, $0x8000000080008081
   162  DATA	round_consts<>+0xA8(SB)/8, $0x8000000000008080
   163  DATA	round_consts<>+0xB0(SB)/8, $0x0000000080000001
   164  DATA	round_consts<>+0xB8(SB)/8, $0x8000000080008008
   165  GLOBL	round_consts<>(SB), NOPTR|RODATA, $192
   166  

View as plain text