// Copyright 2022 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build !purego #include "textflag.h" // func keccakF1600NEON(a *[200]byte) TEXT ·keccakF1600NEON(SB), $200-8 MOVD a+0(FP), R0 MOVD $round_consts<>(SB), R1 MOVD $24, R2 // counter for loop VLD1.P 16(R0), [V0.D1, V1.D1] VLD1.P 16(R0), [V2.D1, V3.D1] VLD1.P 16(R0), [V4.D1, V5.D1] VLD1.P 16(R0), [V6.D1, V7.D1] VLD1.P 16(R0), [V8.D1, V9.D1] VLD1.P 16(R0), [V10.D1, V11.D1] VLD1.P 16(R0), [V12.D1, V13.D1] VLD1.P 16(R0), [V14.D1, V15.D1] VLD1.P 16(R0), [V16.D1, V17.D1] VLD1.P 16(R0), [V18.D1, V19.D1] VLD1.P 16(R0), [V20.D1, V21.D1] VLD1.P 16(R0), [V22.D1, V23.D1] VLD1 (R0), [V24.D1] SUB $192, R0, R0 loop: // theta VEOR3 V20.B16, V15.B16, V10.B16, V25.B16 VEOR3 V21.B16, V16.B16, V11.B16, V26.B16 VEOR3 V22.B16, V17.B16, V12.B16, V27.B16 VEOR3 V23.B16, V18.B16, V13.B16, V28.B16 VEOR3 V24.B16, V19.B16, V14.B16, V29.B16 VEOR3 V25.B16, V5.B16, V0.B16, V25.B16 VEOR3 V26.B16, V6.B16, V1.B16, V26.B16 VEOR3 V27.B16, V7.B16, V2.B16, V27.B16 VEOR3 V28.B16, V8.B16, V3.B16, V28.B16 VEOR3 V29.B16, V9.B16, V4.B16, V29.B16 VRAX1 V27.D2, V25.D2, V30.D2 VRAX1 V28.D2, V26.D2, V31.D2 VRAX1 V29.D2, V27.D2, V27.D2 VRAX1 V25.D2, V28.D2, V28.D2 VRAX1 V26.D2, V29.D2, V29.D2 // theta and rho and Pi VEOR V29.B16, V0.B16, V0.B16 VXAR $63, V30.D2, V1.D2, V25.D2 VXAR $20, V30.D2, V6.D2, V1.D2 VXAR $44, V28.D2, V9.D2, V6.D2 VXAR $3, V31.D2, V22.D2, V9.D2 VXAR $25, V28.D2, V14.D2, V22.D2 VXAR $46, V29.D2, V20.D2, V14.D2 VXAR $2, V31.D2, V2.D2, V26.D2 VXAR $21, V31.D2, V12.D2, V2.D2 VXAR $39, V27.D2, V13.D2, V12.D2 VXAR $56, V28.D2, V19.D2, V13.D2 VXAR $8, V27.D2, V23.D2, V19.D2 VXAR $23, V29.D2, V15.D2, V23.D2 VXAR $37, V28.D2, V4.D2, V15.D2 VXAR $50, V28.D2, V24.D2, V28.D2 VXAR $62, V30.D2, V21.D2, V24.D2 VXAR $9, V27.D2, V8.D2, V8.D2 VXAR $19, V30.D2, V16.D2, V4.D2 VXAR $28, V29.D2, V5.D2, V16.D2 VXAR $36, V27.D2, V3.D2, V5.D2 VXAR $43, V27.D2, V18.D2, V27.D2 VXAR $49, V31.D2, V17.D2, V3.D2 VXAR $54, V30.D2, V11.D2, V30.D2 VXAR $58, V31.D2, V7.D2, V31.D2 VXAR $61, V29.D2, V10.D2, V29.D2 // chi and iota VBCAX V8.B16, V22.B16, V26.B16, V20.B16 VBCAX V22.B16, V23.B16, V8.B16, V21.B16 VBCAX V23.B16, V24.B16, V22.B16, V22.B16 VBCAX V24.B16, V26.B16, V23.B16, V23.B16 VBCAX V26.B16, V8.B16, V24.B16, V24.B16 VLD1R.P 8(R1), [V26.D2] VBCAX V3.B16, V19.B16, V30.B16, V17.B16 VBCAX V19.B16, V15.B16, V3.B16, V18.B16 VBCAX V15.B16, V16.B16, V19.B16, V19.B16 VBCAX V16.B16, V30.B16, V15.B16, V15.B16 VBCAX V30.B16, V3.B16, V16.B16, V16.B16 VBCAX V31.B16, V12.B16, V25.B16, V10.B16 VBCAX V12.B16, V13.B16, V31.B16, V11.B16 VBCAX V13.B16, V14.B16, V12.B16, V12.B16 VBCAX V14.B16, V25.B16, V13.B16, V13.B16 VBCAX V25.B16, V31.B16, V14.B16, V14.B16 VBCAX V4.B16, V9.B16, V29.B16, V7.B16 VBCAX V9.B16, V5.B16, V4.B16, V8.B16 VBCAX V5.B16, V6.B16, V9.B16, V9.B16 VBCAX V6.B16, V29.B16, V5.B16, V5.B16 VBCAX V29.B16, V4.B16, V6.B16, V6.B16 VBCAX V28.B16, V0.B16, V27.B16, V3.B16 VBCAX V0.B16, V1.B16, V28.B16, V4.B16 VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part) VBCAX V2.B16, V27.B16, V1.B16, V1.B16 VBCAX V27.B16, V28.B16, V2.B16, V2.B16 VEOR V26.B16, V0.B16, V0.B16 // iota SUB $1, R2, R2 CBNZ R2, loop VST1.P [V0.D1, V1.D1], 16(R0) VST1.P [V2.D1, V3.D1], 16(R0) VST1.P [V4.D1, V5.D1], 16(R0) VST1.P [V6.D1, V7.D1], 16(R0) VST1.P [V8.D1, V9.D1], 16(R0) VST1.P [V10.D1, V11.D1], 16(R0) VST1.P [V12.D1, V13.D1], 16(R0) VST1.P [V14.D1, V15.D1], 16(R0) VST1.P [V16.D1, V17.D1], 16(R0) VST1.P [V18.D1, V19.D1], 16(R0) VST1.P [V20.D1, V21.D1], 16(R0) VST1.P [V22.D1, V23.D1], 16(R0) VST1 [V24.D1], (R0) RET DATA round_consts<>+0x00(SB)/8, $0x0000000000000001 DATA round_consts<>+0x08(SB)/8, $0x0000000000008082 DATA round_consts<>+0x10(SB)/8, $0x800000000000808a DATA round_consts<>+0x18(SB)/8, $0x8000000080008000 DATA round_consts<>+0x20(SB)/8, $0x000000000000808b DATA round_consts<>+0x28(SB)/8, $0x0000000080000001 DATA round_consts<>+0x30(SB)/8, $0x8000000080008081 DATA round_consts<>+0x38(SB)/8, $0x8000000000008009 DATA round_consts<>+0x40(SB)/8, $0x000000000000008a DATA round_consts<>+0x48(SB)/8, $0x0000000000000088 DATA round_consts<>+0x50(SB)/8, $0x0000000080008009 DATA round_consts<>+0x58(SB)/8, $0x000000008000000a DATA round_consts<>+0x60(SB)/8, $0x000000008000808b DATA round_consts<>+0x68(SB)/8, $0x800000000000008b DATA round_consts<>+0x70(SB)/8, $0x8000000000008089 DATA round_consts<>+0x78(SB)/8, $0x8000000000008003 DATA round_consts<>+0x80(SB)/8, $0x8000000000008002 DATA round_consts<>+0x88(SB)/8, $0x8000000000000080 DATA round_consts<>+0x90(SB)/8, $0x000000000000800a DATA round_consts<>+0x98(SB)/8, $0x800000008000000a DATA round_consts<>+0xA0(SB)/8, $0x8000000080008081 DATA round_consts<>+0xA8(SB)/8, $0x8000000000008080 DATA round_consts<>+0xB0(SB)/8, $0x0000000080000001 DATA round_consts<>+0xB8(SB)/8, $0x8000000080008008 GLOBL round_consts<>(SB), NOPTR|RODATA, $192