// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "go_asm.h" #include "textflag.h" DATA ·chachaConst+0x00(SB)/4, $0x61707865 DATA ·chachaConst+0x04(SB)/4, $0x3320646e DATA ·chachaConst+0x08(SB)/4, $0x79622d32 DATA ·chachaConst+0x0c(SB)/4, $0x6b206574 GLOBL ·chachaConst(SB), NOPTR|RODATA, $32 DATA ·chachaIncRot+0x00(SB)/4, $0x00000000 DATA ·chachaIncRot+0x04(SB)/4, $0x00000001 DATA ·chachaIncRot+0x08(SB)/4, $0x00000002 DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003 GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32 // QR is the ChaCha8 quarter-round on a, b, c, and d. #define QR(a, b, c, d) \ VADDW a, b, a; \ VXORV d, a, d; \ VROTRW $16, d; \ VADDW c, d, c; \ VXORV b, c, b; \ VROTRW $20, b; \ VADDW a, b, a; \ VXORV d, a, d; \ VROTRW $24, d; \ VADDW c, d, c; \ VXORV b, c, b; \ VROTRW $25, b // func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32) TEXT ·block(SB), NOSPLIT, $0 // seed in R4 // blocks in R5 // counter in R6 MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7 BNE R7, lsx_chacha8 JMP ·block_generic(SB) RET lsx_chacha8: MOVV $·chachaConst(SB), R10 MOVV $·chachaIncRot(SB), R11 // load contants // VLDREPL.W $0, R10, V0 WORD $0x30200140 // VLDREPL.W $1, R10, V1 WORD $0x30200541 // VLDREPL.W $2, R10, V2 WORD $0x30200942 // VLDREPL.W $3, R10, V3 WORD $0x30200d43 // load 4-32bit data from incRotMatrix added to counter VMOVQ (R11), V30 // load seed // VLDREPL.W $0, R4, V4 WORD $0x30200084 // VLDREPL.W $1, R4, V5 WORD $0x30200485 // VLDREPL.W $2, R4, V6 WORD $0x30200886 // VLDREPL.W $3, R4, V7 WORD $0x30200c87 // VLDREPL.W $4, R4, V8 WORD $0x30201088 // VLDREPL.W $5, R4, V9 WORD $0x30201489 // VLDREPL.W $6, R4, V10 WORD $0x3020188a // VLDREPL.W $7, R4, V11 WORD $0x30201c8b // load counter and update counter VMOVQ R6, V12.W4 VADDW V12, V30, V12 // zeros for remaining three matrix entries VXORV V13, V13, V13 VXORV V14, V14, V14 VXORV V15, V15, V15 // save seed state for adding back later VORV V4, V13, V20 VORV V5, V13, V21 VORV V6, V13, V22 VORV V7, V13, V23 VORV V8, V13, V24 VORV V9, V13, V25 VORV V10, V13, V26 VORV V11, V13, V27 // 4 iterations. Each iteration is 8 quarter-rounds. MOVV $4, R7 loop: QR(V0, V4, V8, V12) QR(V1, V5, V9, V13) QR(V2, V6, V10, V14) QR(V3, V7, V11, V15) QR(V0, V5, V10, V15) QR(V1, V6, V11, V12) QR(V2, V7, V8, V13) QR(V3, V4, V9, V14) SUBV $1, R7 BNE R7, R0, loop // add seed back VADDW V4, V20, V4 VADDW V5, V21, V5 VADDW V6, V22, V6 VADDW V7, V23, V7 VADDW V8, V24, V8 VADDW V9, V25, V9 VADDW V10, V26, V10 VADDW V11, V27, V11 // store blocks back to output buffer VMOVQ V0, (R5) VMOVQ V1, 16(R5) VMOVQ V2, 32(R5) VMOVQ V3, 48(R5) VMOVQ V4, 64(R5) VMOVQ V5, 80(R5) VMOVQ V6, 96(R5) VMOVQ V7, 112(R5) VMOVQ V8, 128(R5) VMOVQ V9, 144(R5) VMOVQ V10, 160(R5) VMOVQ V11, 176(R5) VMOVQ V12, 192(R5) VMOVQ V13, 208(R5) VMOVQ V14, 224(R5) VMOVQ V15, 240(R5) RET