// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( "fmt" "os" "simd" "unsafe" ) func load(s []float64) simd.Float64x4 { return simd.LoadFloat64x4((*[4]float64)(s[:4])) } type S1 = simd.Float64x4 type S2 simd.Float64x4 func (s S2) Len() int { return simd.Float64x4(s).Len() } func (s S2) Load(a []float64) S2 { return S2(load(a)) } func (s S2) Store(a *[4]float64) { simd.Float64x4(s).Store(a) } func (s S2) Add(a S2) S2 { return S2(simd.Float64x4(s).Add(simd.Float64x4(a))) } func (s S2) Mul(a S2) S2 { return S2(simd.Float64x4(s).Mul(simd.Float64x4(a))) } type S3 struct { simd.Float64x4 } func ip64_0(a, b []float64) float64 { s := 0.0 for i := range a { s += a[i] * b[i] } return s } func ip64_1(a, b []float64) float64 { var z S1 sum := z var i int stride := z.Len() for ; i <= len(a)-stride; i += stride { va := load(a[i:]) vb := load(b[i:]) sum = sum.Add(va.Mul(vb)) } var tmp [4]float64 sum.Store(&tmp) return tmp[0] + tmp[1] + tmp[2] + tmp[3] } func ip64_1a(a, b []float64) float64 { var z S1 sum := z var i int stride := z.Len() for ; i <= len(a)-stride; i += stride { va := load(a[i:]) vb := load(b[i:]) sum = FMA(sum, va, vb) } var tmp [4]float64 sum.Store(&tmp) return tmp[0] + tmp[1] + tmp[2] + tmp[3] } //go:noinline func FMA(a, b, c simd.Float64x4) simd.Float64x4 { return a.Add(b.Mul(c)) } func ip64_2(a, b []float64) float64 { var z S2 sum := z var i int stride := z.Len() for ; i <= len(a)-stride; i += stride { va := z.Load(a[i:]) vb := z.Load(b[i:]) sum = sum.Add(va.Mul(vb)) } var tmp [4]float64 sum.Store(&tmp) return tmp[0] + tmp[1] + tmp[2] + tmp[3] } func ip64_3(a, b []float64) float64 { var z S3 sum := z var i int stride := z.Len() for ; i <= len(a)-stride; i += stride { va := load(a[i:]) vb := load(b[i:]) sum = S3{sum.Add(va.Mul(vb))} } var tmp [4]float64 sum.Store(&tmp) return tmp[0] + tmp[1] + tmp[2] + tmp[3] } func main() { a := []float64{1, 2, 3, 4, 5, 6, 7, 8} ip0 := ip64_0(a, a) ip1 := ip64_1(a, a) ip1a := ip64_1a(a, a) ip2 := ip64_2(a, a) ip3 := ip64_3(a, a) fmt.Printf("Test IP = %f\n", ip0) fmt.Printf("SIMD IP 1 = %f\n", ip1) fmt.Printf("SIMD IP 1a = %f\n", ip1a) fmt.Printf("SIMD IP 2 = %f\n", ip2) fmt.Printf("SIMD IP 3 = %f\n", ip3) var z1 S1 var z2 S2 var z3 S2 s1, s2, s3 := unsafe.Sizeof(z1), unsafe.Sizeof(z2), unsafe.Sizeof(z3) fmt.Printf("unsafe.Sizeof(z1, z2, z3)=%d, %d, %d\n", s1, s2, s3) fail := false if s1 != 32 || s2 != 32 || s3 != 32 { fmt.Println("Failed a sizeof check, should all be 32") fail = true } if ip1 != ip0 || ip1a != ip0 || ip2 != ip0 || ip3 != ip0 { fmt.Println("Failed an inner product check, should all be", ip0) fail = true } if fail { os.Exit(1) } }