Source file src/simd/archsimd/internal/simd_test/simd_test.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && amd64
     6  
     7  package simd_test
     8  
     9  import (
    10  	"fmt"
    11  	"os"
    12  	"reflect"
    13  	"simd/archsimd"
    14  	"slices"
    15  	"testing"
    16  	"unsafe"
    17  )
    18  
    19  func TestMain(m *testing.M) {
    20  	if !archsimd.X86.AVX() {
    21  		fmt.Fprintln(os.Stderr, "Skipping tests: AVX is not available")
    22  		os.Exit(0)
    23  	}
    24  	os.Exit(m.Run())
    25  }
    26  
    27  var sink any
    28  
    29  func TestType(t *testing.T) {
    30  	// Testing:
    31  	// - Defined as another struct's field is ok
    32  	// - Pointer is ok
    33  	// - Type defition is ok
    34  	// - Type alias is ok
    35  	// - Type conversion is ok
    36  	// - Conversion to interface is ok
    37  	type alias = archsimd.Int32x4
    38  	type maskT archsimd.Mask32x4
    39  	type myStruct struct {
    40  		x alias
    41  		y *archsimd.Int32x4
    42  		z maskT
    43  	}
    44  	vals := [4]int32{1, 2, 3, 4}
    45  	v := myStruct{x: archsimd.LoadInt32x4(&vals)}
    46  	// masking elements 1 and 2.
    47  	want := []int32{2, 4, 0, 0}
    48  	y := archsimd.LoadInt32x4(&vals)
    49  	v.y = &y
    50  	sink = y
    51  
    52  	if !archsimd.X86.AVX512GFNI() {
    53  		t.Skip("Test requires X86.AVX512, not available on this hardware")
    54  		return
    55  	}
    56  	v.z = maskT(archsimd.Mask32x4FromBits(0b0011))
    57  	*v.y = v.y.Add(v.x).Masked(archsimd.Mask32x4(v.z))
    58  
    59  	got := [4]int32{}
    60  	v.y.Store(&got)
    61  	checkSlices(t, got[:], want)
    62  }
    63  
    64  func TestUncomparable(t *testing.T) {
    65  	// Test that simd vectors are not comparable
    66  	var x, y any = archsimd.LoadUint32x4(&[4]uint32{1, 2, 3, 4}), archsimd.LoadUint32x4(&[4]uint32{5, 6, 7, 8})
    67  	shouldPanic := func(fn func()) {
    68  		defer func() {
    69  			if recover() == nil {
    70  				panic("did not panic")
    71  			}
    72  		}()
    73  		fn()
    74  	}
    75  	shouldPanic(func() { _ = x == y })
    76  }
    77  
    78  func TestFuncValue(t *testing.T) {
    79  	// Test that simd intrinsic can be used as a function value.
    80  	xv := [4]int32{1, 2, 3, 4}
    81  	yv := [4]int32{5, 6, 7, 8}
    82  	want := []int32{6, 8, 10, 12}
    83  	x := archsimd.LoadInt32x4(&xv)
    84  	y := archsimd.LoadInt32x4(&yv)
    85  	fn := archsimd.Int32x4.Add
    86  	sink = fn
    87  	x = fn(x, y)
    88  	got := [4]int32{}
    89  	x.Store(&got)
    90  	checkSlices(t, got[:], want)
    91  }
    92  
    93  func TestReflectMethod(t *testing.T) {
    94  	// Test that simd intrinsic can be accessed via reflection.
    95  	// NOTE: we don't yet support reflect method.Call.
    96  	xv := [4]int32{1, 2, 3, 4}
    97  	yv := [4]int32{5, 6, 7, 8}
    98  	want := []int32{6, 8, 10, 12}
    99  	x := archsimd.LoadInt32x4(&xv)
   100  	y := archsimd.LoadInt32x4(&yv)
   101  	m, ok := reflect.TypeOf(x).MethodByName("Add")
   102  	if !ok {
   103  		t.Fatal("Add method not found")
   104  	}
   105  	fn := m.Func.Interface().(func(x, y archsimd.Int32x4) archsimd.Int32x4)
   106  	x = fn(x, y)
   107  	got := [4]int32{}
   108  	x.Store(&got)
   109  	checkSlices(t, got[:], want)
   110  }
   111  
   112  func TestVectorConversion(t *testing.T) {
   113  	if !archsimd.X86.AVX512GFNI() {
   114  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   115  		return
   116  	}
   117  	xv := [4]int32{1, 2, 3, 4}
   118  	x := archsimd.LoadInt32x4(&xv)
   119  	xPromoted := x.AsInt64x2()
   120  	xPromotedDemoted := xPromoted.AsInt32x4()
   121  	got := [4]int32{}
   122  	xPromotedDemoted.Store(&got)
   123  	for i := range 4 {
   124  		if xv[i] != got[i] {
   125  			t.Errorf("Result at %d incorrect: want %d, got %d", i, xv[i], got[i])
   126  		}
   127  	}
   128  }
   129  
   130  func TestMaskConversion(t *testing.T) {
   131  	if !archsimd.X86.AVX512GFNI() {
   132  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   133  		return
   134  	}
   135  	x := archsimd.LoadInt32x4Slice([]int32{5, 0, 7, 0})
   136  	mask := archsimd.Int32x4{}.Sub(x).ToMask()
   137  	y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4}).Add(x).Masked(mask)
   138  	want := [4]int32{6, 0, 10, 0}
   139  	got := make([]int32, 4)
   140  	y.StoreSlice(got)
   141  	checkSlices(t, got[:], want[:])
   142  }
   143  
   144  func TestPermute(t *testing.T) {
   145  	if !archsimd.X86.AVX512() {
   146  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   147  		return
   148  	}
   149  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
   150  	indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
   151  	want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
   152  	got := make([]int64, 8)
   153  	archsimd.LoadInt64x8Slice(x).Permute(archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
   154  	checkSlices(t, got, want)
   155  }
   156  
   157  func TestPermuteOrZero(t *testing.T) {
   158  	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
   159  	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
   160  	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
   161  	got := make([]uint8, len(x))
   162  	archsimd.LoadUint8x16Slice(x).PermuteOrZero(archsimd.LoadInt8x16Slice(indices)).StoreSlice(got)
   163  	checkSlices(t, got, want)
   164  }
   165  
   166  func TestConcatPermute(t *testing.T) {
   167  	if !archsimd.X86.AVX512() {
   168  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   169  		return
   170  	}
   171  	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
   172  	y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
   173  	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
   174  	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
   175  	got := make([]int64, 8)
   176  	archsimd.LoadInt64x8Slice(x).ConcatPermute(archsimd.LoadInt64x8Slice(y), archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
   177  	checkSlices(t, got, want)
   178  }
   179  
   180  func TestCompress(t *testing.T) {
   181  	if !archsimd.X86.AVX512() {
   182  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   183  		return
   184  	}
   185  	v1234 := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
   186  	v2400 := v1234.Compress(archsimd.Mask32x4FromBits(0b1010))
   187  	got := make([]int32, 4)
   188  	v2400.StoreSlice(got)
   189  	want := []int32{2, 4, 0, 0}
   190  	if !slices.Equal(got, want) {
   191  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
   192  	}
   193  }
   194  
   195  func TestExpand(t *testing.T) {
   196  	if !archsimd.X86.AVX512() {
   197  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   198  		return
   199  	}
   200  	v3400 := archsimd.LoadInt32x4Slice([]int32{3, 4, 0, 0})
   201  	v2400 := v3400.Expand(archsimd.Mask32x4FromBits(0b1010))
   202  	got := make([]int32, 4)
   203  	v2400.StoreSlice(got)
   204  	want := []int32{0, 3, 0, 4}
   205  	if !slices.Equal(got, want) {
   206  		t.Errorf("want and got differ, want=%v, got=%v", want, got)
   207  	}
   208  }
   209  
   210  var testShiftAllVal uint64 = 3
   211  
   212  func TestShiftAll(t *testing.T) {
   213  	got := make([]int32, 4)
   214  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(2).StoreSlice(got)
   215  	for _, v := range got {
   216  		if v != 0b1100 {
   217  			t.Errorf("expect 0b1100, got %b", v)
   218  		}
   219  	}
   220  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(testShiftAllVal).StoreSlice(got)
   221  	for _, v := range got {
   222  		if v != 0b11000 {
   223  			t.Errorf("expect 0b11000, got %b", v)
   224  		}
   225  	}
   226  }
   227  
   228  func TestSlicesInt8(t *testing.T) {
   229  	if !archsimd.X86.AVX2() {
   230  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   231  		return
   232  	}
   233  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   234  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   235  	v := archsimd.LoadInt8x32Slice(a)
   236  	b := make([]int8, 32, 32)
   237  	v.StoreSlice(b)
   238  	checkSlices(t, a, b)
   239  }
   240  
   241  func TestSlicesInt8SetElem(t *testing.T) {
   242  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   243  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   244  	v := archsimd.LoadInt8x16Slice(a)
   245  
   246  	v = v.SetElem(3, 13)
   247  	a[3] = 13
   248  
   249  	b := make([]int8, 16, 16)
   250  	v.StoreSlice(b)
   251  	checkSlices(t, a, b)
   252  }
   253  
   254  func TestSlicesInt8GetElem(t *testing.T) {
   255  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   256  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   257  	v := archsimd.LoadInt8x16Slice(a)
   258  	e := v.GetElem(2)
   259  	if e != a[2] {
   260  		t.Errorf("GetElem(2) = %d != a[2] = %d", e, a[2])
   261  	}
   262  
   263  }
   264  
   265  func TestSlicesInt8TooShortLoad(t *testing.T) {
   266  	if !archsimd.X86.AVX2() {
   267  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   268  		return
   269  	}
   270  	defer func() {
   271  		if r := recover(); r != nil {
   272  			t.Logf("Saw EXPECTED panic %v", r)
   273  		} else {
   274  			t.Errorf("Did not see expected panic")
   275  		}
   276  	}()
   277  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   278  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} // TOO SHORT, should panic
   279  	v := archsimd.LoadInt8x32Slice(a)
   280  	b := make([]int8, 32, 32)
   281  	v.StoreSlice(b)
   282  	checkSlices(t, a, b)
   283  }
   284  
   285  func TestSlicesInt8TooShortStore(t *testing.T) {
   286  	if !archsimd.X86.AVX2() {
   287  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   288  		return
   289  	}
   290  	defer func() {
   291  		if r := recover(); r != nil {
   292  			t.Logf("Saw EXPECTED panic %v", r)
   293  		} else {
   294  			t.Errorf("Did not see expected panic")
   295  		}
   296  	}()
   297  	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
   298  		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
   299  	v := archsimd.LoadInt8x32Slice(a)
   300  	b := make([]int8, 31) // TOO SHORT, should panic
   301  	v.StoreSlice(b)
   302  	checkSlices(t, a, b)
   303  }
   304  
   305  func TestSlicesFloat64(t *testing.T) {
   306  	a := []float64{1, 2, 3, 4, 5, 6, 7, 8} // too long, should be fine
   307  	v := archsimd.LoadFloat64x4Slice(a)
   308  	b := make([]float64, 4, 4)
   309  	v.StoreSlice(b)
   310  	for i := range b {
   311  		if a[i] != b[i] {
   312  			t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
   313  		}
   314  	}
   315  }
   316  
   317  // TODO: try to reduce this test to be smaller.
   318  func TestMergeLocals(t *testing.T) {
   319  	if !archsimd.X86.AVX2() {
   320  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   321  		return
   322  	}
   323  	testMergeLocalswrapper(t, archsimd.Int64x4.Add)
   324  }
   325  
   326  //go:noinline
   327  func forceSpill() {}
   328  
   329  func testMergeLocalswrapper(t *testing.T, op func(archsimd.Int64x4, archsimd.Int64x4) archsimd.Int64x4) {
   330  	t.Helper()
   331  	s0 := []int64{0, 1, 2, 3}
   332  	s1 := []int64{-1, 0, -1, 0}
   333  	want := []int64{-1, 1, 1, 3}
   334  	v := archsimd.LoadInt64x4Slice(s0)
   335  	m := archsimd.LoadInt64x4Slice(s1)
   336  	forceSpill()
   337  	got := make([]int64, 4)
   338  	gotv := op(v, m)
   339  	gotv.StoreSlice(got)
   340  	for i := range len(want) {
   341  		if !(got[i] == want[i]) {
   342  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
   343  		}
   344  	}
   345  }
   346  
   347  func TestBitMaskFromBits(t *testing.T) {
   348  	if !archsimd.X86.AVX512() {
   349  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   350  		return
   351  	}
   352  	results := [2]int64{}
   353  	want := [2]int64{0, 6}
   354  	m := archsimd.Mask64x2FromBits(0b10)
   355  	archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
   356  	for i := range 2 {
   357  		if results[i] != want[i] {
   358  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   359  		}
   360  	}
   361  }
   362  
   363  var maskForTestBitMaskFromBitsLoad = uint8(0b10)
   364  
   365  func TestBitMaskFromBitsLoad(t *testing.T) {
   366  	if !archsimd.X86.AVX512() {
   367  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   368  		return
   369  	}
   370  	results := [2]int64{}
   371  	want := [2]int64{0, 6}
   372  	m := archsimd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
   373  	archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
   374  	for i := range 2 {
   375  		if results[i] != want[i] {
   376  			t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
   377  		}
   378  	}
   379  }
   380  
   381  func TestBitMaskToBits(t *testing.T) {
   382  	int8s := []int8{
   383  		0, 1, 1, 0, 0, 1, 0, 1,
   384  		1, 0, 1, 1, 0, 0, 1, 0,
   385  		1, 0, 0, 1, 1, 0, 1, 0,
   386  		0, 1, 1, 0, 0, 1, 0, 1,
   387  		1, 0, 0, 1, 0, 1, 1, 0,
   388  		0, 1, 0, 1, 1, 0, 0, 1,
   389  		1, 0, 1, 0, 0, 1, 1, 0,
   390  		0, 1, 1, 0, 1, 0, 0, 1,
   391  	}
   392  	int16s := make([]int16, 32)
   393  	for i := range int16s {
   394  		int16s[i] = int16(int8s[i])
   395  	}
   396  	int32s := make([]int32, 16)
   397  	for i := range int32s {
   398  		int32s[i] = int32(int8s[i])
   399  	}
   400  	int64s := make([]int64, 8)
   401  	for i := range int64s {
   402  		int64s[i] = int64(int8s[i])
   403  	}
   404  	want64 := uint64(0)
   405  	for i := range int8s {
   406  		want64 |= uint64(int8s[i]) << i
   407  	}
   408  	want32 := uint32(want64)
   409  	want16 := uint16(want64)
   410  	want8 := uint8(want64)
   411  	want4 := want8 & 0b1111
   412  	want2 := want4 & 0b11
   413  
   414  	if v := archsimd.LoadInt8x16Slice(int8s[:16]).ToMask().ToBits(); v != want16 {
   415  		t.Errorf("want %b, got %b", want16, v)
   416  	}
   417  	if v := archsimd.LoadInt32x4Slice(int32s[:4]).ToMask().ToBits(); v != want4 {
   418  		t.Errorf("want %b, got %b", want4, v)
   419  	}
   420  	if v := archsimd.LoadInt32x8Slice(int32s[:8]).ToMask().ToBits(); v != want8 {
   421  		t.Errorf("want %b, got %b", want8, v)
   422  	}
   423  	if v := archsimd.LoadInt64x2Slice(int64s[:2]).ToMask().ToBits(); v != want2 {
   424  		t.Errorf("want %b, got %b", want2, v)
   425  	}
   426  	if v := archsimd.LoadInt64x4Slice(int64s[:4]).ToMask().ToBits(); v != want4 {
   427  		t.Errorf("want %b, got %b", want4, v)
   428  	}
   429  
   430  	if archsimd.X86.AVX2() {
   431  		if v := archsimd.LoadInt8x32Slice(int8s[:32]).ToMask().ToBits(); v != want32 {
   432  			t.Errorf("want %b, got %b", want32, v)
   433  		}
   434  	}
   435  
   436  	if archsimd.X86.AVX512() {
   437  		if v := archsimd.LoadInt8x64Slice(int8s).ToMask().ToBits(); v != want64 {
   438  			t.Errorf("want %b, got %b", want64, v)
   439  		}
   440  		if v := archsimd.LoadInt16x8Slice(int16s[:8]).ToMask().ToBits(); v != want8 {
   441  			t.Errorf("want %b, got %b", want8, v)
   442  		}
   443  		if v := archsimd.LoadInt16x16Slice(int16s[:16]).ToMask().ToBits(); v != want16 {
   444  			t.Errorf("want %b, got %b", want16, v)
   445  		}
   446  		if v := archsimd.LoadInt16x32Slice(int16s).ToMask().ToBits(); v != want32 {
   447  			t.Errorf("want %b, got %b", want32, v)
   448  		}
   449  		if v := archsimd.LoadInt32x16Slice(int32s).ToMask().ToBits(); v != want16 {
   450  			t.Errorf("want %b, got %b", want16, v)
   451  		}
   452  		if v := archsimd.LoadInt64x8Slice(int64s).ToMask().ToBits(); v != want8 {
   453  			t.Errorf("want %b, got %b", want8, v)
   454  		}
   455  	}
   456  }
   457  
   458  var maskForTestBitMaskFromBitsStore uint8
   459  
   460  func TestBitMaskToBitsStore(t *testing.T) {
   461  	if !archsimd.X86.AVX512() {
   462  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   463  		return
   464  	}
   465  	maskForTestBitMaskFromBitsStore = archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
   466  	if maskForTestBitMaskFromBitsStore != 0b101 {
   467  		t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
   468  	}
   469  }
   470  
   471  func TestMergeFloat(t *testing.T) {
   472  	if !archsimd.X86.AVX2() {
   473  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   474  		return
   475  	}
   476  	k := make([]int64, 4, 4)
   477  	s := make([]float64, 4, 4)
   478  
   479  	a := archsimd.LoadFloat64x4Slice([]float64{1, 2, 3, 4})
   480  	b := archsimd.LoadFloat64x4Slice([]float64{4, 2, 3, 1})
   481  	g := a.Greater(b)
   482  	g.ToInt64x4().StoreSlice(k)
   483  	c := a.Merge(b, g)
   484  
   485  	c.StoreSlice(s)
   486  
   487  	checkSlices[int64](t, k, []int64{0, 0, 0, -1})
   488  	checkSlices[float64](t, s, []float64{4, 2, 3, 4})
   489  }
   490  
   491  func TestMergeFloat512(t *testing.T) {
   492  	if !archsimd.X86.AVX512() {
   493  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   494  		return
   495  	}
   496  
   497  	k := make([]int64, 8, 8)
   498  	s := make([]float64, 8, 8)
   499  
   500  	a := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   501  	b := archsimd.LoadFloat64x8Slice([]float64{8, 7, 6, 5, 4, 2, 3, 1})
   502  	g := a.Greater(b)
   503  	g.ToInt64x8().StoreSlice(k)
   504  	c := a.Merge(b, g)
   505  	d := a.Masked(g)
   506  
   507  	checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
   508  
   509  	c.StoreSlice(s)
   510  	checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
   511  
   512  	d.StoreSlice(s)
   513  	checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
   514  }
   515  
   516  var ro uint8 = 2
   517  
   518  func TestRotateAllVariable(t *testing.T) {
   519  	if !archsimd.X86.AVX512() {
   520  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   521  		return
   522  	}
   523  	got := make([]int32, 4)
   524  	archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).StoreSlice(got)
   525  	for _, v := range got {
   526  		if v != 0b1100 {
   527  			t.Errorf("Want 0b1100, got %b", v)
   528  		}
   529  	}
   530  }
   531  
   532  func TestBroadcastUint32x4(t *testing.T) {
   533  	s := make([]uint32, 4, 4)
   534  	archsimd.BroadcastUint32x4(123456789).StoreSlice(s)
   535  	checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
   536  }
   537  
   538  func TestBroadcastFloat32x8(t *testing.T) {
   539  	s := make([]float32, 8, 8)
   540  	archsimd.BroadcastFloat32x8(123456789).StoreSlice(s)
   541  	checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
   542  }
   543  
   544  func TestBroadcastFloat64x2(t *testing.T) {
   545  	s := make([]float64, 2, 2)
   546  	archsimd.BroadcastFloat64x2(123456789).StoreSlice(s)
   547  	checkSlices(t, s, []float64{123456789, 123456789})
   548  }
   549  
   550  func TestBroadcastUint64x2(t *testing.T) {
   551  	s := make([]uint64, 2, 2)
   552  	archsimd.BroadcastUint64x2(123456789).StoreSlice(s)
   553  	checkSlices(t, s, []uint64{123456789, 123456789})
   554  }
   555  
   556  func TestBroadcastUint16x8(t *testing.T) {
   557  	s := make([]uint16, 8, 8)
   558  	archsimd.BroadcastUint16x8(12345).StoreSlice(s)
   559  	checkSlices(t, s, []uint16{12345, 12345, 12345, 12345})
   560  }
   561  
   562  func TestBroadcastInt8x32(t *testing.T) {
   563  	if !archsimd.X86.AVX2() {
   564  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   565  		return
   566  	}
   567  	s := make([]int8, 32, 32)
   568  	archsimd.BroadcastInt8x32(-123).StoreSlice(s)
   569  	checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
   570  		-123, -123, -123, -123, -123, -123, -123, -123,
   571  		-123, -123, -123, -123, -123, -123, -123, -123,
   572  		-123, -123, -123, -123, -123, -123, -123, -123,
   573  	})
   574  }
   575  
   576  func TestMaskOpt512(t *testing.T) {
   577  	if !archsimd.X86.AVX512() {
   578  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   579  		return
   580  	}
   581  
   582  	k := make([]int64, 8, 8)
   583  	s := make([]float64, 8, 8)
   584  
   585  	a := archsimd.LoadFloat64x8Slice([]float64{2, 0, 2, 0, 2, 0, 2, 0})
   586  	b := archsimd.LoadFloat64x8Slice([]float64{1, 1, 1, 1, 1, 1, 1, 1})
   587  	c := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
   588  	d := archsimd.LoadFloat64x8Slice([]float64{2, 4, 6, 8, 10, 12, 14, 16})
   589  	g := a.Greater(b)
   590  	e := c.Add(d).Masked(g)
   591  	e.StoreSlice(s)
   592  	g.ToInt64x8().StoreSlice(k)
   593  	checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
   594  	checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
   595  }
   596  
   597  // flattenedTranspose tranposes x and y, regarded as a pair of 2x2
   598  // matrices, but then flattens the rows in order, i.e
   599  // x: ABCD ==> a: A1B2
   600  // y: 1234     b: C3D4
   601  func flattenedTranspose(x, y archsimd.Int32x4) (a, b archsimd.Int32x4) {
   602  	return x.InterleaveLo(y), x.InterleaveHi(y)
   603  }
   604  
   605  func TestFlattenedTranspose(t *testing.T) {
   606  	r := make([]int32, 4, 4)
   607  	s := make([]int32, 4, 4)
   608  
   609  	x := archsimd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
   610  	y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
   611  	a, b := flattenedTranspose(x, y)
   612  
   613  	a.StoreSlice(r)
   614  	b.StoreSlice(s)
   615  
   616  	checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
   617  	checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
   618  
   619  }
   620  
   621  func TestClearAVXUpperBits(t *testing.T) {
   622  	// Test that ClearAVXUpperBits is safe even if there are SIMD values
   623  	// alive (although usually one should not do this).
   624  	if !archsimd.X86.AVX2() {
   625  		t.Skip("Test requires X86.AVX2, not available on this hardware")
   626  		return
   627  	}
   628  
   629  	r := make([]int64, 4)
   630  	s := make([]int64, 4)
   631  
   632  	x := archsimd.LoadInt64x4Slice([]int64{10, 20, 30, 40})
   633  	y := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
   634  
   635  	x.Add(y).StoreSlice(r)
   636  	archsimd.ClearAVXUpperBits()
   637  	x.Sub(y).StoreSlice(s)
   638  
   639  	checkSlices[int64](t, r, []int64{11, 22, 33, 44})
   640  	checkSlices[int64](t, s, []int64{9, 18, 27, 36})
   641  }
   642  
   643  func TestLeadingZeros(t *testing.T) {
   644  	if !archsimd.X86.AVX512() {
   645  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   646  		return
   647  	}
   648  
   649  	src := []uint64{0b1111, 0}
   650  	want := []uint64{60, 64}
   651  	got := make([]uint64, 2)
   652  	archsimd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
   653  	for i := range 2 {
   654  		if want[i] != got[i] {
   655  			t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
   656  		}
   657  	}
   658  }
   659  
   660  func TestIsZero(t *testing.T) {
   661  	v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
   662  	v2 := archsimd.LoadUint64x2Slice([]uint64{0, 0})
   663  	if v1.IsZero() {
   664  		t.Errorf("Result incorrect, want false, got true")
   665  	}
   666  	if !v2.IsZero() {
   667  		t.Errorf("Result incorrect, want true, got false")
   668  	}
   669  	if !v1.And(v2).IsZero() {
   670  		t.Errorf("Result incorrect, want true, got false")
   671  	}
   672  	if v1.AndNot(v2).IsZero() {
   673  		t.Errorf("Result incorrect, want false, got true")
   674  	}
   675  	if !v2.And(v1).IsZero() {
   676  		t.Errorf("Result incorrect, want true, got false")
   677  	}
   678  	if !v2.AndNot(v1).IsZero() {
   679  		t.Errorf("Result incorrect, want true, got false")
   680  	}
   681  }
   682  
   683  func TestSelect4FromPairConst(t *testing.T) {
   684  	x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
   685  	y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
   686  
   687  	llll := x.SelectFromPair(0, 1, 2, 3, y)
   688  	hhhh := x.SelectFromPair(4, 5, 6, 7, y)
   689  	llhh := x.SelectFromPair(0, 1, 6, 7, y)
   690  	hhll := x.SelectFromPair(6, 7, 0, 1, y)
   691  
   692  	lllh := x.SelectFromPair(0, 1, 2, 7, y)
   693  	llhl := x.SelectFromPair(0, 1, 7, 2, y)
   694  	lhll := x.SelectFromPair(0, 7, 1, 2, y)
   695  	hlll := x.SelectFromPair(7, 0, 1, 2, y)
   696  
   697  	hhhl := x.SelectFromPair(4, 5, 6, 0, y)
   698  	hhlh := x.SelectFromPair(4, 5, 0, 6, y)
   699  	hlhh := x.SelectFromPair(4, 0, 5, 6, y)
   700  	lhhh := x.SelectFromPair(0, 4, 5, 6, y)
   701  
   702  	lhlh := x.SelectFromPair(0, 4, 1, 5, y)
   703  	hlhl := x.SelectFromPair(4, 0, 5, 1, y)
   704  	lhhl := x.SelectFromPair(0, 4, 5, 1, y)
   705  	hllh := x.SelectFromPair(4, 0, 1, 5, y)
   706  
   707  	r := make([]int32, 4, 4)
   708  
   709  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   710  		v.StoreSlice(r)
   711  		checkSlices[int32](t, r, []int32{a, b, c, d})
   712  	}
   713  
   714  	foo(llll, 0, 1, 2, 3)
   715  	foo(hhhh, 4, 5, 6, 7)
   716  	foo(llhh, 0, 1, 6, 7)
   717  	foo(hhll, 6, 7, 0, 1)
   718  
   719  	foo(lllh, 0, 1, 2, 7)
   720  	foo(llhl, 0, 1, 7, 2)
   721  	foo(lhll, 0, 7, 1, 2)
   722  	foo(hlll, 7, 0, 1, 2)
   723  
   724  	foo(hhhl, 4, 5, 6, 0)
   725  	foo(hhlh, 4, 5, 0, 6)
   726  	foo(hlhh, 4, 0, 5, 6)
   727  	foo(lhhh, 0, 4, 5, 6)
   728  
   729  	foo(lhlh, 0, 4, 1, 5)
   730  	foo(hlhl, 4, 0, 5, 1)
   731  	foo(lhhl, 0, 4, 5, 1)
   732  	foo(hllh, 4, 0, 1, 5)
   733  }
   734  
   735  //go:noinline
   736  func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
   737  	return x.SelectFromPair(a, b, c, d, y)
   738  }
   739  
   740  func TestSelect4FromPairVar(t *testing.T) {
   741  	x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
   742  	y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
   743  
   744  	llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
   745  	hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
   746  	llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
   747  	hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
   748  
   749  	lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
   750  	llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
   751  	lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
   752  	hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
   753  
   754  	hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
   755  	hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
   756  	hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
   757  	lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
   758  
   759  	lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
   760  	hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
   761  	lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
   762  	hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
   763  
   764  	r := make([]int32, 4, 4)
   765  
   766  	foo := func(v archsimd.Int32x4, a, b, c, d int32) {
   767  		v.StoreSlice(r)
   768  		checkSlices[int32](t, r, []int32{a, b, c, d})
   769  	}
   770  
   771  	foo(llll, 0, 1, 2, 3)
   772  	foo(hhhh, 4, 5, 6, 7)
   773  	foo(llhh, 0, 1, 6, 7)
   774  	foo(hhll, 6, 7, 0, 1)
   775  
   776  	foo(lllh, 0, 1, 2, 7)
   777  	foo(llhl, 0, 1, 7, 2)
   778  	foo(lhll, 0, 7, 1, 2)
   779  	foo(hlll, 7, 0, 1, 2)
   780  
   781  	foo(hhhl, 4, 5, 6, 0)
   782  	foo(hhlh, 4, 5, 0, 6)
   783  	foo(hlhh, 4, 0, 5, 6)
   784  	foo(lhhh, 0, 4, 5, 6)
   785  
   786  	foo(lhlh, 0, 4, 1, 5)
   787  	foo(hlhl, 4, 0, 5, 1)
   788  	foo(lhhl, 0, 4, 5, 1)
   789  	foo(hllh, 4, 0, 1, 5)
   790  }
   791  
   792  func TestSelect4FromPairConstGrouped(t *testing.T) {
   793  	x := archsimd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13})
   794  	y := archsimd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17})
   795  
   796  	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
   797  	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
   798  	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
   799  	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
   800  
   801  	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
   802  	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
   803  	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
   804  	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
   805  
   806  	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
   807  	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
   808  	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
   809  	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
   810  
   811  	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
   812  	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
   813  	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
   814  	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
   815  
   816  	r := make([]float32, 8, 8)
   817  
   818  	foo := func(v archsimd.Float32x8, a, b, c, d float32) {
   819  		v.StoreSlice(r)
   820  		checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
   821  	}
   822  
   823  	foo(llll, 0, 1, 2, 3)
   824  	foo(hhhh, 4, 5, 6, 7)
   825  	foo(llhh, 0, 1, 6, 7)
   826  	foo(hhll, 6, 7, 0, 1)
   827  
   828  	foo(lllh, 0, 1, 2, 7)
   829  	foo(llhl, 0, 1, 7, 2)
   830  	foo(lhll, 0, 7, 1, 2)
   831  	foo(hlll, 7, 0, 1, 2)
   832  
   833  	foo(hhhl, 4, 5, 6, 0)
   834  	foo(hhlh, 4, 5, 0, 6)
   835  	foo(hlhh, 4, 0, 5, 6)
   836  	foo(lhhh, 0, 4, 5, 6)
   837  
   838  	foo(lhlh, 0, 4, 1, 5)
   839  	foo(hlhl, 4, 0, 5, 1)
   840  	foo(lhhl, 0, 4, 5, 1)
   841  	foo(hllh, 4, 0, 1, 5)
   842  }
   843  
   844  func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
   845  	if !archsimd.X86.AVX512() {
   846  		t.Skip("Test requires X86.AVX512, not available on this hardware")
   847  		return
   848  	}
   849  	x := archsimd.LoadUint32x16Slice([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
   850  	y := archsimd.LoadUint32x16Slice([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
   851  
   852  	llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
   853  	hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
   854  	llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
   855  	hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
   856  
   857  	lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
   858  	llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
   859  	lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
   860  	hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
   861  
   862  	hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
   863  	hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
   864  	hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
   865  	lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
   866  
   867  	lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
   868  	hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
   869  	lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
   870  	hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
   871  
   872  	r := make([]uint32, 16, 16)
   873  
   874  	foo := func(v archsimd.Uint32x16, a, b, c, d uint32) {
   875  		v.StoreSlice(r)
   876  		checkSlices[uint32](t, r, []uint32{a, b, c, d,
   877  			10 + a, 10 + b, 10 + c, 10 + d,
   878  			20 + a, 20 + b, 20 + c, 20 + d,
   879  			30 + a, 30 + b, 30 + c, 30 + d,
   880  		})
   881  	}
   882  
   883  	foo(llll, 0, 1, 2, 3)
   884  	foo(hhhh, 4, 5, 6, 7)
   885  	foo(llhh, 0, 1, 6, 7)
   886  	foo(hhll, 6, 7, 0, 1)
   887  
   888  	foo(lllh, 0, 1, 2, 7)
   889  	foo(llhl, 0, 1, 7, 2)
   890  	foo(lhll, 0, 7, 1, 2)
   891  	foo(hlll, 7, 0, 1, 2)
   892  
   893  	foo(hhhl, 4, 5, 6, 0)
   894  	foo(hhlh, 4, 5, 0, 6)
   895  	foo(hlhh, 4, 0, 5, 6)
   896  	foo(lhhh, 0, 4, 5, 6)
   897  
   898  	foo(lhlh, 0, 4, 1, 5)
   899  	foo(hlhl, 4, 0, 5, 1)
   900  	foo(lhhl, 0, 4, 5, 1)
   901  	foo(hllh, 4, 0, 1, 5)
   902  }
   903  
   904  func TestSelect128FromPair(t *testing.T) {
   905  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   906  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   907  
   908  	aa := x.Select128FromPair(0, 0, y)
   909  	ab := x.Select128FromPair(0, 1, y)
   910  	bc := x.Select128FromPair(1, 2, y)
   911  	cd := x.Select128FromPair(2, 3, y)
   912  	da := x.Select128FromPair(3, 0, y)
   913  	dc := x.Select128FromPair(3, 2, y)
   914  
   915  	r := make([]uint64, 4, 4)
   916  
   917  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   918  		a, b = 2*a, 2*b
   919  		v.StoreSlice(r)
   920  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   921  	}
   922  
   923  	foo(aa, 0, 0)
   924  	foo(ab, 0, 1)
   925  	foo(bc, 1, 2)
   926  	foo(cd, 2, 3)
   927  	foo(da, 3, 0)
   928  	foo(dc, 3, 2)
   929  }
   930  
   931  func TestSelect128FromPairError(t *testing.T) {
   932  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   933  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   934  
   935  	defer func() {
   936  		if r := recover(); r != nil {
   937  			t.Logf("Saw expected panic %v", r)
   938  		}
   939  	}()
   940  	_ = x.Select128FromPair(0, 4, y)
   941  
   942  	t.Errorf("Should have panicked")
   943  }
   944  
   945  //go:noinline
   946  func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
   947  	return x.Select128FromPair(lo, hi, y)
   948  }
   949  
   950  func TestSelect128FromPairVar(t *testing.T) {
   951  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
   952  	y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
   953  
   954  	aa := select128FromPair(x, 0, 0, y)
   955  	ab := select128FromPair(x, 0, 1, y)
   956  	bc := select128FromPair(x, 1, 2, y)
   957  	cd := select128FromPair(x, 2, 3, y)
   958  	da := select128FromPair(x, 3, 0, y)
   959  	dc := select128FromPair(x, 3, 2, y)
   960  
   961  	r := make([]uint64, 4, 4)
   962  
   963  	foo := func(v archsimd.Uint64x4, a, b uint64) {
   964  		a, b = 2*a, 2*b
   965  		v.StoreSlice(r)
   966  		checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
   967  	}
   968  
   969  	foo(aa, 0, 0)
   970  	foo(ab, 0, 1)
   971  	foo(bc, 1, 2)
   972  	foo(cd, 2, 3)
   973  	foo(da, 3, 0)
   974  	foo(dc, 3, 2)
   975  }
   976  
   977  func TestSelect2FromPairConst(t *testing.T) {
   978  	x := archsimd.LoadUint64x2Slice([]uint64{0, 1})
   979  	y := archsimd.LoadUint64x2Slice([]uint64{2, 3})
   980  
   981  	ll := x.SelectFromPair(0, 1, y)
   982  	hh := x.SelectFromPair(3, 2, y)
   983  	lh := x.SelectFromPair(0, 3, y)
   984  	hl := x.SelectFromPair(2, 1, y)
   985  
   986  	r := make([]uint64, 2, 2)
   987  
   988  	foo := func(v archsimd.Uint64x2, a, b uint64) {
   989  		v.StoreSlice(r)
   990  		checkSlices[uint64](t, r, []uint64{a, b})
   991  	}
   992  
   993  	foo(ll, 0, 1)
   994  	foo(hh, 3, 2)
   995  	foo(lh, 0, 3)
   996  	foo(hl, 2, 1)
   997  }
   998  
   999  func TestSelect2FromPairConstGroupedUint(t *testing.T) {
  1000  	x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 10, 11})
  1001  	y := archsimd.LoadUint64x4Slice([]uint64{2, 3, 12, 13})
  1002  
  1003  	ll := x.SelectFromPairGrouped(0, 1, y)
  1004  	hh := x.SelectFromPairGrouped(3, 2, y)
  1005  	lh := x.SelectFromPairGrouped(0, 3, y)
  1006  	hl := x.SelectFromPairGrouped(2, 1, y)
  1007  
  1008  	r := make([]uint64, 4, 4)
  1009  
  1010  	foo := func(v archsimd.Uint64x4, a, b uint64) {
  1011  		v.StoreSlice(r)
  1012  		checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
  1013  	}
  1014  
  1015  	foo(ll, 0, 1)
  1016  	foo(hh, 3, 2)
  1017  	foo(lh, 0, 3)
  1018  	foo(hl, 2, 1)
  1019  }
  1020  
  1021  func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
  1022  	x := archsimd.LoadFloat64x4Slice([]float64{0, 1, 10, 11})
  1023  	y := archsimd.LoadFloat64x4Slice([]float64{2, 3, 12, 13})
  1024  
  1025  	ll := x.SelectFromPairGrouped(0, 1, y)
  1026  	hh := x.SelectFromPairGrouped(3, 2, y)
  1027  	lh := x.SelectFromPairGrouped(0, 3, y)
  1028  	hl := x.SelectFromPairGrouped(2, 1, y)
  1029  
  1030  	r := make([]float64, 4, 4)
  1031  
  1032  	foo := func(v archsimd.Float64x4, a, b float64) {
  1033  		v.StoreSlice(r)
  1034  		checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
  1035  	}
  1036  
  1037  	foo(ll, 0, 1)
  1038  	foo(hh, 3, 2)
  1039  	foo(lh, 0, 3)
  1040  	foo(hl, 2, 1)
  1041  }
  1042  
  1043  func TestSelect2FromPairConstGroupedInt(t *testing.T) {
  1044  	x := archsimd.LoadInt64x4Slice([]int64{0, 1, 10, 11})
  1045  	y := archsimd.LoadInt64x4Slice([]int64{2, 3, 12, 13})
  1046  
  1047  	ll := x.SelectFromPairGrouped(0, 1, y)
  1048  	hh := x.SelectFromPairGrouped(3, 2, y)
  1049  	lh := x.SelectFromPairGrouped(0, 3, y)
  1050  	hl := x.SelectFromPairGrouped(2, 1, y)
  1051  
  1052  	r := make([]int64, 4, 4)
  1053  
  1054  	foo := func(v archsimd.Int64x4, a, b int64) {
  1055  		v.StoreSlice(r)
  1056  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
  1057  	}
  1058  
  1059  	foo(ll, 0, 1)
  1060  	foo(hh, 3, 2)
  1061  	foo(lh, 0, 3)
  1062  	foo(hl, 2, 1)
  1063  }
  1064  
  1065  func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
  1066  	if !archsimd.X86.AVX512() {
  1067  		t.Skip("Test requires X86.AVX512, not available on this hardware")
  1068  		return
  1069  	}
  1070  
  1071  	x := archsimd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31})
  1072  	y := archsimd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33})
  1073  
  1074  	ll := x.SelectFromPairGrouped(0, 1, y)
  1075  	hh := x.SelectFromPairGrouped(3, 2, y)
  1076  	lh := x.SelectFromPairGrouped(0, 3, y)
  1077  	hl := x.SelectFromPairGrouped(2, 1, y)
  1078  
  1079  	r := make([]int64, 8, 8)
  1080  
  1081  	foo := func(v archsimd.Int64x8, a, b int64) {
  1082  		v.StoreSlice(r)
  1083  		checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
  1084  	}
  1085  
  1086  	foo(ll, 0, 1)
  1087  	foo(hh, 3, 2)
  1088  	foo(lh, 0, 3)
  1089  	foo(hl, 2, 1)
  1090  }
  1091  
  1092  func TestString(t *testing.T) {
  1093  	x := archsimd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
  1094  	y := archsimd.LoadInt64x4Slice([]int64{-4, -5, -6, -7})
  1095  	z := archsimd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
  1096  	w := archsimd.LoadFloat64x4Slice([]float64{0.5, 1.5, -2.5, 3.5e9})
  1097  
  1098  	sx := "{0,1,2,3}"
  1099  	sy := "{-4,-5,-6,-7}"
  1100  	sz := "{0.5,1.5,-2.5,3.5e+09}"
  1101  	sw := sz
  1102  
  1103  	if x.String() != sx {
  1104  		t.Errorf("x=%s wanted %s", x, sx)
  1105  	}
  1106  	if y.String() != sy {
  1107  		t.Errorf("y=%s wanted %s", y, sy)
  1108  	}
  1109  	if z.String() != sz {
  1110  		t.Errorf("z=%s wanted %s", z, sz)
  1111  	}
  1112  	if w.String() != sw {
  1113  		t.Errorf("w=%s wanted %s", w, sw)
  1114  	}
  1115  	t.Logf("w=%s", w)
  1116  	t.Logf("x=%s", x)
  1117  	t.Logf("y=%s", y)
  1118  	t.Logf("z=%s", z)
  1119  }
  1120  
  1121  // a returns an slice of 16 int32
  1122  func a() []int32 {
  1123  	return make([]int32, 16, 16)
  1124  }
  1125  
  1126  // applyTo3 returns a 16-element slice of the results of
  1127  // applying f to the respective elements of vectors x, y, and z.
  1128  func applyTo3(x, y, z archsimd.Int32x16, f func(x, y, z int32) int32) []int32 {
  1129  	ax, ay, az := a(), a(), a()
  1130  	x.StoreSlice(ax)
  1131  	y.StoreSlice(ay)
  1132  	z.StoreSlice(az)
  1133  
  1134  	r := a()
  1135  	for i := range r {
  1136  		r[i] = f(ax[i], ay[i], az[i])
  1137  	}
  1138  	return r
  1139  }
  1140  
  1141  // applyTo3 returns a 16-element slice of the results of
  1142  // applying f to the respective elements of vectors x, y, z, and w.
  1143  func applyTo4(x, y, z, w archsimd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
  1144  	ax, ay, az, aw := a(), a(), a(), a()
  1145  	x.StoreSlice(ax)
  1146  	y.StoreSlice(ay)
  1147  	z.StoreSlice(az)
  1148  	w.StoreSlice(aw)
  1149  
  1150  	r := make([]int32, len(ax), len(ax))
  1151  	for i := range r {
  1152  		r[i] = f(ax[i], ay[i], az[i], aw[i])
  1153  	}
  1154  	return r
  1155  }
  1156  
  1157  func TestSelectTernOptInt32x16(t *testing.T) {
  1158  	if !archsimd.X86.AVX512() {
  1159  		t.Skip("Test requires X86.AVX512, not available on this hardware")
  1160  		return
  1161  	}
  1162  	ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1163  	ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
  1164  	az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
  1165  	aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
  1166  	am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
  1167  
  1168  	x := archsimd.LoadInt32x16Slice(ax)
  1169  	y := archsimd.LoadInt32x16Slice(ay)
  1170  	z := archsimd.LoadInt32x16Slice(az)
  1171  	w := archsimd.LoadInt32x16Slice(aw)
  1172  	m := archsimd.LoadInt32x16Slice(am)
  1173  
  1174  	foo := func(v archsimd.Int32x16, s []int32) {
  1175  		r := make([]int32, 16, 16)
  1176  		v.StoreSlice(r)
  1177  		checkSlices[int32](t, r, s)
  1178  	}
  1179  
  1180  	t0 := w.Xor(y).Xor(z)
  1181  	ft0 := func(w, y, z int32) int32 {
  1182  		return w ^ y ^ z
  1183  	}
  1184  	foo(t0, applyTo3(w, y, z, ft0))
  1185  
  1186  	t1 := m.And(w.Xor(y).Xor(z.Not()))
  1187  	ft1 := func(m, w, y, z int32) int32 {
  1188  		return m & (w ^ y ^ ^z)
  1189  	}
  1190  	foo(t1, applyTo4(m, w, y, z, ft1))
  1191  
  1192  	t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
  1193  	ft2 := func(x, y, z int32) int32 {
  1194  		return (x ^ y ^ z) & (x ^ y ^ ^z)
  1195  	}
  1196  	foo(t2, applyTo3(x, y, z, ft2))
  1197  }
  1198  
  1199  func TestMaskedMerge(t *testing.T) {
  1200  	if !archsimd.X86.AVX2() {
  1201  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1202  		return
  1203  	}
  1204  	x := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
  1205  	y := archsimd.LoadInt64x4Slice([]int64{5, 6, 1, 1})
  1206  	z := archsimd.LoadInt64x4Slice([]int64{-1, -2, -3, -4})
  1207  	res := make([]int64, 4)
  1208  	expected := []int64{6, 8, -3, -4}
  1209  	mask := x.Less(y)
  1210  	if archsimd.X86.AVX512() {
  1211  		x.Add(y).Merge(z, mask).StoreSlice(res)
  1212  	} else {
  1213  		x.Add(y).Merge(z, mask).StoreSlice(res)
  1214  	}
  1215  	for i := range 4 {
  1216  		if res[i] != expected[i] {
  1217  			t.Errorf("got %d wanted %d", res[i], expected[i])
  1218  		}
  1219  	}
  1220  }
  1221  
  1222  func TestPermuteScalars(t *testing.T) {
  1223  	x := []int32{11, 12, 13, 14}
  1224  	want := []int32{12, 13, 14, 11}
  1225  	got := make([]int32, 4)
  1226  	archsimd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
  1227  	checkSlices(t, got, want)
  1228  }
  1229  
  1230  func TestPermuteScalarsGrouped(t *testing.T) {
  1231  	if !archsimd.X86.AVX2() {
  1232  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1233  		return
  1234  	}
  1235  	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
  1236  	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
  1237  	got := make([]int32, 8)
  1238  	archsimd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
  1239  	checkSlices(t, got, want)
  1240  }
  1241  
  1242  func TestPermuteScalarsHi(t *testing.T) {
  1243  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
  1244  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
  1245  	got := make([]int16, len(x))
  1246  	archsimd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
  1247  	checkSlices(t, got, want)
  1248  }
  1249  
  1250  func TestPermuteScalarsLo(t *testing.T) {
  1251  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
  1252  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
  1253  	got := make([]int16, len(x))
  1254  	archsimd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
  1255  	checkSlices(t, got, want)
  1256  }
  1257  
  1258  func TestPermuteScalarsHiGrouped(t *testing.T) {
  1259  	if !archsimd.X86.AVX2() {
  1260  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1261  		return
  1262  	}
  1263  	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
  1264  	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
  1265  	got := make([]int16, len(x))
  1266  	archsimd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
  1267  	checkSlices(t, got, want)
  1268  }
  1269  
  1270  func TestPermuteScalarsLoGrouped(t *testing.T) {
  1271  	if !archsimd.X86.AVX2() {
  1272  		t.Skip("Test requires X86.AVX2, not available on this hardware")
  1273  		return
  1274  	}
  1275  	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
  1276  	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
  1277  	got := make([]int16, len(x))
  1278  	archsimd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
  1279  	checkSlices(t, got, want)
  1280  }
  1281  
  1282  func TestClMul(t *testing.T) {
  1283  	var x = archsimd.LoadUint64x2Slice([]uint64{1, 5})
  1284  	var y = archsimd.LoadUint64x2Slice([]uint64{3, 9})
  1285  
  1286  	foo := func(v archsimd.Uint64x2, s []uint64) {
  1287  		r := make([]uint64, 2, 2)
  1288  		v.StoreSlice(r)
  1289  		checkSlices[uint64](t, r, s)
  1290  	}
  1291  
  1292  	foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
  1293  	foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
  1294  	foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
  1295  	foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
  1296  	foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
  1297  
  1298  }
  1299  
  1300  func addPairsSlice[T number](a, b []T) []T {
  1301  	r := make([]T, len(a))
  1302  	for i := range len(a) / 2 {
  1303  		r[i] = a[2*i] + a[2*i+1]
  1304  		r[i+len(a)/2] = b[2*i] + b[2*i+1]
  1305  	}
  1306  	return r
  1307  }
  1308  
  1309  func subPairsSlice[T number](a, b []T) []T {
  1310  	r := make([]T, len(a))
  1311  	for i := range len(a) / 2 {
  1312  		r[i] = a[2*i] - a[2*i+1]
  1313  		r[i+len(a)/2] = b[2*i] - b[2*i+1]
  1314  	}
  1315  	return r
  1316  }
  1317  
  1318  func addPairsGroupedSlice[T number](a, b []T) []T {
  1319  	group := int(128 / unsafe.Sizeof(a[0]))
  1320  	r := make([]T, 0, len(a))
  1321  	for i := range len(a) / group {
  1322  		r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
  1323  	}
  1324  	return r
  1325  }
  1326  
  1327  func subPairsGroupedSlice[T number](a, b []T) []T {
  1328  	group := int(128 / unsafe.Sizeof(a[0]))
  1329  	r := make([]T, 0, len(a))
  1330  	for i := range len(a) / group {
  1331  		r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
  1332  	}
  1333  	return r
  1334  }
  1335  
  1336  func TestAddSubPairs(t *testing.T) {
  1337  	testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16])
  1338  	testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16])
  1339  	testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16])
  1340  	testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16])
  1341  	testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32])
  1342  	testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32])
  1343  	testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32])
  1344  	testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32])
  1345  	testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32])
  1346  	testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32])
  1347  	testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64])
  1348  	testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64])
  1349  
  1350  	// Grouped versions
  1351  	if archsimd.X86.AVX2() {
  1352  		testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16])
  1353  		testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16])
  1354  		testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16])
  1355  		testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16])
  1356  		testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32])
  1357  		testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32])
  1358  		testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32])
  1359  		testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32])
  1360  		testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32])
  1361  		testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32])
  1362  		testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64])
  1363  		testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64])
  1364  	}
  1365  }
  1366  
  1367  func convConcatSlice[T, U number](a, b []T, conv func(T) U) []U {
  1368  	r := make([]U, len(a)+len(b))
  1369  	for i, v := range a {
  1370  		r[i] = conv(v)
  1371  	}
  1372  	for i, v := range b {
  1373  		r[len(a)+i] = conv(v)
  1374  	}
  1375  	return r
  1376  }
  1377  
  1378  func convConcatGroupedSlice[T, U number](a, b []T, conv func(T) U) []U {
  1379  	group := int(128 / unsafe.Sizeof(a[0]))
  1380  	r := make([]U, 0, len(a)+len(b))
  1381  	for i := 0; i < len(a)/group; i++ {
  1382  		r = append(r, convConcatSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group], conv)...)
  1383  	}
  1384  	return r
  1385  }
  1386  
  1387  func TestSaturateConcat(t *testing.T) {
  1388  	// Int32x4.SaturateToInt16Concat
  1389  	forSlicePair(t, int32s, 4, func(x, y []int32) bool {
  1390  		a, b := archsimd.LoadInt32x4Slice(x), archsimd.LoadInt32x4Slice(y)
  1391  		var out [8]int16
  1392  		a.SaturateToInt16Concat(b).Store(&out)
  1393  		want := convConcatSlice(x, y, satToInt16)
  1394  		return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1395  	})
  1396  	// Int32x4.SaturateToUint16Concat
  1397  	forSlicePair(t, int32s, 4, func(x, y []int32) bool {
  1398  		a, b := archsimd.LoadInt32x4Slice(x), archsimd.LoadInt32x4Slice(y)
  1399  		var out [8]uint16
  1400  		a.SaturateToUint16Concat(b).Store(&out)
  1401  		want := convConcatSlice(x, y, satToUint16)
  1402  		return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1403  	})
  1404  
  1405  	if archsimd.X86.AVX2() {
  1406  		// Int32x8.SaturateToInt16ConcatGrouped
  1407  		forSlicePair(t, int32s, 8, func(x, y []int32) bool {
  1408  			a, b := archsimd.LoadInt32x8Slice(x), archsimd.LoadInt32x8Slice(y)
  1409  			var out [16]int16
  1410  			a.SaturateToInt16ConcatGrouped(b).Store(&out)
  1411  			want := convConcatGroupedSlice(x, y, satToInt16)
  1412  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1413  		})
  1414  		// Int32x8.SaturateToUint16ConcatGrouped
  1415  		forSlicePair(t, int32s, 8, func(x, y []int32) bool {
  1416  			a, b := archsimd.LoadInt32x8Slice(x), archsimd.LoadInt32x8Slice(y)
  1417  			var out [16]uint16
  1418  			a.SaturateToUint16ConcatGrouped(b).Store(&out)
  1419  			want := convConcatGroupedSlice(x, y, satToUint16)
  1420  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1421  		})
  1422  	}
  1423  
  1424  	if archsimd.X86.AVX512() {
  1425  		// Int32x16.SaturateToInt16ConcatGrouped
  1426  		forSlicePair(t, int32s, 16, func(x, y []int32) bool {
  1427  			a, b := archsimd.LoadInt32x16Slice(x), archsimd.LoadInt32x16Slice(y)
  1428  			var out [32]int16
  1429  			a.SaturateToInt16ConcatGrouped(b).Store(&out)
  1430  			want := convConcatGroupedSlice(x, y, satToInt16)
  1431  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1432  		})
  1433  		// Int32x16.SaturateToUint16ConcatGrouped
  1434  		forSlicePair(t, int32s, 16, func(x, y []int32) bool {
  1435  			a, b := archsimd.LoadInt32x16Slice(x), archsimd.LoadInt32x16Slice(y)
  1436  			var out [32]uint16
  1437  			a.SaturateToUint16ConcatGrouped(b).Store(&out)
  1438  			want := convConcatGroupedSlice(x, y, satToUint16)
  1439  			return checkSlicesLogInput(t, out[:], want, 0, func() { t.Logf("x=%v, y=%v", x, y) })
  1440  		})
  1441  	}
  1442  }
  1443  

View as plain text