Source file src/simd/simd_emulated.go

     1  // Copyright 2026 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && !(amd64 || wasm || arm64)
     6  
     7  package simd
     8  
     9  import (
    10  	"fmt"
    11  	"math"
    12  	"math/bits"
    13  )
    14  
    15  // VectorSize returns the bit length of the emulated vector (fixed to 128).
    16  func VectorBitSize() int {
    17  	return 128
    18  }
    19  
    20  // Emulated returns whether simd is emulated.
    21  func Emulated() bool {
    22  	return true
    23  }
    24  
    25  // HasHardwareCarrylessMultiply returns whether this platform
    26  // has a hardware-implemented version of carryless multiply.
    27  // With default GODEBUG=simd settings, if this is false,
    28  // it is emulated and merely slow, but with non-default settings
    29  // this can indicate the possibility of a missing instruction
    30  // that will fail ("SIGILL") if it is executed.
    31  func HasHardwareCarrylessMultiply() bool {
    32  	return false
    33  }
    34  
    35  type _simd struct {
    36  	_ [0]func(*_simd) *_simd
    37  }
    38  
    39  // Int8s represents a 128-bit vector of 16 int8 elements.
    40  type Int8s struct {
    41  	_    _simd
    42  	a, b uint64
    43  }
    44  
    45  // LoadInt8s loads a slice of int8 into an Int8s vector.
    46  func LoadInt8s(s []int8) Int8s {
    47  	var a, b uint64
    48  	for i := 0; i < 16; i++ {
    49  		val := uint64(uint8(s[i]))
    50  		if i < 8 {
    51  			a |= val << (8 * i)
    52  		} else {
    53  			b |= val << (8 * (i - 8))
    54  		}
    55  	}
    56  	return Int8s{a: a, b: b}
    57  }
    58  
    59  // LoadInt8sPart loads a partial slice of int8 into an Int8s vector.
    60  func LoadInt8sPart(s []int8) (Int8s, int) {
    61  	var a, b uint64
    62  	n := len(s)
    63  	if n > 16 {
    64  		n = 16
    65  	}
    66  	for i := 0; i < n; i++ {
    67  		val := uint64(uint8(s[i]))
    68  		if i < 8 {
    69  			a |= val << (8 * i)
    70  		} else {
    71  			b |= val << (8 * (i - 8))
    72  		}
    73  	}
    74  	return Int8s{a: a, b: b}, n
    75  }
    76  
    77  func (x Int8s) get(i int) int8 {
    78  	if i < 8 {
    79  		return int8(x.a >> (8 * i))
    80  	}
    81  	return int8(x.b >> (8 * (i - 8)))
    82  }
    83  
    84  func (x *Int8s) set(i int, v int8) {
    85  	val := uint64(uint8(v))
    86  	if i < 8 {
    87  		mask := uint64(0xff) << (8 * i)
    88  		x.a = (x.a &^ mask) | (val << (8 * i))
    89  	} else {
    90  		mask := uint64(0xff) << (8 * (i - 8))
    91  		x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
    92  	}
    93  }
    94  
    95  // Abs returns the element-wise absolute value of x.
    96  func (x Int8s) Abs() Int8s {
    97  	var res Int8s
    98  	for i := 0; i < 16; i++ {
    99  		v := x.get(i)
   100  		if v < 0 {
   101  			res.set(i, -v)
   102  		} else {
   103  			res.set(i, v)
   104  		}
   105  	}
   106  	return res
   107  }
   108  
   109  // Add returns the element-wise sum of x and y.
   110  func (x Int8s) Add(y Int8s) Int8s {
   111  	var res Int8s
   112  	for i := 0; i < 16; i++ {
   113  		res.set(i, x.get(i)+y.get(i))
   114  	}
   115  	return res
   116  }
   117  
   118  // AddSaturated returns the element-wise saturated sum of x and y.
   119  func (x Int8s) AddSaturated(y Int8s) Int8s {
   120  	var res Int8s
   121  	for i := 0; i < 16; i++ {
   122  		sum := int(x.get(i)) + int(y.get(i))
   123  		if sum > math.MaxInt8 {
   124  			res.set(i, math.MaxInt8)
   125  		} else if sum < math.MinInt8 {
   126  			res.set(i, math.MinInt8)
   127  		} else {
   128  			res.set(i, int8(sum))
   129  		}
   130  	}
   131  	return res
   132  }
   133  
   134  // And returns the bitwise AND of x and y.
   135  func (x Int8s) And(y Int8s) Int8s {
   136  	return Int8s{a: x.a & y.a, b: x.b & y.b}
   137  }
   138  
   139  // AndNot returns the bitwise AND NOT of x and y.
   140  func (x Int8s) AndNot(y Int8s) Int8s {
   141  	return Int8s{a: x.a &^ y.a, b: x.b &^ y.b}
   142  }
   143  
   144  // Equal returns a mask indicating where x and y are equal.
   145  func (x Int8s) Equal(y Int8s) Mask8s {
   146  	var res Mask8s
   147  	for i := 0; i < 16; i++ {
   148  		if x.get(i) == y.get(i) {
   149  			res.set(i, true)
   150  		}
   151  	}
   152  	return res
   153  }
   154  
   155  // Greater returns a mask indicating where x is greater than y.
   156  func (x Int8s) Greater(y Int8s) Mask8s {
   157  	var res Mask8s
   158  	for i := 0; i < 16; i++ {
   159  		if x.get(i) > y.get(i) {
   160  			res.set(i, true)
   161  		}
   162  	}
   163  	return res
   164  }
   165  
   166  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
   167  func (x Int8s) GreaterEqual(y Int8s) Mask8s {
   168  	var res Mask8s
   169  	for i := 0; i < 16; i++ {
   170  		if x.get(i) >= y.get(i) {
   171  			res.set(i, true)
   172  		}
   173  	}
   174  	return res
   175  }
   176  
   177  // Less returns a mask indicating where x is less than y.
   178  func (x Int8s) Less(y Int8s) Mask8s {
   179  	var res Mask8s
   180  	for i := 0; i < 16; i++ {
   181  		if x.get(i) < y.get(i) {
   182  			res.set(i, true)
   183  		}
   184  	}
   185  	return res
   186  }
   187  
   188  // LessEqual returns a mask indicating where x is less than or equal to y.
   189  func (x Int8s) LessEqual(y Int8s) Mask8s {
   190  	var res Mask8s
   191  	for i := 0; i < 16; i++ {
   192  		if x.get(i) <= y.get(i) {
   193  			res.set(i, true)
   194  		}
   195  	}
   196  	return res
   197  }
   198  
   199  // NotEqual returns a mask indicating where x and y are not equal.
   200  func (x Int8s) NotEqual(y Int8s) Mask8s {
   201  	var res Mask8s
   202  	for i := 0; i < 16; i++ {
   203  		if x.get(i) != y.get(i) {
   204  			res.set(i, true)
   205  		}
   206  	}
   207  	return res
   208  }
   209  
   210  // Len returns the number of elements in the vector.
   211  func (x Int8s) Len() int {
   212  	return 16
   213  }
   214  
   215  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
   216  func (x Int8s) Masked(mask Mask8s) Int8s {
   217  	return Int8s{a: x.a & mask.a, b: x.b & mask.b}
   218  }
   219  
   220  // Max returns the element-wise maximum of x and y.
   221  func (x Int8s) Max(y Int8s) Int8s {
   222  	var res Int8s
   223  	for i := 0; i < 16; i++ {
   224  		vx := x.get(i)
   225  		vy := y.get(i)
   226  		if vx > vy {
   227  			res.set(i, vx)
   228  		} else {
   229  			res.set(i, vy)
   230  		}
   231  	}
   232  	return res
   233  }
   234  
   235  // Mul returns the element-wise product of x and y.
   236  func (x Int8s) Mul(y Int8s) Int8s {
   237  	var res Int8s
   238  	for i := 0; i < 16; i++ {
   239  		res.set(i, x.get(i)*y.get(i))
   240  	}
   241  	return res
   242  }
   243  
   244  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
   245  func (x Int8s) IfElse(mask Mask8s, y Int8s) Int8s {
   246  	return Int8s{
   247  		a: (x.a & mask.a) | (y.a &^ mask.a),
   248  		b: (x.b & mask.b) | (y.b &^ mask.b),
   249  	}
   250  }
   251  
   252  // Min returns the element-wise minimum of x and y.
   253  func (x Int8s) Min(y Int8s) Int8s {
   254  	var res Int8s
   255  	for i := 0; i < 16; i++ {
   256  		vx := x.get(i)
   257  		vy := y.get(i)
   258  		if vx < vy {
   259  			res.set(i, vx)
   260  		} else {
   261  			res.set(i, vy)
   262  		}
   263  	}
   264  	return res
   265  }
   266  
   267  // Neg returns the element-wise negation of x.
   268  func (x Int8s) Neg() Int8s {
   269  	var res Int8s
   270  	for i := 0; i < 16; i++ {
   271  		res.set(i, -x.get(i))
   272  	}
   273  	return res
   274  }
   275  
   276  // Not returns the bitwise NOT of x.
   277  func (x Int8s) Not() Int8s {
   278  	return Int8s{a: ^x.a, b: ^x.b}
   279  }
   280  
   281  // Or returns the bitwise OR of x and y.
   282  func (x Int8s) Or(y Int8s) Int8s {
   283  	return Int8s{a: x.a | y.a, b: x.b | y.b}
   284  }
   285  
   286  // Store stores the vector elements into the slice s.
   287  func (x Int8s) Store(s []int8) {
   288  	for i := 0; i < 16 && i < len(s); i++ {
   289  		s[i] = x.get(i)
   290  	}
   291  }
   292  
   293  // StorePart stores a partial vector into the slice s.
   294  func (x Int8s) StorePart(s []int8) int {
   295  	x.Store(s)
   296  	return min(len(s), x.Len())
   297  }
   298  
   299  // String returns a string representation of the vector.
   300  func (x Int8s) String() string {
   301  	var parts [16]int8
   302  	for i := 0; i < 16; i++ {
   303  		parts[i] = x.get(i)
   304  	}
   305  	return fmt.Sprint(parts)
   306  }
   307  
   308  // Sub returns the element-wise difference of x and y.
   309  func (x Int8s) Sub(y Int8s) Int8s {
   310  	var res Int8s
   311  	for i := 0; i < 16; i++ {
   312  		res.set(i, x.get(i)-y.get(i))
   313  	}
   314  	return res
   315  }
   316  
   317  // SubSaturated returns the element-wise saturated difference of x and y.
   318  func (x Int8s) SubSaturated(y Int8s) Int8s {
   319  	var res Int8s
   320  	for i := 0; i < 16; i++ {
   321  		diff := int(x.get(i)) - int(y.get(i))
   322  		if diff > math.MaxInt8 {
   323  			res.set(i, math.MaxInt8)
   324  		} else if diff < math.MinInt8 {
   325  			res.set(i, math.MinInt8)
   326  		} else {
   327  			res.set(i, int8(diff))
   328  		}
   329  	}
   330  	return res
   331  }
   332  
   333  // ToMask returns a mask representation of the vector.
   334  func (x Int8s) ToMask() Mask8s {
   335  	var res Mask8s
   336  	for i := 0; i < 16; i++ {
   337  		if x.get(i) != 0 {
   338  			res.set(i, true)
   339  		}
   340  	}
   341  	return res
   342  }
   343  
   344  // Xor returns the bitwise XOR of x and y.
   345  func (x Int8s) Xor(y Int8s) Int8s {
   346  	return Int8s{a: x.a ^ y.a, b: x.b ^ y.b}
   347  }
   348  
   349  // ConvertToUint8 converts the vector elements to uint8.
   350  func (x Int8s) ConvertToUint8() Uint8s {
   351  	return Uint8s{a: x.a, b: x.b}
   352  }
   353  
   354  // ToBits reinterprets the vector bits as a Uint8s vector.
   355  func (x Int8s) ToBits() Uint8s {
   356  	return Uint8s{a: x.a, b: x.b}
   357  }
   358  
   359  // Int16s represents a 128-bit vector of 8 int16 elements.
   360  type Int16s struct {
   361  	_    _simd
   362  	a, b uint64
   363  }
   364  
   365  // LoadInt16s loads a slice of int16 into an Int16s vector.
   366  func LoadInt16s(s []int16) Int16s {
   367  	var a, b uint64
   368  	for i := 0; i < 8; i++ {
   369  		val := uint64(uint16(s[i]))
   370  		if i < 4 {
   371  			a |= val << (16 * i)
   372  		} else {
   373  			b |= val << (16 * (i - 4))
   374  		}
   375  	}
   376  	return Int16s{a: a, b: b}
   377  }
   378  
   379  // LoadInt16sPart loads a partial slice of int16 into an Int16s vector.
   380  func LoadInt16sPart(s []int16) (Int16s, int) {
   381  	var a, b uint64
   382  	n := len(s)
   383  	if n > 8 {
   384  		n = 8
   385  	}
   386  	for i := 0; i < n; i++ {
   387  		val := uint64(uint16(s[i]))
   388  		if i < 4 {
   389  			a |= val << (16 * i)
   390  		} else {
   391  			b |= val << (16 * (i - 4))
   392  		}
   393  	}
   394  	return Int16s{a: a, b: b}, n
   395  }
   396  
   397  func (x Int16s) get(i int) int16 {
   398  	if i < 4 {
   399  		return int16(x.a >> (16 * i))
   400  	}
   401  	return int16(x.b >> (16 * (i - 4)))
   402  }
   403  
   404  func (x *Int16s) set(i int, v int16) {
   405  	val := uint64(uint16(v))
   406  	if i < 4 {
   407  		mask := uint64(0xffff) << (16 * i)
   408  		x.a = (x.a &^ mask) | (val << (16 * i))
   409  	} else {
   410  		mask := uint64(0xffff) << (16 * (i - 4))
   411  		x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
   412  	}
   413  }
   414  
   415  // Abs returns the element-wise absolute value of x.
   416  func (x Int16s) Abs() Int16s {
   417  	var res Int16s
   418  	for i := 0; i < 8; i++ {
   419  		v := x.get(i)
   420  		if v < 0 {
   421  			res.set(i, -v)
   422  		} else {
   423  			res.set(i, v)
   424  		}
   425  	}
   426  	return res
   427  }
   428  
   429  // Add returns the element-wise sum of x and y.
   430  func (x Int16s) Add(y Int16s) Int16s {
   431  	var res Int16s
   432  	for i := 0; i < 8; i++ {
   433  		res.set(i, x.get(i)+y.get(i))
   434  	}
   435  	return res
   436  }
   437  
   438  // AddSaturated returns the element-wise saturated sum of x and y.
   439  func (x Int16s) AddSaturated(y Int16s) Int16s {
   440  	var res Int16s
   441  	for i := 0; i < 8; i++ {
   442  		sum := int(x.get(i)) + int(y.get(i))
   443  		if sum > math.MaxInt16 {
   444  			res.set(i, math.MaxInt16)
   445  		} else if sum < math.MinInt16 {
   446  			res.set(i, math.MinInt16)
   447  		} else {
   448  			res.set(i, int16(sum))
   449  		}
   450  	}
   451  	return res
   452  }
   453  
   454  // And returns the bitwise AND of x and y.
   455  func (x Int16s) And(y Int16s) Int16s {
   456  	return Int16s{a: x.a & y.a, b: x.b & y.b}
   457  }
   458  
   459  // AndNot returns the bitwise AND NOT of x and y.
   460  func (x Int16s) AndNot(y Int16s) Int16s {
   461  	return Int16s{a: x.a &^ y.a, b: x.b &^ y.b}
   462  }
   463  
   464  // Equal returns a mask indicating where x and y are equal.
   465  func (x Int16s) Equal(y Int16s) Mask16s {
   466  	var res Mask16s
   467  	for i := 0; i < 8; i++ {
   468  		if x.get(i) == y.get(i) {
   469  			res.set(i, true)
   470  		}
   471  	}
   472  	return res
   473  }
   474  
   475  // Greater returns a mask indicating where x is greater than y.
   476  func (x Int16s) Greater(y Int16s) Mask16s {
   477  	var res Mask16s
   478  	for i := 0; i < 8; i++ {
   479  		if x.get(i) > y.get(i) {
   480  			res.set(i, true)
   481  		}
   482  	}
   483  	return res
   484  }
   485  
   486  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
   487  func (x Int16s) GreaterEqual(y Int16s) Mask16s {
   488  	var res Mask16s
   489  	for i := 0; i < 8; i++ {
   490  		if x.get(i) >= y.get(i) {
   491  			res.set(i, true)
   492  		}
   493  	}
   494  	return res
   495  }
   496  
   497  // Less returns a mask indicating where x is less than y.
   498  func (x Int16s) Less(y Int16s) Mask16s {
   499  	var res Mask16s
   500  	for i := 0; i < 8; i++ {
   501  		if x.get(i) < y.get(i) {
   502  			res.set(i, true)
   503  		}
   504  	}
   505  	return res
   506  }
   507  
   508  // LessEqual returns a mask indicating where x is less than or equal to y.
   509  func (x Int16s) LessEqual(y Int16s) Mask16s {
   510  	var res Mask16s
   511  	for i := 0; i < 8; i++ {
   512  		if x.get(i) <= y.get(i) {
   513  			res.set(i, true)
   514  		}
   515  	}
   516  	return res
   517  }
   518  
   519  // NotEqual returns a mask indicating where x and y are not equal.
   520  func (x Int16s) NotEqual(y Int16s) Mask16s {
   521  	var res Mask16s
   522  	for i := 0; i < 8; i++ {
   523  		if x.get(i) != y.get(i) {
   524  			res.set(i, true)
   525  		}
   526  	}
   527  	return res
   528  }
   529  
   530  // Len returns the number of elements in the vector.
   531  func (x Int16s) Len() int {
   532  	return 8
   533  }
   534  
   535  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
   536  func (x Int16s) Masked(mask Mask16s) Int16s {
   537  	return Int16s{a: x.a & mask.a, b: x.b & mask.b}
   538  }
   539  
   540  // Max returns the element-wise maximum of x and y.
   541  func (x Int16s) Max(y Int16s) Int16s {
   542  	var res Int16s
   543  	for i := 0; i < 8; i++ {
   544  		vx := x.get(i)
   545  		vy := y.get(i)
   546  		if vx > vy {
   547  			res.set(i, vx)
   548  		} else {
   549  			res.set(i, vy)
   550  		}
   551  	}
   552  	return res
   553  }
   554  
   555  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
   556  func (x Int16s) IfElse(mask Mask16s, y Int16s) Int16s {
   557  	return Int16s{
   558  		a: (x.a & mask.a) | (y.a &^ mask.a),
   559  		b: (x.b & mask.b) | (y.b &^ mask.b),
   560  	}
   561  }
   562  
   563  // Min returns the element-wise minimum of x and y.
   564  func (x Int16s) Min(y Int16s) Int16s {
   565  	var res Int16s
   566  	for i := 0; i < 8; i++ {
   567  		vx := x.get(i)
   568  		vy := y.get(i)
   569  		if vx < vy {
   570  			res.set(i, vx)
   571  		} else {
   572  			res.set(i, vy)
   573  		}
   574  	}
   575  	return res
   576  }
   577  
   578  // Mul returns the element-wise product of x and y.
   579  func (x Int16s) Mul(y Int16s) Int16s {
   580  	var res Int16s
   581  	for i := 0; i < 8; i++ {
   582  		res.set(i, x.get(i)*y.get(i))
   583  	}
   584  	return res
   585  }
   586  
   587  // Neg returns the element-wise negation of x.
   588  func (x Int16s) Neg() Int16s {
   589  	var res Int16s
   590  	for i := 0; i < 8; i++ {
   591  		res.set(i, -x.get(i))
   592  	}
   593  	return res
   594  }
   595  
   596  // Not returns the bitwise NOT of x.
   597  func (x Int16s) Not() Int16s {
   598  	return Int16s{a: ^x.a, b: ^x.b}
   599  }
   600  
   601  // Or returns the bitwise OR of x and y.
   602  func (x Int16s) Or(y Int16s) Int16s {
   603  	return Int16s{a: x.a | y.a, b: x.b | y.b}
   604  }
   605  
   606  // ShiftAllLeft shifts all elements left by y bits.
   607  func (x Int16s) ShiftAllLeft(y uint8) Int16s {
   608  	var res Int16s
   609  	for i := 0; i < 8; i++ {
   610  		res.set(i, x.get(i)<<y)
   611  	}
   612  	return res
   613  }
   614  
   615  // ShiftAllRight shifts all elements right by y bits.
   616  func (x Int16s) ShiftAllRight(y uint8) Int16s {
   617  	var res Int16s
   618  	for i := 0; i < 8; i++ {
   619  		res.set(i, x.get(i)>>y)
   620  	}
   621  	return res
   622  }
   623  
   624  // RotateAllLeft rotates all elements left by dist bits.
   625  func (x Int16s) RotateAllLeft(dist uint64) Int16s {
   626  	var res Int16s
   627  	d := dist & 15
   628  	for i := 0; i < 8; i++ {
   629  		u := uint16(x.get(i))
   630  		r := (u << d) | (u >> ((16 - d) & 15))
   631  		res.set(i, int16(r))
   632  	}
   633  	return res
   634  }
   635  
   636  // RotateAllRight rotates all elements right by dist bits.
   637  func (x Int16s) RotateAllRight(dist uint64) Int16s {
   638  	var res Int16s
   639  	d := dist & 15
   640  	for i := 0; i < 8; i++ {
   641  		u := uint16(x.get(i))
   642  		r := (u >> d) | (u << ((16 - d) & 15))
   643  		res.set(i, int16(r))
   644  	}
   645  	return res
   646  }
   647  
   648  // Store stores the vector elements into the slice s.
   649  func (x Int16s) Store(s []int16) {
   650  	for i := 0; i < 8 && i < len(s); i++ {
   651  		s[i] = x.get(i)
   652  	}
   653  }
   654  
   655  // StorePart stores a partial vector into the slice s.
   656  func (x Int16s) StorePart(s []int16) int {
   657  	x.Store(s)
   658  	return min(len(s), x.Len())
   659  }
   660  
   661  // String returns a string representation of the vector.
   662  func (x Int16s) String() string {
   663  	var parts [8]int16
   664  	for i := 0; i < 8; i++ {
   665  		parts[i] = x.get(i)
   666  	}
   667  	return fmt.Sprint(parts)
   668  }
   669  
   670  // Sub returns the element-wise difference of x and y.
   671  func (x Int16s) Sub(y Int16s) Int16s {
   672  	var res Int16s
   673  	for i := 0; i < 8; i++ {
   674  		res.set(i, x.get(i)-y.get(i))
   675  	}
   676  	return res
   677  }
   678  
   679  // SubSaturated returns the element-wise saturated difference of x and y.
   680  func (x Int16s) SubSaturated(y Int16s) Int16s {
   681  	var res Int16s
   682  	for i := 0; i < 8; i++ {
   683  		diff := int(x.get(i)) - int(y.get(i))
   684  		if diff > math.MaxInt16 {
   685  			res.set(i, math.MaxInt16)
   686  		} else if diff < math.MinInt16 {
   687  			res.set(i, math.MinInt16)
   688  		} else {
   689  			res.set(i, int16(diff))
   690  		}
   691  	}
   692  	return res
   693  }
   694  
   695  // ToMask returns a mask representation of the vector.
   696  func (x Int16s) ToMask() Mask16s {
   697  	var res Mask16s
   698  	for i := 0; i < 8; i++ {
   699  		if x.get(i) != 0 {
   700  			res.set(i, true)
   701  		}
   702  	}
   703  	return res
   704  }
   705  
   706  // Xor returns the bitwise XOR of x and y.
   707  func (x Int16s) Xor(y Int16s) Int16s {
   708  	return Int16s{a: x.a ^ y.a, b: x.b ^ y.b}
   709  }
   710  
   711  // ConvertToUint16 converts the vector elements to uint16.
   712  func (x Int16s) ConvertToUint16() Uint16s {
   713  	return Uint16s{a: x.a, b: x.b}
   714  }
   715  
   716  // ToBits reinterprets the vector bits as a Uint16s vector.
   717  func (x Int16s) ToBits() Uint16s {
   718  	return Uint16s{a: x.a, b: x.b}
   719  }
   720  
   721  // Int32s represents a 128-bit vector of 4 int32 elements.
   722  type Int32s struct {
   723  	_    _simd
   724  	a, b uint64
   725  }
   726  
   727  // LoadInt32s loads a slice of int32 into an Int32s vector.
   728  func LoadInt32s(s []int32) Int32s {
   729  	var a, b uint64
   730  	for i := 0; i < 4; i++ {
   731  		val := uint64(uint32(s[i]))
   732  		if i < 2 {
   733  			a |= val << (32 * i)
   734  		} else {
   735  			b |= val << (32 * (i - 2))
   736  		}
   737  	}
   738  	return Int32s{a: a, b: b}
   739  }
   740  
   741  // LoadInt32sPart loads a partial slice of int32 into an Int32s vector.
   742  func LoadInt32sPart(s []int32) (Int32s, int) {
   743  	var a, b uint64
   744  	n := len(s)
   745  	if n > 4 {
   746  		n = 4
   747  	}
   748  	for i := 0; i < n; i++ {
   749  		val := uint64(uint32(s[i]))
   750  		if i < 2 {
   751  			a |= val << (32 * i)
   752  		} else {
   753  			b |= val << (32 * (i - 2))
   754  		}
   755  	}
   756  	return Int32s{a: a, b: b}, n
   757  }
   758  
   759  func (x Int32s) get(i int) int32 {
   760  	if i < 2 {
   761  		return int32(x.a >> (32 * i))
   762  	}
   763  	return int32(x.b >> (32 * (i - 2)))
   764  }
   765  
   766  func (x *Int32s) set(i int, v int32) {
   767  	val := uint64(uint32(v))
   768  	if i < 2 {
   769  		mask := uint64(0xffffffff) << (32 * i)
   770  		x.a = (x.a &^ mask) | (val << (32 * i))
   771  	} else {
   772  		mask := uint64(0xffffffff) << (32 * (i - 2))
   773  		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
   774  	}
   775  }
   776  
   777  // Abs returns the element-wise absolute value of x.
   778  func (x Int32s) Abs() Int32s {
   779  	var res Int32s
   780  	for i := 0; i < 4; i++ {
   781  		v := x.get(i)
   782  		if v < 0 {
   783  			res.set(i, -v)
   784  		} else {
   785  			res.set(i, v)
   786  		}
   787  	}
   788  	return res
   789  }
   790  
   791  // Add returns the element-wise sum of x and y.
   792  func (x Int32s) Add(y Int32s) Int32s {
   793  	var res Int32s
   794  	for i := 0; i < 4; i++ {
   795  		res.set(i, x.get(i)+y.get(i))
   796  	}
   797  	return res
   798  }
   799  
   800  // And returns the bitwise AND of x and y.
   801  func (x Int32s) And(y Int32s) Int32s {
   802  	return Int32s{a: x.a & y.a, b: x.b & y.b}
   803  }
   804  
   805  // AndNot returns the bitwise AND NOT of x and y.
   806  func (x Int32s) AndNot(y Int32s) Int32s {
   807  	return Int32s{a: x.a &^ y.a, b: x.b &^ y.b}
   808  }
   809  
   810  // ConvertToFloat32 converts the vector elements to float32.
   811  func (x Int32s) ConvertToFloat32() Float32s {
   812  	var res Float32s
   813  	for i := 0; i < 4; i++ {
   814  		res.set(i, float32(x.get(i)))
   815  	}
   816  	return res
   817  }
   818  
   819  // Equal returns a mask indicating where x and y are equal.
   820  func (x Int32s) Equal(y Int32s) Mask32s {
   821  	var res Mask32s
   822  	for i := 0; i < 4; i++ {
   823  		if x.get(i) == y.get(i) {
   824  			res.set(i, true)
   825  		}
   826  	}
   827  	return res
   828  }
   829  
   830  // Greater returns a mask indicating where x is greater than y.
   831  func (x Int32s) Greater(y Int32s) Mask32s {
   832  	var res Mask32s
   833  	for i := 0; i < 4; i++ {
   834  		if x.get(i) > y.get(i) {
   835  			res.set(i, true)
   836  		}
   837  	}
   838  	return res
   839  }
   840  
   841  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
   842  func (x Int32s) GreaterEqual(y Int32s) Mask32s {
   843  	var res Mask32s
   844  	for i := 0; i < 4; i++ {
   845  		if x.get(i) >= y.get(i) {
   846  			res.set(i, true)
   847  		}
   848  	}
   849  	return res
   850  }
   851  
   852  // Less returns a mask indicating where x is less than y.
   853  func (x Int32s) Less(y Int32s) Mask32s {
   854  	var res Mask32s
   855  	for i := 0; i < 4; i++ {
   856  		if x.get(i) < y.get(i) {
   857  			res.set(i, true)
   858  		}
   859  	}
   860  	return res
   861  }
   862  
   863  // LessEqual returns a mask indicating where x is less than or equal to y.
   864  func (x Int32s) LessEqual(y Int32s) Mask32s {
   865  	var res Mask32s
   866  	for i := 0; i < 4; i++ {
   867  		if x.get(i) <= y.get(i) {
   868  			res.set(i, true)
   869  		}
   870  	}
   871  	return res
   872  }
   873  
   874  // NotEqual returns a mask indicating where x and y are not equal.
   875  func (x Int32s) NotEqual(y Int32s) Mask32s {
   876  	var res Mask32s
   877  	for i := 0; i < 4; i++ {
   878  		if x.get(i) != y.get(i) {
   879  			res.set(i, true)
   880  		}
   881  	}
   882  	return res
   883  }
   884  
   885  // Len returns the number of elements in the vector.
   886  func (x Int32s) Len() int {
   887  	return 4
   888  }
   889  
   890  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
   891  func (x Int32s) Masked(mask Mask32s) Int32s {
   892  	return Int32s{a: x.a & mask.a, b: x.b & mask.b}
   893  }
   894  
   895  // Max returns the element-wise maximum of x and y.
   896  func (x Int32s) Max(y Int32s) Int32s {
   897  	var res Int32s
   898  	for i := 0; i < 4; i++ {
   899  		vx := x.get(i)
   900  		vy := y.get(i)
   901  		if vx > vy {
   902  			res.set(i, vx)
   903  		} else {
   904  			res.set(i, vy)
   905  		}
   906  	}
   907  	return res
   908  }
   909  
   910  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
   911  func (x Int32s) IfElse(mask Mask32s, y Int32s) Int32s {
   912  	return Int32s{
   913  		a: (x.a & mask.a) | (y.a &^ mask.a),
   914  		b: (x.b & mask.b) | (y.b &^ mask.b),
   915  	}
   916  }
   917  
   918  // Min returns the element-wise minimum of x and y.
   919  func (x Int32s) Min(y Int32s) Int32s {
   920  	var res Int32s
   921  	for i := 0; i < 4; i++ {
   922  		vx := x.get(i)
   923  		vy := y.get(i)
   924  		if vx < vy {
   925  			res.set(i, vx)
   926  		} else {
   927  			res.set(i, vy)
   928  		}
   929  	}
   930  	return res
   931  }
   932  
   933  // Mul returns the element-wise product of x and y.
   934  func (x Int32s) Mul(y Int32s) Int32s {
   935  	var res Int32s
   936  	for i := 0; i < 4; i++ {
   937  		res.set(i, x.get(i)*y.get(i))
   938  	}
   939  	return res
   940  }
   941  
   942  // Neg returns the element-wise negation of x.
   943  func (x Int32s) Neg() Int32s {
   944  	var res Int32s
   945  	for i := 0; i < 4; i++ {
   946  		res.set(i, -x.get(i))
   947  	}
   948  	return res
   949  }
   950  
   951  // Not returns the bitwise NOT of x.
   952  func (x Int32s) Not() Int32s {
   953  	return Int32s{a: ^x.a, b: ^x.b}
   954  }
   955  
   956  // Or returns the bitwise OR of x and y.
   957  func (x Int32s) Or(y Int32s) Int32s {
   958  	return Int32s{a: x.a | y.a, b: x.b | y.b}
   959  }
   960  
   961  // ShiftAllLeft shifts all elements left by y bits.
   962  func (x Int32s) ShiftAllLeft(y uint8) Int32s {
   963  	var res Int32s
   964  	for i := 0; i < 4; i++ {
   965  		res.set(i, x.get(i)<<y)
   966  	}
   967  	return res
   968  }
   969  
   970  // ShiftAllRight shifts all elements right by y bits.
   971  func (x Int32s) ShiftAllRight(y uint8) Int32s {
   972  	var res Int32s
   973  	for i := 0; i < 4; i++ {
   974  		res.set(i, x.get(i)>>y)
   975  	}
   976  	return res
   977  }
   978  
   979  // RotateAllLeft rotates all elements left by dist bits.
   980  func (x Int32s) RotateAllLeft(dist uint64) Int32s {
   981  	var res Int32s
   982  	d := dist & 31
   983  	for i := 0; i < 4; i++ {
   984  		u := uint32(x.get(i))
   985  		r := (u << d) | (u >> ((32 - d) & 31))
   986  		res.set(i, int32(r))
   987  	}
   988  	return res
   989  }
   990  
   991  // RotateAllRight rotates all elements right by dist bits.
   992  func (x Int32s) RotateAllRight(dist uint64) Int32s {
   993  	var res Int32s
   994  	d := dist & 31
   995  	for i := 0; i < 4; i++ {
   996  		u := uint32(x.get(i))
   997  		r := (u >> d) | (u << ((32 - d) & 31))
   998  		res.set(i, int32(r))
   999  	}
  1000  	return res
  1001  }
  1002  
  1003  // Store stores the vector elements into the slice s.
  1004  func (x Int32s) Store(s []int32) {
  1005  	for i := 0; i < 4 && i < len(s); i++ {
  1006  		s[i] = x.get(i)
  1007  	}
  1008  }
  1009  
  1010  // StorePart stores a partial vector into the slice s.
  1011  func (x Int32s) StorePart(s []int32) int {
  1012  	x.Store(s)
  1013  	return min(len(s), x.Len())
  1014  }
  1015  
  1016  // String returns a string representation of the vector.
  1017  func (x Int32s) String() string {
  1018  	var parts [4]int32
  1019  	for i := 0; i < 4; i++ {
  1020  		parts[i] = x.get(i)
  1021  	}
  1022  	return fmt.Sprint(parts)
  1023  }
  1024  
  1025  // Sub returns the element-wise difference of x and y.
  1026  func (x Int32s) Sub(y Int32s) Int32s {
  1027  	var res Int32s
  1028  	for i := 0; i < 4; i++ {
  1029  		res.set(i, x.get(i)-y.get(i))
  1030  	}
  1031  	return res
  1032  }
  1033  
  1034  // ToMask returns a mask representation of the vector.
  1035  func (x Int32s) ToMask() Mask32s {
  1036  	var res Mask32s
  1037  	for i := 0; i < 4; i++ {
  1038  		if x.get(i) != 0 {
  1039  			res.set(i, true)
  1040  		}
  1041  	}
  1042  	return res
  1043  }
  1044  
  1045  // Xor returns the bitwise XOR of x and y.
  1046  func (x Int32s) Xor(y Int32s) Int32s {
  1047  	return Int32s{a: x.a ^ y.a, b: x.b ^ y.b}
  1048  }
  1049  
  1050  // ConvertToUint32 converts the vector elements to uint32.
  1051  func (x Int32s) ConvertToUint32() Uint32s {
  1052  	return Uint32s{a: x.a, b: x.b}
  1053  }
  1054  
  1055  // ToBits reinterprets the vector bits as a Uint32s vector.
  1056  func (x Int32s) ToBits() Uint32s {
  1057  	return Uint32s{a: x.a, b: x.b}
  1058  }
  1059  
  1060  // Int64s represents a 128-bit vector of 2 int64 elements.
  1061  type Int64s struct {
  1062  	_    _simd
  1063  	a, b uint64
  1064  }
  1065  
  1066  // LoadInt64s loads a slice of int64 into an Int64s vector.
  1067  func LoadInt64s(s []int64) Int64s {
  1068  	var a, b uint64
  1069  	a = uint64(s[0])
  1070  	b = uint64(s[1])
  1071  	return Int64s{a: a, b: b}
  1072  }
  1073  
  1074  // LoadInt64sPart loads a partial slice of int64 into an Int64s vector.
  1075  func LoadInt64sPart(s []int64) (Int64s, int) {
  1076  	var a, b uint64
  1077  	if len(s) > 0 {
  1078  		a = uint64(s[0])
  1079  	}
  1080  	if len(s) > 1 {
  1081  		b = uint64(s[1])
  1082  	}
  1083  	return Int64s{a: a, b: b}, len(s)
  1084  }
  1085  
  1086  func (x Int64s) get(i int) int64 {
  1087  	if i == 0 {
  1088  		return int64(x.a)
  1089  	}
  1090  	return int64(x.b)
  1091  }
  1092  
  1093  func (x *Int64s) set(i int, v int64) {
  1094  	if i == 0 {
  1095  		x.a = uint64(v)
  1096  	} else {
  1097  		x.b = uint64(v)
  1098  	}
  1099  }
  1100  
  1101  // Add returns the element-wise sum of x and y.
  1102  func (x Int64s) Add(y Int64s) Int64s {
  1103  	return Int64s{a: x.a + y.a, b: x.b + y.b}
  1104  }
  1105  
  1106  // And returns the bitwise AND of x and y.
  1107  func (x Int64s) And(y Int64s) Int64s {
  1108  	return Int64s{a: x.a & y.a, b: x.b & y.b}
  1109  }
  1110  
  1111  // AndNot returns the bitwise AND NOT of x and y.
  1112  func (x Int64s) AndNot(y Int64s) Int64s {
  1113  	return Int64s{a: x.a &^ y.a, b: x.b &^ y.b}
  1114  }
  1115  
  1116  // Equal returns a mask indicating where x and y are equal.
  1117  func (x Int64s) Equal(y Int64s) Mask64s {
  1118  	var res Mask64s
  1119  	if x.a == y.a {
  1120  		res.a = ^uint64(0)
  1121  	}
  1122  	if x.b == y.b {
  1123  		res.b = ^uint64(0)
  1124  	}
  1125  	return res
  1126  }
  1127  
  1128  // Greater returns a mask indicating where x is greater than y.
  1129  func (x Int64s) Greater(y Int64s) Mask64s {
  1130  	var res Mask64s
  1131  	if int64(x.a) > int64(y.a) {
  1132  		res.a = ^uint64(0)
  1133  	}
  1134  	if int64(x.b) > int64(y.b) {
  1135  		res.b = ^uint64(0)
  1136  	}
  1137  	return res
  1138  }
  1139  
  1140  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  1141  func (x Int64s) GreaterEqual(y Int64s) Mask64s {
  1142  	var res Mask64s
  1143  	if int64(x.a) >= int64(y.a) {
  1144  		res.a = ^uint64(0)
  1145  	}
  1146  	if int64(x.b) >= int64(y.b) {
  1147  		res.b = ^uint64(0)
  1148  	}
  1149  	return res
  1150  }
  1151  
  1152  // Less returns a mask indicating where x is less than y.
  1153  func (x Int64s) Less(y Int64s) Mask64s {
  1154  	var res Mask64s
  1155  	if int64(x.a) < int64(y.a) {
  1156  		res.a = ^uint64(0)
  1157  	}
  1158  	if int64(x.b) < int64(y.b) {
  1159  		res.b = ^uint64(0)
  1160  	}
  1161  	return res
  1162  }
  1163  
  1164  // LessEqual returns a mask indicating where x is less than or equal to y.
  1165  func (x Int64s) LessEqual(y Int64s) Mask64s {
  1166  	var res Mask64s
  1167  	if int64(x.a) <= int64(y.a) {
  1168  		res.a = ^uint64(0)
  1169  	}
  1170  	if int64(x.b) <= int64(y.b) {
  1171  		res.b = ^uint64(0)
  1172  	}
  1173  	return res
  1174  }
  1175  
  1176  // NotEqual returns a mask indicating where x and y are not equal.
  1177  func (x Int64s) NotEqual(y Int64s) Mask64s {
  1178  	var res Mask64s
  1179  	if x.a != y.a {
  1180  		res.a = ^uint64(0)
  1181  	}
  1182  	if x.b != y.b {
  1183  		res.b = ^uint64(0)
  1184  	}
  1185  	return res
  1186  }
  1187  
  1188  // Len returns the number of elements in the vector.
  1189  func (x Int64s) Len() int {
  1190  	return 2
  1191  }
  1192  
  1193  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  1194  func (x Int64s) Masked(mask Mask64s) Int64s {
  1195  	return Int64s{a: x.a & mask.a, b: x.b & mask.b}
  1196  }
  1197  
  1198  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  1199  func (x Int64s) IfElse(mask Mask64s, y Int64s) Int64s {
  1200  	return Int64s{
  1201  		a: (x.a & mask.a) | (y.a &^ mask.a),
  1202  		b: (x.b & mask.b) | (y.b &^ mask.b),
  1203  	}
  1204  }
  1205  
  1206  // Neg returns the element-wise negation of x.
  1207  func (x Int64s) Neg() Int64s {
  1208  	return Int64s{a: uint64(-int64(x.a)), b: uint64(-int64(x.b))}
  1209  }
  1210  
  1211  // Not returns the bitwise NOT of x.
  1212  func (x Int64s) Not() Int64s {
  1213  	return Int64s{a: ^x.a, b: ^x.b}
  1214  }
  1215  
  1216  // Or returns the bitwise OR of x and y.
  1217  func (x Int64s) Or(y Int64s) Int64s {
  1218  	return Int64s{a: x.a | y.a, b: x.b | y.b}
  1219  }
  1220  
  1221  // ShiftAllLeft shifts all elements left by y bits.
  1222  func (x Int64s) ShiftAllLeft(y uint8) Int64s {
  1223  	return Int64s{a: x.a << y, b: x.b << y}
  1224  }
  1225  
  1226  // RotateAllLeft rotates all elements left by dist bits.
  1227  func (x Int64s) RotateAllLeft(dist uint64) Int64s {
  1228  	d := dist & 63
  1229  	return Int64s{
  1230  		a: (x.a << d) | (x.a >> ((64 - d) & 63)),
  1231  		b: (x.b << d) | (x.b >> ((64 - d) & 63)),
  1232  	}
  1233  }
  1234  
  1235  // RotateAllRight rotates all elements right by dist bits.
  1236  func (x Int64s) RotateAllRight(dist uint64) Int64s {
  1237  	d := dist & 63
  1238  	return Int64s{
  1239  		a: (x.a >> d) | (x.a << ((64 - d) & 63)),
  1240  		b: (x.b >> d) | (x.b << ((64 - d) & 63)),
  1241  	}
  1242  }
  1243  
  1244  // Store stores the vector elements into the slice s.
  1245  func (x Int64s) Store(s []int64) {
  1246  	if len(s) > 0 {
  1247  		s[0] = int64(x.a)
  1248  	}
  1249  	if len(s) > 1 {
  1250  		s[1] = int64(x.b)
  1251  	}
  1252  }
  1253  
  1254  // StorePart stores a partial vector into the slice s.
  1255  func (x Int64s) StorePart(s []int64) int {
  1256  	x.Store(s)
  1257  	return min(len(s), x.Len())
  1258  }
  1259  
  1260  // String returns a string representation of the vector.
  1261  func (x Int64s) String() string {
  1262  	return fmt.Sprint([2]int64{int64(x.a), int64(x.b)})
  1263  }
  1264  
  1265  // Sub returns the element-wise difference of x and y.
  1266  func (x Int64s) Sub(y Int64s) Int64s {
  1267  	return Int64s{a: x.a - y.a, b: x.b - y.b}
  1268  }
  1269  
  1270  // ToMask returns a mask representation of the vector.
  1271  func (x Int64s) ToMask() Mask64s {
  1272  	var res Mask64s
  1273  	if x.a != 0 {
  1274  		res.a = ^uint64(0)
  1275  	}
  1276  	if x.b != 0 {
  1277  		res.b = ^uint64(0)
  1278  	}
  1279  	return res
  1280  }
  1281  
  1282  // Xor returns the bitwise XOR of x and y.
  1283  func (x Int64s) Xor(y Int64s) Int64s {
  1284  	return Int64s{a: x.a ^ y.a, b: x.b ^ y.b}
  1285  }
  1286  
  1287  // ConvertToUint64 converts the vector elements to uint64.
  1288  func (x Int64s) ConvertToUint64() Uint64s {
  1289  	return Uint64s{a: x.a, b: x.b}
  1290  }
  1291  
  1292  // ToBits reinterprets the vector bits as a Uint64s vector.
  1293  func (x Int64s) ToBits() Uint64s {
  1294  	return Uint64s{a: x.a, b: x.b}
  1295  }
  1296  
  1297  // Uint8s represents a 128-bit vector of 16 uint8 elements.
  1298  type Uint8s struct {
  1299  	_    _simd
  1300  	a, b uint64
  1301  }
  1302  
  1303  // LoadUint8s loads a slice of uint8 into an Uint8s vector.
  1304  func LoadUint8s(s []uint8) Uint8s {
  1305  	var a, b uint64
  1306  	for i := 0; i < 16; i++ {
  1307  		val := uint64(s[i])
  1308  		if i < 8 {
  1309  			a |= val << (8 * i)
  1310  		} else {
  1311  			b |= val << (8 * (i - 8))
  1312  		}
  1313  	}
  1314  	return Uint8s{a: a, b: b}
  1315  }
  1316  
  1317  // LoadUint8sPart loads a partial slice of uint8 into an Uint8s vector.
  1318  func LoadUint8sPart(s []uint8) (Uint8s, int) {
  1319  	var a, b uint64
  1320  	n := len(s)
  1321  	if n > 16 {
  1322  		n = 16
  1323  	}
  1324  	for i := 0; i < n; i++ {
  1325  		val := uint64(s[i])
  1326  		if i < 8 {
  1327  			a |= val << (8 * i)
  1328  		} else {
  1329  			b |= val << (8 * (i - 8))
  1330  		}
  1331  	}
  1332  	return Uint8s{a: a, b: b}, n
  1333  }
  1334  
  1335  func (x Uint8s) get(i int) uint8 {
  1336  	if i < 8 {
  1337  		return uint8(x.a >> (8 * i))
  1338  	}
  1339  	return uint8(x.b >> (8 * (i - 8)))
  1340  }
  1341  
  1342  func (x *Uint8s) set(i int, v uint8) {
  1343  	val := uint64(v)
  1344  	if i < 8 {
  1345  		mask := uint64(0xff) << (8 * i)
  1346  		x.a = (x.a &^ mask) | (val << (8 * i))
  1347  	} else {
  1348  		mask := uint64(0xff) << (8 * (i - 8))
  1349  		x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
  1350  	}
  1351  }
  1352  
  1353  // Add returns the element-wise sum of x and y.
  1354  func (x Uint8s) Add(y Uint8s) Uint8s {
  1355  	var res Uint8s
  1356  	for i := 0; i < 16; i++ {
  1357  		res.set(i, x.get(i)+y.get(i))
  1358  	}
  1359  	return res
  1360  }
  1361  
  1362  // AddSaturated returns the element-wise saturated sum of x and y.
  1363  func (x Uint8s) AddSaturated(y Uint8s) Uint8s {
  1364  	var res Uint8s
  1365  	for i := 0; i < 16; i++ {
  1366  		sum := int(x.get(i)) + int(y.get(i))
  1367  		if sum > math.MaxUint8 {
  1368  			res.set(i, math.MaxUint8)
  1369  		} else {
  1370  			res.set(i, uint8(sum))
  1371  		}
  1372  	}
  1373  	return res
  1374  }
  1375  
  1376  // And returns the bitwise AND of x and y.
  1377  func (x Uint8s) And(y Uint8s) Uint8s {
  1378  	return Uint8s{a: x.a & y.a, b: x.b & y.b}
  1379  }
  1380  
  1381  // AndNot returns the bitwise AND NOT of x and y.
  1382  func (x Uint8s) AndNot(y Uint8s) Uint8s {
  1383  	return Uint8s{a: x.a &^ y.a, b: x.b &^ y.b}
  1384  }
  1385  
  1386  // Average returns the element-wise average of x and y.
  1387  func (x Uint8s) Average(y Uint8s) Uint8s {
  1388  	var res Uint8s
  1389  	for i := 0; i < 16; i++ {
  1390  		res.set(i, uint8((int(x.get(i))+int(y.get(i))+1)>>1))
  1391  	}
  1392  	return res
  1393  }
  1394  
  1395  // Equal returns a mask indicating where x and y are equal.
  1396  func (x Uint8s) Equal(y Uint8s) Mask8s {
  1397  	var res Mask8s
  1398  	for i := 0; i < 16; i++ {
  1399  		if x.get(i) == y.get(i) {
  1400  			res.set(i, true)
  1401  		}
  1402  	}
  1403  	return res
  1404  }
  1405  
  1406  // NotEqual returns a mask indicating where x and y are not equal.
  1407  func (x Uint8s) NotEqual(y Uint8s) Mask8s {
  1408  	var res Mask8s
  1409  	for i := 0; i < 16; i++ {
  1410  		if x.get(i) != y.get(i) {
  1411  			res.set(i, true)
  1412  		}
  1413  	}
  1414  	return res
  1415  }
  1416  
  1417  // Len returns the number of elements in the vector.
  1418  func (x Uint8s) Len() int {
  1419  	return 16
  1420  }
  1421  
  1422  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  1423  func (x Uint8s) Masked(mask Mask8s) Uint8s {
  1424  	return Uint8s{a: x.a & mask.a, b: x.b & mask.b}
  1425  }
  1426  
  1427  // Max returns the element-wise maximum of x and y.
  1428  func (x Uint8s) Max(y Uint8s) Uint8s {
  1429  	var res Uint8s
  1430  	for i := 0; i < 16; i++ {
  1431  		vx := x.get(i)
  1432  		vy := y.get(i)
  1433  		if vx > vy {
  1434  			res.set(i, vx)
  1435  		} else {
  1436  			res.set(i, vy)
  1437  		}
  1438  	}
  1439  	return res
  1440  }
  1441  
  1442  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  1443  func (x Uint8s) IfElse(mask Mask8s, y Uint8s) Uint8s {
  1444  	return Uint8s{
  1445  		a: (x.a & mask.a) | (y.a &^ mask.a),
  1446  		b: (x.b & mask.b) | (y.b &^ mask.b),
  1447  	}
  1448  }
  1449  
  1450  // Min returns the element-wise minimum of x and y.
  1451  func (x Uint8s) Min(y Uint8s) Uint8s {
  1452  	var res Uint8s
  1453  	for i := 0; i < 16; i++ {
  1454  		vx := x.get(i)
  1455  		vy := y.get(i)
  1456  		if vx < vy {
  1457  			res.set(i, vx)
  1458  		} else {
  1459  			res.set(i, vy)
  1460  		}
  1461  	}
  1462  	return res
  1463  }
  1464  
  1465  // Mul returns the element-wise product of x and y.
  1466  func (x Uint8s) Mul(y Uint8s) Uint8s {
  1467  	var res Uint8s
  1468  	for i := 0; i < 16; i++ {
  1469  		res.set(i, x.get(i)*y.get(i))
  1470  	}
  1471  	return res
  1472  }
  1473  
  1474  // Not returns the bitwise NOT of x.
  1475  func (x Uint8s) Not() Uint8s {
  1476  	return Uint8s{a: ^x.a, b: ^x.b}
  1477  }
  1478  
  1479  // Or returns the bitwise OR of x and y.
  1480  func (x Uint8s) Or(y Uint8s) Uint8s {
  1481  	return Uint8s{a: x.a | y.a, b: x.b | y.b}
  1482  }
  1483  
  1484  // Store stores the vector elements into the slice s.
  1485  func (x Uint8s) Store(s []uint8) {
  1486  	for i := 0; i < 16 && i < len(s); i++ {
  1487  		s[i] = x.get(i)
  1488  	}
  1489  }
  1490  
  1491  // StorePart stores a partial vector into the slice s.
  1492  func (x Uint8s) StorePart(s []uint8) int {
  1493  	x.Store(s)
  1494  	return min(len(s), x.Len())
  1495  }
  1496  
  1497  // String returns a string representation of the vector.
  1498  func (x Uint8s) String() string {
  1499  	var parts [16]uint8
  1500  	for i := 0; i < 16; i++ {
  1501  		parts[i] = x.get(i)
  1502  	}
  1503  	return fmt.Sprint(parts)
  1504  }
  1505  
  1506  // Sub returns the element-wise difference of x and y.
  1507  func (x Uint8s) Sub(y Uint8s) Uint8s {
  1508  	var res Uint8s
  1509  	for i := 0; i < 16; i++ {
  1510  		res.set(i, x.get(i)-y.get(i))
  1511  	}
  1512  	return res
  1513  }
  1514  
  1515  // SubSaturated returns the element-wise saturated difference of x and y.
  1516  func (x Uint8s) SubSaturated(y Uint8s) Uint8s {
  1517  	var res Uint8s
  1518  	for i := 0; i < 16; i++ {
  1519  		vx := x.get(i)
  1520  		vy := y.get(i)
  1521  		if vx < vy {
  1522  			res.set(i, 0)
  1523  		} else {
  1524  			res.set(i, vx-vy)
  1525  		}
  1526  	}
  1527  	return res
  1528  }
  1529  
  1530  // Xor returns the bitwise XOR of x and y.
  1531  func (x Uint8s) Xor(y Uint8s) Uint8s {
  1532  	return Uint8s{a: x.a ^ y.a, b: x.b ^ y.b}
  1533  }
  1534  
  1535  // BitsToInt8 reinterprets the vector bits as an Int8s vector.
  1536  func (x Uint8s) BitsToInt8() Int8s {
  1537  	return Int8s{a: x.a, b: x.b}
  1538  }
  1539  
  1540  // ConvertToInt8 converts the vector elements to int8.
  1541  func (x Uint8s) ConvertToInt8() Int8s {
  1542  	return Int8s{a: x.a, b: x.b}
  1543  }
  1544  
  1545  // ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
  1546  func (x Uint8s) ReshapeToUint16s() Uint16s {
  1547  	return Uint16s{a: x.a, b: x.b}
  1548  }
  1549  
  1550  // ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
  1551  func (x Uint8s) ReshapeToUint32s() Uint32s {
  1552  	return Uint32s{a: x.a, b: x.b}
  1553  }
  1554  
  1555  // ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
  1556  func (x Uint8s) ReshapeToUint64s() Uint64s {
  1557  	return Uint64s{a: x.a, b: x.b}
  1558  }
  1559  
  1560  // Uint16s represents a 128-bit vector of 8 uint16 elements.
  1561  type Uint16s struct {
  1562  	_    _simd
  1563  	a, b uint64
  1564  }
  1565  
  1566  // LoadUint16s loads a slice of uint16 into an Uint16s vector.
  1567  func LoadUint16s(s []uint16) Uint16s {
  1568  	var a, b uint64
  1569  	for i := 0; i < 8; i++ {
  1570  		val := uint64(s[i])
  1571  		if i < 4 {
  1572  			a |= val << (16 * i)
  1573  		} else {
  1574  			b |= val << (16 * (i - 4))
  1575  		}
  1576  	}
  1577  	return Uint16s{a: a, b: b}
  1578  }
  1579  
  1580  // LoadUint16sPart loads a partial slice of uint16 into an Uint16s vector.
  1581  func LoadUint16sPart(s []uint16) (Uint16s, int) {
  1582  	var a, b uint64
  1583  	n := len(s)
  1584  	if n > 8 {
  1585  		n = 8
  1586  	}
  1587  	for i := 0; i < n; i++ {
  1588  		val := uint64(s[i])
  1589  		if i < 4 {
  1590  			a |= val << (16 * i)
  1591  		} else {
  1592  			b |= val << (16 * (i - 4))
  1593  		}
  1594  	}
  1595  	return Uint16s{a: a, b: b}, n
  1596  }
  1597  
  1598  func (x Uint16s) get(i int) uint16 {
  1599  	if i < 4 {
  1600  		return uint16(x.a >> (16 * i))
  1601  	}
  1602  	return uint16(x.b >> (16 * (i - 4)))
  1603  }
  1604  
  1605  func (x *Uint16s) set(i int, v uint16) {
  1606  	val := uint64(v)
  1607  	if i < 4 {
  1608  		mask := uint64(0xffff) << (16 * i)
  1609  		x.a = (x.a &^ mask) | (val << (16 * i))
  1610  	} else {
  1611  		mask := uint64(0xffff) << (16 * (i - 4))
  1612  		x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
  1613  	}
  1614  }
  1615  
  1616  // Add returns the element-wise sum of x and y.
  1617  func (x Uint16s) Add(y Uint16s) Uint16s {
  1618  	var res Uint16s
  1619  	for i := 0; i < 8; i++ {
  1620  		res.set(i, x.get(i)+y.get(i))
  1621  	}
  1622  	return res
  1623  }
  1624  
  1625  // AddSaturated returns the element-wise saturated sum of x and y.
  1626  func (x Uint16s) AddSaturated(y Uint16s) Uint16s {
  1627  	var res Uint16s
  1628  	for i := 0; i < 8; i++ {
  1629  		sum := int(x.get(i)) + int(y.get(i))
  1630  		if sum > math.MaxUint16 {
  1631  			res.set(i, math.MaxUint16)
  1632  		} else {
  1633  			res.set(i, uint16(sum))
  1634  		}
  1635  	}
  1636  	return res
  1637  }
  1638  
  1639  // And returns the bitwise AND of x and y.
  1640  func (x Uint16s) And(y Uint16s) Uint16s {
  1641  	return Uint16s{a: x.a & y.a, b: x.b & y.b}
  1642  }
  1643  
  1644  // AndNot returns the bitwise AND NOT of x and y.
  1645  func (x Uint16s) AndNot(y Uint16s) Uint16s {
  1646  	return Uint16s{a: x.a &^ y.a, b: x.b &^ y.b}
  1647  }
  1648  
  1649  // Average returns the element-wise average of x and y.
  1650  func (x Uint16s) Average(y Uint16s) Uint16s {
  1651  	var res Uint16s
  1652  	for i := 0; i < 8; i++ {
  1653  		res.set(i, uint16((int(x.get(i))+int(y.get(i))+1)>>1))
  1654  	}
  1655  	return res
  1656  }
  1657  
  1658  // Equal returns a mask indicating where x and y are equal.
  1659  func (x Uint16s) Equal(y Uint16s) Mask16s {
  1660  	var res Mask16s
  1661  	for i := 0; i < 8; i++ {
  1662  		if x.get(i) == y.get(i) {
  1663  			res.set(i, true)
  1664  		}
  1665  	}
  1666  	return res
  1667  }
  1668  
  1669  // Greater returns a mask indicating where x is greater than y.
  1670  func (x Uint16s) Greater(y Uint16s) Mask16s {
  1671  	var res Mask16s
  1672  	for i := 0; i < 8; i++ {
  1673  		if x.get(i) > y.get(i) {
  1674  			res.set(i, true)
  1675  		}
  1676  	}
  1677  	return res
  1678  }
  1679  
  1680  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  1681  func (x Uint16s) GreaterEqual(y Uint16s) Mask16s {
  1682  	var res Mask16s
  1683  	for i := 0; i < 8; i++ {
  1684  		if x.get(i) >= y.get(i) {
  1685  			res.set(i, true)
  1686  		}
  1687  	}
  1688  	return res
  1689  }
  1690  
  1691  // Less returns a mask indicating where x is less than y.
  1692  func (x Uint16s) Less(y Uint16s) Mask16s {
  1693  	var res Mask16s
  1694  	for i := 0; i < 8; i++ {
  1695  		if x.get(i) < y.get(i) {
  1696  			res.set(i, true)
  1697  		}
  1698  	}
  1699  	return res
  1700  }
  1701  
  1702  // LessEqual returns a mask indicating where x is less than or equal to y.
  1703  func (x Uint16s) LessEqual(y Uint16s) Mask16s {
  1704  	var res Mask16s
  1705  	for i := 0; i < 8; i++ {
  1706  		if x.get(i) <= y.get(i) {
  1707  			res.set(i, true)
  1708  		}
  1709  	}
  1710  	return res
  1711  }
  1712  
  1713  // NotEqual returns a mask indicating where x and y are not equal.
  1714  func (x Uint16s) NotEqual(y Uint16s) Mask16s {
  1715  	var res Mask16s
  1716  	for i := 0; i < 8; i++ {
  1717  		if x.get(i) != y.get(i) {
  1718  			res.set(i, true)
  1719  		}
  1720  	}
  1721  	return res
  1722  }
  1723  
  1724  // Len returns the number of elements in the vector.
  1725  func (x Uint16s) Len() int {
  1726  	return 8
  1727  }
  1728  
  1729  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  1730  func (x Uint16s) Masked(mask Mask16s) Uint16s {
  1731  	return Uint16s{a: x.a & mask.a, b: x.b & mask.b}
  1732  }
  1733  
  1734  // Max returns the element-wise maximum of x and y.
  1735  func (x Uint16s) Max(y Uint16s) Uint16s {
  1736  	var res Uint16s
  1737  	for i := 0; i < 8; i++ {
  1738  		vx := x.get(i)
  1739  		vy := y.get(i)
  1740  		if vx > vy {
  1741  			res.set(i, vx)
  1742  		} else {
  1743  			res.set(i, vy)
  1744  		}
  1745  	}
  1746  	return res
  1747  }
  1748  
  1749  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  1750  func (x Uint16s) IfElse(mask Mask16s, y Uint16s) Uint16s {
  1751  	return Uint16s{
  1752  		a: (x.a & mask.a) | (y.a &^ mask.a),
  1753  		b: (x.b & mask.b) | (y.b &^ mask.b),
  1754  	}
  1755  }
  1756  
  1757  // Min returns the element-wise minimum of x and y.
  1758  func (x Uint16s) Min(y Uint16s) Uint16s {
  1759  	var res Uint16s
  1760  	for i := 0; i < 8; i++ {
  1761  		vx := x.get(i)
  1762  		vy := y.get(i)
  1763  		if vx < vy {
  1764  			res.set(i, vx)
  1765  		} else {
  1766  			res.set(i, vy)
  1767  		}
  1768  	}
  1769  	return res
  1770  }
  1771  
  1772  // Mul returns the element-wise product of x and y.
  1773  func (x Uint16s) Mul(y Uint16s) Uint16s {
  1774  	var res Uint16s
  1775  	for i := 0; i < 8; i++ {
  1776  		res.set(i, x.get(i)*y.get(i))
  1777  	}
  1778  	return res
  1779  }
  1780  
  1781  // Not returns the bitwise NOT of x.
  1782  func (x Uint16s) Not() Uint16s {
  1783  	return Uint16s{a: ^x.a, b: ^x.b}
  1784  }
  1785  
  1786  // Or returns the bitwise OR of x and y.
  1787  func (x Uint16s) Or(y Uint16s) Uint16s {
  1788  	return Uint16s{a: x.a | y.a, b: x.b | y.b}
  1789  }
  1790  
  1791  // ShiftAllLeft shifts all elements left by y bits.
  1792  func (x Uint16s) ShiftAllLeft(y uint8) Uint16s {
  1793  	var res Uint16s
  1794  	for i := 0; i < 8; i++ {
  1795  		res.set(i, x.get(i)<<y)
  1796  	}
  1797  	return res
  1798  }
  1799  
  1800  // ShiftAllRight shifts all elements right by y bits.
  1801  func (x Uint16s) ShiftAllRight(y uint8) Uint16s {
  1802  	var res Uint16s
  1803  	for i := 0; i < 8; i++ {
  1804  		res.set(i, x.get(i)>>y)
  1805  	}
  1806  	return res
  1807  }
  1808  
  1809  // RotateAllLeft rotates all elements left by dist bits.
  1810  func (x Uint16s) RotateAllLeft(dist uint64) Uint16s {
  1811  	var res Uint16s
  1812  	d := dist & 15
  1813  	for i := 0; i < 8; i++ {
  1814  		u := x.get(i)
  1815  		r := (u << d) | (u >> ((16 - d) & 15))
  1816  		res.set(i, r)
  1817  	}
  1818  	return res
  1819  }
  1820  
  1821  // RotateAllRight rotates all elements right by dist bits.
  1822  func (x Uint16s) RotateAllRight(dist uint64) Uint16s {
  1823  	var res Uint16s
  1824  	d := dist & 15
  1825  	for i := 0; i < 8; i++ {
  1826  		u := x.get(i)
  1827  		r := (u >> d) | (u << ((16 - d) & 15))
  1828  		res.set(i, r)
  1829  	}
  1830  	return res
  1831  }
  1832  
  1833  // Store stores the vector elements into the slice s.
  1834  func (x Uint16s) Store(s []uint16) {
  1835  	for i := 0; i < 8 && i < len(s); i++ {
  1836  		s[i] = x.get(i)
  1837  	}
  1838  }
  1839  
  1840  // StorePart stores a partial vector into the slice s.
  1841  func (x Uint16s) StorePart(s []uint16) int {
  1842  	x.Store(s)
  1843  	return min(len(s), x.Len())
  1844  }
  1845  
  1846  // String returns a string representation of the vector.
  1847  func (x Uint16s) String() string {
  1848  	var parts [8]uint16
  1849  	for i := 0; i < 8; i++ {
  1850  		parts[i] = x.get(i)
  1851  	}
  1852  	return fmt.Sprint(parts)
  1853  }
  1854  
  1855  // Sub returns the element-wise difference of x and y.
  1856  func (x Uint16s) Sub(y Uint16s) Uint16s {
  1857  	var res Uint16s
  1858  	for i := 0; i < 8; i++ {
  1859  		res.set(i, x.get(i)-y.get(i))
  1860  	}
  1861  	return res
  1862  }
  1863  
  1864  // SubSaturated returns the element-wise saturated difference of x and y.
  1865  func (x Uint16s) SubSaturated(y Uint16s) Uint16s {
  1866  	var res Uint16s
  1867  	for i := 0; i < 8; i++ {
  1868  		vx := x.get(i)
  1869  		vy := y.get(i)
  1870  		if vx < vy {
  1871  			res.set(i, 0)
  1872  		} else {
  1873  			res.set(i, vx-vy)
  1874  		}
  1875  	}
  1876  	return res
  1877  }
  1878  
  1879  // Xor returns the bitwise XOR of x and y.
  1880  func (x Uint16s) Xor(y Uint16s) Uint16s {
  1881  	return Uint16s{a: x.a ^ y.a, b: x.b ^ y.b}
  1882  }
  1883  
  1884  // BitsToInt16 reinterprets the vector bits as an Int16s vector.
  1885  func (x Uint16s) BitsToInt16() Int16s {
  1886  	return Int16s{a: x.a, b: x.b}
  1887  }
  1888  
  1889  // ConvertToInt16 converts the vector elements to int16.
  1890  func (x Uint16s) ConvertToInt16() Int16s {
  1891  	return Int16s{a: x.a, b: x.b}
  1892  }
  1893  
  1894  // ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
  1895  func (x Uint16s) ReshapeToUint32s() Uint32s {
  1896  	return Uint32s{a: x.a, b: x.b}
  1897  }
  1898  
  1899  // ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
  1900  func (x Uint16s) ReshapeToUint64s() Uint64s {
  1901  	return Uint64s{a: x.a, b: x.b}
  1902  }
  1903  
  1904  // ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
  1905  func (x Uint16s) ReshapeToUint8s() Uint8s {
  1906  	return Uint8s{a: x.a, b: x.b}
  1907  }
  1908  
  1909  // Uint32s represents a 128-bit vector of 4 uint32 elements.
  1910  type Uint32s struct {
  1911  	_    _simd
  1912  	a, b uint64
  1913  }
  1914  
  1915  // LoadUint32s loads a slice of uint32 into an Uint32s vector.
  1916  func LoadUint32s(s []uint32) Uint32s {
  1917  	var a, b uint64
  1918  	for i := 0; i < 4; i++ {
  1919  		val := uint64(s[i])
  1920  		if i < 2 {
  1921  			a |= val << (32 * i)
  1922  		} else {
  1923  			b |= val << (32 * (i - 2))
  1924  		}
  1925  	}
  1926  	return Uint32s{a: a, b: b}
  1927  }
  1928  
  1929  // LoadUint32sPart loads a partial slice of uint32 into an Uint32s vector.
  1930  func LoadUint32sPart(s []uint32) (Uint32s, int) {
  1931  	var a, b uint64
  1932  	n := len(s)
  1933  	if n > 4 {
  1934  		n = 4
  1935  	}
  1936  	for i := 0; i < n; i++ {
  1937  		val := uint64(s[i])
  1938  		if i < 2 {
  1939  			a |= val << (32 * i)
  1940  		} else {
  1941  			b |= val << (32 * (i - 2))
  1942  		}
  1943  	}
  1944  	return Uint32s{a: a, b: b}, n
  1945  }
  1946  
  1947  func (x Uint32s) get(i int) uint32 {
  1948  	if i < 2 {
  1949  		return uint32(x.a >> (32 * i))
  1950  	}
  1951  	return uint32(x.b >> (32 * (i - 2)))
  1952  }
  1953  
  1954  func (x *Uint32s) set(i int, v uint32) {
  1955  	val := uint64(v)
  1956  	if i < 2 {
  1957  		mask := uint64(0xffffffff) << (32 * i)
  1958  		x.a = (x.a &^ mask) | (val << (32 * i))
  1959  	} else {
  1960  		mask := uint64(0xffffffff) << (32 * (i - 2))
  1961  		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
  1962  	}
  1963  }
  1964  
  1965  // Add returns the element-wise sum of x and y.
  1966  func (x Uint32s) Add(y Uint32s) Uint32s {
  1967  	var res Uint32s
  1968  	for i := 0; i < 4; i++ {
  1969  		res.set(i, x.get(i)+y.get(i))
  1970  	}
  1971  	return res
  1972  }
  1973  
  1974  // And returns the bitwise AND of x and y.
  1975  func (x Uint32s) And(y Uint32s) Uint32s {
  1976  	return Uint32s{a: x.a & y.a, b: x.b & y.b}
  1977  }
  1978  
  1979  // AndNot returns the bitwise AND NOT of x and y.
  1980  func (x Uint32s) AndNot(y Uint32s) Uint32s {
  1981  	return Uint32s{a: x.a &^ y.a, b: x.b &^ y.b}
  1982  }
  1983  
  1984  // Equal returns a mask indicating where x and y are equal.
  1985  func (x Uint32s) Equal(y Uint32s) Mask32s {
  1986  	var res Mask32s
  1987  	for i := 0; i < 4; i++ {
  1988  		if x.get(i) == y.get(i) {
  1989  			res.set(i, true)
  1990  		}
  1991  	}
  1992  	return res
  1993  }
  1994  
  1995  // Greater returns a mask indicating where x is greater than y.
  1996  func (x Uint32s) Greater(y Uint32s) Mask32s {
  1997  	var res Mask32s
  1998  	for i := 0; i < 4; i++ {
  1999  		if x.get(i) > y.get(i) {
  2000  			res.set(i, true)
  2001  		}
  2002  	}
  2003  	return res
  2004  }
  2005  
  2006  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  2007  func (x Uint32s) GreaterEqual(y Uint32s) Mask32s {
  2008  	var res Mask32s
  2009  	for i := 0; i < 4; i++ {
  2010  		if x.get(i) >= y.get(i) {
  2011  			res.set(i, true)
  2012  		}
  2013  	}
  2014  	return res
  2015  }
  2016  
  2017  // Less returns a mask indicating where x is less than y.
  2018  func (x Uint32s) Less(y Uint32s) Mask32s {
  2019  	var res Mask32s
  2020  	for i := 0; i < 4; i++ {
  2021  		if x.get(i) < y.get(i) {
  2022  			res.set(i, true)
  2023  		}
  2024  	}
  2025  	return res
  2026  }
  2027  
  2028  // LessEqual returns a mask indicating where x is less than or equal to y.
  2029  func (x Uint32s) LessEqual(y Uint32s) Mask32s {
  2030  	var res Mask32s
  2031  	for i := 0; i < 4; i++ {
  2032  		if x.get(i) <= y.get(i) {
  2033  			res.set(i, true)
  2034  		}
  2035  	}
  2036  	return res
  2037  }
  2038  
  2039  // NotEqual returns a mask indicating where x and y are not equal.
  2040  func (x Uint32s) NotEqual(y Uint32s) Mask32s {
  2041  	var res Mask32s
  2042  	for i := 0; i < 4; i++ {
  2043  		if x.get(i) != y.get(i) {
  2044  			res.set(i, true)
  2045  		}
  2046  	}
  2047  	return res
  2048  }
  2049  
  2050  // Len returns the number of elements in the vector.
  2051  func (x Uint32s) Len() int {
  2052  	return 4
  2053  }
  2054  
  2055  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2056  func (x Uint32s) Masked(mask Mask32s) Uint32s {
  2057  	return Uint32s{a: x.a & mask.a, b: x.b & mask.b}
  2058  }
  2059  
  2060  // Max returns the element-wise maximum of x and y.
  2061  func (x Uint32s) Max(y Uint32s) Uint32s {
  2062  	var res Uint32s
  2063  	for i := 0; i < 4; i++ {
  2064  		vx := x.get(i)
  2065  		vy := y.get(i)
  2066  		if vx > vy {
  2067  			res.set(i, vx)
  2068  		} else {
  2069  			res.set(i, vy)
  2070  		}
  2071  	}
  2072  	return res
  2073  }
  2074  
  2075  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2076  func (x Uint32s) IfElse(mask Mask32s, y Uint32s) Uint32s {
  2077  	return Uint32s{
  2078  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2079  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2080  	}
  2081  }
  2082  
  2083  // Min returns the element-wise minimum of x and y.
  2084  func (x Uint32s) Min(y Uint32s) Uint32s {
  2085  	var res Uint32s
  2086  	for i := 0; i < 4; i++ {
  2087  		vx := x.get(i)
  2088  		vy := y.get(i)
  2089  		if vx < vy {
  2090  			res.set(i, vx)
  2091  		} else {
  2092  			res.set(i, vy)
  2093  		}
  2094  	}
  2095  	return res
  2096  }
  2097  
  2098  // Mul returns the element-wise product of x and y.
  2099  func (x Uint32s) Mul(y Uint32s) Uint32s {
  2100  	var res Uint32s
  2101  	for i := 0; i < 4; i++ {
  2102  		res.set(i, x.get(i)*y.get(i))
  2103  	}
  2104  	return res
  2105  }
  2106  
  2107  // Not returns the bitwise NOT of x.
  2108  func (x Uint32s) Not() Uint32s {
  2109  	return Uint32s{a: ^x.a, b: ^x.b}
  2110  }
  2111  
  2112  // Or returns the bitwise OR of x and y.
  2113  func (x Uint32s) Or(y Uint32s) Uint32s {
  2114  	return Uint32s{a: x.a | y.a, b: x.b | y.b}
  2115  }
  2116  
  2117  // ShiftAllLeft shifts all elements left by y bits.
  2118  func (x Uint32s) ShiftAllLeft(y uint8) Uint32s {
  2119  	var res Uint32s
  2120  	for i := 0; i < 4; i++ {
  2121  		res.set(i, x.get(i)<<y)
  2122  	}
  2123  	return res
  2124  }
  2125  
  2126  // ShiftAllRight shifts all elements right by y bits.
  2127  func (x Uint32s) ShiftAllRight(y uint8) Uint32s {
  2128  	var res Uint32s
  2129  	for i := 0; i < 4; i++ {
  2130  		res.set(i, x.get(i)>>y)
  2131  	}
  2132  	return res
  2133  }
  2134  
  2135  // RotateAllLeft rotates all elements left by dist bits.
  2136  func (x Uint32s) RotateAllLeft(dist uint64) Uint32s {
  2137  	var res Uint32s
  2138  	d := dist & 31
  2139  	for i := 0; i < 4; i++ {
  2140  		u := x.get(i)
  2141  		r := (u << d) | (u >> ((32 - d) & 31))
  2142  		res.set(i, r)
  2143  	}
  2144  	return res
  2145  }
  2146  
  2147  // RotateAllRight rotates all elements right by dist bits.
  2148  func (x Uint32s) RotateAllRight(dist uint64) Uint32s {
  2149  	var res Uint32s
  2150  	d := dist & 31
  2151  	for i := 0; i < 4; i++ {
  2152  		u := x.get(i)
  2153  		r := (u >> d) | (u << ((32 - d) & 31))
  2154  		res.set(i, r)
  2155  	}
  2156  	return res
  2157  }
  2158  
  2159  // Store stores the vector elements into the slice s.
  2160  func (x Uint32s) Store(s []uint32) {
  2161  	for i := 0; i < 4 && i < len(s); i++ {
  2162  		s[i] = x.get(i)
  2163  	}
  2164  }
  2165  
  2166  // StorePart stores a partial vector into the slice s.
  2167  func (x Uint32s) StorePart(s []uint32) int {
  2168  	x.Store(s)
  2169  	return min(len(s), x.Len())
  2170  }
  2171  
  2172  // String returns a string representation of the vector.
  2173  func (x Uint32s) String() string {
  2174  	var parts [4]uint32
  2175  	for i := 0; i < 4; i++ {
  2176  		parts[i] = x.get(i)
  2177  	}
  2178  	return fmt.Sprint(parts)
  2179  }
  2180  
  2181  // Sub returns the element-wise difference of x and y.
  2182  func (x Uint32s) Sub(y Uint32s) Uint32s {
  2183  	var res Uint32s
  2184  	for i := 0; i < 4; i++ {
  2185  		res.set(i, x.get(i)-y.get(i))
  2186  	}
  2187  	return res
  2188  }
  2189  
  2190  // Xor returns the bitwise XOR of x and y.
  2191  func (x Uint32s) Xor(y Uint32s) Uint32s {
  2192  	return Uint32s{a: x.a ^ y.a, b: x.b ^ y.b}
  2193  }
  2194  
  2195  // BitsToFloat32 reinterprets the vector bits as a Float32s vector.
  2196  func (x Uint32s) BitsToFloat32() Float32s {
  2197  	return Float32s{a: x.a, b: x.b}
  2198  }
  2199  
  2200  // BitsToInt32 reinterprets the vector bits as an Int32s vector.
  2201  func (x Uint32s) BitsToInt32() Int32s {
  2202  	return Int32s{a: x.a, b: x.b}
  2203  }
  2204  
  2205  // ConvertToInt32 converts the vector elements to int32.
  2206  func (x Uint32s) ConvertToInt32() Int32s {
  2207  	return Int32s{a: x.a, b: x.b}
  2208  }
  2209  
  2210  // ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
  2211  func (x Uint32s) ReshapeToUint16s() Uint16s {
  2212  	return Uint16s{a: x.a, b: x.b}
  2213  }
  2214  
  2215  // ReshapeToUint64s reinterprets the vector bits as a Uint64s vector.
  2216  func (x Uint32s) ReshapeToUint64s() Uint64s {
  2217  	return Uint64s{a: x.a, b: x.b}
  2218  }
  2219  
  2220  // ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
  2221  func (x Uint32s) ReshapeToUint8s() Uint8s {
  2222  	return Uint8s{a: x.a, b: x.b}
  2223  }
  2224  
  2225  // Uint64s represents a 128-bit vector of 2 uint64 elements.
  2226  type Uint64s struct {
  2227  	_    _simd
  2228  	a, b uint64
  2229  }
  2230  
  2231  // LoadUint64s loads a slice of uint64 into an Uint64s vector.
  2232  func LoadUint64s(s []uint64) Uint64s {
  2233  	var a, b uint64
  2234  	a = s[0]
  2235  	b = s[1]
  2236  	return Uint64s{a: a, b: b}
  2237  }
  2238  
  2239  // LoadUint64sPart loads a partial slice of uint64 into an Uint64s vector.
  2240  func LoadUint64sPart(s []uint64) (Uint64s, int) {
  2241  	n := len(s)
  2242  	var a, b uint64
  2243  	if n > 0 {
  2244  		a = s[0]
  2245  	}
  2246  	if n > 1 {
  2247  		b = s[1]
  2248  	}
  2249  	return Uint64s{a: a, b: b}, n
  2250  }
  2251  
  2252  func (x Uint64s) get(i int) uint64 {
  2253  	if i == 0 {
  2254  		return x.a
  2255  	}
  2256  	return x.b
  2257  }
  2258  
  2259  func (x *Uint64s) set(i int, v uint64) {
  2260  	if i == 0 {
  2261  		x.a = v
  2262  	} else {
  2263  		x.b = v
  2264  	}
  2265  }
  2266  
  2267  // Add returns the element-wise sum of x and y.
  2268  func (x Uint64s) Add(y Uint64s) Uint64s {
  2269  	return Uint64s{a: x.a + y.a, b: x.b + y.b}
  2270  }
  2271  
  2272  // And returns the bitwise AND of x and y.
  2273  func (x Uint64s) And(y Uint64s) Uint64s {
  2274  	return Uint64s{a: x.a & y.a, b: x.b & y.b}
  2275  }
  2276  
  2277  // AndNot returns the bitwise AND NOT of x and y.
  2278  func (x Uint64s) AndNot(y Uint64s) Uint64s {
  2279  	return Uint64s{a: x.a &^ y.a, b: x.b &^ y.b}
  2280  }
  2281  
  2282  // Equal returns a mask indicating where x and y are equal.
  2283  func (x Uint64s) Equal(y Uint64s) Mask64s {
  2284  	var res Mask64s
  2285  	if x.a == y.a {
  2286  		res.a = ^uint64(0)
  2287  	}
  2288  	if x.b == y.b {
  2289  		res.b = ^uint64(0)
  2290  	}
  2291  	return res
  2292  }
  2293  
  2294  // Greater returns a mask indicating where x is greater than y.
  2295  func (x Uint64s) Greater(y Uint64s) Mask64s {
  2296  	var res Mask64s
  2297  	for i := 0; i < 2; i++ {
  2298  		if x.get(i) > y.get(i) {
  2299  			res.set(i, true)
  2300  		}
  2301  	}
  2302  	return res
  2303  }
  2304  
  2305  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  2306  func (x Uint64s) GreaterEqual(y Uint64s) Mask64s {
  2307  	var res Mask64s
  2308  	for i := 0; i < 2; i++ {
  2309  		if x.get(i) >= y.get(i) {
  2310  			res.set(i, true)
  2311  		}
  2312  	}
  2313  	return res
  2314  }
  2315  
  2316  // Less returns a mask indicating where x is less than y.
  2317  func (x Uint64s) Less(y Uint64s) Mask64s {
  2318  	var res Mask64s
  2319  	for i := 0; i < 2; i++ {
  2320  		if x.get(i) < y.get(i) {
  2321  			res.set(i, true)
  2322  		}
  2323  	}
  2324  	return res
  2325  }
  2326  
  2327  // LessEqual returns a mask indicating where x is less than or equal to y.
  2328  func (x Uint64s) LessEqual(y Uint64s) Mask64s {
  2329  	var res Mask64s
  2330  	for i := 0; i < 2; i++ {
  2331  		if x.get(i) <= y.get(i) {
  2332  			res.set(i, true)
  2333  		}
  2334  	}
  2335  	return res
  2336  }
  2337  
  2338  // NotEqual returns a mask indicating where x and y are not equal.
  2339  func (x Uint64s) NotEqual(y Uint64s) Mask64s {
  2340  	var res Mask64s
  2341  	if x.a != y.a {
  2342  		res.a = ^uint64(0)
  2343  	}
  2344  	if x.b != y.b {
  2345  		res.b = ^uint64(0)
  2346  	}
  2347  	return res
  2348  }
  2349  
  2350  // Len returns the number of elements in the vector.
  2351  func (x Uint64s) Len() int {
  2352  	return 2
  2353  }
  2354  
  2355  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2356  func (x Uint64s) Masked(mask Mask64s) Uint64s {
  2357  	return Uint64s{a: x.a & mask.a, b: x.b & mask.b}
  2358  }
  2359  
  2360  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2361  func (x Uint64s) IfElse(mask Mask64s, y Uint64s) Uint64s {
  2362  	return Uint64s{
  2363  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2364  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2365  	}
  2366  }
  2367  
  2368  // Not returns the bitwise NOT of x.
  2369  func (x Uint64s) Not() Uint64s {
  2370  	return Uint64s{a: ^x.a, b: ^x.b}
  2371  }
  2372  
  2373  // Or returns the bitwise OR of x and y.
  2374  func (x Uint64s) Or(y Uint64s) Uint64s {
  2375  	return Uint64s{a: x.a | y.a, b: x.b | y.b}
  2376  }
  2377  
  2378  // ShiftAllLeft shifts all elements left by y bits.
  2379  func (x Uint64s) ShiftAllLeft(y uint8) Uint64s {
  2380  	return Uint64s{a: x.a << y, b: x.b << y}
  2381  }
  2382  
  2383  // ShiftAllRight shifts all elements right by y bits.
  2384  func (x Uint64s) ShiftAllRight(y uint8) Uint64s {
  2385  	return Uint64s{a: x.a >> y, b: x.b >> y}
  2386  }
  2387  
  2388  // RotateAllLeft rotates all elements left by dist bits.
  2389  func (x Uint64s) RotateAllLeft(dist uint64) Uint64s {
  2390  	d := dist & 63
  2391  	return Uint64s{
  2392  		a: (x.a << d) | (x.a >> ((64 - d) & 63)),
  2393  		b: (x.b << d) | (x.b >> ((64 - d) & 63)),
  2394  	}
  2395  }
  2396  
  2397  // RotateAllRight rotates all elements right by dist bits.
  2398  func (x Uint64s) RotateAllRight(dist uint64) Uint64s {
  2399  	d := dist & 63
  2400  	return Uint64s{
  2401  		a: (x.a >> d) | (x.a << ((64 - d) & 63)),
  2402  		b: (x.b >> d) | (x.b << ((64 - d) & 63)),
  2403  	}
  2404  }
  2405  
  2406  // Store stores the vector elements into the slice s.
  2407  func (x Uint64s) Store(s []uint64) {
  2408  	if len(s) > 0 {
  2409  		s[0] = x.a
  2410  	}
  2411  	if len(s) > 1 {
  2412  		s[1] = x.b
  2413  	}
  2414  }
  2415  
  2416  // StorePart stores a partial vector into the slice s.
  2417  func (x Uint64s) StorePart(s []uint64) int {
  2418  	x.Store(s)
  2419  	return min(len(s), x.Len())
  2420  }
  2421  
  2422  // String returns a string representation of the vector.
  2423  func (x Uint64s) String() string {
  2424  	return fmt.Sprint([2]uint64{x.a, x.b})
  2425  }
  2426  
  2427  // Sub returns the element-wise difference of x and y.
  2428  func (x Uint64s) Sub(y Uint64s) Uint64s {
  2429  	return Uint64s{a: x.a - y.a, b: x.b - y.b}
  2430  }
  2431  
  2432  // Xor returns the bitwise XOR of x and y.
  2433  func (x Uint64s) Xor(y Uint64s) Uint64s {
  2434  	return Uint64s{a: x.a ^ y.a, b: x.b ^ y.b}
  2435  }
  2436  
  2437  // BitsToFloat64 reinterprets the vector bits as a Float64s vector.
  2438  func (x Uint64s) BitsToFloat64() Float64s {
  2439  	return Float64s{a: x.a, b: x.b}
  2440  }
  2441  
  2442  // BitsToInt64 reinterprets the vector bits as an Int64s vector.
  2443  func (x Uint64s) BitsToInt64() Int64s {
  2444  	return Int64s{a: x.a, b: x.b}
  2445  }
  2446  
  2447  // ConvertToInt64 converts the vector elements to int64.
  2448  func (x Uint64s) ConvertToInt64() Int64s {
  2449  	return Int64s{a: x.a, b: x.b}
  2450  }
  2451  
  2452  // ReshapeToUint16s reinterprets the vector bits as a Uint16s vector.
  2453  func (x Uint64s) ReshapeToUint16s() Uint16s {
  2454  	return Uint16s{a: x.a, b: x.b}
  2455  }
  2456  
  2457  // ReshapeToUint32s reinterprets the vector bits as a Uint32s vector.
  2458  func (x Uint64s) ReshapeToUint32s() Uint32s {
  2459  	return Uint32s{a: x.a, b: x.b}
  2460  }
  2461  
  2462  // ReshapeToUint8s reinterprets the vector bits as a Uint8s vector.
  2463  func (x Uint64s) ReshapeToUint8s() Uint8s {
  2464  	return Uint8s{a: x.a, b: x.b}
  2465  }
  2466  
  2467  // Float32s represents a 128-bit vector of 4 float32 elements.
  2468  type Float32s struct {
  2469  	_    _simd
  2470  	a, b uint64
  2471  }
  2472  
  2473  // LoadFloat32s loads a slice of float32 into an Float32s vector.
  2474  func LoadFloat32s(s []float32) Float32s {
  2475  	var a, b uint64
  2476  	for i := 0; i < 4; i++ {
  2477  		val := uint64(math.Float32bits(s[i]))
  2478  		if i < 2 {
  2479  			a |= val << (32 * i)
  2480  		} else {
  2481  			b |= val << (32 * (i - 2))
  2482  		}
  2483  	}
  2484  	return Float32s{a: a, b: b}
  2485  }
  2486  
  2487  // LoadFloat32sPart loads a partial slice of float32 into an Float32s vector.
  2488  func LoadFloat32sPart(s []float32) (Float32s, int) {
  2489  	var a, b uint64
  2490  	n := len(s)
  2491  	if n > 4 {
  2492  		n = 4
  2493  	}
  2494  	for i := 0; i < n; i++ {
  2495  		val := uint64(math.Float32bits(s[i]))
  2496  		if i < 2 {
  2497  			a |= val << (32 * i)
  2498  		} else {
  2499  			b |= val << (32 * (i - 2))
  2500  		}
  2501  	}
  2502  	return Float32s{a: a, b: b}, n
  2503  }
  2504  
  2505  func (x Float32s) get(i int) float32 {
  2506  	if i < 2 {
  2507  		return math.Float32frombits(uint32(x.a >> (32 * i)))
  2508  	}
  2509  	return math.Float32frombits(uint32(x.b >> (32 * (i - 2))))
  2510  }
  2511  
  2512  func (x *Float32s) set(i int, v float32) {
  2513  	val := uint64(math.Float32bits(v))
  2514  	if i < 2 {
  2515  		mask := uint64(0xffffffff) << (32 * i)
  2516  		x.a = (x.a &^ mask) | (val << (32 * i))
  2517  	} else {
  2518  		mask := uint64(0xffffffff) << (32 * (i - 2))
  2519  		x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
  2520  	}
  2521  }
  2522  
  2523  // Abs returns the element-wise absolute value of x.
  2524  func (x Float32s) Abs() Float32s {
  2525  	var res Float32s
  2526  	for i := 0; i < 4; i++ {
  2527  		v := x.get(i)
  2528  		if v < 0 {
  2529  			res.set(i, -v)
  2530  		} else {
  2531  			res.set(i, v)
  2532  		}
  2533  	}
  2534  	return res
  2535  }
  2536  
  2537  // Add returns the element-wise sum of x and y.
  2538  func (x Float32s) Add(y Float32s) Float32s {
  2539  	var res Float32s
  2540  	for i := 0; i < 4; i++ {
  2541  		res.set(i, x.get(i)+y.get(i))
  2542  	}
  2543  	return res
  2544  }
  2545  
  2546  // ConvertToInt32 converts the vector elements to int32.
  2547  func (x Float32s) ConvertToInt32() Int32s {
  2548  	var res Int32s
  2549  	for i := 0; i < 4; i++ {
  2550  		res.set(i, int32(x.get(i)))
  2551  	}
  2552  	return res
  2553  }
  2554  
  2555  // Div returns the element-wise quotient of x and y.
  2556  func (x Float32s) Div(y Float32s) Float32s {
  2557  	var res Float32s
  2558  	for i := 0; i < 4; i++ {
  2559  		res.set(i, x.get(i)/y.get(i))
  2560  	}
  2561  	return res
  2562  }
  2563  
  2564  // Equal returns a mask indicating where x and y are equal.
  2565  func (x Float32s) Equal(y Float32s) Mask32s {
  2566  	var res Mask32s
  2567  	for i := 0; i < 4; i++ {
  2568  		if x.get(i) == y.get(i) {
  2569  			res.set(i, true)
  2570  		}
  2571  	}
  2572  	return res
  2573  }
  2574  
  2575  // Greater returns a mask indicating where x is greater than y.
  2576  func (x Float32s) Greater(y Float32s) Mask32s {
  2577  	var res Mask32s
  2578  	for i := 0; i < 4; i++ {
  2579  		if x.get(i) > y.get(i) {
  2580  			res.set(i, true)
  2581  		}
  2582  	}
  2583  	return res
  2584  }
  2585  
  2586  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  2587  func (x Float32s) GreaterEqual(y Float32s) Mask32s {
  2588  	var res Mask32s
  2589  	for i := 0; i < 4; i++ {
  2590  		if x.get(i) >= y.get(i) {
  2591  			res.set(i, true)
  2592  		}
  2593  	}
  2594  	return res
  2595  }
  2596  
  2597  // Len returns the number of elements in the vector.
  2598  func (x Float32s) Len() int {
  2599  	return 4
  2600  }
  2601  
  2602  // Less returns a mask indicating where x is less than y.
  2603  func (x Float32s) Less(y Float32s) Mask32s {
  2604  	var res Mask32s
  2605  	for i := 0; i < 4; i++ {
  2606  		if x.get(i) < y.get(i) {
  2607  			res.set(i, true)
  2608  		}
  2609  	}
  2610  	return res
  2611  }
  2612  
  2613  // LessEqual returns a mask indicating where x is less than or equal to y.
  2614  func (x Float32s) LessEqual(y Float32s) Mask32s {
  2615  	var res Mask32s
  2616  	for i := 0; i < 4; i++ {
  2617  		if x.get(i) <= y.get(i) {
  2618  			res.set(i, true)
  2619  		}
  2620  	}
  2621  	return res
  2622  }
  2623  
  2624  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2625  func (x Float32s) Masked(mask Mask32s) Float32s {
  2626  	return Float32s{a: x.a & mask.a, b: x.b & mask.b}
  2627  }
  2628  
  2629  // Max returns the element-wise maximum of x and y.
  2630  func (x Float32s) Max(y Float32s) Float32s {
  2631  	var res Float32s
  2632  	for i := 0; i < 4; i++ {
  2633  		vx := x.get(i)
  2634  		vy := y.get(i)
  2635  		if vx > vy {
  2636  			res.set(i, vx)
  2637  		} else {
  2638  			res.set(i, vy)
  2639  		}
  2640  	}
  2641  	return res
  2642  }
  2643  
  2644  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2645  func (x Float32s) IfElse(mask Mask32s, y Float32s) Float32s {
  2646  	return Float32s{
  2647  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2648  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2649  	}
  2650  }
  2651  
  2652  // Min returns the element-wise minimum of x and y.
  2653  func (x Float32s) Min(y Float32s) Float32s {
  2654  	var res Float32s
  2655  	for i := 0; i < 4; i++ {
  2656  		vx := x.get(i)
  2657  		vy := y.get(i)
  2658  		if vx < vy {
  2659  			res.set(i, vx)
  2660  		} else {
  2661  			res.set(i, vy)
  2662  		}
  2663  	}
  2664  	return res
  2665  }
  2666  
  2667  // Mul returns the element-wise product of x and y.
  2668  func (x Float32s) Mul(y Float32s) Float32s {
  2669  	var res Float32s
  2670  	for i := 0; i < 4; i++ {
  2671  		res.set(i, x.get(i)*y.get(i))
  2672  	}
  2673  	return res
  2674  }
  2675  
  2676  // MulAdd returns x * y + z element-wise.
  2677  func (x Float32s) MulAdd(y, z Float32s) Float32s {
  2678  	var res Float32s
  2679  	for i := 0; i < 4; i++ {
  2680  		res.set(i, x.get(i)+y.get(i)*z.get(i))
  2681  	}
  2682  	return res
  2683  }
  2684  
  2685  // Neg returns the element-wise negation of x.
  2686  func (x Float32s) Neg() Float32s {
  2687  	var res Float32s
  2688  	for i := 0; i < 4; i++ {
  2689  		res.set(i, -(x.get(i)))
  2690  	}
  2691  	return res
  2692  }
  2693  
  2694  // NotEqual returns a mask indicating where x and y are not equal.
  2695  func (x Float32s) NotEqual(y Float32s) Mask32s {
  2696  	var res Mask32s
  2697  	for i := 0; i < 4; i++ {
  2698  		if x.get(i) != y.get(i) {
  2699  			res.set(i, true)
  2700  		}
  2701  	}
  2702  	return res
  2703  }
  2704  
  2705  // Sqrt returns the element-wise square root of x.
  2706  func (x Float32s) Sqrt() Float32s {
  2707  	var res Float32s
  2708  	for i := 0; i < 4; i++ {
  2709  		res.set(i, float32(math.Sqrt(float64(x.get(i)))))
  2710  	}
  2711  	return res
  2712  }
  2713  
  2714  // Store stores the vector elements into the slice s.
  2715  func (x Float32s) Store(s []float32) {
  2716  	for i := 0; i < 4 && i < len(s); i++ {
  2717  		s[i] = x.get(i)
  2718  	}
  2719  }
  2720  
  2721  // StorePart stores a partial vector into the slice s.
  2722  func (x Float32s) StorePart(s []float32) int {
  2723  	x.Store(s)
  2724  	return min(len(s), x.Len())
  2725  }
  2726  
  2727  // String returns a string representation of the vector.
  2728  func (x Float32s) String() string {
  2729  	var parts [4]float32
  2730  	for i := 0; i < 4; i++ {
  2731  		parts[i] = x.get(i)
  2732  	}
  2733  	return fmt.Sprint(parts)
  2734  }
  2735  
  2736  // Sub returns the element-wise difference of x and y.
  2737  func (x Float32s) Sub(y Float32s) Float32s {
  2738  	var res Float32s
  2739  	for i := 0; i < 4; i++ {
  2740  		res.set(i, x.get(i)-y.get(i))
  2741  	}
  2742  	return res
  2743  }
  2744  
  2745  // ToBits reinterprets the vector bits as a Uint32s vector.
  2746  func (x Float32s) ToBits() Uint32s {
  2747  	return Uint32s{a: x.a, b: x.b}
  2748  }
  2749  
  2750  // Float64s represents a 128-bit vector of 2 float64 elements.
  2751  type Float64s struct {
  2752  	_    _simd
  2753  	a, b uint64
  2754  }
  2755  
  2756  // LoadFloat64s loads a slice of float64 into an Float64s vector.
  2757  func LoadFloat64s(s []float64) Float64s {
  2758  	var a, b uint64
  2759  	a = math.Float64bits(s[0])
  2760  	b = math.Float64bits(s[1])
  2761  	return Float64s{a: a, b: b}
  2762  }
  2763  
  2764  // LoadFloat64sPart loads a partial slice of float64 into an Float64s vector.
  2765  func LoadFloat64sPart(s []float64) (Float64s, int) {
  2766  	n := len(s)
  2767  	var a, b uint64
  2768  	if n > 0 {
  2769  		a = math.Float64bits(s[0])
  2770  	}
  2771  	if n > 1 {
  2772  		b = math.Float64bits(s[1])
  2773  	}
  2774  	return Float64s{a: a, b: b}, n
  2775  }
  2776  
  2777  func (x Float64s) get(i int) float64 {
  2778  	if i == 0 {
  2779  		return math.Float64frombits(x.a)
  2780  	}
  2781  	return math.Float64frombits(x.b)
  2782  }
  2783  
  2784  func (x *Float64s) set(i int, v float64) {
  2785  	if i == 0 {
  2786  		x.a = math.Float64bits(v)
  2787  	} else {
  2788  		x.b = math.Float64bits(v)
  2789  	}
  2790  }
  2791  
  2792  // Abs returns the element-wise absolute value of x.
  2793  func (x Float64s) Abs() Float64s {
  2794  	var res Float64s
  2795  	for i := 0; i < 4; i++ {
  2796  		v := x.get(i)
  2797  		if v < 0 {
  2798  			res.set(i, -v)
  2799  		} else {
  2800  			res.set(i, v)
  2801  		}
  2802  	}
  2803  	return res
  2804  }
  2805  
  2806  // Add returns the element-wise sum of x and y.
  2807  func (x Float64s) Add(y Float64s) Float64s {
  2808  	var res Float64s
  2809  	res.set(0, x.get(0)+y.get(0))
  2810  	res.set(1, x.get(1)+y.get(1))
  2811  	return res
  2812  }
  2813  
  2814  // Div returns the element-wise quotient of x and y.
  2815  func (x Float64s) Div(y Float64s) Float64s {
  2816  	var res Float64s
  2817  	res.set(0, x.get(0)/y.get(0))
  2818  	res.set(1, x.get(1)/y.get(1))
  2819  	return res
  2820  }
  2821  
  2822  // Equal returns a mask indicating where x and y are equal.
  2823  func (x Float64s) Equal(y Float64s) Mask64s {
  2824  	var res Mask64s
  2825  	if x.get(0) == y.get(0) {
  2826  		res.a = ^uint64(0)
  2827  	}
  2828  	if x.get(1) == y.get(1) {
  2829  		res.b = ^uint64(0)
  2830  	}
  2831  	return res
  2832  }
  2833  
  2834  // Greater returns a mask indicating where x is greater than y.
  2835  func (x Float64s) Greater(y Float64s) Mask64s {
  2836  	var res Mask64s
  2837  	if x.get(0) > y.get(0) {
  2838  		res.a = ^uint64(0)
  2839  	}
  2840  	if x.get(1) > y.get(1) {
  2841  		res.b = ^uint64(0)
  2842  	}
  2843  	return res
  2844  }
  2845  
  2846  // GreaterEqual returns a mask indicating where x is greater than or equal to y.
  2847  func (x Float64s) GreaterEqual(y Float64s) Mask64s {
  2848  	var res Mask64s
  2849  	if x.get(0) >= y.get(0) {
  2850  		res.a = ^uint64(0)
  2851  	}
  2852  	if x.get(1) >= y.get(1) {
  2853  		res.b = ^uint64(0)
  2854  	}
  2855  	return res
  2856  }
  2857  
  2858  // Len returns the number of elements in the vector.
  2859  func (x Float64s) Len() int {
  2860  	return 2
  2861  }
  2862  
  2863  // Less returns a mask indicating where x is less than y.
  2864  func (x Float64s) Less(y Float64s) Mask64s {
  2865  	var res Mask64s
  2866  	if x.get(0) < y.get(0) {
  2867  		res.a = ^uint64(0)
  2868  	}
  2869  	if x.get(1) < y.get(1) {
  2870  		res.b = ^uint64(0)
  2871  	}
  2872  	return res
  2873  }
  2874  
  2875  // LessEqual returns a mask indicating where x is less than or equal to y.
  2876  func (x Float64s) LessEqual(y Float64s) Mask64s {
  2877  	var res Mask64s
  2878  	if x.get(0) <= y.get(0) {
  2879  		res.a = ^uint64(0)
  2880  	}
  2881  	if x.get(1) <= y.get(1) {
  2882  		res.b = ^uint64(0)
  2883  	}
  2884  	return res
  2885  }
  2886  
  2887  // Masked returns a new vector with elements from x where mask is true, and zero elsewhere.
  2888  func (x Float64s) Masked(mask Mask64s) Float64s {
  2889  	return Float64s{a: x.a & mask.a, b: x.b & mask.b}
  2890  }
  2891  
  2892  // Max returns the element-wise maximum of x and y.
  2893  func (x Float64s) Max(y Float64s) Float64s {
  2894  	var res Float64s
  2895  	vx := x.get(0)
  2896  	vy := y.get(0)
  2897  	if vx > vy {
  2898  		res.set(0, vx)
  2899  	} else {
  2900  		res.set(0, vy)
  2901  	}
  2902  	vx = x.get(1)
  2903  	vy = y.get(1)
  2904  	if vx > vy {
  2905  		res.set(1, vx)
  2906  	} else {
  2907  		res.set(1, vy)
  2908  	}
  2909  	return res
  2910  }
  2911  
  2912  // IfElse returns a new vector with elements from x where mask is true, and y where mask is false.
  2913  func (x Float64s) IfElse(mask Mask64s, y Float64s) Float64s {
  2914  	return Float64s{
  2915  		a: (x.a & mask.a) | (y.a &^ mask.a),
  2916  		b: (x.b & mask.b) | (y.b &^ mask.b),
  2917  	}
  2918  }
  2919  
  2920  // Min returns the element-wise minimum of x and y.
  2921  func (x Float64s) Min(y Float64s) Float64s {
  2922  	var res Float64s
  2923  	vx := x.get(0)
  2924  	vy := y.get(0)
  2925  	if vx < vy {
  2926  		res.set(0, vx)
  2927  	} else {
  2928  		res.set(0, vy)
  2929  	}
  2930  	vx = x.get(1)
  2931  	vy = y.get(1)
  2932  	if vx < vy {
  2933  		res.set(1, vx)
  2934  	} else {
  2935  		res.set(1, vy)
  2936  	}
  2937  	return res
  2938  }
  2939  
  2940  // Mul returns the element-wise product of x and y.
  2941  func (x Float64s) Mul(y Float64s) Float64s {
  2942  	var res Float64s
  2943  	res.set(0, x.get(0)*y.get(0))
  2944  	res.set(1, x.get(1)*y.get(1))
  2945  	return res
  2946  }
  2947  
  2948  // MulAdd returns x * y + z element-wise.
  2949  func (x Float64s) MulAdd(y, z Float64s) Float64s {
  2950  	var res Float64s
  2951  	res.set(0, x.get(0)+y.get(0)*z.get(0))
  2952  	res.set(1, x.get(1)+y.get(1)*z.get(1))
  2953  	return res
  2954  }
  2955  
  2956  // Neg returns the element-wise negation of x.
  2957  func (x Float64s) Neg() Float64s {
  2958  	var res Float64s
  2959  	for i := 0; i < 4; i++ {
  2960  		res.set(i, -(x.get(i)))
  2961  	}
  2962  	return res
  2963  }
  2964  
  2965  // NotEqual returns a mask indicating where x and y are not equal.
  2966  func (x Float64s) NotEqual(y Float64s) Mask64s {
  2967  	var res Mask64s
  2968  	if x.get(0) != y.get(0) {
  2969  		res.a = ^uint64(0)
  2970  	}
  2971  	if x.get(1) != y.get(1) {
  2972  		res.b = ^uint64(0)
  2973  	}
  2974  	return res
  2975  }
  2976  
  2977  // Sqrt returns the element-wise square root of x.
  2978  func (x Float64s) Sqrt() Float64s {
  2979  	var res Float64s
  2980  	res.set(0, math.Sqrt(x.get(0)))
  2981  	res.set(1, math.Sqrt(x.get(1)))
  2982  	return res
  2983  }
  2984  
  2985  // Store stores the vector elements into the slice s.
  2986  func (x Float64s) Store(s []float64) {
  2987  	if len(s) > 0 {
  2988  		s[0] = x.get(0)
  2989  	}
  2990  	if len(s) > 1 {
  2991  		s[1] = x.get(1)
  2992  	}
  2993  }
  2994  
  2995  // StorePart stores a partial vector into the slice s.
  2996  func (x Float64s) StorePart(s []float64) int {
  2997  	x.Store(s)
  2998  	return min(len(s), x.Len())
  2999  }
  3000  
  3001  // String returns a string representation of the vector.
  3002  func (x Float64s) String() string {
  3003  	return fmt.Sprint([2]float64{x.get(0), x.get(1)})
  3004  }
  3005  
  3006  // Sub returns the element-wise difference of x and y.
  3007  func (x Float64s) Sub(y Float64s) Float64s {
  3008  	var res Float64s
  3009  	res.set(0, x.get(0)-y.get(0))
  3010  	res.set(1, x.get(1)-y.get(1))
  3011  	return res
  3012  }
  3013  
  3014  // ToBits reinterprets the vector bits as a Uint64s vector.
  3015  func (x Float64s) ToBits() Uint64s {
  3016  	return Uint64s{a: x.a, b: x.b}
  3017  }
  3018  
  3019  // Mask8s represents a 128-bit mask vector for 16 int8/uint8 elements.
  3020  type Mask8s struct {
  3021  	_    _simd
  3022  	a, b uint64
  3023  }
  3024  
  3025  func (x *Mask8s) set(i int, v bool) {
  3026  	if v {
  3027  		if i < 8 {
  3028  			mask := uint64(0xff) << (8 * i)
  3029  			x.a |= mask
  3030  		} else {
  3031  			mask := uint64(0xff) << (8 * (i - 8))
  3032  			x.b |= mask
  3033  		}
  3034  	}
  3035  }
  3036  
  3037  // And returns the bitwise AND of x and y.
  3038  func (x Mask8s) And(y Mask8s) Mask8s {
  3039  	return Mask8s{a: x.a & y.a, b: x.b & y.b}
  3040  }
  3041  
  3042  // Or returns the bitwise OR of x and y.
  3043  func (x Mask8s) Or(y Mask8s) Mask8s {
  3044  	return Mask8s{a: x.a | y.a, b: x.b | y.b}
  3045  }
  3046  
  3047  // String returns a string representation of the vector.
  3048  func (x Mask8s) String() string {
  3049  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  3050  }
  3051  
  3052  // ToInt8s converts the mask to an Int8s vector.
  3053  func (x Mask8s) ToInt8s() Int8s {
  3054  	return Int8s{a: x.a, b: x.b}
  3055  }
  3056  
  3057  // Mask16s represents a 128-bit mask vector for 8 int16/uint16 elements.
  3058  type Mask16s struct {
  3059  	_    _simd
  3060  	a, b uint64
  3061  }
  3062  
  3063  func (x *Mask16s) set(i int, v bool) {
  3064  	if v {
  3065  		if i < 4 {
  3066  			mask := uint64(0xffff) << (16 * i)
  3067  			x.a |= mask
  3068  		} else {
  3069  			mask := uint64(0xffff) << (16 * (i - 4))
  3070  			x.b |= mask
  3071  		}
  3072  	}
  3073  }
  3074  
  3075  // And returns the bitwise AND of x and y.
  3076  func (x Mask16s) And(y Mask16s) Mask16s {
  3077  	return Mask16s{a: x.a & y.a, b: x.b & y.b}
  3078  }
  3079  
  3080  // Or returns the bitwise OR of x and y.
  3081  func (x Mask16s) Or(y Mask16s) Mask16s {
  3082  	return Mask16s{a: x.a | y.a, b: x.b | y.b}
  3083  }
  3084  
  3085  // String returns a string representation of the vector.
  3086  func (x Mask16s) String() string {
  3087  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  3088  }
  3089  
  3090  // ToInt16s converts the mask to an Int16s vector.
  3091  func (x Mask16s) ToInt16s() Int16s {
  3092  	return Int16s{a: x.a, b: x.b}
  3093  }
  3094  
  3095  // Mask32s represents a 128-bit mask vector for 4 int32/uint32/float32 elements.
  3096  type Mask32s struct {
  3097  	_    _simd
  3098  	a, b uint64
  3099  }
  3100  
  3101  func (x *Mask32s) set(i int, v bool) {
  3102  	if v {
  3103  		if i < 2 {
  3104  			mask := uint64(0xffffffff) << (32 * i)
  3105  			x.a |= mask
  3106  		} else {
  3107  			mask := uint64(0xffffffff) << (32 * (i - 2))
  3108  			x.b |= mask
  3109  		}
  3110  	}
  3111  }
  3112  
  3113  // And returns the bitwise AND of x and y.
  3114  func (x Mask32s) And(y Mask32s) Mask32s {
  3115  	return Mask32s{a: x.a & y.a, b: x.b & y.b}
  3116  }
  3117  
  3118  // Or returns the bitwise OR of x and y.
  3119  func (x Mask32s) Or(y Mask32s) Mask32s {
  3120  	return Mask32s{a: x.a | y.a, b: x.b | y.b}
  3121  }
  3122  
  3123  // String returns a string representation of the vector.
  3124  func (x Mask32s) String() string {
  3125  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  3126  }
  3127  
  3128  // ToInt32s converts the mask to an Int32s vector.
  3129  func (x Mask32s) ToInt32s() Int32s {
  3130  	return Int32s{a: x.a, b: x.b}
  3131  }
  3132  
  3133  // Mask64s represents a 128-bit mask vector for 2 int64/uint64/float64 elements.
  3134  type Mask64s struct {
  3135  	_    _simd
  3136  	a, b uint64
  3137  }
  3138  
  3139  func (x *Mask64s) set(i int, v bool) {
  3140  	if v {
  3141  		if i == 0 {
  3142  			x.a = ^uint64(0)
  3143  		} else {
  3144  			x.b = ^uint64(0)
  3145  		}
  3146  	}
  3147  }
  3148  
  3149  // And returns the bitwise AND of x and y.
  3150  func (x Mask64s) And(y Mask64s) Mask64s {
  3151  	return Mask64s{a: x.a & y.a, b: x.b & y.b}
  3152  }
  3153  
  3154  // Or returns the bitwise OR of x and y.
  3155  func (x Mask64s) Or(y Mask64s) Mask64s {
  3156  	return Mask64s{a: x.a | y.a, b: x.b | y.b}
  3157  }
  3158  
  3159  // String returns a string representation of the vector.
  3160  func (x Mask64s) String() string {
  3161  	return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
  3162  }
  3163  
  3164  // ToInt64s converts the mask to an Int64s vector.
  3165  func (x Mask64s) ToInt64s() Int64s {
  3166  	return Int64s{a: x.a, b: x.b}
  3167  }
  3168  
  3169  func newT(lo, hi uint64) Uint64s {
  3170  	return Uint64s{a: lo, b: hi}
  3171  }
  3172  
  3173  // mwl returns the 128-bit product of the lower halves of x and y
  3174  func (x Uint64s) mwl(y Uint64s) Uint64s {
  3175  	hi, lo := bits.Mul64(x.a, y.a)
  3176  	return Uint64s{a: lo, b: hi}
  3177  }
  3178  
  3179  var (
  3180  	// For mK, bits J such that J mod 5 == K are set
  3181  	m0 = newT(0x1084210842108421, 0x2108421084210842)
  3182  	m1 = newT(0x2108421084210842, 0x4210842108421084)
  3183  	m2 = newT(0x4210842108421084, 0x8421084210842108)
  3184  	m3 = newT(0x8421084210842108, 0x0842108421084210)
  3185  	m4 = newT(0x0842108421084210, 0x1084210842108421)
  3186  )
  3187  
  3188  func (x Uint64s) clmul(y Uint64s) Uint64s {
  3189  	x0 := x.And(m0)
  3190  	x1 := x.And(m1)
  3191  	x2 := x.And(m2)
  3192  	x3 := x.And(m3)
  3193  	x4 := x.And(m4)
  3194  
  3195  	y0 := y.And(m0)
  3196  	y1 := y.And(m1)
  3197  	y2 := y.And(m2)
  3198  	y3 := y.And(m3)
  3199  	y4 := y.And(m4)
  3200  
  3201  	// sum of x, y indices == K mod 5; mask index = K
  3202  	z := (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0)
  3203  	z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z)
  3204  	z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z)
  3205  	z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z)
  3206  	z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z)
  3207  
  3208  	return z
  3209  }
  3210  
  3211  // CarrylessMultiplyEven computes the carryless
  3212  // multiplications of selected even halves of the elements of x and y.
  3213  // The result fills the 128 bits of each even-odd pair.
  3214  //
  3215  // A carryless multiplication uses bitwise XOR instead of
  3216  // add-with-carry, for example (in base two):
  3217  //
  3218  //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
  3219  //
  3220  // This also models multiplication of polynomials with coefficients
  3221  // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
  3222  // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
  3223  // polynomial terms, but coefficients "add" with XOR.)
  3224  func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s {
  3225  	return x.clmul(y)
  3226  }
  3227  
  3228  // CarrylessMultiplyOdd computes the carryless
  3229  // multiplications of selected odd halves of the elements of x and y.
  3230  // The result fills the 128 bits of each even-odd pair.
  3231  //
  3232  // A carryless multiplication uses bitwise XOR instead of
  3233  // add-with-carry, for example (in base two):
  3234  //
  3235  //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
  3236  //
  3237  // This also models multiplication of polynomials with coefficients
  3238  // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
  3239  // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
  3240  // polynomial terms, but coefficients "add" with XOR.)
  3241  func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s {
  3242  	x.a = x.b
  3243  	y.a = y.b
  3244  	return x.clmul(y)
  3245  }
  3246  
  3247  const (
  3248  	by8  = 0x0101010101010101
  3249  	by16 = 0x0001000100010001
  3250  )
  3251  
  3252  // BroadcastInt8 fills the elements of a slice with its argument value.
  3253  func BroadcastInt8s(x int8) Int8s {
  3254  	v := (255 & uint64(x)) * by8
  3255  	return Int8s{a: v, b: v}
  3256  }
  3257  
  3258  // BroadcastInt16 fills the elements of a slice with its argument value.
  3259  func BroadcastInt16s(x int16) Int16s {
  3260  	v := (65535 & uint64(x)) * by16
  3261  	return Int16s{a: v, b: v}
  3262  }
  3263  
  3264  // BroadcastInt32 fills the elements of a slice with its argument value.
  3265  func BroadcastInt32s(x int32) Int32s {
  3266  	v := uint64(x) & 0xffffffff
  3267  	v = v<<32 | v
  3268  	return Int32s{a: v, b: v}
  3269  }
  3270  
  3271  // BroadcastInt64 fills the elements of a slice with its argument value.
  3272  func BroadcastInt64s(x int64) Int64s {
  3273  	v := uint64(x)
  3274  	return Int64s{a: v, b: v}
  3275  }
  3276  
  3277  // BroadcastUint8 fills the elements of a slice with its argument value.
  3278  func BroadcastUint8s(x uint8) Uint8s {
  3279  	v := uint64(x) * by8
  3280  	return Uint8s{a: v, b: v}
  3281  
  3282  }
  3283  
  3284  // BroadcastUint16 fills the elements of a slice with its argument value.
  3285  func BroadcastUint16s(x uint16) Uint16s {
  3286  	v := uint64(x) * by16
  3287  	return Uint16s{a: v, b: v}
  3288  
  3289  }
  3290  
  3291  // BroadcastUint32 fills the elements of a slice with its argument value.
  3292  func BroadcastUint32s(x uint32) Uint32s {
  3293  	v := uint64(x)
  3294  	v = v<<32 | v
  3295  	return Uint32s{a: v, b: v}
  3296  }
  3297  
  3298  // BroadcastUint64 fills the elements of a slice with its argument value.
  3299  func BroadcastUint64s(x uint64) Uint64s {
  3300  	return Uint64s{a: x, b: x}
  3301  }
  3302  
  3303  // BroadcastFloat32 fills the elements of a slice with its argument value.
  3304  func BroadcastFloat32s(x float32) Float32s {
  3305  	v := uint64(math.Float32bits(x))
  3306  	v = v<<32 | v
  3307  	return Float32s{a: v, b: v}
  3308  }
  3309  
  3310  // BroadcastFloat64 fills the elements of a slice with its argument value.
  3311  func BroadcastFloat64s(x float64) Float64s {
  3312  	v := math.Float64bits(x)
  3313  	return Float64s{a: v, b: v}
  3314  }
  3315  

View as plain text