Source file src/simd/archsimd/shuffles_amd64.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && amd64
     6  
     7  package archsimd
     8  
     9  // These constants represent the source pattern for the four parameters
    10  // (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped.
    11  // L means the element comes from the 'x' vector (Low), and
    12  // H means it comes from the 'y' vector (High).
    13  // The order of the letters corresponds to elements a, b, c, d.
    14  // The underlying integer value is a bitmask where:
    15  // Bit 0: Source of element 'a' (0 for x, 1 for y)
    16  // Bit 1: Source of element 'b' (0 for x, 1 for y)
    17  // Bit 2: Source of element 'c' (0 for x, 1 for y)
    18  // Bit 3: Source of element 'd' (0 for x, 1 for y)
    19  // Note that the least-significant bit is on the LEFT in this encoding.
    20  const (
    21  	_LLLL = iota // a:x, b:x, c:x, d:x
    22  	_HLLL        // a:y, b:x, c:x, d:x
    23  	_LHLL        // a:x, b:y, c:x, d:x
    24  	_HHLL        // a:y, b:y, c:x, d:x
    25  	_LLHL        // a:x, b:x, c:y, d:x
    26  	_HLHL        // a:y, b:x, c:y, d:x
    27  	_LHHL        // a:x, b:y, c:y, d:x
    28  	_HHHL        // a:y, b:y, c:y, d:x
    29  	_LLLH        // a:x, b:x, c:x, d:y
    30  	_HLLH        // a:y, b:x, c:x, d:y
    31  	_LHLH        // a:x, b:y, c:x, d:y
    32  	_HHLH        // a:y, b:y, c:x, d:y
    33  	_LLHH        // a:x, b:x, c:y, d:y
    34  	_HLHH        // a:y, b:x, c:y, d:y
    35  	_LHHH        // a:x, b:y, c:y, d:y
    36  	_HHHH        // a:y, b:y, c:y, d:y
    37  )
    38  
    39  // These constants represent the source pattern for the four parameters
    40  // (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for
    41  // two-element vectors.
    42  const (
    43  	_LL = iota
    44  	_HL
    45  	_LH
    46  	_HH
    47  )
    48  
    49  // SelectFromPair returns the selection of four elements from the two
    50  // vectors x and y, where selector values in the range 0-3 specify
    51  // elements from x and values in the range 4-7 specify the 0-3 elements
    52  // of y.  When the selectors are constants and the selection can be
    53  // implemented in a single instruction, it will be, otherwise it
    54  // requires two.  a is the source index of the least element in the
    55  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
    56  // elements in the output.  For example,
    57  //
    58  //	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
    59  //
    60  // returns {4,8,25,81}.
    61  //
    62  // If the selectors are not constant this will translate to a function
    63  // call.
    64  //
    65  // Asm: VSHUFPS, CPU Feature: AVX
    66  func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 {
    67  	// pattern gets the concatenation of "x or y?" bits
    68  	// (0 == x, 1 == y)
    69  	// This will determine operand choice/order and whether a second
    70  	// instruction is needed.
    71  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
    72  
    73  	// a-d are masked down to their offsets within x or y
    74  	// this is not necessary for x, but this is easier on the
    75  	// eyes and reduces the risk of an error now or later.
    76  	a, b, c, d = a&3, b&3, c&3, d&3
    77  
    78  	switch pattern {
    79  	case _LLLL:
    80  		return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
    81  	case _HHHH:
    82  		return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
    83  	case _LLHH:
    84  		return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
    85  	case _HHLL:
    86  		return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
    87  
    88  	case _HLLL:
    89  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
    90  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
    91  	case _LHLL:
    92  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
    93  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
    94  
    95  	case _HLHH:
    96  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
    97  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
    98  	case _LHHH:
    99  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   100  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   101  
   102  	case _LLLH:
   103  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   104  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   105  	case _LLHL:
   106  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   107  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   108  	case _HHLH:
   109  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   110  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   111  	case _HHHL:
   112  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   113  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   114  
   115  	case _LHLH:
   116  		z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
   117  		return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   118  	case _HLHL:
   119  		z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
   120  		return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   121  	case _HLLH:
   122  		z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
   123  		return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   124  	case _LHHL:
   125  		z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
   126  		return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   127  	}
   128  	panic("missing case, switch should be exhaustive")
   129  }
   130  
   131  // SelectFromPair returns the selection of four elements from the two
   132  // vectors x and y, where selector values in the range 0-3 specify
   133  // elements from x and values in the range 4-7 specify the 0-3 elements
   134  // of y.  When the selectors are constants and can be the selection
   135  // can be implemented in a single instruction, it will be, otherwise
   136  // it requires two. a is the source index of the least element in the
   137  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   138  // elements in the output.  For example,
   139  //
   140  //	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
   141  //
   142  // returns {4,8,25,81}.
   143  //
   144  // If the selectors are not constant this will translate to a function
   145  // call.
   146  //
   147  // Asm: VSHUFPS, CPU Feature: AVX
   148  func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 {
   149  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   150  
   151  	a, b, c, d = a&3, b&3, c&3, d&3
   152  
   153  	switch pattern {
   154  	case _LLLL:
   155  		return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
   156  	case _HHHH:
   157  		return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
   158  	case _LLHH:
   159  		return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
   160  	case _HHLL:
   161  		return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
   162  
   163  	case _HLLL:
   164  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   165  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   166  	case _LHLL:
   167  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   168  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   169  
   170  	case _HLHH:
   171  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   172  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   173  	case _LHHH:
   174  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   175  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   176  
   177  	case _LLLH:
   178  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   179  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   180  	case _LLHL:
   181  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   182  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   183  	case _HHLH:
   184  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   185  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   186  	case _HHHL:
   187  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   188  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   189  
   190  	case _LHLH:
   191  		z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
   192  		return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   193  	case _HLHL:
   194  		z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
   195  		return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   196  	case _HLLH:
   197  		z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
   198  		return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   199  	case _LHHL:
   200  		z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
   201  		return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   202  	}
   203  	panic("missing case, switch should be exhaustive")
   204  }
   205  
   206  // SelectFromPair returns the selection of four elements from the two
   207  // vectors x and y, where selector values in the range 0-3 specify
   208  // elements from x and values in the range 4-7 specify the 0-3 elements
   209  // of y.  When the selectors are constants and can be the selection
   210  // can be implemented in a single instruction, it will be, otherwise
   211  // it requires two. a is the source index of the least element in the
   212  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   213  // elements in the output.  For example,
   214  //
   215  //	{1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81})
   216  //
   217  // returns {4,8,25,81}.
   218  //
   219  // If the selectors are not constant this will translate to a function
   220  // call.
   221  //
   222  // Asm: VSHUFPS, CPU Feature: AVX
   223  func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 {
   224  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   225  
   226  	a, b, c, d = a&3, b&3, c&3, d&3
   227  
   228  	switch pattern {
   229  	case _LLLL:
   230  		return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
   231  	case _HHHH:
   232  		return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
   233  	case _LLHH:
   234  		return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
   235  	case _HHLL:
   236  		return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
   237  
   238  	case _HLLL:
   239  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   240  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   241  	case _LHLL:
   242  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   243  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   244  
   245  	case _HLHH:
   246  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   247  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   248  	case _LHHH:
   249  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   250  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   251  
   252  	case _LLLH:
   253  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   254  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   255  	case _LLHL:
   256  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   257  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   258  	case _HHLH:
   259  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   260  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   261  	case _HHHL:
   262  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   263  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   264  
   265  	case _LHLH:
   266  		z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
   267  		return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   268  	case _HLHL:
   269  		z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
   270  		return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   271  	case _HLLH:
   272  		z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
   273  		return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   274  	case _LHHL:
   275  		z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
   276  		return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   277  	}
   278  	panic("missing case, switch should be exhaustive")
   279  }
   280  
   281  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   282  // the vectors x and y, the selection of four elements from  x and y,
   283  // where selector values in the range 0-3 specify elements from x and
   284  // values in the range 4-7 specify the 0-3 elements of y.
   285  // When the selectors are constants and can be the selection
   286  // can be implemented in a single instruction, it will be, otherwise
   287  // it requires two. a is the source index of the least element in the
   288  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   289  // elements in the output.  For example,
   290  //
   291  //	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
   292  //
   293  // returns {4,8,25,81,64,128,169,289}.
   294  //
   295  // If the selectors are not constant this will translate to a function
   296  // call.
   297  //
   298  // Asm: VSHUFPS, CPU Feature: AVX
   299  func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
   300  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   301  
   302  	a, b, c, d = a&3, b&3, c&3, d&3
   303  
   304  	switch pattern {
   305  	case _LLLL:
   306  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   307  	case _HHHH:
   308  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   309  	case _LLHH:
   310  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   311  	case _HHLL:
   312  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   313  
   314  	case _HLLL:
   315  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   316  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   317  	case _LHLL:
   318  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   319  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   320  
   321  	case _HLHH:
   322  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   323  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   324  	case _LHHH:
   325  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   326  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   327  
   328  	case _LLLH:
   329  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   330  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   331  	case _LLHL:
   332  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   333  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   334  	case _HHLH:
   335  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   336  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   337  	case _HHHL:
   338  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   339  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   340  
   341  	case _LHLH:
   342  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   343  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   344  	case _HLHL:
   345  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   346  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   347  	case _HLLH:
   348  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   349  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   350  	case _LHHL:
   351  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   352  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   353  	}
   354  	panic("missing case, switch should be exhaustive")
   355  }
   356  
   357  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   358  // the vectors x and y, the selection of four elements from  x and y,
   359  // where selector values in the range 0-3 specify elements from x and
   360  // values in the range 4-7 specify the 0-3 elements of y.
   361  // When the selectors are constants and can be the selection
   362  // can be implemented in a single instruction, it will be, otherwise
   363  // it requires two. a is the source index of the least element in the
   364  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   365  // elements in the output.  For example,
   366  //
   367  //	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
   368  //
   369  // returns {4,8,25,81,64,128,169,289}.
   370  //
   371  // If the selectors are not constant this will translate to a function
   372  // call.
   373  //
   374  // Asm: VSHUFPS, CPU Feature: AVX
   375  func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
   376  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   377  
   378  	a, b, c, d = a&3, b&3, c&3, d&3
   379  
   380  	switch pattern {
   381  	case _LLLL:
   382  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   383  	case _HHHH:
   384  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   385  	case _LLHH:
   386  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   387  	case _HHLL:
   388  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   389  
   390  	case _HLLL:
   391  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   392  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   393  	case _LHLL:
   394  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   395  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   396  
   397  	case _HLHH:
   398  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   399  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   400  	case _LHHH:
   401  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   402  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   403  
   404  	case _LLLH:
   405  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   406  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   407  	case _LLHL:
   408  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   409  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   410  	case _HHLH:
   411  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   412  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   413  	case _HHHL:
   414  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   415  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   416  
   417  	case _LHLH:
   418  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   419  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   420  	case _HLHL:
   421  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   422  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   423  	case _HLLH:
   424  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   425  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   426  	case _LHHL:
   427  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   428  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   429  	}
   430  	panic("missing case, switch should be exhaustive")
   431  }
   432  
   433  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   434  // the vectors x and y, the selection of four elements from  x and y,
   435  // where selector values in the range 0-3 specify elements from x and
   436  // values in the range 4-7 specify the 0-3 elements of y.
   437  // When the selectors are constants and can be the selection
   438  // can be implemented in a single instruction, it will be, otherwise
   439  // it requires two. a is the source index of the least element in the
   440  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   441  // elements in the output.  For example,
   442  //
   443  //	{1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
   444  //
   445  // returns {4,8,25,81,64,128,169,289}.
   446  //
   447  // If the selectors are not constant this will translate to a function
   448  // call.
   449  //
   450  // Asm: VSHUFPS, CPU Feature: AVX
   451  func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
   452  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   453  
   454  	a, b, c, d = a&3, b&3, c&3, d&3
   455  
   456  	switch pattern {
   457  	case _LLLL:
   458  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   459  	case _HHHH:
   460  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   461  	case _LLHH:
   462  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   463  	case _HHLL:
   464  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   465  
   466  	case _HLLL:
   467  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   468  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   469  	case _LHLL:
   470  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   471  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   472  
   473  	case _HLHH:
   474  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   475  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   476  	case _LHHH:
   477  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   478  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   479  
   480  	case _LLLH:
   481  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   482  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   483  	case _LLHL:
   484  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   485  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   486  	case _HHLH:
   487  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   488  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   489  	case _HHHL:
   490  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   491  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   492  
   493  	case _LHLH:
   494  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   495  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   496  	case _HLHL:
   497  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   498  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   499  	case _HLLH:
   500  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   501  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   502  	case _LHHL:
   503  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   504  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   505  	}
   506  	panic("missing case, switch should be exhaustive")
   507  }
   508  
   509  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   510  // of the vectors x and y, the selection of four elements from  x and y,
   511  // where selector values in the range 0-3 specify elements from x and
   512  // values in the range 4-7 specify the 0-3 elements of y.
   513  // When the selectors are constants and can be the selection
   514  // can be implemented in a single instruction, it will be, otherwise
   515  // it requires two.
   516  //
   517  // If the selectors are not constant this will translate to a function
   518  // call.
   519  //
   520  // Asm: VSHUFPS, CPU Feature: AVX512
   521  func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
   522  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   523  
   524  	a, b, c, d = a&3, b&3, c&3, d&3
   525  
   526  	switch pattern {
   527  	case _LLLL:
   528  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   529  	case _HHHH:
   530  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   531  	case _LLHH:
   532  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   533  	case _HHLL:
   534  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   535  
   536  	case _HLLL:
   537  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   538  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   539  	case _LHLL:
   540  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   541  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   542  
   543  	case _HLHH:
   544  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   545  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   546  	case _LHHH:
   547  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   548  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   549  
   550  	case _LLLH:
   551  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   552  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   553  	case _LLHL:
   554  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   555  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   556  	case _HHLH:
   557  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   558  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   559  	case _HHHL:
   560  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   561  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   562  
   563  	case _LHLH:
   564  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   565  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   566  	case _HLHL:
   567  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   568  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   569  	case _HLLH:
   570  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   571  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   572  	case _LHHL:
   573  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   574  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   575  	}
   576  	panic("missing case, switch should be exhaustive")
   577  }
   578  
   579  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   580  // of the vectors x and y, the selection of four elements from  x and y,
   581  // where selector values in the range 0-3 specify elements from x and
   582  // values in the range 4-7 specify the 0-3 elements of y.
   583  // When the selectors are constants and can be the selection
   584  // can be implemented in a single instruction, it will be, otherwise
   585  // it requires two.
   586  //
   587  // If the selectors are not constant this will translate to a function
   588  // call.
   589  //
   590  // Asm: VSHUFPS, CPU Feature: AVX512
   591  func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
   592  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   593  
   594  	a, b, c, d = a&3, b&3, c&3, d&3
   595  
   596  	switch pattern {
   597  	case _LLLL:
   598  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   599  	case _HHHH:
   600  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   601  	case _LLHH:
   602  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   603  	case _HHLL:
   604  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   605  
   606  	case _HLLL:
   607  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   608  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   609  	case _LHLL:
   610  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   611  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   612  
   613  	case _HLHH:
   614  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   615  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   616  	case _LHHH:
   617  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   618  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   619  
   620  	case _LLLH:
   621  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   622  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   623  	case _LLHL:
   624  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   625  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   626  	case _HHLH:
   627  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   628  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   629  	case _HHHL:
   630  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   631  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   632  
   633  	case _LHLH:
   634  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   635  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   636  	case _HLHL:
   637  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   638  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   639  	case _HLLH:
   640  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   641  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   642  	case _LHHL:
   643  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   644  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   645  	}
   646  	panic("missing case, switch should be exhaustive")
   647  }
   648  
   649  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   650  // of the vectors x and y, the selection of four elements from  x and y,
   651  // where selector values in the range 0-3 specify elements from x and
   652  // values in the range 4-7 specify the 0-3 elements of y.
   653  // When the selectors are constants and can be the selection
   654  // can be implemented in a single instruction, it will be, otherwise
   655  // it requires two.
   656  //
   657  // If the selectors are not constant this will translate to a function
   658  // call.
   659  //
   660  // Asm: VSHUFPS, CPU Feature: AVX512
   661  func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
   662  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   663  
   664  	a, b, c, d = a&3, b&3, c&3, d&3
   665  
   666  	switch pattern {
   667  	case _LLLL:
   668  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   669  	case _HHHH:
   670  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   671  	case _LLHH:
   672  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   673  	case _HHLL:
   674  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   675  
   676  	case _HLLL:
   677  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   678  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   679  	case _LHLL:
   680  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   681  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   682  
   683  	case _HLHH:
   684  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   685  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   686  	case _LHHH:
   687  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   688  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   689  
   690  	case _LLLH:
   691  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   692  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   693  	case _LLHL:
   694  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   695  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   696  	case _HHLH:
   697  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   698  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   699  	case _HHHL:
   700  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   701  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   702  
   703  	case _LHLH:
   704  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   705  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   706  	case _HLHL:
   707  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   708  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   709  	case _HLLH:
   710  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   711  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   712  	case _LHHL:
   713  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   714  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   715  	}
   716  	panic("missing case, switch should be exhaustive")
   717  }
   718  
   719  // cscimm4 converts the 4 vector element indices into a single
   720  // uint8 for use as an immediate.
   721  func cscimm4(a, b, c, d uint8) uint8 {
   722  	return uint8(a + b<<2 + c<<4 + d<<6)
   723  }
   724  
   725  // cscimm2 converts the 2 vector element indices into a single
   726  // uint8 for use as an immediate.
   727  func cscimm2(a, b uint8) uint8 {
   728  	return uint8(a + b<<1)
   729  }
   730  
   731  // cscimm2g2 converts the 2 vector element indices into a single
   732  // uint8 for use as an immediate, but duplicated for VSHUFPD
   733  // to emulate grouped behavior of VSHUFPS
   734  func cscimm2g2(a, b uint8) uint8 {
   735  	g := cscimm2(a, b)
   736  	return g + g<<2
   737  }
   738  
   739  // cscimm2g4 converts the 2 vector element indices into a single
   740  // uint8 for use as an immediate, but with four copies for VSHUFPD
   741  // to emulate grouped behavior of VSHUFPS
   742  func cscimm2g4(a, b uint8) uint8 {
   743  	g := cscimm2g2(a, b)
   744  	return g + g<<4
   745  }
   746  
   747  // SelectFromPair returns the selection of two elements from the two
   748  // vectors x and y, where selector values in the range 0-1 specify
   749  // elements from x and values in the range 2-3 specify the 0-1 elements
   750  // of y.  When the selectors are constants the selection can be
   751  // implemented in a single instruction.
   752  //
   753  // If the selectors are not constant this will translate to a function
   754  // call.
   755  //
   756  // Asm: VSHUFPD, CPU Feature: AVX
   757  func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 {
   758  	pattern := (a&2)>>1 + (b & 2)
   759  
   760  	a, b = a&1, b&1
   761  
   762  	switch pattern {
   763  	case _LL:
   764  		return x.concatSelectedConstant(cscimm2(a, b), x)
   765  	case _HH:
   766  		return y.concatSelectedConstant(cscimm2(a, b), y)
   767  	case _LH:
   768  		return x.concatSelectedConstant(cscimm2(a, b), y)
   769  	case _HL:
   770  		return y.concatSelectedConstant(cscimm2(a, b), x)
   771  	}
   772  	panic("missing case, switch should be exhaustive")
   773  }
   774  
   775  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   776  // the vectors x and y, the selection of two elements from the two
   777  // vectors x and y, where selector values in the range 0-1 specify
   778  // elements from x and values in the range 2-3 specify the 0-1 elements
   779  // of y.  When the selectors are constants the selection can be
   780  // implemented in a single instruction.
   781  //
   782  // If the selectors are not constant this will translate to a function
   783  // call.
   784  //
   785  // Asm: VSHUFPD, CPU Feature: AVX
   786  func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 {
   787  	pattern := (a&2)>>1 + (b & 2)
   788  
   789  	a, b = a&1, b&1
   790  
   791  	switch pattern {
   792  	case _LL:
   793  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   794  	case _HH:
   795  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   796  	case _LH:
   797  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   798  	case _HL:
   799  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   800  	}
   801  	panic("missing case, switch should be exhaustive")
   802  }
   803  
   804  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   805  // of the vectors x and y, the selection of two elements from the two
   806  // vectors x and y, where selector values in the range 0-1 specify
   807  // elements from x and values in the range 2-3 specify the 0-1 elements
   808  // of y.  When the selectors are constants the selection can be
   809  // implemented in a single instruction.
   810  //
   811  // If the selectors are not constant this will translate to a function
   812  // call.
   813  //
   814  // Asm: VSHUFPD, CPU Feature: AVX512
   815  func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 {
   816  	pattern := (a&2)>>1 + (b & 2)
   817  
   818  	a, b = a&1, b&1
   819  
   820  	switch pattern {
   821  	case _LL:
   822  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   823  	case _HH:
   824  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   825  	case _LH:
   826  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   827  	case _HL:
   828  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   829  	}
   830  	panic("missing case, switch should be exhaustive")
   831  }
   832  
   833  // SelectFromPair returns the selection of two elements from the two
   834  // vectors x and y, where selector values in the range 0-1 specify
   835  // elements from x and values in the range 2-3 specify the 0-1 elements
   836  // of y.  When the selectors are constants the selection can be
   837  // implemented in a single instruction.
   838  //
   839  // If the selectors are not constant this will translate to a function
   840  // call.
   841  //
   842  // Asm: VSHUFPD, CPU Feature: AVX
   843  func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 {
   844  	pattern := (a&2)>>1 + (b & 2)
   845  
   846  	a, b = a&1, b&1
   847  
   848  	switch pattern {
   849  	case _LL:
   850  		return x.concatSelectedConstant(cscimm2(a, b), x)
   851  	case _HH:
   852  		return y.concatSelectedConstant(cscimm2(a, b), y)
   853  	case _LH:
   854  		return x.concatSelectedConstant(cscimm2(a, b), y)
   855  	case _HL:
   856  		return y.concatSelectedConstant(cscimm2(a, b), x)
   857  	}
   858  	panic("missing case, switch should be exhaustive")
   859  }
   860  
   861  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   862  // the vectors x and y, the selection of two elements from the two
   863  // vectors x and y, where selector values in the range 0-1 specify
   864  // elements from x and values in the range 2-3 specify the 0-1 elements
   865  // of y.  When the selectors are constants the selection can be
   866  // implemented in a single instruction.
   867  //
   868  // If the selectors are not constant this will translate to a function
   869  // call.
   870  //
   871  // Asm: VSHUFPD, CPU Feature: AVX
   872  func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 {
   873  	pattern := (a&2)>>1 + (b & 2)
   874  
   875  	a, b = a&1, b&1
   876  
   877  	switch pattern {
   878  	case _LL:
   879  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   880  	case _HH:
   881  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   882  	case _LH:
   883  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   884  	case _HL:
   885  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   886  	}
   887  	panic("missing case, switch should be exhaustive")
   888  }
   889  
   890  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   891  // of the vectors x and y, the selection of two elements from the two
   892  // vectors x and y, where selector values in the range 0-1 specify
   893  // elements from x and values in the range 2-3 specify the 0-1 elements
   894  // of y.  When the selectors are constants the selection can be
   895  // implemented in a single instruction.
   896  //
   897  // If the selectors are not constant this will translate to a function
   898  // call.
   899  //
   900  // Asm: VSHUFPD, CPU Feature: AVX512
   901  func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 {
   902  	pattern := (a&2)>>1 + (b & 2)
   903  
   904  	a, b = a&1, b&1
   905  
   906  	switch pattern {
   907  	case _LL:
   908  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   909  	case _HH:
   910  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   911  	case _LH:
   912  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   913  	case _HL:
   914  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   915  	}
   916  	panic("missing case, switch should be exhaustive")
   917  }
   918  
   919  // SelectFromPair returns the selection of two elements from the two
   920  // vectors x and y, where selector values in the range 0-1 specify
   921  // elements from x and values in the range 2-3 specify the 0-1 elements
   922  // of y.  When the selectors are constants the selection can be
   923  // implemented in a single instruction.
   924  //
   925  // If the selectors are not constant this will translate to a function
   926  // call.
   927  //
   928  // Asm: VSHUFPD, CPU Feature: AVX
   929  func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 {
   930  	pattern := (a&2)>>1 + (b & 2)
   931  
   932  	a, b = a&1, b&1
   933  
   934  	switch pattern {
   935  	case _LL:
   936  		return x.concatSelectedConstant(cscimm2(a, b), x)
   937  	case _HH:
   938  		return y.concatSelectedConstant(cscimm2(a, b), y)
   939  	case _LH:
   940  		return x.concatSelectedConstant(cscimm2(a, b), y)
   941  	case _HL:
   942  		return y.concatSelectedConstant(cscimm2(a, b), x)
   943  	}
   944  	panic("missing case, switch should be exhaustive")
   945  }
   946  
   947  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   948  // the vectors x and y, the selection of two elements from the two
   949  // vectors x and y, where selector values in the range 0-1 specify
   950  // elements from x and values in the range 2-3 specify the 0-1 elements
   951  // of y.  When the selectors are constants the selection can be
   952  // implemented in a single instruction.
   953  //
   954  // If the selectors are not constant this will translate to a function
   955  // call.
   956  //
   957  // Asm: VSHUFPD, CPU Feature: AVX
   958  func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 {
   959  	pattern := (a&2)>>1 + (b & 2)
   960  
   961  	a, b = a&1, b&1
   962  
   963  	switch pattern {
   964  	case _LL:
   965  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   966  	case _HH:
   967  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   968  	case _LH:
   969  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   970  	case _HL:
   971  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   972  	}
   973  	panic("missing case, switch should be exhaustive")
   974  }
   975  
   976  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   977  // of the vectors x and y, the selection of two elements from the two
   978  // vectors x and y, where selector values in the range 0-1 specify
   979  // elements from x and values in the range 2-3 specify the 0-1 elements
   980  // of y.  When the selectors are constants the selection can be
   981  // implemented in a single instruction.
   982  //
   983  // If the selectors are not constant this will translate to a function
   984  // call.
   985  //
   986  // Asm: VSHUFPD, CPU Feature: AVX512
   987  func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
   988  	pattern := (a&2)>>1 + (b & 2)
   989  
   990  	a, b = a&1, b&1
   991  
   992  	switch pattern {
   993  	case _LL:
   994  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   995  	case _HH:
   996  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   997  	case _LH:
   998  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   999  	case _HL:
  1000  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
  1001  	}
  1002  	panic("missing case, switch should be exhaustive")
  1003  }
  1004  
  1005  /* PermuteScalars */
  1006  
  1007  // PermuteScalars performs a permutation of vector x's elements using the supplied indices:
  1008  //
  1009  //	result = {x[a], x[b], x[c], x[d]}
  1010  //
  1011  // Parameters a,b,c,d should have values between 0 and 3.
  1012  // If a through d are constants, then an instruction will be inlined, otherwise
  1013  // a jump table may be generated.
  1014  //
  1015  // Asm: VPSHUFD, CPU Feature: AVX
  1016  func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
  1017  	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1018  }
  1019  
  1020  // PermuteScalars performs a permutation of vector x's elements using the supplied indices:
  1021  //
  1022  //	result = {x[a], x[b], x[c], x[d]}
  1023  //
  1024  // Parameters a,b,c,d should have values between 0 and 3.
  1025  // If a through d are constants, then an instruction will be inlined, otherwise
  1026  // a jump table may be generated.
  1027  //
  1028  // Asm: VPSHUFD, CPU Feature: AVX
  1029  func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
  1030  	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1031  }
  1032  
  1033  /* PermuteScalarsGrouped */
  1034  
  1035  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1036  //
  1037  //	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
  1038  //
  1039  // Parameters a,b,c,d should have values between 0 and 3.
  1040  // If a through d are constants, then an instruction will be inlined, otherwise
  1041  // a jump table may be generated.
  1042  //
  1043  // Asm: VPSHUFD, CPU Feature: AVX2
  1044  func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
  1045  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1046  }
  1047  
  1048  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1049  //
  1050  //	 result =
  1051  //		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
  1052  //			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
  1053  //
  1054  // Parameters a,b,c,d should have values between 0 and 3.
  1055  // If a through d are constants, then an instruction will be inlined, otherwise
  1056  // a jump table may be generated.
  1057  //
  1058  // Asm: VPSHUFD, CPU Feature: AVX512
  1059  func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
  1060  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1061  }
  1062  
  1063  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1064  //
  1065  //	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
  1066  //
  1067  // Parameters a,b,c,d should have values between 0 and 3.
  1068  // If a through d are constants, then an instruction will be inlined, otherwise
  1069  // a jump table is generated.
  1070  //
  1071  // Asm: VPSHUFD, CPU Feature: AVX2
  1072  func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
  1073  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1074  }
  1075  
  1076  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1077  //
  1078  //	 result =
  1079  //		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
  1080  //			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
  1081  //
  1082  // Parameters a,b,c,d should have values between 0 and 3.
  1083  // If a through d are constants, then an instruction will be inlined, otherwise
  1084  // a jump table is generated.
  1085  //
  1086  // Asm: VPSHUFD, CPU Feature: AVX512
  1087  func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
  1088  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1089  }
  1090  
  1091  /* PermuteScalarsHi */
  1092  
  1093  // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
  1094  //
  1095  //	result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
  1096  //
  1097  // Parameters a,b,c,d should have values between 0 and 3.
  1098  // If a through d are constants, then an instruction will be inlined, otherwise
  1099  // a jump table is generated.
  1100  //
  1101  // Asm: VPSHUFHW, CPU Feature: AVX512
  1102  func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
  1103  	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1104  }
  1105  
  1106  // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
  1107  //
  1108  //	result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
  1109  //
  1110  // Parameters a,b,c,d should have values between 0 and 3.
  1111  // If a through d are constants, then an instruction will be inlined, otherwise
  1112  // a jump table is generated.
  1113  //
  1114  // Asm: VPSHUFHW, CPU Feature: AVX512
  1115  func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
  1116  	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1117  }
  1118  
  1119  /* PermuteScalarsHiGrouped */
  1120  
  1121  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1122  //
  1123  //	 result =
  1124  //		  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
  1125  //			x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
  1126  //
  1127  // Parameters a,b,c,d should have values between 0 and 3.
  1128  // If a through d are constants, then an instruction will be inlined, otherwise
  1129  // a jump table is generated.
  1130  //
  1131  // Asm: VPSHUFHW, CPU Feature: AVX2
  1132  func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
  1133  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1134  }
  1135  
  1136  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1137  //
  1138  //	 result =
  1139  //		  {x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
  1140  //			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
  1141  //			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
  1142  //			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
  1143  //
  1144  // Parameters a,b,c,d should have values between 0 and 3.
  1145  // If a through d are constants, then an instruction will be inlined, otherwise
  1146  // a jump table is generated.
  1147  //
  1148  // Asm: VPSHUFHW, CPU Feature: AVX512
  1149  func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
  1150  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1151  }
  1152  
  1153  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1154  //
  1155  //	 result =
  1156  //	  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
  1157  //		x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
  1158  //
  1159  // Each group is of size 128-bit.
  1160  //
  1161  // Parameters a,b,c,d should have values between 0 and 3.
  1162  // If a through d are constants, then an instruction will be inlined, otherwise
  1163  // a jump table is generated.
  1164  //
  1165  // Asm: VPSHUFHW, CPU Feature: AVX2
  1166  func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
  1167  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1168  }
  1169  
  1170  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1171  //
  1172  //	 result =
  1173  //		 {  x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
  1174  //			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
  1175  //			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
  1176  //			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
  1177  //
  1178  // Parameters a,b,c,d should have values between 0 and 3.
  1179  // If a through d are constants, then an instruction will be inlined, otherwise
  1180  // a jump table is generated.
  1181  //
  1182  // Asm: VPSHUFHW, CPU Feature: AVX512
  1183  func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
  1184  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1185  }
  1186  
  1187  /* PermuteScalarsLo */
  1188  
  1189  // PermuteScalarsLo performs a permutation of vector x using the supplied indices:
  1190  //
  1191  //	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
  1192  //
  1193  // Parameters a,b,c,d should have values between 0 and 3.
  1194  // If a through d are constants, then an instruction will be inlined, otherwise
  1195  // a jump table is generated.
  1196  //
  1197  // Asm: VPSHUFLW, CPU Feature: AVX512
  1198  func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
  1199  	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1200  }
  1201  
  1202  // PermuteScalarsLo performs a permutation of vector x using the supplied indices:
  1203  //
  1204  //	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
  1205  //
  1206  // Parameters a,b,c,d should have values between 0 and 3.
  1207  // If a through d are constants, then an instruction will be inlined, otherwise
  1208  // a jump table is generated.
  1209  //
  1210  // Asm: VPSHUFLW, CPU Feature: AVX512
  1211  func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
  1212  	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1213  }
  1214  
  1215  /* PermuteScalarsLoGrouped */
  1216  
  1217  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1218  //
  1219  //	 result =
  1220  //	 {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
  1221  //		 x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
  1222  //
  1223  // Parameters a,b,c,d should have values between 0 and 3.
  1224  // If a through d are constants, then an instruction will be inlined, otherwise
  1225  // a jump table is generated.
  1226  //
  1227  // Asm: VPSHUFLW, CPU Feature: AVX2
  1228  func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
  1229  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1230  }
  1231  
  1232  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1233  //
  1234  //	 result =
  1235  //	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
  1236  //		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
  1237  //		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
  1238  //		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
  1239  //
  1240  // Parameters a,b,c,d should have values between 0 and 3.
  1241  // If a through d are constants, then an instruction will be inlined, otherwise
  1242  // a jump table is generated.
  1243  //
  1244  // Asm: VPSHUFLW, CPU Feature: AVX512
  1245  func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
  1246  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1247  }
  1248  
  1249  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1250  //
  1251  //	 result = {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
  1252  //		x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
  1253  //
  1254  // Parameters a,b,c,d should have values between 0 and 3.
  1255  // If a through d are constants, then an instruction will be inlined, otherwise
  1256  // a jump table is generated.
  1257  //
  1258  // Asm: VPSHUFLW, CPU Feature: AVX2
  1259  func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
  1260  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1261  }
  1262  
  1263  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1264  //
  1265  //	 result =
  1266  //	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
  1267  //		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
  1268  //		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
  1269  //		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
  1270  //
  1271  // Each group is of size 128-bit.
  1272  //
  1273  // Parameters a,b,c,d should have values between 0 and 3.
  1274  // If a through d are constants, then an instruction will be inlined, otherwise
  1275  // a jump table is generated.
  1276  //
  1277  // Asm: VPSHUFLW, CPU Feature: AVX512
  1278  func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
  1279  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1280  }
  1281  
  1282  // CarrylessMultiply computes one of four possible carryless
  1283  // multiplications of selected high and low halves of x and y,
  1284  // depending on the values of a and b, returning the 128-bit
  1285  // product in the concatenated two elements of the result.
  1286  // a selects the low (0) or high (1) element of x and
  1287  // b selects the low (0) or high (1) element of y.
  1288  //
  1289  // A carryless multiplication uses bitwise XOR instead of
  1290  // add-with-carry, for example (in base two):
  1291  //
  1292  //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
  1293  //
  1294  // This also models multiplication of polynomials with coefficients
  1295  // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
  1296  // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
  1297  // polynomial terms, but coefficients "add" with XOR.)
  1298  //
  1299  // constant values of a and b will result in better performance,
  1300  // otherwise the intrinsic may translate into a jump table.
  1301  //
  1302  // Asm: VPCLMULQDQ, CPU Feature: AVX
  1303  func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
  1304  	return x.carrylessMultiply(a&1+((b&1)<<4), y)
  1305  }
  1306  
  1307  // CarrylessMultiplyGrouped computes one of four possible carryless
  1308  // multiplications of selected high and low halves of each of the two
  1309  // 128-bit lanes of x and y, depending on the values of a and b,
  1310  // and returns the four 128-bit products in the result's lanes.
  1311  // a selects the low (0) or high (1) elements of x's lanes and
  1312  // b selects the low (0) or high (1) elements of y's lanes.
  1313  //
  1314  // A carryless multiplication uses bitwise XOR instead of
  1315  // add-with-carry, for example (in base two):
  1316  //
  1317  //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
  1318  //
  1319  // This also models multiplication of polynomials with coefficients
  1320  // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
  1321  // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
  1322  // polynomial terms, but coefficients "add" with XOR.)
  1323  //
  1324  // constant values of a and b will result in better performance,
  1325  // otherwise the intrinsic may translate into a jump table.
  1326  //
  1327  // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
  1328  func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
  1329  	return x.carrylessMultiply(a&1+((b&1)<<4), y)
  1330  }
  1331  
  1332  // CarrylessMultiplyGrouped computes one of four possible carryless
  1333  // multiplications of selected high and low halves of each of the four
  1334  // 128-bit lanes of x and y, depending on the values of a and b,
  1335  // and returns the four 128-bit products in the result's lanes.
  1336  // a selects the low (0) or high (1) elements of x's lanes and
  1337  // b selects the low (0) or high (1) elements of y's lanes.
  1338  //
  1339  // A carryless multiplication uses bitwise XOR instead of
  1340  // add-with-carry, for example (in base two):
  1341  //
  1342  //	11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
  1343  //
  1344  // This also models multiplication of polynomials with coefficients
  1345  // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
  1346  // x**2 + 0x + 1 = x**2 + 1 modeled by 101.  (Note that "+" adds
  1347  // polynomial terms, but coefficients "add" with XOR.)
  1348  //
  1349  // constant values of a and b will result in better performance,
  1350  // otherwise the intrinsic may translate into a jump table.
  1351  //
  1352  // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
  1353  func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 {
  1354  	return x.carrylessMultiply(a&1+((b&1)<<4), y)
  1355  }
  1356  

View as plain text