Source file src/simd/shuffles_amd64.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && amd64
     6  
     7  package simd
     8  
     9  // These constants represent the source pattern for the four parameters
    10  // (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped.
    11  // L means the element comes from the 'x' vector (Low), and
    12  // H means it comes from the 'y' vector (High).
    13  // The order of the letters corresponds to elements a, b, c, d.
    14  // The underlying integer value is a bitmask where:
    15  // Bit 0: Source of element 'a' (0 for x, 1 for y)
    16  // Bit 1: Source of element 'b' (0 for x, 1 for y)
    17  // Bit 2: Source of element 'c' (0 for x, 1 for y)
    18  // Bit 3: Source of element 'd' (0 for x, 1 for y)
    19  // Note that the least-significant bit is on the LEFT in this encoding.
    20  const (
    21  	_LLLL = iota // a:x, b:x, c:x, d:x
    22  	_HLLL        // a:y, b:x, c:x, d:x
    23  	_LHLL        // a:x, b:y, c:x, d:x
    24  	_HHLL        // a:y, b:y, c:x, d:x
    25  	_LLHL        // a:x, b:x, c:y, d:x
    26  	_HLHL        // a:y, b:x, c:y, d:x
    27  	_LHHL        // a:x, b:y, c:y, d:x
    28  	_HHHL        // a:y, b:y, c:y, d:x
    29  	_LLLH        // a:x, b:x, c:x, d:y
    30  	_HLLH        // a:y, b:x, c:x, d:y
    31  	_LHLH        // a:x, b:y, c:x, d:y
    32  	_HHLH        // a:y, b:y, c:x, d:y
    33  	_LLHH        // a:x, b:x, c:y, d:y
    34  	_HLHH        // a:y, b:x, c:y, d:y
    35  	_LHHH        // a:x, b:y, c:y, d:y
    36  	_HHHH        // a:y, b:y, c:y, d:y
    37  )
    38  
    39  // These constants represent the source pattern for the four parameters
    40  // (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for
    41  // two-element vectors.
    42  const (
    43  	_LL = iota
    44  	_HL
    45  	_LH
    46  	_HH
    47  )
    48  
    49  // SelectFromPair returns the selection of four elements from the two
    50  // vectors x and y, where selector values in the range 0-3 specify
    51  // elements from x and values in the range 4-7 specify the 0-3 elements
    52  // of y.  When the selectors are constants and the selection can be
    53  // implemented in a single instruction, it will be, otherwise it
    54  // requires two.  a is the source index of the least element in the
    55  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
    56  // elements in the output.  For example,
    57  // {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
    58  //
    59  // If the selectors are not constant this will translate to a function
    60  // call.
    61  //
    62  // Asm: VSHUFPS, CPU Feature: AVX
    63  func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 {
    64  	// pattern gets the concatenation of "x or y?" bits
    65  	// (0 == x, 1 == y)
    66  	// This will determine operand choice/order and whether a second
    67  	// instruction is needed.
    68  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
    69  
    70  	// a-d are masked down to their offsets within x or y
    71  	// this is not necessary for x, but this is easier on the
    72  	// eyes and reduces the risk of an error now or later.
    73  	a, b, c, d = a&3, b&3, c&3, d&3
    74  
    75  	switch pattern {
    76  	case _LLLL:
    77  		return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
    78  	case _HHHH:
    79  		return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
    80  	case _LLHH:
    81  		return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
    82  	case _HHLL:
    83  		return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
    84  
    85  	case _HLLL:
    86  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
    87  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
    88  	case _LHLL:
    89  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
    90  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
    91  
    92  	case _HLHH:
    93  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
    94  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
    95  	case _LHHH:
    96  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
    97  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
    98  
    99  	case _LLLH:
   100  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   101  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   102  	case _LLHL:
   103  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   104  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   105  	case _HHLH:
   106  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   107  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   108  	case _HHHL:
   109  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   110  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   111  
   112  	case _LHLH:
   113  		z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
   114  		return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   115  	case _HLHL:
   116  		z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
   117  		return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   118  	case _HLLH:
   119  		z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
   120  		return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   121  	case _LHHL:
   122  		z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
   123  		return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   124  	}
   125  	panic("missing case, switch should be exhaustive")
   126  }
   127  
   128  // SelectFromPair returns the selection of four elements from the two
   129  // vectors x and y, where selector values in the range 0-3 specify
   130  // elements from x and values in the range 4-7 specify the 0-3 elements
   131  // of y.  When the selectors are constants and can be the selection
   132  // can be implemented in a single instruction, it will be, otherwise
   133  // it requires two. a is the source index of the least element in the
   134  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   135  // elements in the output.  For example,
   136  // {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
   137  //
   138  // If the selectors are not constant this will translate to a function
   139  // call.
   140  //
   141  // Asm: VSHUFPS, CPU Feature: AVX
   142  func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 {
   143  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   144  
   145  	a, b, c, d = a&3, b&3, c&3, d&3
   146  
   147  	switch pattern {
   148  	case _LLLL:
   149  		return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
   150  	case _HHHH:
   151  		return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
   152  	case _LLHH:
   153  		return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
   154  	case _HHLL:
   155  		return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
   156  
   157  	case _HLLL:
   158  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   159  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   160  	case _LHLL:
   161  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   162  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   163  
   164  	case _HLHH:
   165  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   166  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   167  	case _LHHH:
   168  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   169  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   170  
   171  	case _LLLH:
   172  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   173  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   174  	case _LLHL:
   175  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   176  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   177  	case _HHLH:
   178  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   179  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   180  	case _HHHL:
   181  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   182  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   183  
   184  	case _LHLH:
   185  		z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
   186  		return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   187  	case _HLHL:
   188  		z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
   189  		return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   190  	case _HLLH:
   191  		z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
   192  		return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   193  	case _LHHL:
   194  		z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
   195  		return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   196  	}
   197  	panic("missing case, switch should be exhaustive")
   198  }
   199  
   200  // SelectFromPair returns the selection of four elements from the two
   201  // vectors x and y, where selector values in the range 0-3 specify
   202  // elements from x and values in the range 4-7 specify the 0-3 elements
   203  // of y.  When the selectors are constants and can be the selection
   204  // can be implemented in a single instruction, it will be, otherwise
   205  // it requires two. a is the source index of the least element in the
   206  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   207  // elements in the output.  For example,
   208  // {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
   209  //
   210  // If the selectors are not constant this will translate to a function
   211  // call.
   212  //
   213  // Asm: VSHUFPS, CPU Feature: AVX
   214  func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 {
   215  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   216  
   217  	a, b, c, d = a&3, b&3, c&3, d&3
   218  
   219  	switch pattern {
   220  	case _LLLL:
   221  		return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
   222  	case _HHHH:
   223  		return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
   224  	case _LLHH:
   225  		return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
   226  	case _HHLL:
   227  		return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
   228  
   229  	case _HLLL:
   230  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   231  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   232  	case _LHLL:
   233  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   234  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
   235  
   236  	case _HLHH:
   237  		z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
   238  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   239  	case _LHHH:
   240  		z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
   241  		return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
   242  
   243  	case _LLLH:
   244  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   245  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   246  	case _LLHL:
   247  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   248  		return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   249  	case _HHLH:
   250  		z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
   251  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   252  	case _HHHL:
   253  		z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
   254  		return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
   255  
   256  	case _LHLH:
   257  		z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
   258  		return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   259  	case _HLHL:
   260  		z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
   261  		return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   262  	case _HLLH:
   263  		z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
   264  		return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   265  	case _LHHL:
   266  		z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
   267  		return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   268  	}
   269  	panic("missing case, switch should be exhaustive")
   270  }
   271  
   272  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   273  // the vectors x and y, the selection of four elements from  x and y,
   274  // where selector values in the range 0-3 specify elements from x and
   275  // values in the range 4-7 specify the 0-3 elements of y.
   276  // When the selectors are constants and can be the selection
   277  // can be implemented in a single instruction, it will be, otherwise
   278  // it requires two. a is the source index of the least element in the
   279  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   280  // elements in the output.  For example,
   281  // {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
   282  //
   283  //	returns {4,8,25,81,64,128,169,289}
   284  //
   285  // If the selectors are not constant this will translate to a function
   286  // call.
   287  //
   288  // Asm: VSHUFPS, CPU Feature: AVX
   289  func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
   290  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   291  
   292  	a, b, c, d = a&3, b&3, c&3, d&3
   293  
   294  	switch pattern {
   295  	case _LLLL:
   296  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   297  	case _HHHH:
   298  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   299  	case _LLHH:
   300  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   301  	case _HHLL:
   302  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   303  
   304  	case _HLLL:
   305  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   306  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   307  	case _LHLL:
   308  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   309  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   310  
   311  	case _HLHH:
   312  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   313  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   314  	case _LHHH:
   315  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   316  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   317  
   318  	case _LLLH:
   319  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   320  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   321  	case _LLHL:
   322  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   323  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   324  	case _HHLH:
   325  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   326  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   327  	case _HHHL:
   328  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   329  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   330  
   331  	case _LHLH:
   332  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   333  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   334  	case _HLHL:
   335  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   336  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   337  	case _HLLH:
   338  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   339  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   340  	case _LHHL:
   341  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   342  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   343  	}
   344  	panic("missing case, switch should be exhaustive")
   345  }
   346  
   347  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   348  // the vectors x and y, the selection of four elements from  x and y,
   349  // where selector values in the range 0-3 specify elements from x and
   350  // values in the range 4-7 specify the 0-3 elements of y.
   351  // When the selectors are constants and can be the selection
   352  // can be implemented in a single instruction, it will be, otherwise
   353  // it requires two. a is the source index of the least element in the
   354  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   355  // elements in the output.  For example,
   356  // {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
   357  //
   358  //	returns {4,8,25,81,64,128,169,289}
   359  //
   360  // If the selectors are not constant this will translate to a function
   361  // call.
   362  //
   363  // Asm: VSHUFPS, CPU Feature: AVX
   364  func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
   365  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   366  
   367  	a, b, c, d = a&3, b&3, c&3, d&3
   368  
   369  	switch pattern {
   370  	case _LLLL:
   371  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   372  	case _HHHH:
   373  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   374  	case _LLHH:
   375  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   376  	case _HHLL:
   377  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   378  
   379  	case _HLLL:
   380  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   381  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   382  	case _LHLL:
   383  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   384  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   385  
   386  	case _HLHH:
   387  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   388  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   389  	case _LHHH:
   390  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   391  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   392  
   393  	case _LLLH:
   394  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   395  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   396  	case _LLHL:
   397  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   398  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   399  	case _HHLH:
   400  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   401  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   402  	case _HHHL:
   403  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   404  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   405  
   406  	case _LHLH:
   407  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   408  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   409  	case _HLHL:
   410  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   411  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   412  	case _HLLH:
   413  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   414  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   415  	case _LHHL:
   416  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   417  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   418  	}
   419  	panic("missing case, switch should be exhaustive")
   420  }
   421  
   422  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   423  // the vectors x and y, the selection of four elements from  x and y,
   424  // where selector values in the range 0-3 specify elements from x and
   425  // values in the range 4-7 specify the 0-3 elements of y.
   426  // When the selectors are constants and can be the selection
   427  // can be implemented in a single instruction, it will be, otherwise
   428  // it requires two. a is the source index of the least element in the
   429  // output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
   430  // elements in the output.  For example,
   431  // {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
   432  //
   433  //	returns {4,8,25,81,64,128,169,289}
   434  //
   435  // If the selectors are not constant this will translate to a function
   436  // call.
   437  //
   438  // Asm: VSHUFPS, CPU Feature: AVX
   439  func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
   440  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   441  
   442  	a, b, c, d = a&3, b&3, c&3, d&3
   443  
   444  	switch pattern {
   445  	case _LLLL:
   446  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   447  	case _HHHH:
   448  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   449  	case _LLHH:
   450  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   451  	case _HHLL:
   452  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   453  
   454  	case _HLLL:
   455  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   456  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   457  	case _LHLL:
   458  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   459  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   460  
   461  	case _HLHH:
   462  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   463  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   464  	case _LHHH:
   465  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   466  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   467  
   468  	case _LLLH:
   469  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   470  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   471  	case _LLHL:
   472  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   473  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   474  	case _HHLH:
   475  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   476  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   477  	case _HHHL:
   478  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   479  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   480  
   481  	case _LHLH:
   482  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   483  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   484  	case _HLHL:
   485  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   486  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   487  	case _HLLH:
   488  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   489  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   490  	case _LHHL:
   491  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   492  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   493  	}
   494  	panic("missing case, switch should be exhaustive")
   495  }
   496  
   497  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   498  // of the vectors x and y, the selection of four elements from  x and y,
   499  // where selector values in the range 0-3 specify elements from x and
   500  // values in the range 4-7 specify the 0-3 elements of y.
   501  // When the selectors are constants and can be the selection
   502  // can be implemented in a single instruction, it will be, otherwise
   503  // it requires two.
   504  //
   505  // If the selectors are not constant this will translate to a function
   506  // call.
   507  //
   508  // Asm: VSHUFPS, CPU Feature: AVX512
   509  func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
   510  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   511  
   512  	a, b, c, d = a&3, b&3, c&3, d&3
   513  
   514  	switch pattern {
   515  	case _LLLL:
   516  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   517  	case _HHHH:
   518  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   519  	case _LLHH:
   520  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   521  	case _HHLL:
   522  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   523  
   524  	case _HLLL:
   525  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   526  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   527  	case _LHLL:
   528  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   529  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   530  
   531  	case _HLHH:
   532  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   533  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   534  	case _LHHH:
   535  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   536  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   537  
   538  	case _LLLH:
   539  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   540  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   541  	case _LLHL:
   542  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   543  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   544  	case _HHLH:
   545  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   546  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   547  	case _HHHL:
   548  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   549  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   550  
   551  	case _LHLH:
   552  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   553  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   554  	case _HLHL:
   555  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   556  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   557  	case _HLLH:
   558  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   559  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   560  	case _LHHL:
   561  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   562  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   563  	}
   564  	panic("missing case, switch should be exhaustive")
   565  }
   566  
   567  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   568  // of the vectors x and y, the selection of four elements from  x and y,
   569  // where selector values in the range 0-3 specify elements from x and
   570  // values in the range 4-7 specify the 0-3 elements of y.
   571  // When the selectors are constants and can be the selection
   572  // can be implemented in a single instruction, it will be, otherwise
   573  // it requires two.
   574  //
   575  // If the selectors are not constant this will translate to a function
   576  // call.
   577  //
   578  // Asm: VSHUFPS, CPU Feature: AVX512
   579  func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
   580  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   581  
   582  	a, b, c, d = a&3, b&3, c&3, d&3
   583  
   584  	switch pattern {
   585  	case _LLLL:
   586  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   587  	case _HHHH:
   588  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   589  	case _LLHH:
   590  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   591  	case _HHLL:
   592  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   593  
   594  	case _HLLL:
   595  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   596  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   597  	case _LHLL:
   598  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   599  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   600  
   601  	case _HLHH:
   602  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   603  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   604  	case _LHHH:
   605  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   606  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   607  
   608  	case _LLLH:
   609  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   610  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   611  	case _LLHL:
   612  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   613  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   614  	case _HHLH:
   615  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   616  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   617  	case _HHHL:
   618  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   619  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   620  
   621  	case _LHLH:
   622  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   623  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   624  	case _HLHL:
   625  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   626  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   627  	case _HLLH:
   628  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   629  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   630  	case _LHHL:
   631  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   632  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   633  	}
   634  	panic("missing case, switch should be exhaustive")
   635  }
   636  
   637  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   638  // of the vectors x and y, the selection of four elements from  x and y,
   639  // where selector values in the range 0-3 specify elements from x and
   640  // values in the range 4-7 specify the 0-3 elements of y.
   641  // When the selectors are constants and can be the selection
   642  // can be implemented in a single instruction, it will be, otherwise
   643  // it requires two.
   644  //
   645  // If the selectors are not constant this will translate to a function
   646  // call.
   647  //
   648  // Asm: VSHUFPS, CPU Feature: AVX512
   649  func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
   650  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
   651  
   652  	a, b, c, d = a&3, b&3, c&3, d&3
   653  
   654  	switch pattern {
   655  	case _LLLL:
   656  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   657  	case _HHHH:
   658  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   659  	case _LLHH:
   660  		return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
   661  	case _HHLL:
   662  		return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
   663  
   664  	case _HLLL:
   665  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   666  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   667  	case _LHLL:
   668  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   669  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
   670  
   671  	case _HLHH:
   672  		z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
   673  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   674  	case _LHHH:
   675  		z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
   676  		return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
   677  
   678  	case _LLLH:
   679  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   680  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   681  	case _LLHL:
   682  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   683  		return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   684  	case _HHLH:
   685  		z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
   686  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   687  	case _HHHL:
   688  		z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
   689  		return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
   690  
   691  	case _LHLH:
   692  		z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
   693  		return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
   694  	case _HLHL:
   695  		z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
   696  		return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
   697  	case _HLLH:
   698  		z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
   699  		return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
   700  	case _LHHL:
   701  		z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
   702  		return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
   703  	}
   704  	panic("missing case, switch should be exhaustive")
   705  }
   706  
   707  // cscimm4 converts the 4 vector element indices into a single
   708  // uint8 for use as an immediate.
   709  func cscimm4(a, b, c, d uint8) uint8 {
   710  	return uint8(a + b<<2 + c<<4 + d<<6)
   711  }
   712  
   713  // cscimm2 converts the 2 vector element indices into a single
   714  // uint8 for use as an immediate.
   715  func cscimm2(a, b uint8) uint8 {
   716  	return uint8(a + b<<1)
   717  }
   718  
   719  // cscimm2g2 converts the 2 vector element indices into a single
   720  // uint8 for use as an immediate, but duplicated for VSHUFPD
   721  // to emulate grouped behavior of VSHUFPS
   722  func cscimm2g2(a, b uint8) uint8 {
   723  	g := cscimm2(a, b)
   724  	return g + g<<2
   725  }
   726  
   727  // cscimm2g4 converts the 2 vector element indices into a single
   728  // uint8 for use as an immediate, but with four copies for VSHUFPD
   729  // to emulate grouped behavior of VSHUFPS
   730  func cscimm2g4(a, b uint8) uint8 {
   731  	g := cscimm2g2(a, b)
   732  	return g + g<<4
   733  }
   734  
   735  // SelectFromPair returns the selection of two elements from the two
   736  // vectors x and y, where selector values in the range 0-1 specify
   737  // elements from x and values in the range 2-3 specify the 0-1 elements
   738  // of y.  When the selectors are constants the selection can be
   739  // implemented in a single instruction.
   740  //
   741  // If the selectors are not constant this will translate to a function
   742  // call.
   743  //
   744  // Asm: VSHUFPD, CPU Feature: AVX
   745  func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 {
   746  	pattern := (a&2)>>1 + (b & 2)
   747  
   748  	a, b = a&1, b&1
   749  
   750  	switch pattern {
   751  	case _LL:
   752  		return x.concatSelectedConstant(cscimm2(a, b), x)
   753  	case _HH:
   754  		return y.concatSelectedConstant(cscimm2(a, b), y)
   755  	case _LH:
   756  		return x.concatSelectedConstant(cscimm2(a, b), y)
   757  	case _HL:
   758  		return y.concatSelectedConstant(cscimm2(a, b), x)
   759  	}
   760  	panic("missing case, switch should be exhaustive")
   761  }
   762  
   763  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   764  // the vectors x and y, the selection of two elements from the two
   765  // vectors x and y, where selector values in the range 0-1 specify
   766  // elements from x and values in the range 2-3 specify the 0-1 elements
   767  // of y.  When the selectors are constants the selection can be
   768  // implemented in a single instruction.
   769  //
   770  // If the selectors are not constant this will translate to a function
   771  // call.
   772  //
   773  // Asm: VSHUFPD, CPU Feature: AVX
   774  func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 {
   775  	pattern := (a&2)>>1 + (b & 2)
   776  
   777  	a, b = a&1, b&1
   778  
   779  	switch pattern {
   780  	case _LL:
   781  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   782  	case _HH:
   783  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   784  	case _LH:
   785  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   786  	case _HL:
   787  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   788  	}
   789  	panic("missing case, switch should be exhaustive")
   790  }
   791  
   792  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   793  // of the vectors x and y, the selection of two elements from the two
   794  // vectors x and y, where selector values in the range 0-1 specify
   795  // elements from x and values in the range 2-3 specify the 0-1 elements
   796  // of y.  When the selectors are constants the selection can be
   797  // implemented in a single instruction.
   798  //
   799  // If the selectors are not constant this will translate to a function
   800  // call.
   801  //
   802  // Asm: VSHUFPD, CPU Feature: AVX512
   803  func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 {
   804  	pattern := (a&2)>>1 + (b & 2)
   805  
   806  	a, b = a&1, b&1
   807  
   808  	switch pattern {
   809  	case _LL:
   810  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   811  	case _HH:
   812  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   813  	case _LH:
   814  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   815  	case _HL:
   816  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   817  	}
   818  	panic("missing case, switch should be exhaustive")
   819  }
   820  
   821  // SelectFromPair returns the selection of two elements from the two
   822  // vectors x and y, where selector values in the range 0-1 specify
   823  // elements from x and values in the range 2-3 specify the 0-1 elements
   824  // of y.  When the selectors are constants the selection can be
   825  // implemented in a single instruction.
   826  //
   827  // If the selectors are not constant this will translate to a function
   828  // call.
   829  //
   830  // Asm: VSHUFPD, CPU Feature: AVX
   831  func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 {
   832  	pattern := (a&2)>>1 + (b & 2)
   833  
   834  	a, b = a&1, b&1
   835  
   836  	switch pattern {
   837  	case _LL:
   838  		return x.concatSelectedConstant(cscimm2(a, b), x)
   839  	case _HH:
   840  		return y.concatSelectedConstant(cscimm2(a, b), y)
   841  	case _LH:
   842  		return x.concatSelectedConstant(cscimm2(a, b), y)
   843  	case _HL:
   844  		return y.concatSelectedConstant(cscimm2(a, b), x)
   845  	}
   846  	panic("missing case, switch should be exhaustive")
   847  }
   848  
   849  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   850  // the vectors x and y, the selection of two elements from the two
   851  // vectors x and y, where selector values in the range 0-1 specify
   852  // elements from x and values in the range 2-3 specify the 0-1 elements
   853  // of y.  When the selectors are constants the selection can be
   854  // implemented in a single instruction.
   855  //
   856  // If the selectors are not constant this will translate to a function
   857  // call.
   858  //
   859  // Asm: VSHUFPD, CPU Feature: AVX
   860  func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 {
   861  	pattern := (a&2)>>1 + (b & 2)
   862  
   863  	a, b = a&1, b&1
   864  
   865  	switch pattern {
   866  	case _LL:
   867  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   868  	case _HH:
   869  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   870  	case _LH:
   871  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   872  	case _HL:
   873  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   874  	}
   875  	panic("missing case, switch should be exhaustive")
   876  }
   877  
   878  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   879  // of the vectors x and y, the selection of two elements from the two
   880  // vectors x and y, where selector values in the range 0-1 specify
   881  // elements from x and values in the range 2-3 specify the 0-1 elements
   882  // of y.  When the selectors are constants the selection can be
   883  // implemented in a single instruction.
   884  //
   885  // If the selectors are not constant this will translate to a function
   886  // call.
   887  //
   888  // Asm: VSHUFPD, CPU Feature: AVX512
   889  func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 {
   890  	pattern := (a&2)>>1 + (b & 2)
   891  
   892  	a, b = a&1, b&1
   893  
   894  	switch pattern {
   895  	case _LL:
   896  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   897  	case _HH:
   898  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   899  	case _LH:
   900  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   901  	case _HL:
   902  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   903  	}
   904  	panic("missing case, switch should be exhaustive")
   905  }
   906  
   907  // SelectFromPair returns the selection of two elements from the two
   908  // vectors x and y, where selector values in the range 0-1 specify
   909  // elements from x and values in the range 2-3 specify the 0-1 elements
   910  // of y.  When the selectors are constants the selection can be
   911  // implemented in a single instruction.
   912  //
   913  // If the selectors are not constant this will translate to a function
   914  // call.
   915  //
   916  // Asm: VSHUFPD, CPU Feature: AVX
   917  func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 {
   918  	pattern := (a&2)>>1 + (b & 2)
   919  
   920  	a, b = a&1, b&1
   921  
   922  	switch pattern {
   923  	case _LL:
   924  		return x.concatSelectedConstant(cscimm2(a, b), x)
   925  	case _HH:
   926  		return y.concatSelectedConstant(cscimm2(a, b), y)
   927  	case _LH:
   928  		return x.concatSelectedConstant(cscimm2(a, b), y)
   929  	case _HL:
   930  		return y.concatSelectedConstant(cscimm2(a, b), x)
   931  	}
   932  	panic("missing case, switch should be exhaustive")
   933  }
   934  
   935  // SelectFromPairGrouped returns, for each of the two 128-bit halves of
   936  // the vectors x and y, the selection of two elements from the two
   937  // vectors x and y, where selector values in the range 0-1 specify
   938  // elements from x and values in the range 2-3 specify the 0-1 elements
   939  // of y.  When the selectors are constants the selection can be
   940  // implemented in a single instruction.
   941  //
   942  // If the selectors are not constant this will translate to a function
   943  // call.
   944  //
   945  // Asm: VSHUFPD, CPU Feature: AVX
   946  func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 {
   947  	pattern := (a&2)>>1 + (b & 2)
   948  
   949  	a, b = a&1, b&1
   950  
   951  	switch pattern {
   952  	case _LL:
   953  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   954  	case _HH:
   955  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   956  	case _LH:
   957  		return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
   958  	case _HL:
   959  		return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
   960  	}
   961  	panic("missing case, switch should be exhaustive")
   962  }
   963  
   964  // SelectFromPairGrouped returns, for each of the four 128-bit subvectors
   965  // of the vectors x and y, the selection of two elements from the two
   966  // vectors x and y, where selector values in the range 0-1 specify
   967  // elements from x and values in the range 2-3 specify the 0-1 elements
   968  // of y.  When the selectors are constants the selection can be
   969  // implemented in a single instruction.
   970  //
   971  // If the selectors are not constant this will translate to a function
   972  // call.
   973  //
   974  // Asm: VSHUFPD, CPU Feature: AVX512
   975  func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
   976  	pattern := (a&2)>>1 + (b & 2)
   977  
   978  	a, b = a&1, b&1
   979  
   980  	switch pattern {
   981  	case _LL:
   982  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   983  	case _HH:
   984  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   985  	case _LH:
   986  		return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
   987  	case _HL:
   988  		return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
   989  	}
   990  	panic("missing case, switch should be exhaustive")
   991  }
   992  
   993  /* PermuteScalars */
   994  
   995  // PermuteScalars performs a permutation of vector x's elements using the supplied indices:
   996  //
   997  //	result = {x[a], x[b], x[c], x[d]}
   998  //
   999  // Parameters a,b,c,d should have values between 0 and 3.
  1000  // If a through d are constants, then an instruction will be inlined, otherwise
  1001  // a jump table may be generated.
  1002  //
  1003  // Asm: VPSHUFD, CPU Feature: AVX
  1004  func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
  1005  	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1006  }
  1007  
  1008  // PermuteScalars performs a permutation of vector x's elements using the supplied indices:
  1009  //
  1010  //	result = {x[a], x[b], x[c], x[d]}
  1011  //
  1012  // Parameters a,b,c,d should have values between 0 and 3.
  1013  // If a through d are constants, then an instruction will be inlined, otherwise
  1014  // a jump table may be generated.
  1015  //
  1016  // Asm: VPSHUFD, CPU Feature: AVX
  1017  func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
  1018  	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1019  }
  1020  
  1021  /* PermuteScalarsGrouped */
  1022  
  1023  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1024  //
  1025  //	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
  1026  //
  1027  // Parameters a,b,c,d should have values between 0 and 3.
  1028  // If a through d are constants, then an instruction will be inlined, otherwise
  1029  // a jump table may be generated.
  1030  //
  1031  // Asm: VPSHUFD, CPU Feature: AVX2
  1032  func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
  1033  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1034  }
  1035  
  1036  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1037  //
  1038  //	 result =
  1039  //		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
  1040  //			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
  1041  //
  1042  // Parameters a,b,c,d should have values between 0 and 3.
  1043  // If a through d are constants, then an instruction will be inlined, otherwise
  1044  // a jump table may be generated.
  1045  //
  1046  // Asm: VPSHUFD, CPU Feature: AVX512
  1047  func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
  1048  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1049  }
  1050  
  1051  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1052  //
  1053  //	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
  1054  //
  1055  // Parameters a,b,c,d should have values between 0 and 3.
  1056  // If a through d are constants, then an instruction will be inlined, otherwise
  1057  // a jump table is generated.
  1058  //
  1059  // Asm: VPSHUFD, CPU Feature: AVX2
  1060  func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
  1061  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1062  }
  1063  
  1064  // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
  1065  //
  1066  //	 result =
  1067  //		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
  1068  //			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
  1069  //
  1070  // Parameters a,b,c,d should have values between 0 and 3.
  1071  // If a through d are constants, then an instruction will be inlined, otherwise
  1072  // a jump table is generated.
  1073  //
  1074  // Asm: VPSHUFD, CPU Feature: AVX512
  1075  func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
  1076  	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1077  }
  1078  
  1079  /* PermuteScalarsHi */
  1080  
  1081  // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
  1082  //
  1083  // result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
  1084  //
  1085  // Parameters a,b,c,d should have values between 0 and 3.
  1086  // If a through d are constants, then an instruction will be inlined, otherwise
  1087  // a jump table is generated.
  1088  //
  1089  // Asm: VPSHUFHW, CPU Feature: AVX512
  1090  func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
  1091  	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1092  }
  1093  
  1094  // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
  1095  //
  1096  // result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
  1097  //
  1098  // Parameters a,b,c,d should have values between 0 and 3.
  1099  // If a through d are constants, then an instruction will be inlined, otherwise
  1100  // a jump table is generated.
  1101  //
  1102  // Asm: VPSHUFHW, CPU Feature: AVX512
  1103  func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
  1104  	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1105  }
  1106  
  1107  /* PermuteScalarsHiGrouped */
  1108  
  1109  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1110  //
  1111  //	 result =
  1112  //		  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
  1113  //			x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
  1114  //
  1115  // Parameters a,b,c,d should have values between 0 and 3.
  1116  // If a through d are constants, then an instruction will be inlined, otherwise
  1117  // a jump table is generated.
  1118  //
  1119  // Asm: VPSHUFHW, CPU Feature: AVX2
  1120  func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
  1121  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1122  }
  1123  
  1124  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1125  //
  1126  //	 result =
  1127  //		  {x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
  1128  //			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
  1129  //			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
  1130  //			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
  1131  //
  1132  // Parameters a,b,c,d should have values between 0 and 3.
  1133  // If a through d are constants, then an instruction will be inlined, otherwise
  1134  // a jump table is generated.
  1135  //
  1136  // Asm: VPSHUFHW, CPU Feature: AVX512
  1137  func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
  1138  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1139  }
  1140  
  1141  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1142  //
  1143  //	 result =
  1144  //	  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
  1145  //		x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
  1146  //
  1147  // Each group is of size 128-bit.
  1148  //
  1149  // Parameters a,b,c,d should have values between 0 and 3.
  1150  // If a through d are constants, then an instruction will be inlined, otherwise
  1151  // a jump table is generated.
  1152  //
  1153  // Asm: VPSHUFHW, CPU Feature: AVX2
  1154  func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
  1155  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1156  }
  1157  
  1158  // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
  1159  //
  1160  //	 result =
  1161  //		 {  x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
  1162  //			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
  1163  //			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
  1164  //			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
  1165  //
  1166  // Parameters a,b,c,d should have values between 0 and 3.
  1167  // If a through d are constants, then an instruction will be inlined, otherwise
  1168  // a jump table is generated.
  1169  //
  1170  // Asm: VPSHUFHW, CPU Feature: AVX512
  1171  func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
  1172  	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1173  }
  1174  
  1175  /* PermuteScalarsLo */
  1176  
  1177  // PermuteScalarsLo performs a permutation of vector x using the supplied indices:
  1178  //
  1179  //	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
  1180  //
  1181  // Parameters a,b,c,d should have values between 0 and 3.
  1182  // If a through d are constants, then an instruction will be inlined, otherwise
  1183  // a jump table is generated.
  1184  //
  1185  // Asm: VPSHUFLW, CPU Feature: AVX512
  1186  func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
  1187  	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1188  }
  1189  
  1190  // PermuteScalarsLo performs a permutation of vector x using the supplied indices:
  1191  //
  1192  //	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
  1193  //
  1194  // Parameters a,b,c,d should have values between 0 and 3.
  1195  // If a through d are constants, then an instruction will be inlined, otherwise
  1196  // a jump table is generated.
  1197  //
  1198  // Asm: VPSHUFLW, CPU Feature: AVX512
  1199  func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
  1200  	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1201  }
  1202  
  1203  /* PermuteScalarsLoGrouped */
  1204  
  1205  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1206  //
  1207  //	 result =
  1208  //	 {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
  1209  //		 x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
  1210  //
  1211  // Parameters a,b,c,d should have values between 0 and 3.
  1212  // If a through d are constants, then an instruction will be inlined, otherwise
  1213  // a jump table is generated.
  1214  //
  1215  // Asm: VPSHUFLW, CPU Feature: AVX2
  1216  func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
  1217  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1218  }
  1219  
  1220  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1221  //
  1222  //	 result =
  1223  //	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
  1224  //		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
  1225  //		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
  1226  //		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
  1227  //
  1228  // Parameters a,b,c,d should have values between 0 and 3.
  1229  // If a through d are constants, then an instruction will be inlined, otherwise
  1230  // a jump table is generated.
  1231  //
  1232  // Asm: VPSHUFLW, CPU Feature: AVX512
  1233  func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
  1234  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1235  }
  1236  
  1237  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1238  //
  1239  //	 result = {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
  1240  //		x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
  1241  //
  1242  // Parameters a,b,c,d should have values between 0 and 3.
  1243  // If a through d are constants, then an instruction will be inlined, otherwise
  1244  // a jump table is generated.
  1245  //
  1246  // Asm: VPSHUFLW, CPU Feature: AVX2
  1247  func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
  1248  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1249  }
  1250  
  1251  // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
  1252  //
  1253  //	 result =
  1254  //	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
  1255  //		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
  1256  //		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
  1257  //		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
  1258  //
  1259  // Each group is of size 128-bit.
  1260  //
  1261  // Parameters a,b,c,d should have values between 0 and 3.
  1262  // If a through d are constants, then an instruction will be inlined, otherwise
  1263  // a jump table is generated.
  1264  //
  1265  // Asm: VPSHUFLW, CPU Feature: AVX512
  1266  func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
  1267  	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
  1268  }
  1269  

View as plain text