Source file src/simd/archsimd/ops_internal_amd64.go

     1  // Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT.
     2  
     3  //go:build goexperiment.simd
     4  
     5  package archsimd
     6  
     7  /* blend */
     8  
     9  // blend blends two vectors based on mask values, choosing either
    10  // the first or the second based on whether the third is false or true
    11  //
    12  // Asm: VPBLENDVB, CPU Feature: AVX
    13  func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16
    14  
    15  // blend blends two vectors based on mask values, choosing either
    16  // the first or the second based on whether the third is false or true
    17  //
    18  // Asm: VPBLENDVB, CPU Feature: AVX2
    19  func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32
    20  
    21  /* blendMasked */
    22  
    23  // blendMasked blends two vectors based on mask values, choosing either
    24  // the first or the second based on whether the third is false or true
    25  //
    26  // This operation is applied selectively under a write mask.
    27  //
    28  // Asm: VPBLENDMB, CPU Feature: AVX512
    29  func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64
    30  
    31  // blendMasked blends two vectors based on mask values, choosing either
    32  // the first or the second based on whether the third is false or true
    33  //
    34  // This operation is applied selectively under a write mask.
    35  //
    36  // Asm: VPBLENDMW, CPU Feature: AVX512
    37  func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32
    38  
    39  // blendMasked blends two vectors based on mask values, choosing either
    40  // the first or the second based on whether the third is false or true
    41  //
    42  // This operation is applied selectively under a write mask.
    43  //
    44  // Asm: VPBLENDMD, CPU Feature: AVX512
    45  func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
    46  
    47  // blendMasked blends two vectors based on mask values, choosing either
    48  // the first or the second based on whether the third is false or true
    49  //
    50  // This operation is applied selectively under a write mask.
    51  //
    52  // Asm: VPBLENDMQ, CPU Feature: AVX512
    53  func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
    54  
    55  /* carrylessMultiply */
    56  
    57  // carrylessMultiply computes one of four possible Galois polynomial
    58  // products of selected high and low halves of x and y,
    59  // depending on the value of xyHiLo, returning the 128-bit
    60  // product in the concatenated two elements of the result.
    61  // Bit 0 selects the low (0) or high (1) element of x and
    62  // bit 4 selects the low (0x00) or high (0x10) element of y.
    63  //
    64  // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
    65  //
    66  // Asm: VPCLMULQDQ, CPU Feature: AVX
    67  func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2
    68  
    69  // carrylessMultiply computes one of two possible Galois polynomial
    70  // products of selected high and low halves of each of the two
    71  // 128-bit lanes of x and y, depending on the value of xyHiLo,
    72  // and returns the four 128-bit products in the result's lanes.
    73  // Bit 0 selects the low (0) or high (1) elements of x's lanes and
    74  // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
    75  //
    76  // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
    77  //
    78  // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
    79  func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4
    80  
    81  // carrylessMultiply computes one of four possible Galois polynomial
    82  // products of selected high and low halves of each of the four
    83  // 128-bit lanes of x and y, depending on the value of xyHiLo,
    84  // and returns the four 128-bit products in the result's lanes.
    85  // Bit 0 selects the low (0) or high (1) elements of x's lanes and
    86  // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
    87  //
    88  // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
    89  //
    90  // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
    91  func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8
    92  
    93  /* concatSelectedConstant */
    94  
    95  // concatSelectedConstant concatenates selected elements from x and y into the lower and upper
    96  // halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
    97  // where each {h,l}{1,0} is two bits specify which element from y or x to select.
    98  // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
    99  // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
   100  //
   101  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   102  //
   103  // Asm: VSHUFPS, CPU Feature: AVX
   104  func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4
   105  
   106  // concatSelectedConstant concatenates selected elements from x and y into the lower and upper
   107  // halves of the output.  The selection is chosen by the constant parameter hilo
   108  // where hi and lo are each one bit specifying which 64-bit element to select
   109  // from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
   110  // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
   111  // selecting from y, is 1, and selects 7.
   112  //
   113  // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   114  //
   115  // Asm: VSHUFPD, CPU Feature: AVX
   116  func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2
   117  
   118  // concatSelectedConstant concatenates selected elements from x and y into the lower and upper
   119  // halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
   120  // where each {h,l}{1,0} is two bits specify which element from y or x to select.
   121  // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
   122  // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
   123  //
   124  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   125  //
   126  // Asm: VSHUFPS, CPU Feature: AVX
   127  func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4
   128  
   129  // concatSelectedConstant concatenates selected elements from x and y into the lower and upper
   130  // halves of the output.  The selection is chosen by the constant parameter hilo
   131  // where hi and lo are each one bit specifying which 64-bit element to select
   132  // from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
   133  // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
   134  // selecting from y, is 1, and selects 7.
   135  //
   136  // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   137  //
   138  // Asm: VSHUFPD, CPU Feature: AVX
   139  func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2
   140  
   141  // concatSelectedConstant concatenates selected elements from x and y into the lower and upper
   142  // halves of the output.  The selection is chosen by the constant parameter h1h0l1l0
   143  // where each {h,l}{1,0} is two bits specify which element from y or x to select.
   144  // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
   145  // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
   146  //
   147  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   148  //
   149  // Asm: VSHUFPS, CPU Feature: AVX
   150  func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4
   151  
   152  // concatSelectedConstant concatenates selected elements from x and y into the lower and upper
   153  // halves of the output.  The selection is chosen by the constant parameter hilo
   154  // where hi and lo are each one bit specifying which 64-bit element to select
   155  // from y and x.  For example {4,5}.concatSelectedConstant(0b10, {6,7})
   156  // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
   157  // selecting from y, is 1, and selects 7.
   158  //
   159  // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   160  //
   161  // Asm: VSHUFPD, CPU Feature: AVX
   162  func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2
   163  
   164  /* concatSelectedConstantGrouped */
   165  
   166  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   167  // into the lower and upper halves of corresponding subvectors of the output.
   168  // The selection is chosen by the constant parameter h1h0l1l0
   169  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
   170  // For example,
   171  // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
   172  // returns {2,0,5,7,10,8,13,15}
   173  // (don't forget that the binary constant is written big-endian).
   174  //
   175  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   176  //
   177  // Asm: VSHUFPS, CPU Feature: AVX
   178  func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8
   179  
   180  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   181  // into the lower and upper halves of corresponding subvectors of the output.
   182  // The selection is chosen by the constant parameter h1h0l1l0
   183  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
   184  // For example,
   185  //
   186  //	{0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
   187  //	 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
   188  //
   189  // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
   190  //
   191  // (don't forget that the binary constant is written big-endian).
   192  //
   193  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   194  //
   195  // Asm: VSHUFPS, CPU Feature: AVX512
   196  func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16
   197  
   198  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   199  // into the lower and upper halves of corresponding subvectors of the output.
   200  // The selections are specified by the constant parameter hilos where each
   201  // hi and lo pair select 64-bit elements from the corresponding 128-bit
   202  // subvectors of x and y.
   203  //
   204  // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
   205  // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
   206  // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
   207  // then 1, selecting element 1 from x's upper 128 bits (9), then 1,
   208  // selecting element 1 from y's upper 128 bits (11).
   209  // This differs from the same method applied to a 32x8 vector, where
   210  // the 8-bit constant performs the same selection on both subvectors.
   211  //
   212  // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   213  //
   214  // Asm: VSHUFPD, CPU Feature: AVX
   215  func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4
   216  
   217  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   218  // into the lower and upper halves of corresponding subvectors of the output.
   219  // The selections are specified by the constant parameter hilos where each
   220  // hi and lo pair select 64-bit elements from the corresponding 128-bit
   221  // subvectors of x and y.
   222  //
   223  // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
   224  // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
   225  // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
   226  // then 1, selecting element 1 from x's next 128 bits (9), then 1,
   227  // selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
   228  // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
   229  // 1 bits select the upper elements from x and y's last 128 bits (17, 19).
   230  // This differs from the same method applied to a 32x8 or 32x16 vector, where
   231  // the 8-bit constant performs the same selection on all the subvectors.
   232  //
   233  // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   234  //
   235  // Asm: VSHUFPD, CPU Feature: AVX512
   236  func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8
   237  
   238  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   239  // into the lower and upper halves of corresponding subvectors of the output.
   240  // The selection is chosen by the constant parameter h1h0l1l0
   241  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
   242  // For example,
   243  // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
   244  // returns {2,0,5,7,10,8,13,15}
   245  // (don't forget that the binary constant is written big-endian).
   246  //
   247  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   248  //
   249  // Asm: VSHUFPS, CPU Feature: AVX
   250  func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8
   251  
   252  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   253  // into the lower and upper halves of corresponding subvectors of the output.
   254  // The selection is chosen by the constant parameter h1h0l1l0
   255  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
   256  // For example,
   257  //
   258  //	{0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
   259  //	 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
   260  //
   261  // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
   262  //
   263  // (don't forget that the binary constant is written big-endian).
   264  //
   265  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   266  //
   267  // Asm: VSHUFPS, CPU Feature: AVX512
   268  func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16
   269  
   270  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   271  // into the lower and upper halves of corresponding subvectors of the output.
   272  // The selections are specified by the constant parameter hilos where each
   273  // hi and lo pair select 64-bit elements from the corresponding 128-bit
   274  // subvectors of x and y.
   275  //
   276  // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
   277  // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
   278  // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
   279  // then 1, selecting element 1 from x's upper 128 bits (9), then 1,
   280  // selecting element 1 from y's upper 128 bits (11).
   281  // This differs from the same method applied to a 32x8 vector, where
   282  // the 8-bit constant performs the same selection on both subvectors.
   283  //
   284  // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   285  //
   286  // Asm: VSHUFPD, CPU Feature: AVX
   287  func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4
   288  
   289  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   290  // into the lower and upper halves of corresponding subvectors of the output.
   291  // The selections are specified by the constant parameter hilos where each
   292  // hi and lo pair select 64-bit elements from the corresponding 128-bit
   293  // subvectors of x and y.
   294  //
   295  // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
   296  // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
   297  // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
   298  // then 1, selecting element 1 from x's next 128 bits (9), then 1,
   299  // selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
   300  // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
   301  // 1 bits select the upper elements from x and y's last 128 bits (17, 19).
   302  // This differs from the same method applied to a 32x8 or 32x16 vector, where
   303  // the 8-bit constant performs the same selection on all the subvectors.
   304  //
   305  // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   306  //
   307  // Asm: VSHUFPD, CPU Feature: AVX512
   308  func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8
   309  
   310  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   311  // into the lower and upper halves of corresponding subvectors of the output.
   312  // The selection is chosen by the constant parameter h1h0l1l0
   313  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
   314  // For example,
   315  // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
   316  // returns {2,0,5,7,10,8,13,15}
   317  // (don't forget that the binary constant is written big-endian).
   318  //
   319  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   320  //
   321  // Asm: VSHUFPS, CPU Feature: AVX
   322  func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8
   323  
   324  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   325  // into the lower and upper halves of corresponding subvectors of the output.
   326  // The selection is chosen by the constant parameter h1h0l1l0
   327  // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
   328  // For example,
   329  //
   330  //	{0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
   331  //	 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
   332  //
   333  // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
   334  //
   335  // (don't forget that the binary constant is written big-endian).
   336  //
   337  // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   338  //
   339  // Asm: VSHUFPS, CPU Feature: AVX512
   340  func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16
   341  
   342  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   343  // into the lower and upper halves of corresponding subvectors of the output.
   344  // The selections are specified by the constant parameter hilos where each
   345  // hi and lo pair select 64-bit elements from the corresponding 128-bit
   346  // subvectors of x and y.
   347  //
   348  // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
   349  // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
   350  // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
   351  // then 1, selecting element 1 from x's upper 128 bits (9), then 1,
   352  // selecting element 1 from y's upper 128 bits (11).
   353  // This differs from the same method applied to a 32x8 vector, where
   354  // the 8-bit constant performs the same selection on both subvectors.
   355  //
   356  // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   357  //
   358  // Asm: VSHUFPD, CPU Feature: AVX
   359  func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4
   360  
   361  // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
   362  // into the lower and upper halves of corresponding subvectors of the output.
   363  // The selections are specified by the constant parameter hilos where each
   364  // hi and lo pair select 64-bit elements from the corresponding 128-bit
   365  // subvectors of x and y.
   366  //
   367  // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
   368  // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
   369  // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
   370  // then 1, selecting element 1 from x's next 128 bits (9), then 1,
   371  // selecting element 1 from y's upper 128 bits (11).  The next two 0 bits select
   372  // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
   373  // 1 bits select the upper elements from x and y's last 128 bits (17, 19).
   374  // This differs from the same method applied to a 32x8 or 32x16 vector, where
   375  // the 8-bit constant performs the same selection on all the subvectors.
   376  //
   377  // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   378  //
   379  // Asm: VSHUFPD, CPU Feature: AVX512
   380  func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
   381  
   382  /* permuteScalars */
   383  
   384  // permuteScalars performs a permutation of vector x using constant indices:
   385  //
   386  //	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
   387  //
   388  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   389  //
   390  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   391  //
   392  // Asm: VPSHUFD, CPU Feature: AVX
   393  func (x Int32x4) permuteScalars(indices uint8) Int32x4
   394  
   395  // permuteScalars performs a permutation of vector x using constant indices:
   396  //
   397  //	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
   398  //
   399  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   400  //
   401  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   402  //
   403  // Asm: VPSHUFD, CPU Feature: AVX
   404  func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
   405  
   406  /* permuteScalarsGrouped */
   407  
   408  // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
   409  //
   410  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
   411  //
   412  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   413  // Each group is of size 128-bit.
   414  //
   415  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   416  //
   417  // Asm: VPSHUFD, CPU Feature: AVX2
   418  func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
   419  
   420  // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
   421  //
   422  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
   423  //
   424  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   425  // Each group is of size 128-bit.
   426  //
   427  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   428  //
   429  // Asm: VPSHUFD, CPU Feature: AVX512
   430  func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
   431  
   432  // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
   433  //
   434  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
   435  //
   436  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   437  // Each group is of size 128-bit.
   438  //
   439  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   440  //
   441  // Asm: VPSHUFD, CPU Feature: AVX2
   442  func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
   443  
   444  // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
   445  //
   446  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
   447  //
   448  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   449  // Each group is of size 128-bit.
   450  //
   451  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   452  //
   453  // Asm: VPSHUFD, CPU Feature: AVX512
   454  func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
   455  
   456  /* permuteScalarsHi */
   457  
   458  // permuteScalarsHi performs a permutation of vector x using constant indices:
   459  //
   460  //	result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
   461  //
   462  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   463  //
   464  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   465  //
   466  // Asm: VPSHUFHW, CPU Feature: AVX512
   467  func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
   468  
   469  // permuteScalarsHi performs a permutation of vector x using constant indices:
   470  //
   471  //	result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
   472  //
   473  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   474  //
   475  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   476  //
   477  // Asm: VPSHUFHW, CPU Feature: AVX512
   478  func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
   479  
   480  /* permuteScalarsHiGrouped */
   481  
   482  // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
   483  // result =
   484  //
   485  //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
   486  //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
   487  //
   488  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   489  // Each group is of size 128-bit.
   490  //
   491  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   492  //
   493  // Asm: VPSHUFHW, CPU Feature: AVX2
   494  func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
   495  
   496  // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
   497  // result =
   498  //
   499  //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
   500  //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
   501  //
   502  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   503  // Each group is of size 128-bit.
   504  //
   505  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   506  //
   507  // Asm: VPSHUFHW, CPU Feature: AVX512
   508  func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
   509  
   510  // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
   511  // result =
   512  //
   513  //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
   514  //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
   515  //
   516  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   517  // Each group is of size 128-bit.
   518  //
   519  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   520  //
   521  // Asm: VPSHUFHW, CPU Feature: AVX2
   522  func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
   523  
   524  // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
   525  // result =
   526  //
   527  //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
   528  //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
   529  //
   530  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   531  // Each group is of size 128-bit.
   532  //
   533  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   534  //
   535  // Asm: VPSHUFHW, CPU Feature: AVX512
   536  func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
   537  
   538  /* permuteScalarsLo */
   539  
   540  // permuteScalarsLo performs a permutation of vector x using constant indices:
   541  //
   542  //	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
   543  //
   544  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   545  //
   546  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   547  //
   548  // Asm: VPSHUFLW, CPU Feature: AVX512
   549  func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
   550  
   551  // permuteScalarsLo performs a permutation of vector x using constant indices:
   552  //
   553  //	result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
   554  //
   555  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   556  //
   557  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   558  //
   559  // Asm: VPSHUFLW, CPU Feature: AVX512
   560  func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
   561  
   562  /* permuteScalarsLoGrouped */
   563  
   564  // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
   565  //
   566  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
   567  //	 x_group1[indices[0:2]], ...}
   568  //
   569  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   570  // Each group is of size 128-bit.
   571  //
   572  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   573  //
   574  // Asm: VPSHUFLW, CPU Feature: AVX2
   575  func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
   576  
   577  // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
   578  //
   579  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
   580  //	 x_group1[indices[0:2]], ...}
   581  //
   582  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   583  // Each group is of size 128-bit.
   584  //
   585  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   586  //
   587  // Asm: VPSHUFLW, CPU Feature: AVX512
   588  func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
   589  
   590  // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
   591  //
   592  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
   593  //	 x_group1[indices[0:2]], ...}
   594  //
   595  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   596  // Each group is of size 128-bit.
   597  //
   598  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   599  //
   600  // Asm: VPSHUFLW, CPU Feature: AVX2
   601  func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
   602  
   603  // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
   604  //
   605  //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
   606  //	 x_group1[indices[0:2]], ...}
   607  //
   608  // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   609  // Each group is of size 128-bit.
   610  //
   611  // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   612  //
   613  // Asm: VPSHUFLW, CPU Feature: AVX512
   614  func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
   615  
   616  /* tern */
   617  
   618  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   619  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   620  //
   621  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   622  //
   623  // Asm: VPTERNLOGD, CPU Feature: AVX512
   624  func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4
   625  
   626  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   627  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   628  //
   629  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   630  //
   631  // Asm: VPTERNLOGD, CPU Feature: AVX512
   632  func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8
   633  
   634  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   635  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   636  //
   637  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   638  //
   639  // Asm: VPTERNLOGD, CPU Feature: AVX512
   640  func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16
   641  
   642  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   643  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   644  //
   645  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   646  //
   647  // Asm: VPTERNLOGQ, CPU Feature: AVX512
   648  func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2
   649  
   650  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   651  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   652  //
   653  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   654  //
   655  // Asm: VPTERNLOGQ, CPU Feature: AVX512
   656  func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4
   657  
   658  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   659  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   660  //
   661  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   662  //
   663  // Asm: VPTERNLOGQ, CPU Feature: AVX512
   664  func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8
   665  
   666  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   667  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   668  //
   669  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   670  //
   671  // Asm: VPTERNLOGD, CPU Feature: AVX512
   672  func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4
   673  
   674  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   675  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   676  //
   677  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   678  //
   679  // Asm: VPTERNLOGD, CPU Feature: AVX512
   680  func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8
   681  
   682  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   683  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   684  //
   685  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   686  //
   687  // Asm: VPTERNLOGD, CPU Feature: AVX512
   688  func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16
   689  
   690  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   691  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   692  //
   693  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   694  //
   695  // Asm: VPTERNLOGQ, CPU Feature: AVX512
   696  func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2
   697  
   698  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   699  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   700  //
   701  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   702  //
   703  // Asm: VPTERNLOGQ, CPU Feature: AVX512
   704  func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4
   705  
   706  // tern performs a logical operation on three vectors based on the 8-bit truth table.
   707  // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
   708  //
   709  // table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
   710  //
   711  // Asm: VPTERNLOGQ, CPU Feature: AVX512
   712  func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8
   713  

View as plain text