Source file src/simd/archsimd/ops_internal_amd64.go
1 // Code generated by 'simdgen -o godefs -goroot $GOROOT -xedPath $XED_PATH go.yaml types.yaml categories.yaml'; DO NOT EDIT. 2 3 //go:build goexperiment.simd 4 5 package archsimd 6 7 /* blend */ 8 9 // blend blends two vectors based on mask values, choosing either 10 // the first or the second based on whether the third is false or true 11 // 12 // Asm: VPBLENDVB, CPU Feature: AVX 13 func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16 14 15 // blend blends two vectors based on mask values, choosing either 16 // the first or the second based on whether the third is false or true 17 // 18 // Asm: VPBLENDVB, CPU Feature: AVX2 19 func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32 20 21 /* blendMasked */ 22 23 // blendMasked blends two vectors based on mask values, choosing either 24 // the first or the second based on whether the third is false or true 25 // 26 // This operation is applied selectively under a write mask. 27 // 28 // Asm: VPBLENDMB, CPU Feature: AVX512 29 func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64 30 31 // blendMasked blends two vectors based on mask values, choosing either 32 // the first or the second based on whether the third is false or true 33 // 34 // This operation is applied selectively under a write mask. 35 // 36 // Asm: VPBLENDMW, CPU Feature: AVX512 37 func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32 38 39 // blendMasked blends two vectors based on mask values, choosing either 40 // the first or the second based on whether the third is false or true 41 // 42 // This operation is applied selectively under a write mask. 43 // 44 // Asm: VPBLENDMD, CPU Feature: AVX512 45 func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 46 47 // blendMasked blends two vectors based on mask values, choosing either 48 // the first or the second based on whether the third is false or true 49 // 50 // This operation is applied selectively under a write mask. 51 // 52 // Asm: VPBLENDMQ, CPU Feature: AVX512 53 func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 54 55 /* carrylessMultiply */ 56 57 // carrylessMultiply computes one of four possible Galois polynomial 58 // products of selected high and low halves of x and y, 59 // depending on the value of xyHiLo, returning the 128-bit 60 // product in the concatenated two elements of the result. 61 // Bit 0 selects the low (0) or high (1) element of x and 62 // bit 4 selects the low (0x00) or high (0x10) element of y. 63 // 64 // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 65 // 66 // Asm: VPCLMULQDQ, CPU Feature: AVX 67 func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2 68 69 // carrylessMultiply computes one of two possible Galois polynomial 70 // products of selected high and low halves of each of the two 71 // 128-bit lanes of x and y, depending on the value of xyHiLo, 72 // and returns the four 128-bit products in the result's lanes. 73 // Bit 0 selects the low (0) or high (1) elements of x's lanes and 74 // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. 75 // 76 // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 77 // 78 // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ 79 func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4 80 81 // carrylessMultiply computes one of four possible Galois polynomial 82 // products of selected high and low halves of each of the four 83 // 128-bit lanes of x and y, depending on the value of xyHiLo, 84 // and returns the four 128-bit products in the result's lanes. 85 // Bit 0 selects the low (0) or high (1) elements of x's lanes and 86 // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes. 87 // 88 // xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 89 // 90 // Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ 91 func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8 92 93 /* concatSelectedConstant */ 94 95 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 96 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 97 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 98 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 99 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 100 // 101 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 102 // 103 // Asm: VSHUFPS, CPU Feature: AVX 104 func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4 105 106 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 107 // halves of the output. The selection is chosen by the constant parameter hilo 108 // where hi and lo are each one bit specifying which 64-bit element to select 109 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 110 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 111 // selecting from y, is 1, and selects 7. 112 // 113 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 114 // 115 // Asm: VSHUFPD, CPU Feature: AVX 116 func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2 117 118 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 119 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 120 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 121 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 122 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 123 // 124 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 125 // 126 // Asm: VSHUFPS, CPU Feature: AVX 127 func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4 128 129 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 130 // halves of the output. The selection is chosen by the constant parameter hilo 131 // where hi and lo are each one bit specifying which 64-bit element to select 132 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 133 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 134 // selecting from y, is 1, and selects 7. 135 // 136 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 137 // 138 // Asm: VSHUFPD, CPU Feature: AVX 139 func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2 140 141 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 142 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 143 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 144 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 145 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 146 // 147 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 148 // 149 // Asm: VSHUFPS, CPU Feature: AVX 150 func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4 151 152 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 153 // halves of the output. The selection is chosen by the constant parameter hilo 154 // where hi and lo are each one bit specifying which 64-bit element to select 155 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 156 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 157 // selecting from y, is 1, and selects 7. 158 // 159 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 160 // 161 // Asm: VSHUFPD, CPU Feature: AVX 162 func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2 163 164 /* concatSelectedConstantGrouped */ 165 166 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 167 // into the lower and upper halves of corresponding subvectors of the output. 168 // The selection is chosen by the constant parameter h1h0l1l0 169 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 170 // For example, 171 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 172 // returns {2,0,5,7,10,8,13,15} 173 // (don't forget that the binary constant is written big-endian). 174 // 175 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 176 // 177 // Asm: VSHUFPS, CPU Feature: AVX 178 func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8 179 180 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 181 // into the lower and upper halves of corresponding subvectors of the output. 182 // The selection is chosen by the constant parameter h1h0l1l0 183 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 184 // For example, 185 // 186 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 187 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 188 // 189 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 190 // 191 // (don't forget that the binary constant is written big-endian). 192 // 193 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 194 // 195 // Asm: VSHUFPS, CPU Feature: AVX512 196 func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16 197 198 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 199 // into the lower and upper halves of corresponding subvectors of the output. 200 // The selections are specified by the constant parameter hilos where each 201 // hi and lo pair select 64-bit elements from the corresponding 128-bit 202 // subvectors of x and y. 203 // 204 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 205 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 206 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 207 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 208 // selecting element 1 from y's upper 128 bits (11). 209 // This differs from the same method applied to a 32x8 vector, where 210 // the 8-bit constant performs the same selection on both subvectors. 211 // 212 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 213 // 214 // Asm: VSHUFPD, CPU Feature: AVX 215 func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4 216 217 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 218 // into the lower and upper halves of corresponding subvectors of the output. 219 // The selections are specified by the constant parameter hilos where each 220 // hi and lo pair select 64-bit elements from the corresponding 128-bit 221 // subvectors of x and y. 222 // 223 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 224 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 225 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 226 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 227 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 228 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 229 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 230 // This differs from the same method applied to a 32x8 or 32x16 vector, where 231 // the 8-bit constant performs the same selection on all the subvectors. 232 // 233 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 234 // 235 // Asm: VSHUFPD, CPU Feature: AVX512 236 func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8 237 238 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 239 // into the lower and upper halves of corresponding subvectors of the output. 240 // The selection is chosen by the constant parameter h1h0l1l0 241 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 242 // For example, 243 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 244 // returns {2,0,5,7,10,8,13,15} 245 // (don't forget that the binary constant is written big-endian). 246 // 247 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 248 // 249 // Asm: VSHUFPS, CPU Feature: AVX 250 func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8 251 252 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 253 // into the lower and upper halves of corresponding subvectors of the output. 254 // The selection is chosen by the constant parameter h1h0l1l0 255 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 256 // For example, 257 // 258 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 259 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 260 // 261 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 262 // 263 // (don't forget that the binary constant is written big-endian). 264 // 265 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 266 // 267 // Asm: VSHUFPS, CPU Feature: AVX512 268 func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16 269 270 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 271 // into the lower and upper halves of corresponding subvectors of the output. 272 // The selections are specified by the constant parameter hilos where each 273 // hi and lo pair select 64-bit elements from the corresponding 128-bit 274 // subvectors of x and y. 275 // 276 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 277 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 278 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 279 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 280 // selecting element 1 from y's upper 128 bits (11). 281 // This differs from the same method applied to a 32x8 vector, where 282 // the 8-bit constant performs the same selection on both subvectors. 283 // 284 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 285 // 286 // Asm: VSHUFPD, CPU Feature: AVX 287 func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4 288 289 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 290 // into the lower and upper halves of corresponding subvectors of the output. 291 // The selections are specified by the constant parameter hilos where each 292 // hi and lo pair select 64-bit elements from the corresponding 128-bit 293 // subvectors of x and y. 294 // 295 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 296 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 297 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 298 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 299 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 300 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 301 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 302 // This differs from the same method applied to a 32x8 or 32x16 vector, where 303 // the 8-bit constant performs the same selection on all the subvectors. 304 // 305 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 306 // 307 // Asm: VSHUFPD, CPU Feature: AVX512 308 func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8 309 310 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 311 // into the lower and upper halves of corresponding subvectors of the output. 312 // The selection is chosen by the constant parameter h1h0l1l0 313 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 314 // For example, 315 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 316 // returns {2,0,5,7,10,8,13,15} 317 // (don't forget that the binary constant is written big-endian). 318 // 319 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 320 // 321 // Asm: VSHUFPS, CPU Feature: AVX 322 func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8 323 324 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 325 // into the lower and upper halves of corresponding subvectors of the output. 326 // The selection is chosen by the constant parameter h1h0l1l0 327 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 328 // For example, 329 // 330 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 331 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 332 // 333 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 334 // 335 // (don't forget that the binary constant is written big-endian). 336 // 337 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 338 // 339 // Asm: VSHUFPS, CPU Feature: AVX512 340 func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16 341 342 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 343 // into the lower and upper halves of corresponding subvectors of the output. 344 // The selections are specified by the constant parameter hilos where each 345 // hi and lo pair select 64-bit elements from the corresponding 128-bit 346 // subvectors of x and y. 347 // 348 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 349 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 350 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 351 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 352 // selecting element 1 from y's upper 128 bits (11). 353 // This differs from the same method applied to a 32x8 vector, where 354 // the 8-bit constant performs the same selection on both subvectors. 355 // 356 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 357 // 358 // Asm: VSHUFPD, CPU Feature: AVX 359 func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4 360 361 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 362 // into the lower and upper halves of corresponding subvectors of the output. 363 // The selections are specified by the constant parameter hilos where each 364 // hi and lo pair select 64-bit elements from the corresponding 128-bit 365 // subvectors of x and y. 366 // 367 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 368 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 369 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 370 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 371 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 372 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 373 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 374 // This differs from the same method applied to a 32x8 or 32x16 vector, where 375 // the 8-bit constant performs the same selection on all the subvectors. 376 // 377 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 378 // 379 // Asm: VSHUFPD, CPU Feature: AVX512 380 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 381 382 /* permuteScalars */ 383 384 // permuteScalars performs a permutation of vector x using constant indices: 385 // 386 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} 387 // 388 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 389 // 390 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 391 // 392 // Asm: VPSHUFD, CPU Feature: AVX 393 func (x Int32x4) permuteScalars(indices uint8) Int32x4 394 395 // permuteScalars performs a permutation of vector x using constant indices: 396 // 397 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} 398 // 399 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 400 // 401 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 402 // 403 // Asm: VPSHUFD, CPU Feature: AVX 404 func (x Uint32x4) permuteScalars(indices uint8) Uint32x4 405 406 /* permuteScalarsGrouped */ 407 408 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 409 // 410 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 411 // 412 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 413 // Each group is of size 128-bit. 414 // 415 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 416 // 417 // Asm: VPSHUFD, CPU Feature: AVX2 418 func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8 419 420 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 421 // 422 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 423 // 424 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 425 // Each group is of size 128-bit. 426 // 427 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 428 // 429 // Asm: VPSHUFD, CPU Feature: AVX512 430 func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16 431 432 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 433 // 434 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 435 // 436 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 437 // Each group is of size 128-bit. 438 // 439 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 440 // 441 // Asm: VPSHUFD, CPU Feature: AVX2 442 func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8 443 444 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 445 // 446 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 447 // 448 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 449 // Each group is of size 128-bit. 450 // 451 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 452 // 453 // Asm: VPSHUFD, CPU Feature: AVX512 454 func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16 455 456 /* permuteScalarsHi */ 457 458 // permuteScalarsHi performs a permutation of vector x using constant indices: 459 // 460 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} 461 // 462 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 463 // 464 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 465 // 466 // Asm: VPSHUFHW, CPU Feature: AVX512 467 func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8 468 469 // permuteScalarsHi performs a permutation of vector x using constant indices: 470 // 471 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} 472 // 473 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 474 // 475 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 476 // 477 // Asm: VPSHUFHW, CPU Feature: AVX512 478 func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8 479 480 /* permuteScalarsHiGrouped */ 481 482 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 483 // result = 484 // 485 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 486 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 487 // 488 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 489 // Each group is of size 128-bit. 490 // 491 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 492 // 493 // Asm: VPSHUFHW, CPU Feature: AVX2 494 func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16 495 496 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 497 // result = 498 // 499 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 500 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 501 // 502 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 503 // Each group is of size 128-bit. 504 // 505 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 506 // 507 // Asm: VPSHUFHW, CPU Feature: AVX512 508 func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32 509 510 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 511 // result = 512 // 513 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 514 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 515 // 516 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 517 // Each group is of size 128-bit. 518 // 519 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 520 // 521 // Asm: VPSHUFHW, CPU Feature: AVX2 522 func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16 523 524 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 525 // result = 526 // 527 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 528 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 529 // 530 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 531 // Each group is of size 128-bit. 532 // 533 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 534 // 535 // Asm: VPSHUFHW, CPU Feature: AVX512 536 func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32 537 538 /* permuteScalarsLo */ 539 540 // permuteScalarsLo performs a permutation of vector x using constant indices: 541 // 542 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} 543 // 544 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 545 // 546 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 547 // 548 // Asm: VPSHUFLW, CPU Feature: AVX512 549 func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8 550 551 // permuteScalarsLo performs a permutation of vector x using constant indices: 552 // 553 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} 554 // 555 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 556 // 557 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 558 // 559 // Asm: VPSHUFLW, CPU Feature: AVX512 560 func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8 561 562 /* permuteScalarsLoGrouped */ 563 564 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 565 // 566 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 567 // x_group1[indices[0:2]], ...} 568 // 569 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 570 // Each group is of size 128-bit. 571 // 572 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 573 // 574 // Asm: VPSHUFLW, CPU Feature: AVX2 575 func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16 576 577 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 578 // 579 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 580 // x_group1[indices[0:2]], ...} 581 // 582 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 583 // Each group is of size 128-bit. 584 // 585 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 586 // 587 // Asm: VPSHUFLW, CPU Feature: AVX512 588 func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32 589 590 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 591 // 592 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 593 // x_group1[indices[0:2]], ...} 594 // 595 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 596 // Each group is of size 128-bit. 597 // 598 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 599 // 600 // Asm: VPSHUFLW, CPU Feature: AVX2 601 func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16 602 603 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 604 // 605 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 606 // x_group1[indices[0:2]], ...} 607 // 608 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 609 // Each group is of size 128-bit. 610 // 611 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 612 // 613 // Asm: VPSHUFLW, CPU Feature: AVX512 614 func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32 615 616 /* tern */ 617 618 // tern performs a logical operation on three vectors based on the 8-bit truth table. 619 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 620 // 621 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 622 // 623 // Asm: VPTERNLOGD, CPU Feature: AVX512 624 func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4 625 626 // tern performs a logical operation on three vectors based on the 8-bit truth table. 627 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 628 // 629 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 630 // 631 // Asm: VPTERNLOGD, CPU Feature: AVX512 632 func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8 633 634 // tern performs a logical operation on three vectors based on the 8-bit truth table. 635 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 636 // 637 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 638 // 639 // Asm: VPTERNLOGD, CPU Feature: AVX512 640 func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16 641 642 // tern performs a logical operation on three vectors based on the 8-bit truth table. 643 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 644 // 645 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 646 // 647 // Asm: VPTERNLOGQ, CPU Feature: AVX512 648 func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2 649 650 // tern performs a logical operation on three vectors based on the 8-bit truth table. 651 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 652 // 653 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 654 // 655 // Asm: VPTERNLOGQ, CPU Feature: AVX512 656 func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4 657 658 // tern performs a logical operation on three vectors based on the 8-bit truth table. 659 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 660 // 661 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 662 // 663 // Asm: VPTERNLOGQ, CPU Feature: AVX512 664 func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8 665 666 // tern performs a logical operation on three vectors based on the 8-bit truth table. 667 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 668 // 669 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 670 // 671 // Asm: VPTERNLOGD, CPU Feature: AVX512 672 func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4 673 674 // tern performs a logical operation on three vectors based on the 8-bit truth table. 675 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 676 // 677 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 678 // 679 // Asm: VPTERNLOGD, CPU Feature: AVX512 680 func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8 681 682 // tern performs a logical operation on three vectors based on the 8-bit truth table. 683 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 684 // 685 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 686 // 687 // Asm: VPTERNLOGD, CPU Feature: AVX512 688 func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16 689 690 // tern performs a logical operation on three vectors based on the 8-bit truth table. 691 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 692 // 693 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 694 // 695 // Asm: VPTERNLOGQ, CPU Feature: AVX512 696 func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2 697 698 // tern performs a logical operation on three vectors based on the 8-bit truth table. 699 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 700 // 701 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 702 // 703 // Asm: VPTERNLOGQ, CPU Feature: AVX512 704 func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4 705 706 // tern performs a logical operation on three vectors based on the 8-bit truth table. 707 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 708 // 709 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 710 // 711 // Asm: VPTERNLOGQ, CPU Feature: AVX512 712 func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8 713