Source file src/simd/ops_internal_amd64.go
1 // Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT. 2 3 //go:build goexperiment.simd 4 5 package simd 6 7 /* blend */ 8 9 // blend blends two vectors based on mask values, choosing either 10 // the first or the second based on whether the third is false or true 11 // 12 // Asm: VPBLENDVB, CPU Feature: AVX 13 func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16 14 15 // blend blends two vectors based on mask values, choosing either 16 // the first or the second based on whether the third is false or true 17 // 18 // Asm: VPBLENDVB, CPU Feature: AVX2 19 func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32 20 21 /* blendMasked */ 22 23 // blendMasked blends two vectors based on mask values, choosing either 24 // the first or the second based on whether the third is false or true 25 // 26 // This operation is applied selectively under a write mask. 27 // 28 // Asm: VPBLENDMB, CPU Feature: AVX512 29 func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64 30 31 // blendMasked blends two vectors based on mask values, choosing either 32 // the first or the second based on whether the third is false or true 33 // 34 // This operation is applied selectively under a write mask. 35 // 36 // Asm: VPBLENDMW, CPU Feature: AVX512 37 func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32 38 39 // blendMasked blends two vectors based on mask values, choosing either 40 // the first or the second based on whether the third is false or true 41 // 42 // This operation is applied selectively under a write mask. 43 // 44 // Asm: VPBLENDMD, CPU Feature: AVX512 45 func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16 46 47 // blendMasked blends two vectors based on mask values, choosing either 48 // the first or the second based on whether the third is false or true 49 // 50 // This operation is applied selectively under a write mask. 51 // 52 // Asm: VPBLENDMQ, CPU Feature: AVX512 53 func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8 54 55 /* concatSelectedConstant */ 56 57 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 58 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 59 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 60 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 61 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 62 // 63 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 64 // 65 // Asm: VSHUFPS, CPU Feature: AVX 66 func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4 67 68 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 69 // halves of the output. The selection is chosen by the constant parameter hilo 70 // where hi and lo are each one bit specifying which 64-bit element to select 71 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 72 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 73 // selecting from y, is 1, and selects 7. 74 // 75 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 76 // 77 // Asm: VSHUFPD, CPU Feature: AVX 78 func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2 79 80 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 81 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 82 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 83 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 84 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 85 // 86 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 87 // 88 // Asm: VSHUFPS, CPU Feature: AVX 89 func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4 90 91 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 92 // halves of the output. The selection is chosen by the constant parameter hilo 93 // where hi and lo are each one bit specifying which 64-bit element to select 94 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 95 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 96 // selecting from y, is 1, and selects 7. 97 // 98 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 99 // 100 // Asm: VSHUFPD, CPU Feature: AVX 101 func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2 102 103 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 104 // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 105 // where each {h,l}{1,0} is two bits specify which element from y or x to select. 106 // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns 107 // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). 108 // 109 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 110 // 111 // Asm: VSHUFPS, CPU Feature: AVX 112 func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4 113 114 // concatSelectedConstant concatenates selected elements from x and y into the lower and upper 115 // halves of the output. The selection is chosen by the constant parameter hilo 116 // where hi and lo are each one bit specifying which 64-bit element to select 117 // from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7}) 118 // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1, 119 // selecting from y, is 1, and selects 7. 120 // 121 // hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table. 122 // 123 // Asm: VSHUFPD, CPU Feature: AVX 124 func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2 125 126 /* concatSelectedConstantGrouped */ 127 128 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 129 // into the lower and upper halves of corresponding subvectors of the output. 130 // The selection is chosen by the constant parameter h1h0l1l0 131 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 132 // For example, 133 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 134 // returns {2,0,5,7,10,8,13,15} 135 // (don't forget that the binary constant is written big-endian). 136 // 137 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 138 // 139 // Asm: VSHUFPS, CPU Feature: AVX 140 func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8 141 142 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 143 // into the lower and upper halves of corresponding subvectors of the output. 144 // The selection is chosen by the constant parameter h1h0l1l0 145 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 146 // For example, 147 // 148 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 149 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 150 // 151 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 152 // 153 // (don't forget that the binary constant is written big-endian). 154 // 155 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 156 // 157 // Asm: VSHUFPS, CPU Feature: AVX512 158 func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16 159 160 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 161 // into the lower and upper halves of corresponding subvectors of the output. 162 // The selections are specified by the constant parameter hilos where each 163 // hi and lo pair select 64-bit elements from the corresponding 128-bit 164 // subvectors of x and y. 165 // 166 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 167 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 168 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 169 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 170 // selecting element 1 from y's upper 128 bits (11). 171 // This differs from the same method applied to a 32x8 vector, where 172 // the 8-bit constant performs the same selection on both subvectors. 173 // 174 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 175 // 176 // Asm: VSHUFPD, CPU Feature: AVX 177 func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4 178 179 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 180 // into the lower and upper halves of corresponding subvectors of the output. 181 // The selections are specified by the constant parameter hilos where each 182 // hi and lo pair select 64-bit elements from the corresponding 128-bit 183 // subvectors of x and y. 184 // 185 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 186 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 187 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 188 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 189 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 190 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 191 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 192 // This differs from the same method applied to a 32x8 or 32x16 vector, where 193 // the 8-bit constant performs the same selection on all the subvectors. 194 // 195 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 196 // 197 // Asm: VSHUFPD, CPU Feature: AVX512 198 func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8 199 200 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 201 // into the lower and upper halves of corresponding subvectors of the output. 202 // The selection is chosen by the constant parameter h1h0l1l0 203 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 204 // For example, 205 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 206 // returns {2,0,5,7,10,8,13,15} 207 // (don't forget that the binary constant is written big-endian). 208 // 209 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 210 // 211 // Asm: VSHUFPS, CPU Feature: AVX 212 func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8 213 214 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 215 // into the lower and upper halves of corresponding subvectors of the output. 216 // The selection is chosen by the constant parameter h1h0l1l0 217 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 218 // For example, 219 // 220 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 221 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 222 // 223 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 224 // 225 // (don't forget that the binary constant is written big-endian). 226 // 227 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 228 // 229 // Asm: VSHUFPS, CPU Feature: AVX512 230 func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16 231 232 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 233 // into the lower and upper halves of corresponding subvectors of the output. 234 // The selections are specified by the constant parameter hilos where each 235 // hi and lo pair select 64-bit elements from the corresponding 128-bit 236 // subvectors of x and y. 237 // 238 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 239 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 240 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 241 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 242 // selecting element 1 from y's upper 128 bits (11). 243 // This differs from the same method applied to a 32x8 vector, where 244 // the 8-bit constant performs the same selection on both subvectors. 245 // 246 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 247 // 248 // Asm: VSHUFPD, CPU Feature: AVX 249 func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4 250 251 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 252 // into the lower and upper halves of corresponding subvectors of the output. 253 // The selections are specified by the constant parameter hilos where each 254 // hi and lo pair select 64-bit elements from the corresponding 128-bit 255 // subvectors of x and y. 256 // 257 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 258 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 259 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 260 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 261 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 262 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 263 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 264 // This differs from the same method applied to a 32x8 or 32x16 vector, where 265 // the 8-bit constant performs the same selection on all the subvectors. 266 // 267 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 268 // 269 // Asm: VSHUFPD, CPU Feature: AVX512 270 func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8 271 272 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 273 // into the lower and upper halves of corresponding subvectors of the output. 274 // The selection is chosen by the constant parameter h1h0l1l0 275 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 276 // For example, 277 // {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15}) 278 // returns {2,0,5,7,10,8,13,15} 279 // (don't forget that the binary constant is written big-endian). 280 // 281 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 282 // 283 // Asm: VSHUFPS, CPU Feature: AVX 284 func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8 285 286 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 287 // into the lower and upper halves of corresponding subvectors of the output. 288 // The selection is chosen by the constant parameter h1h0l1l0 289 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. 290 // For example, 291 // 292 // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( 293 // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) 294 // 295 // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} 296 // 297 // (don't forget that the binary constant is written big-endian). 298 // 299 // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. 300 // 301 // Asm: VSHUFPS, CPU Feature: AVX512 302 func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16 303 304 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 305 // into the lower and upper halves of corresponding subvectors of the output. 306 // The selections are specified by the constant parameter hilos where each 307 // hi and lo pair select 64-bit elements from the corresponding 128-bit 308 // subvectors of x and y. 309 // 310 // For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11}) 311 // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least 312 // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 313 // then 1, selecting element 1 from x's upper 128 bits (9), then 1, 314 // selecting element 1 from y's upper 128 bits (11). 315 // This differs from the same method applied to a 32x8 vector, where 316 // the 8-bit constant performs the same selection on both subvectors. 317 // 318 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 319 // 320 // Asm: VSHUFPD, CPU Feature: AVX 321 func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4 322 323 // concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y 324 // into the lower and upper halves of corresponding subvectors of the output. 325 // The selections are specified by the constant parameter hilos where each 326 // hi and lo pair select 64-bit elements from the corresponding 128-bit 327 // subvectors of x and y. 328 // 329 // For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19}) 330 // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's 331 // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), 332 // then 1, selecting element 1 from x's next 128 bits (9), then 1, 333 // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select 334 // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two 335 // 1 bits select the upper elements from x and y's last 128 bits (17, 19). 336 // This differs from the same method applied to a 32x8 or 32x16 vector, where 337 // the 8-bit constant performs the same selection on all the subvectors. 338 // 339 // hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table. 340 // 341 // Asm: VSHUFPD, CPU Feature: AVX512 342 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 343 344 /* permuteScalars */ 345 346 // permuteScalars performs a permutation of vector x using constant indices: 347 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} 348 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 349 // 350 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 351 // 352 // Asm: VPSHUFD, CPU Feature: AVX 353 func (x Int32x4) permuteScalars(indices uint8) Int32x4 354 355 // permuteScalars performs a permutation of vector x using constant indices: 356 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} 357 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 358 // 359 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 360 // 361 // Asm: VPSHUFD, CPU Feature: AVX 362 func (x Uint32x4) permuteScalars(indices uint8) Uint32x4 363 364 /* permuteScalarsGrouped */ 365 366 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 367 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 368 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 369 // Each group is of size 128-bit. 370 // 371 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 372 // 373 // Asm: VPSHUFD, CPU Feature: AVX2 374 func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8 375 376 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 377 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 378 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 379 // Each group is of size 128-bit. 380 // 381 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 382 // 383 // Asm: VPSHUFD, CPU Feature: AVX512 384 func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16 385 386 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 387 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 388 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 389 // Each group is of size 128-bit. 390 // 391 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 392 // 393 // Asm: VPSHUFD, CPU Feature: AVX2 394 func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8 395 396 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: 397 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} 398 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 399 // Each group is of size 128-bit. 400 // 401 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 402 // 403 // Asm: VPSHUFD, CPU Feature: AVX512 404 func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16 405 406 /* permuteScalarsHi */ 407 408 // permuteScalarsHi performs a permutation of vector x using constant indices: 409 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} 410 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 411 // 412 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 413 // 414 // Asm: VPSHUFHW, CPU Feature: AVX512 415 func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8 416 417 // permuteScalarsHi performs a permutation of vector x using constant indices: 418 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} 419 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 420 // 421 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 422 // 423 // Asm: VPSHUFHW, CPU Feature: AVX512 424 func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8 425 426 /* permuteScalarsHiGrouped */ 427 428 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 429 // result = 430 // 431 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 432 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 433 // 434 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 435 // Each group is of size 128-bit. 436 // 437 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 438 // 439 // Asm: VPSHUFHW, CPU Feature: AVX2 440 func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16 441 442 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 443 // result = 444 // 445 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 446 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 447 // 448 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 449 // Each group is of size 128-bit. 450 // 451 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 452 // 453 // Asm: VPSHUFHW, CPU Feature: AVX512 454 func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32 455 456 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 457 // result = 458 // 459 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 460 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 461 // 462 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 463 // Each group is of size 128-bit. 464 // 465 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 466 // 467 // Asm: VPSHUFHW, CPU Feature: AVX2 468 func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16 469 470 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: 471 // result = 472 // 473 // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], 474 // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} 475 // 476 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 477 // Each group is of size 128-bit. 478 // 479 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 480 // 481 // Asm: VPSHUFHW, CPU Feature: AVX512 482 func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32 483 484 /* permuteScalarsLo */ 485 486 // permuteScalarsLo performs a permutation of vector x using constant indices: 487 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} 488 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 489 // 490 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 491 // 492 // Asm: VPSHUFLW, CPU Feature: AVX512 493 func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8 494 495 // permuteScalarsLo performs a permutation of vector x using constant indices: 496 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} 497 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 498 // 499 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 500 // 501 // Asm: VPSHUFLW, CPU Feature: AVX512 502 func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8 503 504 /* permuteScalarsLoGrouped */ 505 506 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 507 // 508 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 509 // x_group1[indices[0:2]], ...} 510 // 511 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 512 // Each group is of size 128-bit. 513 // 514 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 515 // 516 // Asm: VPSHUFLW, CPU Feature: AVX2 517 func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16 518 519 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 520 // 521 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 522 // x_group1[indices[0:2]], ...} 523 // 524 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 525 // Each group is of size 128-bit. 526 // 527 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 528 // 529 // Asm: VPSHUFLW, CPU Feature: AVX512 530 func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32 531 532 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 533 // 534 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 535 // x_group1[indices[0:2]], ...} 536 // 537 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 538 // Each group is of size 128-bit. 539 // 540 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 541 // 542 // Asm: VPSHUFLW, CPU Feature: AVX2 543 func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16 544 545 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: 546 // 547 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], 548 // x_group1[indices[0:2]], ...} 549 // 550 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. 551 // Each group is of size 128-bit. 552 // 553 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. 554 // 555 // Asm: VPSHUFLW, CPU Feature: AVX512 556 func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32 557 558 /* tern */ 559 560 // tern performs a logical operation on three vectors based on the 8-bit truth table. 561 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 562 // 563 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 564 // 565 // Asm: VPTERNLOGD, CPU Feature: AVX512 566 func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4 567 568 // tern performs a logical operation on three vectors based on the 8-bit truth table. 569 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 570 // 571 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 572 // 573 // Asm: VPTERNLOGD, CPU Feature: AVX512 574 func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8 575 576 // tern performs a logical operation on three vectors based on the 8-bit truth table. 577 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 578 // 579 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 580 // 581 // Asm: VPTERNLOGD, CPU Feature: AVX512 582 func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16 583 584 // tern performs a logical operation on three vectors based on the 8-bit truth table. 585 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 586 // 587 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 588 // 589 // Asm: VPTERNLOGQ, CPU Feature: AVX512 590 func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2 591 592 // tern performs a logical operation on three vectors based on the 8-bit truth table. 593 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 594 // 595 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 596 // 597 // Asm: VPTERNLOGQ, CPU Feature: AVX512 598 func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4 599 600 // tern performs a logical operation on three vectors based on the 8-bit truth table. 601 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 602 // 603 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 604 // 605 // Asm: VPTERNLOGQ, CPU Feature: AVX512 606 func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8 607 608 // tern performs a logical operation on three vectors based on the 8-bit truth table. 609 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 610 // 611 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 612 // 613 // Asm: VPTERNLOGD, CPU Feature: AVX512 614 func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4 615 616 // tern performs a logical operation on three vectors based on the 8-bit truth table. 617 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 618 // 619 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 620 // 621 // Asm: VPTERNLOGD, CPU Feature: AVX512 622 func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8 623 624 // tern performs a logical operation on three vectors based on the 8-bit truth table. 625 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 626 // 627 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 628 // 629 // Asm: VPTERNLOGD, CPU Feature: AVX512 630 func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16 631 632 // tern performs a logical operation on three vectors based on the 8-bit truth table. 633 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 634 // 635 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 636 // 637 // Asm: VPTERNLOGQ, CPU Feature: AVX512 638 func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2 639 640 // tern performs a logical operation on three vectors based on the 8-bit truth table. 641 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 642 // 643 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 644 // 645 // Asm: VPTERNLOGQ, CPU Feature: AVX512 646 func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4 647 648 // tern performs a logical operation on three vectors based on the 8-bit truth table. 649 // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) 650 // 651 // table results in better performance when it's a constant, a non-constant value will be translated into a jump table. 652 // 653 // Asm: VPTERNLOGQ, CPU Feature: AVX512 654 func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8 655