// Copyright 2026 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build goexperiment.simd && wasm package archsimd var nn = [2]int64{-1 << 63, -1 << 63} var f0s = [16]int8{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0} var ff00s = [8]int16{-1, 0, -1, 0, -1, 0, -1, 0} var ffff0000s = [4]int32{-1, 0, -1, 0} // For unsigned comparison, the trick for converting it into // signed comparisonm is to notice that the unsigned range is // the same as the signed range plus 1 << bitwidth-1. // And adding or subtracting the sign bit is the same as XORing // it. Thus, XOR both sign bits and then used the signed // comparison operations. // Less return a mask vector of x[i] < y[i] func (x Uint64x2) Less(y Uint64x2) Mask64x2 { signs := LoadInt64x2Array(&nn) ix := x.BitsToInt64().Xor(signs) iy := y.BitsToInt64().Xor(signs) return ix.Less(iy) } // LessEqual return a mask vector of x[i] <= y[i] func (x Uint64x2) LessEqual(y Uint64x2) Mask64x2 { signs := LoadInt64x2Array(&nn) ix := x.BitsToInt64().Xor(signs) iy := y.BitsToInt64().Xor(signs) return ix.LessEqual(iy) } // Greater return a mask vector of x[i] > y[i] func (x Uint64x2) Greater(y Uint64x2) Mask64x2 { signs := LoadInt64x2Array(&nn) ix := x.BitsToInt64().Xor(signs) iy := y.BitsToInt64().Xor(signs) return ix.Greater(iy) } // GreaterEqual return a mask vector of x[i] >= y[i] func (x Uint64x2) GreaterEqual(y Uint64x2) Mask64x2 { signs := LoadInt64x2Array(&nn) ix := x.BitsToInt64().Xor(signs) iy := y.BitsToInt64().Xor(signs) return ix.GreaterEqual(iy) } // Max returns the elementswise maximum of elements in x and y func (x Int64x2) Max(y Int64x2) Int64x2 { mask := x.Greater(y).ToInt64x2() return x.And(mask).Or(y.AndNot(mask)) } // Min returns the elementswise minimum of elements in x and y func (x Int64x2) Min(y Int64x2) Int64x2 { mask := x.Less(y).ToInt64x2() return x.And(mask).Or(y.AndNot(mask)) } // Max returns the elementswise maximum of elements in x and y func (x Uint64x2) Max(y Uint64x2) Uint64x2 { mask := x.Greater(y).ToInt64x2().ToBits() return x.And(mask).Or(y.AndNot(mask)) } // Min returns the elementswise minimum of elements in x and y func (x Uint64x2) Min(y Uint64x2) Uint64x2 { mask := x.Less(y).ToInt64x2().ToBits() return x.And(mask).Or(y.AndNot(mask)) } // Mul returns the elementswise product of elements in x and y func (x Int8x16) Mul(y Int8x16) Int8x16 { // To obtain an 8-bit multiply, split the vectors into even and odd // elements, shift odds into even position, widen elements in both // vectors, multiply, discard high parts, realign the odd results // and combine. mask := LoadInt8x16Array(&f0s) mask16 := mask.ToBits().ReshapeToUint16s() xe := x.And(mask).ToBits().ReshapeToUint16s() xo := x.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) ye := y.And(mask).ToBits().ReshapeToUint16s() yo := y.AndNot(mask).ToBits().ReshapeToUint16s().ShiftAllRight(8) pe := xe.Mul(ye).And(mask16) po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) return pe.Or(po).ReshapeToUint8s().BitsToInt8() } // Mul returns the elementswise product of elements in x and y func (x Uint8x16) Mul(y Uint8x16) Uint8x16 { mask := LoadInt8x16Array(&f0s).ToBits() mask16 := mask.ReshapeToUint16s() xe := x.And(mask).ReshapeToUint16s() xo := x.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) ye := y.And(mask).ReshapeToUint16s() yo := y.AndNot(mask).ReshapeToUint16s().ShiftAllRight(8) pe := xe.Mul(ye).And(mask16) po := xo.Mul(yo).And(mask16).ShiftAllLeft(8) return pe.Or(po).ReshapeToUint8s() } // OnesCount returns the number of set bits in each vector element func (x Int16x8) OnesCount() Int16x8 { mask := LoadInt8x16Array(&f0s) c := x.ToBits().ReshapeToUint8s().BitsToInt8().OnesCount() // per-byte counts ce := c.And(mask).ToBits().ReshapeToUint16s().BitsToInt16() // even-element per-byte counts, as 16-bit elements co := c.AndNot(mask).ToBits().ReshapeToUint16s().BitsToInt16().ShiftAllRight(8) // odd-element per-byte counts, as 16-bit elements, aligned return ce.Add(co) // return their elementwise sum } // OnesCount returns the number of set bits in each vector element func (x Int32x4) OnesCount() Int32x4 { mask := LoadInt8x16Array(&f0s) c := x.ToBits().ReshapeToUint8s().BitsToInt8().OnesCount() // per-byte counts ce := c.And(mask).ToBits().ReshapeToUint16s().BitsToInt16() // even-element per-byte counts, as 16-bit elements co := c.AndNot(mask).ToBits().ReshapeToUint16s().BitsToInt16().ShiftAllRight(8) // odd-element per-byte counts, as 16-bit elements, aligned mask16 := LoadInt16x8Array(&ff00s) y := ce.Add(co) // per int16 counts, etc. ye := y.And(mask16).ToBits().ReshapeToUint32s().BitsToInt32() yo := y.AndNot(mask16).ToBits().ReshapeToUint32s().BitsToInt32().ShiftAllRight(16) return ye.Add(yo) } // OnesCount returns the number of set bits in each vector element func (x Int64x2) OnesCount() Int64x2 { mask := LoadInt8x16Array(&f0s) c := x.ToBits().ReshapeToUint8s().BitsToInt8().OnesCount() ce := c.And(mask).ToBits().ReshapeToUint16s().BitsToInt16() co := c.AndNot(mask).ToBits().ReshapeToUint16s().BitsToInt16().ShiftAllRight(8) mask16 := LoadInt16x8Array(&ff00s) y := ce.Add(co) ye := y.And(mask16).ToBits().ReshapeToUint32s().BitsToInt32() yo := y.AndNot(mask16).ToBits().ReshapeToUint32s().BitsToInt32().ShiftAllRight(16) mask32 := LoadInt32x4Array(&ffff0000s) z := ye.Add(yo) ze := z.And(mask32).ToBits().ReshapeToUint64s().BitsToInt64() zo := z.AndNot(mask32).ToBits().ReshapeToUint64s().BitsToInt64().ShiftAllRight(32) return ze.Add(zo) } // OnesCount returns the number of set bits in each vector element func (x Uint8x16) OnesCount() Uint8x16 { return x.BitsToInt8().OnesCount().ToBits() } // OnesCount returns the number of set bits in each vector element func (x Uint16x8) OnesCount() Uint16x8 { return x.BitsToInt16().OnesCount().ToBits() } // OnesCount returns the number of set bits in each vector element func (x Uint32x4) OnesCount() Uint32x4 { return x.BitsToInt32().OnesCount().ToBits() } // OnesCount returns the number of set bits in each vector element func (x Uint64x2) OnesCount() Uint64x2 { return x.BitsToInt64().OnesCount().ToBits() } // CarrylessMultiplyEven computes the carryless // multiplications of selected even halves of the elements of x and y. // // A carryless multiplication uses bitwise XOR instead of // add-with-carry, for example (in base two): // // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 // // This also models multiplication of polynomials with coefficients // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds // polynomial terms, but coefficients "add" with XOR.) // // Emulated func (x Uint64x2) CarrylessMultiplyEven(y Uint64x2) Uint64x2 { return x.carrylessMultiply(y) } // CarrylessMultiplyOdd computes the carryless // multiplications of selected odd halves of the elements of x and y. // // A carryless multiplication uses bitwise XOR instead of // add-with-carry, for example (in base two): // // 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101 // // This also models multiplication of polynomials with coefficients // from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 = // x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds // polynomial terms, but coefficients "add" with XOR.) // // Emulated func (x Uint64x2) CarrylessMultiplyOdd(y Uint64x2) Uint64x2 { x = x.SetElem(0, x.GetElem(1)) y = y.SetElem(0, x.GetElem(1)) return x.carrylessMultiply(y) }