clmul_emulated.go

     1  // Copyright 2026 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && (arm64 || wasm)
     6  
     7  package archsimd
     8  
     9  func new64x2(lo, hi uint64) Uint64x2 {
    10  	return Uint64x2{}.SetElem(0, lo).SetElem(1, hi)
    11  }
    12  
    13  // These masks all have 4 zeroes between 1s.
    14  var m0 = new64x2(0x1084210842108421, 0x2108421084210842)
    15  var m1 = new64x2(0x2108421084210842, 0x4210842108421084)
    16  var m2 = new64x2(0x4210842108421084, 0x8421084210842108)
    17  var m3 = new64x2(0x8421084210842108, 0x0842108421084210)
    18  var m4 = new64x2(0x0842108421084210, 0x1084210842108421)
    19  
    20  // Selects the middle 64 bits of a 128-bit simd value
    21  var middle = new64x2(0xffffffff00000000, 0x00000000ffffffff)
    22  
    23  // mwl is a 64x64 into 128 multiply that is missing
    24  // some carries that we don't need for CLMUL emulation.
    25  // The high 64 bits of each input are ignored.
    26  // Also just for fun, accumulate sums with Xor.
    27  func (x Uint64x2) mwl(y Uint64x2) Uint64x2 {
    28  	// reshape input into Uint32x4
    29  	// input is  {a b _ _}.mwl{c d _ _}
    30  	// need the sum of
    31  	// ac0_ac1
    32  	//   0 ad0_ad1
    33  	//   0 bc0_bc1
    34  	//   0   0 bd0_bd1
    35  	// This "sum" is where the carries (not propagated
    36  	// across lanes) are lost.
    37  	ab__ := x.ReshapeToUint32s()
    38  	cd__ := y.ReshapeToUint32s()
    39  	ac0_ac1_bd0_bd1 := ab__.MulWidenLo(cd__)
    40  
    41  	dc__ := y.RotateAllLeft(32).ReshapeToUint32s()
    42  	ad0_ad1_bc0_bc1 := ab__.MulWidenLo(dc__)
    43  	//
    44  	// have        ad0, ad1, bc0, bc1
    45  	// want        0, ad0+bc0, ad1+bc1, 0
    46  	// to add to    ac0_ac1_bd0_bd1
    47  	//
    48  	// swap 64-bit halves of ad0_ad1_bc0_bc1
    49  	// to get   bc0_bc1_ad0_ad1
    50  	bc0_bc1_ad0_ad1 := Uint64x2{}.SetElem(0, ad0_ad1_bc0_bc1.GetElem(1)).SetElem(1, ad0_ad1_bc0_bc1.GetElem(0))
    51  
    52  	// added to ad0_ad1_bc0_bc1 yields
    53  	//   bc0+ad0, bc1+ad1, bc0+ad0, bc1+ad1
    54  	// rotate 32 (within the two 64-bit elements) yields
    55  	//   bc1+ad1, bc0+ad0, bc1+ad1, bc0+ad0
    56  	// and then intersect with mask:
    57  	//   0      , bc0+ad0, bc1+ad1, 0
    58  	//
    59  	// use xor to make it a worse multiply
    60  	zzz_adPbc0_adPbc1_zzz := bc0_bc1_ad0_ad1.Xor(ad0_ad1_bc0_bc1).RotateAllLeft(32).And(middle)
    61  	return ac0_ac1_bd0_bd1.Xor(zzz_adPbc0_adPbc1_zzz)
    62  }
    63  
    64  // carrylessMultiply is constant time carrless multiply implemented with an
    65  // absurd number of multiplication given that the emulation platforms only have
    66  // 32x32 into 64, it might make sense to rework this into that primitive, but,
    67  // for now this works and is easily tested in scalar Go.
    68  func (x Uint64x2) carrylessMultiply(y Uint64x2) Uint64x2 {
    69  
    70  	// This by masking the two inputs into 5 thinned inputs, with
    71  	// 4 zeroes separating any 2 set bits.  Multiply will potentially
    72  	// set more bits with addition of overlapping terms, however this
    73  	// technique allows as many as 31 additions (filling all 4 separation
    74  	// positions with 1) without perturbing the bits we care about.  Since
    75  	// there's at most 13 set bits in a thinned input, 31 is not a problem.
    76  	// If there were only 3 set bits, there are 16 1s per thinned input and
    77  	// only 15 additions can be tolerated -- so that's not possible.
    78  
    79  	// This is also discussed at
    80  	// https://timtaubert.de/blog/2017/06/verified-binary-multiplication-for-ghash/
    81  
    82  	x0 := x.And(m0)
    83  	x1 := x.And(m1)
    84  	x2 := x.And(m2)
    85  	x3 := x.And(m3)
    86  	x4 := x.And(m4)
    87  
    88  	y0 := y.And(m0)
    89  	y1 := y.And(m1)
    90  	y2 := y.And(m2)
    91  	y3 := y.And(m3)
    92  	y4 := y.And(m4)
    93  
    94  	var z Uint64x2
    95  	// for a given line, combining (xI).mwl(yJ) terms, I+J == K mod 5; mask index = K
    96  	z = (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0)
    97  	z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z)
    98  	z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z)
    99  	z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z)
   100  	z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z)
   101  
   102  	return z
   103  }
   104
View as plain text