Source file src/simd/internal/simd_test/transpose_test.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build goexperiment.simd && amd64
     6  
     7  package simd_test
     8  
     9  import (
    10  	"fmt"
    11  	"simd"
    12  	"testing"
    13  )
    14  
    15  func Transpose4(a0, a1, a2, a3 simd.Int32x4) (b0, b1, b2, b3 simd.Int32x4) {
    16  	t0, t1 := a0.InterleaveLo(a1), a0.InterleaveHi(a1)
    17  	t2, t3 := a2.InterleaveLo(a3), a2.InterleaveHi(a3)
    18  
    19  	// a0: ABCD ==> t0: A1B2
    20  	// a1: 1234     t1: C3D4
    21  	// a2: EFGH     t2: E5F6
    22  	// a3: 5678     t3: G7H8
    23  
    24  	// need
    25  	// A1E5
    26  	// B2F6
    27  	// C3G7
    28  	// D4H8
    29  
    30  	b0 = t0.SelectFromPair(0, 1, 4, 5, t2) // lower elements from each
    31  	b1 = t0.SelectFromPair(2, 3, 6, 7, t2) // upper elements from each
    32  	b2 = t1.SelectFromPair(0, 1, 4, 5, t3) // lowers
    33  	b3 = t1.SelectFromPair(2, 3, 6, 7, t3) // uppers
    34  	return
    35  }
    36  
    37  func Transpose8(a0, a1, a2, a3, a4, a5, a6, a7 simd.Int32x8) (b0, b1, b2, b3, b4, b5, b6, b7 simd.Int32x8) {
    38  	t0, t1 := a0.InterleaveLoGrouped(a1), a0.InterleaveHiGrouped(a1)
    39  	t2, t3 := a2.InterleaveLoGrouped(a3), a2.InterleaveHiGrouped(a3)
    40  	t4, t5 := a4.InterleaveLoGrouped(a5), a4.InterleaveHiGrouped(a5)
    41  	t6, t7 := a6.InterleaveLoGrouped(a7), a6.InterleaveHiGrouped(a7)
    42  
    43  	// a0: ABCD ==> t0: A1B2
    44  	// a1: 1234     t1: C3D4
    45  	// a2: EFGH     t2: E5F6
    46  	// a3: 5678     t3: G7H8
    47  
    48  	// need
    49  	// A1E5
    50  	// B2F6
    51  	// C3G7
    52  	// D4H8
    53  
    54  	a0 = t0.SelectFromPairGrouped(0, 1, 4, 5, t2) // lower elements from each
    55  	a1 = t0.SelectFromPairGrouped(2, 3, 6, 7, t2) // upper elements from each
    56  	a2 = t1.SelectFromPairGrouped(0, 1, 4, 5, t3) // lowers
    57  	a3 = t1.SelectFromPairGrouped(2, 3, 6, 7, t3) // uppers
    58  
    59  	a4 = t4.SelectFromPairGrouped(0, 1, 4, 5, t6) // lower elements from each
    60  	a5 = t4.SelectFromPairGrouped(2, 3, 6, 7, t6) // upper elements from each
    61  	a6 = t5.SelectFromPairGrouped(0, 1, 4, 5, t7) // lowers
    62  	a7 = t5.SelectFromPairGrouped(2, 3, 6, 7, t7) // uppers
    63  
    64  	// next need to swap the upper 128 bits of a0-a3 with the lower 128 bits of a4-a7
    65  
    66  	b0 = a0.Select128FromPair(0, 2, a4)
    67  	b4 = a0.Select128FromPair(1, 3, a4)
    68  
    69  	b1 = a1.Select128FromPair(0, 2, a5)
    70  	b5 = a1.Select128FromPair(1, 3, a5)
    71  
    72  	b2 = a2.Select128FromPair(0, 2, a6)
    73  	b6 = a2.Select128FromPair(1, 3, a6)
    74  
    75  	b3 = a3.Select128FromPair(0, 2, a7)
    76  	b7 = a3.Select128FromPair(1, 3, a7)
    77  
    78  	return
    79  }
    80  
    81  func TestTranspose4(t *testing.T) {
    82  	r := make([]int32, 16, 16)
    83  
    84  	w := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
    85  	x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
    86  	y := simd.LoadInt32x4Slice([]int32{0xE, 0xF, 0x10, 0x11})
    87  	z := simd.LoadInt32x4Slice([]int32{5, 6, 7, 8})
    88  	a, b, c, d := Transpose4(w, x, y, z)
    89  
    90  	a.StoreSlice(r[0:])
    91  	b.StoreSlice(r[4:])
    92  	c.StoreSlice(r[8:])
    93  	d.StoreSlice(r[12:])
    94  
    95  	checkSlices[int32](t, r, []int32{
    96  		0xA, 1, 0xE, 5,
    97  		0xB, 2, 0xF, 6,
    98  		0xC, 3, 0x10, 7,
    99  		0xD, 4, 0x11, 8,
   100  	})
   101  
   102  }
   103  
   104  func TestTranspose8(t *testing.T) {
   105  	m := make([]int32, 8)
   106  
   107  	a := []int32{}
   108  	for i := int32(1); i <= 64; i++ {
   109  		a = append(a, i)
   110  	}
   111  
   112  	p := simd.LoadInt32x8Slice(a[0:])
   113  	q := simd.LoadInt32x8Slice(a[8:])
   114  	r := simd.LoadInt32x8Slice(a[16:])
   115  	s := simd.LoadInt32x8Slice(a[24:])
   116  
   117  	w := simd.LoadInt32x8Slice(a[32:])
   118  	x := simd.LoadInt32x8Slice(a[40:])
   119  	y := simd.LoadInt32x8Slice(a[48:])
   120  	z := simd.LoadInt32x8Slice(a[56:])
   121  
   122  	p, q, r, s, w, x, y, z = Transpose8(p, q, r, s, w, x, y, z)
   123  
   124  	foo := func(a simd.Int32x8, z int32) {
   125  		a.StoreSlice(m)
   126  		var o []int32
   127  		for i := int32(0); i < 8; i++ {
   128  			o = append(o, z+i*8)
   129  		}
   130  		checkSlices[int32](t, m, o)
   131  	}
   132  
   133  	foo(p, 1)
   134  	foo(q, 2)
   135  	foo(r, 3)
   136  	foo(s, 4)
   137  	foo(w, 5)
   138  	foo(x, 6)
   139  	foo(y, 7)
   140  	foo(z, 8)
   141  
   142  }
   143  
   144  const BIG = 20000
   145  
   146  var bigMatrix [][]int32
   147  
   148  // 9x9 is smallest matrix with diagonal and off-diagonal tiles, plus a fringe.
   149  var nineMatrix [][]int32
   150  
   151  var thirtyMatrix [][]int32
   152  
   153  func fill(m [][]int32) {
   154  	for i := range m {
   155  		m[i] = make([]int32, len(m))
   156  		for j := range m[i] {
   157  			m[i][j] = int32(-i<<16 + j)
   158  		}
   159  	}
   160  }
   161  
   162  func isTransposed(m [][]int32) bool {
   163  	for i, mi := range m {
   164  		for j, a := range mi {
   165  			if a != int32(-j<<16+i) {
   166  				return false
   167  			}
   168  		}
   169  	}
   170  	return true
   171  }
   172  
   173  func dupe(m [][]int32) [][]int32 {
   174  	n := len(m)
   175  	p := make([][]int32, n, n)
   176  	for i := range p {
   177  		t := make([]int32, n)
   178  		for j, a := range m[i] {
   179  			t[j] = a
   180  		}
   181  		p[i] = t
   182  	}
   183  	return p
   184  }
   185  
   186  func init() {
   187  	bigMatrix = make([][]int32, BIG, BIG)
   188  	fill(bigMatrix)
   189  	nineMatrix = make([][]int32, 9, 9)
   190  	fill(nineMatrix)
   191  	thirtyMatrix = make([][]int32, 30, 30)
   192  	fill(thirtyMatrix)
   193  }
   194  
   195  func BenchmarkPlainTranspose(b *testing.B) {
   196  	d := dupe(bigMatrix)
   197  	for b.Loop() {
   198  		transposePlain(d)
   199  	}
   200  }
   201  
   202  func BenchmarkTiled4Transpose(b *testing.B) {
   203  	d := dupe(bigMatrix)
   204  	for b.Loop() {
   205  		transposeTiled4(d)
   206  	}
   207  }
   208  
   209  func BenchmarkTiled8Transpose(b *testing.B) {
   210  	d := dupe(bigMatrix)
   211  	for b.Loop() {
   212  		transposeTiled8(d)
   213  	}
   214  }
   215  
   216  func Benchmark2BlockedTranspose(b *testing.B) {
   217  	d := dupe(bigMatrix)
   218  	for b.Loop() {
   219  		transpose2Blocked(d)
   220  	}
   221  }
   222  func Benchmark3BlockedTranspose(b *testing.B) {
   223  	d := dupe(bigMatrix)
   224  	for b.Loop() {
   225  		transpose3Blocked(d)
   226  	}
   227  }
   228  func Benchmark4BlockedTranspose(b *testing.B) {
   229  	d := dupe(bigMatrix)
   230  	for b.Loop() {
   231  		transpose4Blocked(d)
   232  	}
   233  }
   234  func Benchmark5aBlockedTranspose(b *testing.B) {
   235  	d := dupe(bigMatrix)
   236  	for b.Loop() {
   237  		transpose5aBlocked(d)
   238  	}
   239  }
   240  
   241  func Benchmark5bBlockedTranspose(b *testing.B) {
   242  	d := dupe(bigMatrix)
   243  	for b.Loop() {
   244  		transpose5bBlocked(d)
   245  	}
   246  }
   247  
   248  func transposePlain(m [][]int32) {
   249  	for i := range m {
   250  		for j := 0; j < i; j++ {
   251  			t := m[i][j]
   252  			m[i][j] = m[j][i]
   253  			m[j][i] = t
   254  		}
   255  	}
   256  }
   257  
   258  func TestTransposePlain(t *testing.T) {
   259  	d := dupe(nineMatrix)
   260  	t.Logf("Input matrix is %s", formatMatrix(d))
   261  	transposePlain(d)
   262  	if !isTransposed(d) {
   263  		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
   264  	} else {
   265  		t.Logf("Transposed plain matrix = %s", formatMatrix(d))
   266  	}
   267  }
   268  
   269  func TestTranspose2Blocked(t *testing.T) {
   270  	d := dupe(nineMatrix)
   271  	t.Logf("Input matrix is %s", formatMatrix(d))
   272  	transpose2Blocked(d)
   273  	if !isTransposed(d) {
   274  		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
   275  	}
   276  }
   277  
   278  func TestTranspose3Blocked(t *testing.T) {
   279  	d := dupe(nineMatrix)
   280  	t.Logf("Input matrix is %s", formatMatrix(d))
   281  	transpose3Blocked(d)
   282  	if !isTransposed(d) {
   283  		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
   284  	}
   285  }
   286  
   287  func TestTranspose4Blocked(t *testing.T) {
   288  	d := dupe(nineMatrix)
   289  	t.Logf("Input matrix is %s", formatMatrix(d))
   290  	transpose4Blocked(d)
   291  	if !isTransposed(d) {
   292  		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
   293  	}
   294  }
   295  
   296  func TestTranspose5aBlocked(t *testing.T) {
   297  	d := dupe(nineMatrix)
   298  	t.Logf("Input matrix is %s", formatMatrix(d))
   299  	transpose5aBlocked(d)
   300  	if !isTransposed(d) {
   301  		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
   302  	}
   303  }
   304  
   305  func TestTranspose5bBlocked(t *testing.T) {
   306  	d := dupe(nineMatrix)
   307  	t.Logf("Input matrix is %s", formatMatrix(d))
   308  	transpose5bBlocked(d)
   309  	if !isTransposed(d) {
   310  		t.Errorf("d is not transposed, d = %s", formatMatrix(d))
   311  	}
   312  }
   313  
   314  func TestTransposeTiled4(t *testing.T) {
   315  	d := dupe(nineMatrix)
   316  	transposeTiled4(d)
   317  	if !isTransposed(d) {
   318  		t.Errorf("d is not transposed, d = %v", d)
   319  	}
   320  }
   321  
   322  func TestTransposeTiled8(t *testing.T) {
   323  	d := dupe(thirtyMatrix)
   324  	transposeTiled8(d)
   325  	if !isTransposed(d) {
   326  		t.Errorf("d is not transposed, d = %v", d)
   327  	}
   328  }
   329  
   330  func formatMatrix(m [][]int32) string {
   331  	s := ""
   332  	for _, mi := range m {
   333  		s += "\n["
   334  		for _, t := range mi {
   335  			h := t >> 16
   336  			l := t & 0xffff
   337  			s += fmt.Sprintf(" (%d %d)", h, l)
   338  		}
   339  		s += " ]"
   340  	}
   341  	return s
   342  }
   343  
   344  func transpose2Blocked(m [][]int32) {
   345  	const B = 2
   346  	N := len(m)
   347  	i := 0
   348  	for ; i <= len(m)-B; i += B {
   349  		r0, r1 := m[i], m[i+1]
   350  		if len(r0) < N || len(r1) < N {
   351  			panic("Early bounds check failure")
   352  		}
   353  		// transpose around diagonal
   354  		d01, d10 := r0[i+1], r1[i]
   355  		r0[i+1], r1[i] = d10, d01
   356  
   357  		// transpose across diagonal
   358  		j := 0
   359  		for ; j < i; j += B {
   360  			a0, a1 := m[j], m[j+1]
   361  
   362  			b00, b01 := a0[i], a0[i+1]
   363  			b10, b11 := a1[i], a1[i+1]
   364  
   365  			a0[i], a0[i+1] = r0[j], r1[j]
   366  			a1[i], a1[i+1] = r0[j+1], r1[j+1]
   367  
   368  			r0[j], r0[j+1] = b00, b10
   369  			r1[j], r1[j+1] = b01, b11
   370  		}
   371  	}
   372  
   373  	// Do the fringe
   374  	for ; i < len(m); i++ {
   375  		j := 0
   376  		r := m[i]
   377  		for ; j < i; j++ {
   378  			t := r[j]
   379  			r[j] = m[j][i]
   380  			m[j][i] = t
   381  		}
   382  	}
   383  }
   384  
   385  func transpose3Blocked(m [][]int32) {
   386  	const B = 3
   387  	N := len(m)
   388  	i := 0
   389  	for ; i <= len(m)-B; i += B {
   390  		r0, r1, r2 := m[i], m[i+1], m[i+2]
   391  		if len(r0) < N || len(r1) < N {
   392  			panic("Early bounds check failure")
   393  		}
   394  		// transpose around diagonal
   395  		d01, d10 := r0[i+1], r1[i]
   396  		d02, d20 := r0[i+2], r2[i]
   397  		d12, d21 := r1[i+2], r2[i+1]
   398  
   399  		r0[i+1], r1[i] = d10, d01
   400  		r0[i+2], r2[i] = d20, d02
   401  		r1[i+2], r2[i+1] = d21, d12
   402  
   403  		// transpose across diagonal
   404  		j := 0
   405  		for ; j < i; j += B {
   406  			a0, a1, a2 := m[j], m[j+1], m[j+2]
   407  
   408  			b00, b01, b02 := a0[i], a0[i+1], a0[i+2]
   409  			b10, b11, b12 := a1[i], a1[i+1], a1[i+2]
   410  			b20, b21, b22 := a2[i], a2[i+1], a2[i+2]
   411  
   412  			a0[i], a0[i+1], a0[i+2] = r0[j], r1[j], r2[j]
   413  			a1[i], a1[i+1], a1[i+2] = r0[j+1], r1[j+1], r2[j+1]
   414  			a2[i], a2[i+1], a2[i+2] = r0[j+2], r1[j+2], r2[j+2]
   415  
   416  			r0[j], r0[j+1], r0[j+2] = b00, b10, b20
   417  			r1[j], r1[j+1], r1[j+2] = b01, b11, b21
   418  			r2[j], r2[j+1], r2[j+2] = b02, b12, b22
   419  		}
   420  	}
   421  
   422  	// Do the fringe
   423  	for ; i < len(m); i++ {
   424  		j := 0
   425  		r := m[i]
   426  		for ; j < i; j++ {
   427  			t := r[j]
   428  			r[j] = m[j][i]
   429  			m[j][i] = t
   430  		}
   431  	}
   432  }
   433  
   434  func transpose4Blocked(m [][]int32) {
   435  	const B = 4
   436  	N := len(m)
   437  	i := 0
   438  	for ; i <= len(m)-B; i += B {
   439  		r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
   440  		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
   441  			panic("Early bounds check failure")
   442  		}
   443  		// transpose around diagonal
   444  		d01, d10 := r0[i+1], r1[i]
   445  		d02, d20 := r0[i+2], r2[i]
   446  		d03, d30 := r0[i+3], r3[i]
   447  		d12, d21 := r1[i+2], r2[i+1]
   448  		d13, d31 := r1[i+3], r3[i+1]
   449  		d23, d32 := r2[i+3], r3[i+2]
   450  
   451  		r0[i+1], r1[i] = d10, d01
   452  		r0[i+2], r2[i] = d20, d02
   453  		r0[i+3], r3[i] = d30, d03
   454  		r1[i+2], r2[i+1] = d21, d12
   455  		r1[i+3], r3[i+1] = d31, d13
   456  		r2[i+3], r3[i+2] = d32, d23
   457  
   458  		// transpose across diagonal
   459  		j := 0
   460  		for ; j < i; j += B {
   461  			a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
   462  
   463  			b00, b01, b02, b03 := a0[i], a0[i+1], a0[i+2], a0[i+3]
   464  			b10, b11, b12, b13 := a1[i], a1[i+1], a1[i+2], a1[i+3]
   465  			b20, b21, b22, b23 := a2[i], a2[i+1], a2[i+2], a2[i+3]
   466  			b30, b31, b32, b33 := a3[i], a3[i+1], a3[i+2], a3[i+3]
   467  
   468  			a0[i], a0[i+1], a0[i+2], a0[i+3] = r0[j], r1[j], r2[j], r3[j]
   469  			a1[i], a1[i+1], a1[i+2], a1[i+3] = r0[j+1], r1[j+1], r2[j+1], r3[j+1]
   470  			a2[i], a2[i+1], a2[i+2], a2[i+3] = r0[j+2], r1[j+2], r2[j+2], r3[j+2]
   471  			a3[i], a3[i+1], a3[i+2], a3[i+3] = r0[j+3], r1[j+3], r2[j+3], r3[j+3]
   472  
   473  			r0[j], r0[j+1], r0[j+2], r0[j+3] = b00, b10, b20, b30
   474  			r1[j], r1[j+1], r1[j+2], r1[j+3] = b01, b11, b21, b31
   475  			r2[j], r2[j+1], r2[j+2], r2[j+3] = b02, b12, b22, b32
   476  			r3[j], r3[j+1], r3[j+2], r3[j+3] = b03, b13, b23, b33
   477  		}
   478  	}
   479  
   480  	// Do the fringe
   481  	for ; i < len(m); i++ {
   482  		j := 0
   483  		r := m[i]
   484  		for ; j < i; j++ {
   485  			t := r[j]
   486  			r[j] = m[j][i]
   487  			m[j][i] = t
   488  		}
   489  	}
   490  }
   491  
   492  func transpose5aBlocked(m [][]int32) {
   493  	const B = 5
   494  	N := len(m)
   495  	i := 0
   496  	for ; i <= len(m)-B; i += B {
   497  		r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
   498  		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
   499  			panic("Early bounds check failure")
   500  		}
   501  		// transpose around diagonal
   502  		d01, d10 := r0[i+1], r1[i]
   503  		d02, d20 := r0[i+2], r2[i]
   504  		d03, d30 := r0[i+3], r3[i]
   505  		d04, d40 := r0[i+4], r4[i]
   506  
   507  		d12, d21 := r1[i+2], r2[i+1]
   508  		d13, d31 := r1[i+3], r3[i+1]
   509  		d14, d41 := r1[i+4], r4[i+1]
   510  
   511  		d23, d32 := r2[i+3], r3[i+2]
   512  		d24, d42 := r2[i+4], r4[i+2]
   513  
   514  		d34, d43 := r3[i+4], r4[i+3]
   515  
   516  		r0[i+1], r1[i] = d10, d01
   517  		r0[i+2], r2[i] = d20, d02
   518  		r0[i+3], r3[i] = d30, d03
   519  		r0[i+4], r4[i] = d40, d04
   520  
   521  		r1[i+2], r2[i+1] = d21, d12
   522  		r1[i+3], r3[i+1] = d31, d13
   523  		r1[i+4], r4[i+1] = d41, d14
   524  
   525  		r2[i+3], r3[i+2] = d32, d23
   526  		r2[i+4], r4[i+2] = d42, d24
   527  
   528  		r3[i+4], r4[i+3] = d43, d34
   529  
   530  		// transpose across diagonal
   531  		j := 0
   532  		for ; j < i; j += B {
   533  			a0, a1, a2, a3, a4 := m[j], m[j+1], m[j+2], m[j+3], m[j+4]
   534  
   535  			b00, b01, b02, b03, b04 := a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4]
   536  			b10, b11, b12, b13, b14 := a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4]
   537  			b20, b21, b22, b23, b24 := a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4]
   538  			b30, b31, b32, b33, b34 := a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4]
   539  			b40, b41, b42, b43, b44 := a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4]
   540  
   541  			a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4] = r0[j], r1[j], r2[j], r3[j], r4[j]
   542  			a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4] = r0[j+1], r1[j+1], r2[j+1], r3[j+1], r4[j+1]
   543  			a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4] = r0[j+2], r1[j+2], r2[j+2], r3[j+2], r4[j+2]
   544  			a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4] = r0[j+3], r1[j+3], r2[j+3], r3[j+3], r4[j+3]
   545  			a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4] = r0[j+4], r1[j+4], r2[j+4], r3[j+4], r4[j+4]
   546  
   547  			r0[j], r0[j+1], r0[j+2], r0[j+3], r0[j+4] = b00, b10, b20, b30, b40
   548  			r1[j], r1[j+1], r1[j+2], r1[j+3], r1[j+4] = b01, b11, b21, b31, b41
   549  			r2[j], r2[j+1], r2[j+2], r2[j+3], r2[j+4] = b02, b12, b22, b32, b42
   550  			r3[j], r3[j+1], r3[j+2], r3[j+3], r3[j+4] = b03, b13, b23, b33, b43
   551  			r4[j], r4[j+1], r4[j+2], r4[j+3], r4[j+4] = b04, b14, b24, b34, b44
   552  		}
   553  	}
   554  
   555  	// Do the fringe
   556  	for ; i < len(m); i++ {
   557  		j := 0
   558  		r := m[i]
   559  		for ; j < i; j++ {
   560  			t := r[j]
   561  			r[j] = m[j][i]
   562  			m[j][i] = t
   563  		}
   564  	}
   565  }
   566  
   567  // transpose5bBlocked is just like transpose5aBlocked
   568  // but rewritten to reduce register pressure in the
   569  // inner loop.
   570  func transpose5bBlocked(m [][]int32) {
   571  	const B = 5
   572  	N := len(m)
   573  	i := 0
   574  	for ; i <= len(m)-B; i += B {
   575  		r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
   576  		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
   577  			panic("Early bounds check failure")
   578  		}
   579  		// transpose around diagonal
   580  		d01, d10 := r0[i+1], r1[i]
   581  		d02, d20 := r0[i+2], r2[i]
   582  		d03, d30 := r0[i+3], r3[i]
   583  		d04, d40 := r0[i+4], r4[i]
   584  		r0[i+1], r1[i] = d10, d01
   585  		r0[i+2], r2[i] = d20, d02
   586  		r0[i+3], r3[i] = d30, d03
   587  		r0[i+4], r4[i] = d40, d04
   588  
   589  		d12, d21 := r1[i+2], r2[i+1]
   590  		d13, d31 := r1[i+3], r3[i+1]
   591  		d14, d41 := r1[i+4], r4[i+1]
   592  		r1[i+2], r2[i+1] = d21, d12
   593  		r1[i+3], r3[i+1] = d31, d13
   594  		r1[i+4], r4[i+1] = d41, d14
   595  
   596  		d23, d32 := r2[i+3], r3[i+2]
   597  		d24, d42 := r2[i+4], r4[i+2]
   598  		r2[i+3], r3[i+2] = d32, d23
   599  		r2[i+4], r4[i+2] = d42, d24
   600  
   601  		d34, d43 := r3[i+4], r4[i+3]
   602  		r3[i+4], r4[i+3] = d43, d34
   603  
   604  		// transpose across diagonal
   605  		j := 0
   606  		for ; j < i; j += B {
   607  			a4, a0, a1, a2, a3 := m[j+4], m[j], m[j+1], m[j+2], m[j+3]
   608  
   609  			// Process column i+4
   610  			temp0 := a0[i+4]
   611  			temp1 := a1[i+4]
   612  			temp2 := a2[i+4]
   613  			temp3 := a3[i+4]
   614  			temp4 := a4[i+4]
   615  
   616  			a4[i+4] = r4[j+4]
   617  			a0[i+4] = r4[j]
   618  			a1[i+4] = r4[j+1]
   619  			a2[i+4] = r4[j+2]
   620  			a3[i+4] = r4[j+3]
   621  
   622  			r0[j+4] = temp0
   623  			r1[j+4] = temp1
   624  			r2[j+4] = temp2
   625  			r3[j+4] = temp3
   626  			r4[j+4] = temp4
   627  
   628  			// Process column i
   629  			temp0 = a0[i]
   630  			temp1 = a1[i]
   631  			temp2 = a2[i]
   632  			temp3 = a3[i]
   633  			temp4 = a4[i]
   634  
   635  			a4[i] = r0[j+4]
   636  			a0[i] = r0[j]
   637  			a1[i] = r0[j+1]
   638  			a2[i] = r0[j+2]
   639  			a3[i] = r0[j+3]
   640  
   641  			r0[j] = temp0
   642  			r1[j] = temp1
   643  			r2[j] = temp2
   644  			r3[j] = temp3
   645  			r4[j] = temp4
   646  
   647  			// Process column i+1
   648  			temp0 = a0[i+1]
   649  			temp1 = a1[i+1]
   650  			temp2 = a2[i+1]
   651  			temp3 = a3[i+1]
   652  			temp4 = a4[i+1]
   653  
   654  			a4[i+1] = r1[j+4]
   655  			a0[i+1] = r1[j]
   656  			a1[i+1] = r1[j+1]
   657  			a2[i+1] = r1[j+2]
   658  			a3[i+1] = r1[j+3]
   659  
   660  			r0[j+1] = temp0
   661  			r1[j+1] = temp1
   662  			r2[j+1] = temp2
   663  			r3[j+1] = temp3
   664  			r4[j+1] = temp4
   665  
   666  			// Process column i+2
   667  			temp0 = a0[i+2]
   668  			temp1 = a1[i+2]
   669  			temp2 = a2[i+2]
   670  			temp3 = a3[i+2]
   671  			temp4 = a4[i+2]
   672  
   673  			a4[i+2] = r2[j+4]
   674  			a0[i+2] = r2[j]
   675  			a1[i+2] = r2[j+1]
   676  			a2[i+2] = r2[j+2]
   677  			a3[i+2] = r2[j+3]
   678  
   679  			r0[j+2] = temp0
   680  			r1[j+2] = temp1
   681  			r2[j+2] = temp2
   682  			r3[j+2] = temp3
   683  			r4[j+2] = temp4
   684  
   685  			// Process column i+3
   686  			temp0 = a0[i+3]
   687  			temp1 = a1[i+3]
   688  			temp2 = a2[i+3]
   689  			temp3 = a3[i+3]
   690  			temp4 = a4[i+3]
   691  
   692  			a4[i+3] = r3[j+4]
   693  			a0[i+3] = r3[j]
   694  			a1[i+3] = r3[j+1]
   695  			a2[i+3] = r3[j+2]
   696  			a3[i+3] = r3[j+3]
   697  
   698  			r0[j+3] = temp0
   699  			r1[j+3] = temp1
   700  			r2[j+3] = temp2
   701  			r3[j+3] = temp3
   702  			r4[j+3] = temp4
   703  		}
   704  	}
   705  
   706  	// Do the fringe
   707  	for ; i < len(m); i++ {
   708  		j := 0
   709  		r := m[i]
   710  		for ; j < i; j++ {
   711  			t := r[j]
   712  			r[j] = m[j][i]
   713  			m[j][i] = t
   714  		}
   715  	}
   716  }
   717  
   718  func transposeTiled4(m [][]int32) {
   719  	const B = 4
   720  	N := len(m)
   721  	i := 0
   722  	for ; i < len(m)-(B-1); i += B {
   723  		r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
   724  		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
   725  			panic("Early bounds check failure")
   726  		}
   727  		// transpose diagonal
   728  		d0, d1, d2, d3 :=
   729  			simd.LoadInt32x4Slice(r0[i:]),
   730  			simd.LoadInt32x4Slice(r1[i:]),
   731  			simd.LoadInt32x4Slice(r2[i:]),
   732  			simd.LoadInt32x4Slice(r3[i:])
   733  
   734  		d0, d1, d2, d3 = Transpose4(d0, d1, d2, d3)
   735  
   736  		d0.StoreSlice(r0[i:])
   737  		d1.StoreSlice(r1[i:])
   738  		d2.StoreSlice(r2[i:])
   739  		d3.StoreSlice(r3[i:])
   740  
   741  		// transpose across diagonal
   742  		j := 0
   743  		for ; j < i; j += B {
   744  			a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
   745  			u0, u1, u2, u3 :=
   746  				simd.LoadInt32x4Slice(a0[i:]),
   747  				simd.LoadInt32x4Slice(a1[i:]),
   748  				simd.LoadInt32x4Slice(a2[i:]),
   749  				simd.LoadInt32x4Slice(a3[i:])
   750  
   751  			u0, u1, u2, u3 = Transpose4(u0, u1, u2, u3)
   752  
   753  			l0 := simd.LoadInt32x4Slice(r0[j:])
   754  			u0.StoreSlice(r0[j:])
   755  			l1 := simd.LoadInt32x4Slice(r1[j:])
   756  			u1.StoreSlice(r1[j:])
   757  			l2 := simd.LoadInt32x4Slice(r2[j:])
   758  			u2.StoreSlice(r2[j:])
   759  			l3 := simd.LoadInt32x4Slice(r3[j:])
   760  			u3.StoreSlice(r3[j:])
   761  
   762  			u0, u1, u2, u3 = Transpose4(l0, l1, l2, l3)
   763  
   764  			u0.StoreSlice(a0[i:])
   765  			u1.StoreSlice(a1[i:])
   766  			u2.StoreSlice(a2[i:])
   767  			u3.StoreSlice(a3[i:])
   768  		}
   769  	}
   770  	// Do the fringe
   771  	for ; i < len(m); i++ {
   772  		j := 0
   773  		r := m[i]
   774  		for ; j < i; j++ {
   775  			t := r[j]
   776  			r[j] = m[j][i]
   777  			m[j][i] = t
   778  		}
   779  	}
   780  }
   781  
   782  func transposeTiled8(m [][]int32) {
   783  	const B = 8
   784  	N := len(m)
   785  	i := 0
   786  	for ; i < len(m)-(B-1); i += B {
   787  		r0, r1, r2, r3, r4, r5, r6, r7 := m[i], m[i+1], m[i+2], m[i+3], m[i+4], m[i+5], m[i+6], m[i+7]
   788  		if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N || len(r5) < N || len(r6) < N || len(r7) < N {
   789  			panic("Early bounds check failure")
   790  		}
   791  		// transpose diagonal
   792  		d0, d1, d2, d3, d4, d5, d6, d7 :=
   793  			simd.LoadInt32x8Slice(r0[i:]),
   794  			simd.LoadInt32x8Slice(r1[i:]),
   795  			simd.LoadInt32x8Slice(r2[i:]),
   796  			simd.LoadInt32x8Slice(r3[i:]),
   797  			simd.LoadInt32x8Slice(r4[i:]),
   798  			simd.LoadInt32x8Slice(r5[i:]),
   799  			simd.LoadInt32x8Slice(r6[i:]),
   800  			simd.LoadInt32x8Slice(r7[i:])
   801  
   802  		d0, d1, d2, d3, d4, d5, d6, d7 = Transpose8(d0, d1, d2, d3, d4, d5, d6, d7)
   803  
   804  		d0.StoreSlice(r0[i:])
   805  		d1.StoreSlice(r1[i:])
   806  		d2.StoreSlice(r2[i:])
   807  		d3.StoreSlice(r3[i:])
   808  		d4.StoreSlice(r4[i:])
   809  		d5.StoreSlice(r5[i:])
   810  		d6.StoreSlice(r6[i:])
   811  		d7.StoreSlice(r7[i:])
   812  
   813  		// transpose across diagonal
   814  		j := 0
   815  		for ; j < i; j += B {
   816  			a7, a0, a1, a2, a3, a4, a5, a6 := m[j+7], m[j], m[j+1], m[j+2], m[j+3], m[j+4], m[j+5], m[j+6]
   817  			u0, u1, u2, u3, u4, u5, u6, u7 :=
   818  				simd.LoadInt32x8Slice(a0[i:]),
   819  				simd.LoadInt32x8Slice(a1[i:]),
   820  				simd.LoadInt32x8Slice(a2[i:]),
   821  				simd.LoadInt32x8Slice(a3[i:]),
   822  				simd.LoadInt32x8Slice(a4[i:]),
   823  				simd.LoadInt32x8Slice(a5[i:]),
   824  				simd.LoadInt32x8Slice(a6[i:]),
   825  				simd.LoadInt32x8Slice(a7[i:])
   826  
   827  			u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(u0, u1, u2, u3, u4, u5, u6, u7)
   828  
   829  			l0 := simd.LoadInt32x8Slice(r0[j:])
   830  			u0.StoreSlice(r0[j:])
   831  			l1 := simd.LoadInt32x8Slice(r1[j:])
   832  			u1.StoreSlice(r1[j:])
   833  			l2 := simd.LoadInt32x8Slice(r2[j:])
   834  			u2.StoreSlice(r2[j:])
   835  			l3 := simd.LoadInt32x8Slice(r3[j:])
   836  			u3.StoreSlice(r3[j:])
   837  			l4 := simd.LoadInt32x8Slice(r4[j:])
   838  			u4.StoreSlice(r4[j:])
   839  			l5 := simd.LoadInt32x8Slice(r5[j:])
   840  			u5.StoreSlice(r5[j:])
   841  			l6 := simd.LoadInt32x8Slice(r6[j:])
   842  			u6.StoreSlice(r6[j:])
   843  			l7 := simd.LoadInt32x8Slice(r7[j:])
   844  			u7.StoreSlice(r7[j:])
   845  
   846  			u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(l0, l1, l2, l3, l4, l5, l6, l7)
   847  
   848  			u0.StoreSlice(a0[i:])
   849  			u1.StoreSlice(a1[i:])
   850  			u2.StoreSlice(a2[i:])
   851  			u3.StoreSlice(a3[i:])
   852  			u4.StoreSlice(a4[i:])
   853  			u5.StoreSlice(a5[i:])
   854  			u6.StoreSlice(a6[i:])
   855  			u7.StoreSlice(a7[i:])
   856  		}
   857  	}
   858  	// Do the fringe
   859  	for ; i < len(m); i++ {
   860  		j := 0
   861  		r := m[i]
   862  		for ; j < i; j++ {
   863  			t := r[j]
   864  			r[j] = m[j][i]
   865  			m[j][i] = t
   866  		}
   867  	}
   868  }
   869  

View as plain text