1
2
3
4
5
6
7 package simd_test
8
9 import (
10 "fmt"
11 "simd"
12 "testing"
13 )
14
15 func Transpose4(a0, a1, a2, a3 simd.Int32x4) (b0, b1, b2, b3 simd.Int32x4) {
16 t0, t1 := a0.InterleaveLo(a1), a0.InterleaveHi(a1)
17 t2, t3 := a2.InterleaveLo(a3), a2.InterleaveHi(a3)
18
19
20
21
22
23
24
25
26
27
28
29
30 b0 = t0.SelectFromPair(0, 1, 4, 5, t2)
31 b1 = t0.SelectFromPair(2, 3, 6, 7, t2)
32 b2 = t1.SelectFromPair(0, 1, 4, 5, t3)
33 b3 = t1.SelectFromPair(2, 3, 6, 7, t3)
34 return
35 }
36
37 func Transpose8(a0, a1, a2, a3, a4, a5, a6, a7 simd.Int32x8) (b0, b1, b2, b3, b4, b5, b6, b7 simd.Int32x8) {
38 t0, t1 := a0.InterleaveLoGrouped(a1), a0.InterleaveHiGrouped(a1)
39 t2, t3 := a2.InterleaveLoGrouped(a3), a2.InterleaveHiGrouped(a3)
40 t4, t5 := a4.InterleaveLoGrouped(a5), a4.InterleaveHiGrouped(a5)
41 t6, t7 := a6.InterleaveLoGrouped(a7), a6.InterleaveHiGrouped(a7)
42
43
44
45
46
47
48
49
50
51
52
53
54 a0 = t0.SelectFromPairGrouped(0, 1, 4, 5, t2)
55 a1 = t0.SelectFromPairGrouped(2, 3, 6, 7, t2)
56 a2 = t1.SelectFromPairGrouped(0, 1, 4, 5, t3)
57 a3 = t1.SelectFromPairGrouped(2, 3, 6, 7, t3)
58
59 a4 = t4.SelectFromPairGrouped(0, 1, 4, 5, t6)
60 a5 = t4.SelectFromPairGrouped(2, 3, 6, 7, t6)
61 a6 = t5.SelectFromPairGrouped(0, 1, 4, 5, t7)
62 a7 = t5.SelectFromPairGrouped(2, 3, 6, 7, t7)
63
64
65
66 b0 = a0.Select128FromPair(0, 2, a4)
67 b4 = a0.Select128FromPair(1, 3, a4)
68
69 b1 = a1.Select128FromPair(0, 2, a5)
70 b5 = a1.Select128FromPair(1, 3, a5)
71
72 b2 = a2.Select128FromPair(0, 2, a6)
73 b6 = a2.Select128FromPair(1, 3, a6)
74
75 b3 = a3.Select128FromPair(0, 2, a7)
76 b7 = a3.Select128FromPair(1, 3, a7)
77
78 return
79 }
80
81 func TestTranspose4(t *testing.T) {
82 r := make([]int32, 16, 16)
83
84 w := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
85 x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
86 y := simd.LoadInt32x4Slice([]int32{0xE, 0xF, 0x10, 0x11})
87 z := simd.LoadInt32x4Slice([]int32{5, 6, 7, 8})
88 a, b, c, d := Transpose4(w, x, y, z)
89
90 a.StoreSlice(r[0:])
91 b.StoreSlice(r[4:])
92 c.StoreSlice(r[8:])
93 d.StoreSlice(r[12:])
94
95 checkSlices[int32](t, r, []int32{
96 0xA, 1, 0xE, 5,
97 0xB, 2, 0xF, 6,
98 0xC, 3, 0x10, 7,
99 0xD, 4, 0x11, 8,
100 })
101
102 }
103
104 func TestTranspose8(t *testing.T) {
105 m := make([]int32, 8)
106
107 a := []int32{}
108 for i := int32(1); i <= 64; i++ {
109 a = append(a, i)
110 }
111
112 p := simd.LoadInt32x8Slice(a[0:])
113 q := simd.LoadInt32x8Slice(a[8:])
114 r := simd.LoadInt32x8Slice(a[16:])
115 s := simd.LoadInt32x8Slice(a[24:])
116
117 w := simd.LoadInt32x8Slice(a[32:])
118 x := simd.LoadInt32x8Slice(a[40:])
119 y := simd.LoadInt32x8Slice(a[48:])
120 z := simd.LoadInt32x8Slice(a[56:])
121
122 p, q, r, s, w, x, y, z = Transpose8(p, q, r, s, w, x, y, z)
123
124 foo := func(a simd.Int32x8, z int32) {
125 a.StoreSlice(m)
126 var o []int32
127 for i := int32(0); i < 8; i++ {
128 o = append(o, z+i*8)
129 }
130 checkSlices[int32](t, m, o)
131 }
132
133 foo(p, 1)
134 foo(q, 2)
135 foo(r, 3)
136 foo(s, 4)
137 foo(w, 5)
138 foo(x, 6)
139 foo(y, 7)
140 foo(z, 8)
141
142 }
143
144 const BIG = 20000
145
146 var bigMatrix [][]int32
147
148
149 var nineMatrix [][]int32
150
151 var thirtyMatrix [][]int32
152
153 func fill(m [][]int32) {
154 for i := range m {
155 m[i] = make([]int32, len(m))
156 for j := range m[i] {
157 m[i][j] = int32(-i<<16 + j)
158 }
159 }
160 }
161
162 func isTransposed(m [][]int32) bool {
163 for i, mi := range m {
164 for j, a := range mi {
165 if a != int32(-j<<16+i) {
166 return false
167 }
168 }
169 }
170 return true
171 }
172
173 func dupe(m [][]int32) [][]int32 {
174 n := len(m)
175 p := make([][]int32, n, n)
176 for i := range p {
177 t := make([]int32, n)
178 for j, a := range m[i] {
179 t[j] = a
180 }
181 p[i] = t
182 }
183 return p
184 }
185
186 func init() {
187 bigMatrix = make([][]int32, BIG, BIG)
188 fill(bigMatrix)
189 nineMatrix = make([][]int32, 9, 9)
190 fill(nineMatrix)
191 thirtyMatrix = make([][]int32, 30, 30)
192 fill(thirtyMatrix)
193 }
194
195 func BenchmarkPlainTranspose(b *testing.B) {
196 d := dupe(bigMatrix)
197 for b.Loop() {
198 transposePlain(d)
199 }
200 }
201
202 func BenchmarkTiled4Transpose(b *testing.B) {
203 d := dupe(bigMatrix)
204 for b.Loop() {
205 transposeTiled4(d)
206 }
207 }
208
209 func BenchmarkTiled8Transpose(b *testing.B) {
210 d := dupe(bigMatrix)
211 for b.Loop() {
212 transposeTiled8(d)
213 }
214 }
215
216 func Benchmark2BlockedTranspose(b *testing.B) {
217 d := dupe(bigMatrix)
218 for b.Loop() {
219 transpose2Blocked(d)
220 }
221 }
222 func Benchmark3BlockedTranspose(b *testing.B) {
223 d := dupe(bigMatrix)
224 for b.Loop() {
225 transpose3Blocked(d)
226 }
227 }
228 func Benchmark4BlockedTranspose(b *testing.B) {
229 d := dupe(bigMatrix)
230 for b.Loop() {
231 transpose4Blocked(d)
232 }
233 }
234 func Benchmark5aBlockedTranspose(b *testing.B) {
235 d := dupe(bigMatrix)
236 for b.Loop() {
237 transpose5aBlocked(d)
238 }
239 }
240
241 func Benchmark5bBlockedTranspose(b *testing.B) {
242 d := dupe(bigMatrix)
243 for b.Loop() {
244 transpose5bBlocked(d)
245 }
246 }
247
248 func transposePlain(m [][]int32) {
249 for i := range m {
250 for j := 0; j < i; j++ {
251 t := m[i][j]
252 m[i][j] = m[j][i]
253 m[j][i] = t
254 }
255 }
256 }
257
258 func TestTransposePlain(t *testing.T) {
259 d := dupe(nineMatrix)
260 t.Logf("Input matrix is %s", formatMatrix(d))
261 transposePlain(d)
262 if !isTransposed(d) {
263 t.Errorf("d is not transposed, d = %s", formatMatrix(d))
264 } else {
265 t.Logf("Transposed plain matrix = %s", formatMatrix(d))
266 }
267 }
268
269 func TestTranspose2Blocked(t *testing.T) {
270 d := dupe(nineMatrix)
271 t.Logf("Input matrix is %s", formatMatrix(d))
272 transpose2Blocked(d)
273 if !isTransposed(d) {
274 t.Errorf("d is not transposed, d = %s", formatMatrix(d))
275 }
276 }
277
278 func TestTranspose3Blocked(t *testing.T) {
279 d := dupe(nineMatrix)
280 t.Logf("Input matrix is %s", formatMatrix(d))
281 transpose3Blocked(d)
282 if !isTransposed(d) {
283 t.Errorf("d is not transposed, d = %s", formatMatrix(d))
284 }
285 }
286
287 func TestTranspose4Blocked(t *testing.T) {
288 d := dupe(nineMatrix)
289 t.Logf("Input matrix is %s", formatMatrix(d))
290 transpose4Blocked(d)
291 if !isTransposed(d) {
292 t.Errorf("d is not transposed, d = %s", formatMatrix(d))
293 }
294 }
295
296 func TestTranspose5aBlocked(t *testing.T) {
297 d := dupe(nineMatrix)
298 t.Logf("Input matrix is %s", formatMatrix(d))
299 transpose5aBlocked(d)
300 if !isTransposed(d) {
301 t.Errorf("d is not transposed, d = %s", formatMatrix(d))
302 }
303 }
304
305 func TestTranspose5bBlocked(t *testing.T) {
306 d := dupe(nineMatrix)
307 t.Logf("Input matrix is %s", formatMatrix(d))
308 transpose5bBlocked(d)
309 if !isTransposed(d) {
310 t.Errorf("d is not transposed, d = %s", formatMatrix(d))
311 }
312 }
313
314 func TestTransposeTiled4(t *testing.T) {
315 d := dupe(nineMatrix)
316 transposeTiled4(d)
317 if !isTransposed(d) {
318 t.Errorf("d is not transposed, d = %v", d)
319 }
320 }
321
322 func TestTransposeTiled8(t *testing.T) {
323 d := dupe(thirtyMatrix)
324 transposeTiled8(d)
325 if !isTransposed(d) {
326 t.Errorf("d is not transposed, d = %v", d)
327 }
328 }
329
330 func formatMatrix(m [][]int32) string {
331 s := ""
332 for _, mi := range m {
333 s += "\n["
334 for _, t := range mi {
335 h := t >> 16
336 l := t & 0xffff
337 s += fmt.Sprintf(" (%d %d)", h, l)
338 }
339 s += " ]"
340 }
341 return s
342 }
343
344 func transpose2Blocked(m [][]int32) {
345 const B = 2
346 N := len(m)
347 i := 0
348 for ; i <= len(m)-B; i += B {
349 r0, r1 := m[i], m[i+1]
350 if len(r0) < N || len(r1) < N {
351 panic("Early bounds check failure")
352 }
353
354 d01, d10 := r0[i+1], r1[i]
355 r0[i+1], r1[i] = d10, d01
356
357
358 j := 0
359 for ; j < i; j += B {
360 a0, a1 := m[j], m[j+1]
361
362 b00, b01 := a0[i], a0[i+1]
363 b10, b11 := a1[i], a1[i+1]
364
365 a0[i], a0[i+1] = r0[j], r1[j]
366 a1[i], a1[i+1] = r0[j+1], r1[j+1]
367
368 r0[j], r0[j+1] = b00, b10
369 r1[j], r1[j+1] = b01, b11
370 }
371 }
372
373
374 for ; i < len(m); i++ {
375 j := 0
376 r := m[i]
377 for ; j < i; j++ {
378 t := r[j]
379 r[j] = m[j][i]
380 m[j][i] = t
381 }
382 }
383 }
384
385 func transpose3Blocked(m [][]int32) {
386 const B = 3
387 N := len(m)
388 i := 0
389 for ; i <= len(m)-B; i += B {
390 r0, r1, r2 := m[i], m[i+1], m[i+2]
391 if len(r0) < N || len(r1) < N {
392 panic("Early bounds check failure")
393 }
394
395 d01, d10 := r0[i+1], r1[i]
396 d02, d20 := r0[i+2], r2[i]
397 d12, d21 := r1[i+2], r2[i+1]
398
399 r0[i+1], r1[i] = d10, d01
400 r0[i+2], r2[i] = d20, d02
401 r1[i+2], r2[i+1] = d21, d12
402
403
404 j := 0
405 for ; j < i; j += B {
406 a0, a1, a2 := m[j], m[j+1], m[j+2]
407
408 b00, b01, b02 := a0[i], a0[i+1], a0[i+2]
409 b10, b11, b12 := a1[i], a1[i+1], a1[i+2]
410 b20, b21, b22 := a2[i], a2[i+1], a2[i+2]
411
412 a0[i], a0[i+1], a0[i+2] = r0[j], r1[j], r2[j]
413 a1[i], a1[i+1], a1[i+2] = r0[j+1], r1[j+1], r2[j+1]
414 a2[i], a2[i+1], a2[i+2] = r0[j+2], r1[j+2], r2[j+2]
415
416 r0[j], r0[j+1], r0[j+2] = b00, b10, b20
417 r1[j], r1[j+1], r1[j+2] = b01, b11, b21
418 r2[j], r2[j+1], r2[j+2] = b02, b12, b22
419 }
420 }
421
422
423 for ; i < len(m); i++ {
424 j := 0
425 r := m[i]
426 for ; j < i; j++ {
427 t := r[j]
428 r[j] = m[j][i]
429 m[j][i] = t
430 }
431 }
432 }
433
434 func transpose4Blocked(m [][]int32) {
435 const B = 4
436 N := len(m)
437 i := 0
438 for ; i <= len(m)-B; i += B {
439 r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
440 if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
441 panic("Early bounds check failure")
442 }
443
444 d01, d10 := r0[i+1], r1[i]
445 d02, d20 := r0[i+2], r2[i]
446 d03, d30 := r0[i+3], r3[i]
447 d12, d21 := r1[i+2], r2[i+1]
448 d13, d31 := r1[i+3], r3[i+1]
449 d23, d32 := r2[i+3], r3[i+2]
450
451 r0[i+1], r1[i] = d10, d01
452 r0[i+2], r2[i] = d20, d02
453 r0[i+3], r3[i] = d30, d03
454 r1[i+2], r2[i+1] = d21, d12
455 r1[i+3], r3[i+1] = d31, d13
456 r2[i+3], r3[i+2] = d32, d23
457
458
459 j := 0
460 for ; j < i; j += B {
461 a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
462
463 b00, b01, b02, b03 := a0[i], a0[i+1], a0[i+2], a0[i+3]
464 b10, b11, b12, b13 := a1[i], a1[i+1], a1[i+2], a1[i+3]
465 b20, b21, b22, b23 := a2[i], a2[i+1], a2[i+2], a2[i+3]
466 b30, b31, b32, b33 := a3[i], a3[i+1], a3[i+2], a3[i+3]
467
468 a0[i], a0[i+1], a0[i+2], a0[i+3] = r0[j], r1[j], r2[j], r3[j]
469 a1[i], a1[i+1], a1[i+2], a1[i+3] = r0[j+1], r1[j+1], r2[j+1], r3[j+1]
470 a2[i], a2[i+1], a2[i+2], a2[i+3] = r0[j+2], r1[j+2], r2[j+2], r3[j+2]
471 a3[i], a3[i+1], a3[i+2], a3[i+3] = r0[j+3], r1[j+3], r2[j+3], r3[j+3]
472
473 r0[j], r0[j+1], r0[j+2], r0[j+3] = b00, b10, b20, b30
474 r1[j], r1[j+1], r1[j+2], r1[j+3] = b01, b11, b21, b31
475 r2[j], r2[j+1], r2[j+2], r2[j+3] = b02, b12, b22, b32
476 r3[j], r3[j+1], r3[j+2], r3[j+3] = b03, b13, b23, b33
477 }
478 }
479
480
481 for ; i < len(m); i++ {
482 j := 0
483 r := m[i]
484 for ; j < i; j++ {
485 t := r[j]
486 r[j] = m[j][i]
487 m[j][i] = t
488 }
489 }
490 }
491
492 func transpose5aBlocked(m [][]int32) {
493 const B = 5
494 N := len(m)
495 i := 0
496 for ; i <= len(m)-B; i += B {
497 r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
498 if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
499 panic("Early bounds check failure")
500 }
501
502 d01, d10 := r0[i+1], r1[i]
503 d02, d20 := r0[i+2], r2[i]
504 d03, d30 := r0[i+3], r3[i]
505 d04, d40 := r0[i+4], r4[i]
506
507 d12, d21 := r1[i+2], r2[i+1]
508 d13, d31 := r1[i+3], r3[i+1]
509 d14, d41 := r1[i+4], r4[i+1]
510
511 d23, d32 := r2[i+3], r3[i+2]
512 d24, d42 := r2[i+4], r4[i+2]
513
514 d34, d43 := r3[i+4], r4[i+3]
515
516 r0[i+1], r1[i] = d10, d01
517 r0[i+2], r2[i] = d20, d02
518 r0[i+3], r3[i] = d30, d03
519 r0[i+4], r4[i] = d40, d04
520
521 r1[i+2], r2[i+1] = d21, d12
522 r1[i+3], r3[i+1] = d31, d13
523 r1[i+4], r4[i+1] = d41, d14
524
525 r2[i+3], r3[i+2] = d32, d23
526 r2[i+4], r4[i+2] = d42, d24
527
528 r3[i+4], r4[i+3] = d43, d34
529
530
531 j := 0
532 for ; j < i; j += B {
533 a0, a1, a2, a3, a4 := m[j], m[j+1], m[j+2], m[j+3], m[j+4]
534
535 b00, b01, b02, b03, b04 := a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4]
536 b10, b11, b12, b13, b14 := a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4]
537 b20, b21, b22, b23, b24 := a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4]
538 b30, b31, b32, b33, b34 := a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4]
539 b40, b41, b42, b43, b44 := a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4]
540
541 a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4] = r0[j], r1[j], r2[j], r3[j], r4[j]
542 a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4] = r0[j+1], r1[j+1], r2[j+1], r3[j+1], r4[j+1]
543 a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4] = r0[j+2], r1[j+2], r2[j+2], r3[j+2], r4[j+2]
544 a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4] = r0[j+3], r1[j+3], r2[j+3], r3[j+3], r4[j+3]
545 a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4] = r0[j+4], r1[j+4], r2[j+4], r3[j+4], r4[j+4]
546
547 r0[j], r0[j+1], r0[j+2], r0[j+3], r0[j+4] = b00, b10, b20, b30, b40
548 r1[j], r1[j+1], r1[j+2], r1[j+3], r1[j+4] = b01, b11, b21, b31, b41
549 r2[j], r2[j+1], r2[j+2], r2[j+3], r2[j+4] = b02, b12, b22, b32, b42
550 r3[j], r3[j+1], r3[j+2], r3[j+3], r3[j+4] = b03, b13, b23, b33, b43
551 r4[j], r4[j+1], r4[j+2], r4[j+3], r4[j+4] = b04, b14, b24, b34, b44
552 }
553 }
554
555
556 for ; i < len(m); i++ {
557 j := 0
558 r := m[i]
559 for ; j < i; j++ {
560 t := r[j]
561 r[j] = m[j][i]
562 m[j][i] = t
563 }
564 }
565 }
566
567
568
569
570 func transpose5bBlocked(m [][]int32) {
571 const B = 5
572 N := len(m)
573 i := 0
574 for ; i <= len(m)-B; i += B {
575 r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
576 if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
577 panic("Early bounds check failure")
578 }
579
580 d01, d10 := r0[i+1], r1[i]
581 d02, d20 := r0[i+2], r2[i]
582 d03, d30 := r0[i+3], r3[i]
583 d04, d40 := r0[i+4], r4[i]
584 r0[i+1], r1[i] = d10, d01
585 r0[i+2], r2[i] = d20, d02
586 r0[i+3], r3[i] = d30, d03
587 r0[i+4], r4[i] = d40, d04
588
589 d12, d21 := r1[i+2], r2[i+1]
590 d13, d31 := r1[i+3], r3[i+1]
591 d14, d41 := r1[i+4], r4[i+1]
592 r1[i+2], r2[i+1] = d21, d12
593 r1[i+3], r3[i+1] = d31, d13
594 r1[i+4], r4[i+1] = d41, d14
595
596 d23, d32 := r2[i+3], r3[i+2]
597 d24, d42 := r2[i+4], r4[i+2]
598 r2[i+3], r3[i+2] = d32, d23
599 r2[i+4], r4[i+2] = d42, d24
600
601 d34, d43 := r3[i+4], r4[i+3]
602 r3[i+4], r4[i+3] = d43, d34
603
604
605 j := 0
606 for ; j < i; j += B {
607 a4, a0, a1, a2, a3 := m[j+4], m[j], m[j+1], m[j+2], m[j+3]
608
609
610 temp0 := a0[i+4]
611 temp1 := a1[i+4]
612 temp2 := a2[i+4]
613 temp3 := a3[i+4]
614 temp4 := a4[i+4]
615
616 a4[i+4] = r4[j+4]
617 a0[i+4] = r4[j]
618 a1[i+4] = r4[j+1]
619 a2[i+4] = r4[j+2]
620 a3[i+4] = r4[j+3]
621
622 r0[j+4] = temp0
623 r1[j+4] = temp1
624 r2[j+4] = temp2
625 r3[j+4] = temp3
626 r4[j+4] = temp4
627
628
629 temp0 = a0[i]
630 temp1 = a1[i]
631 temp2 = a2[i]
632 temp3 = a3[i]
633 temp4 = a4[i]
634
635 a4[i] = r0[j+4]
636 a0[i] = r0[j]
637 a1[i] = r0[j+1]
638 a2[i] = r0[j+2]
639 a3[i] = r0[j+3]
640
641 r0[j] = temp0
642 r1[j] = temp1
643 r2[j] = temp2
644 r3[j] = temp3
645 r4[j] = temp4
646
647
648 temp0 = a0[i+1]
649 temp1 = a1[i+1]
650 temp2 = a2[i+1]
651 temp3 = a3[i+1]
652 temp4 = a4[i+1]
653
654 a4[i+1] = r1[j+4]
655 a0[i+1] = r1[j]
656 a1[i+1] = r1[j+1]
657 a2[i+1] = r1[j+2]
658 a3[i+1] = r1[j+3]
659
660 r0[j+1] = temp0
661 r1[j+1] = temp1
662 r2[j+1] = temp2
663 r3[j+1] = temp3
664 r4[j+1] = temp4
665
666
667 temp0 = a0[i+2]
668 temp1 = a1[i+2]
669 temp2 = a2[i+2]
670 temp3 = a3[i+2]
671 temp4 = a4[i+2]
672
673 a4[i+2] = r2[j+4]
674 a0[i+2] = r2[j]
675 a1[i+2] = r2[j+1]
676 a2[i+2] = r2[j+2]
677 a3[i+2] = r2[j+3]
678
679 r0[j+2] = temp0
680 r1[j+2] = temp1
681 r2[j+2] = temp2
682 r3[j+2] = temp3
683 r4[j+2] = temp4
684
685
686 temp0 = a0[i+3]
687 temp1 = a1[i+3]
688 temp2 = a2[i+3]
689 temp3 = a3[i+3]
690 temp4 = a4[i+3]
691
692 a4[i+3] = r3[j+4]
693 a0[i+3] = r3[j]
694 a1[i+3] = r3[j+1]
695 a2[i+3] = r3[j+2]
696 a3[i+3] = r3[j+3]
697
698 r0[j+3] = temp0
699 r1[j+3] = temp1
700 r2[j+3] = temp2
701 r3[j+3] = temp3
702 r4[j+3] = temp4
703 }
704 }
705
706
707 for ; i < len(m); i++ {
708 j := 0
709 r := m[i]
710 for ; j < i; j++ {
711 t := r[j]
712 r[j] = m[j][i]
713 m[j][i] = t
714 }
715 }
716 }
717
718 func transposeTiled4(m [][]int32) {
719 const B = 4
720 N := len(m)
721 i := 0
722 for ; i < len(m)-(B-1); i += B {
723 r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
724 if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
725 panic("Early bounds check failure")
726 }
727
728 d0, d1, d2, d3 :=
729 simd.LoadInt32x4Slice(r0[i:]),
730 simd.LoadInt32x4Slice(r1[i:]),
731 simd.LoadInt32x4Slice(r2[i:]),
732 simd.LoadInt32x4Slice(r3[i:])
733
734 d0, d1, d2, d3 = Transpose4(d0, d1, d2, d3)
735
736 d0.StoreSlice(r0[i:])
737 d1.StoreSlice(r1[i:])
738 d2.StoreSlice(r2[i:])
739 d3.StoreSlice(r3[i:])
740
741
742 j := 0
743 for ; j < i; j += B {
744 a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
745 u0, u1, u2, u3 :=
746 simd.LoadInt32x4Slice(a0[i:]),
747 simd.LoadInt32x4Slice(a1[i:]),
748 simd.LoadInt32x4Slice(a2[i:]),
749 simd.LoadInt32x4Slice(a3[i:])
750
751 u0, u1, u2, u3 = Transpose4(u0, u1, u2, u3)
752
753 l0 := simd.LoadInt32x4Slice(r0[j:])
754 u0.StoreSlice(r0[j:])
755 l1 := simd.LoadInt32x4Slice(r1[j:])
756 u1.StoreSlice(r1[j:])
757 l2 := simd.LoadInt32x4Slice(r2[j:])
758 u2.StoreSlice(r2[j:])
759 l3 := simd.LoadInt32x4Slice(r3[j:])
760 u3.StoreSlice(r3[j:])
761
762 u0, u1, u2, u3 = Transpose4(l0, l1, l2, l3)
763
764 u0.StoreSlice(a0[i:])
765 u1.StoreSlice(a1[i:])
766 u2.StoreSlice(a2[i:])
767 u3.StoreSlice(a3[i:])
768 }
769 }
770
771 for ; i < len(m); i++ {
772 j := 0
773 r := m[i]
774 for ; j < i; j++ {
775 t := r[j]
776 r[j] = m[j][i]
777 m[j][i] = t
778 }
779 }
780 }
781
782 func transposeTiled8(m [][]int32) {
783 const B = 8
784 N := len(m)
785 i := 0
786 for ; i < len(m)-(B-1); i += B {
787 r0, r1, r2, r3, r4, r5, r6, r7 := m[i], m[i+1], m[i+2], m[i+3], m[i+4], m[i+5], m[i+6], m[i+7]
788 if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N || len(r5) < N || len(r6) < N || len(r7) < N {
789 panic("Early bounds check failure")
790 }
791
792 d0, d1, d2, d3, d4, d5, d6, d7 :=
793 simd.LoadInt32x8Slice(r0[i:]),
794 simd.LoadInt32x8Slice(r1[i:]),
795 simd.LoadInt32x8Slice(r2[i:]),
796 simd.LoadInt32x8Slice(r3[i:]),
797 simd.LoadInt32x8Slice(r4[i:]),
798 simd.LoadInt32x8Slice(r5[i:]),
799 simd.LoadInt32x8Slice(r6[i:]),
800 simd.LoadInt32x8Slice(r7[i:])
801
802 d0, d1, d2, d3, d4, d5, d6, d7 = Transpose8(d0, d1, d2, d3, d4, d5, d6, d7)
803
804 d0.StoreSlice(r0[i:])
805 d1.StoreSlice(r1[i:])
806 d2.StoreSlice(r2[i:])
807 d3.StoreSlice(r3[i:])
808 d4.StoreSlice(r4[i:])
809 d5.StoreSlice(r5[i:])
810 d6.StoreSlice(r6[i:])
811 d7.StoreSlice(r7[i:])
812
813
814 j := 0
815 for ; j < i; j += B {
816 a7, a0, a1, a2, a3, a4, a5, a6 := m[j+7], m[j], m[j+1], m[j+2], m[j+3], m[j+4], m[j+5], m[j+6]
817 u0, u1, u2, u3, u4, u5, u6, u7 :=
818 simd.LoadInt32x8Slice(a0[i:]),
819 simd.LoadInt32x8Slice(a1[i:]),
820 simd.LoadInt32x8Slice(a2[i:]),
821 simd.LoadInt32x8Slice(a3[i:]),
822 simd.LoadInt32x8Slice(a4[i:]),
823 simd.LoadInt32x8Slice(a5[i:]),
824 simd.LoadInt32x8Slice(a6[i:]),
825 simd.LoadInt32x8Slice(a7[i:])
826
827 u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(u0, u1, u2, u3, u4, u5, u6, u7)
828
829 l0 := simd.LoadInt32x8Slice(r0[j:])
830 u0.StoreSlice(r0[j:])
831 l1 := simd.LoadInt32x8Slice(r1[j:])
832 u1.StoreSlice(r1[j:])
833 l2 := simd.LoadInt32x8Slice(r2[j:])
834 u2.StoreSlice(r2[j:])
835 l3 := simd.LoadInt32x8Slice(r3[j:])
836 u3.StoreSlice(r3[j:])
837 l4 := simd.LoadInt32x8Slice(r4[j:])
838 u4.StoreSlice(r4[j:])
839 l5 := simd.LoadInt32x8Slice(r5[j:])
840 u5.StoreSlice(r5[j:])
841 l6 := simd.LoadInt32x8Slice(r6[j:])
842 u6.StoreSlice(r6[j:])
843 l7 := simd.LoadInt32x8Slice(r7[j:])
844 u7.StoreSlice(r7[j:])
845
846 u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(l0, l1, l2, l3, l4, l5, l6, l7)
847
848 u0.StoreSlice(a0[i:])
849 u1.StoreSlice(a1[i:])
850 u2.StoreSlice(a2[i:])
851 u3.StoreSlice(a3[i:])
852 u4.StoreSlice(a4[i:])
853 u5.StoreSlice(a5[i:])
854 u6.StoreSlice(a6[i:])
855 u7.StoreSlice(a7[i:])
856 }
857 }
858
859 for ; i < len(m); i++ {
860 j := 0
861 r := m[i]
862 for ; j < i; j++ {
863 t := r[j]
864 r[j] = m[j][i]
865 m[j][i] = t
866 }
867 }
868 }
869
View as plain text