Source file
src/simd/simd_emulated.go
1
2
3
4
5
6
7 package simd
8
9 import (
10 "fmt"
11 "math"
12 "math/bits"
13 )
14
15
16 func VectorBitSize() int {
17 return 128
18 }
19
20
21 func Emulated() bool {
22 return true
23 }
24
25
26
27
28
29
30
31 func HasHardwareCarrylessMultiply() bool {
32 return false
33 }
34
35 type _simd struct {
36 _ [0]func(*_simd) *_simd
37 }
38
39
40 type Int8s struct {
41 _ _simd
42 a, b uint64
43 }
44
45
46 func LoadInt8s(s []int8) Int8s {
47 var a, b uint64
48 for i := 0; i < 16; i++ {
49 val := uint64(uint8(s[i]))
50 if i < 8 {
51 a |= val << (8 * i)
52 } else {
53 b |= val << (8 * (i - 8))
54 }
55 }
56 return Int8s{a: a, b: b}
57 }
58
59
60 func LoadInt8sPart(s []int8) (Int8s, int) {
61 var a, b uint64
62 n := len(s)
63 if n > 16 {
64 n = 16
65 }
66 for i := 0; i < n; i++ {
67 val := uint64(uint8(s[i]))
68 if i < 8 {
69 a |= val << (8 * i)
70 } else {
71 b |= val << (8 * (i - 8))
72 }
73 }
74 return Int8s{a: a, b: b}, n
75 }
76
77 func (x Int8s) get(i int) int8 {
78 if i < 8 {
79 return int8(x.a >> (8 * i))
80 }
81 return int8(x.b >> (8 * (i - 8)))
82 }
83
84 func (x *Int8s) set(i int, v int8) {
85 val := uint64(uint8(v))
86 if i < 8 {
87 mask := uint64(0xff) << (8 * i)
88 x.a = (x.a &^ mask) | (val << (8 * i))
89 } else {
90 mask := uint64(0xff) << (8 * (i - 8))
91 x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
92 }
93 }
94
95
96 func (x Int8s) Abs() Int8s {
97 var res Int8s
98 for i := 0; i < 16; i++ {
99 v := x.get(i)
100 if v < 0 {
101 res.set(i, -v)
102 } else {
103 res.set(i, v)
104 }
105 }
106 return res
107 }
108
109
110 func (x Int8s) Add(y Int8s) Int8s {
111 var res Int8s
112 for i := 0; i < 16; i++ {
113 res.set(i, x.get(i)+y.get(i))
114 }
115 return res
116 }
117
118
119 func (x Int8s) AddSaturated(y Int8s) Int8s {
120 var res Int8s
121 for i := 0; i < 16; i++ {
122 sum := int(x.get(i)) + int(y.get(i))
123 if sum > math.MaxInt8 {
124 res.set(i, math.MaxInt8)
125 } else if sum < math.MinInt8 {
126 res.set(i, math.MinInt8)
127 } else {
128 res.set(i, int8(sum))
129 }
130 }
131 return res
132 }
133
134
135 func (x Int8s) And(y Int8s) Int8s {
136 return Int8s{a: x.a & y.a, b: x.b & y.b}
137 }
138
139
140 func (x Int8s) AndNot(y Int8s) Int8s {
141 return Int8s{a: x.a &^ y.a, b: x.b &^ y.b}
142 }
143
144
145 func (x Int8s) Equal(y Int8s) Mask8s {
146 var res Mask8s
147 for i := 0; i < 16; i++ {
148 if x.get(i) == y.get(i) {
149 res.set(i, true)
150 }
151 }
152 return res
153 }
154
155
156 func (x Int8s) Greater(y Int8s) Mask8s {
157 var res Mask8s
158 for i := 0; i < 16; i++ {
159 if x.get(i) > y.get(i) {
160 res.set(i, true)
161 }
162 }
163 return res
164 }
165
166
167 func (x Int8s) GreaterEqual(y Int8s) Mask8s {
168 var res Mask8s
169 for i := 0; i < 16; i++ {
170 if x.get(i) >= y.get(i) {
171 res.set(i, true)
172 }
173 }
174 return res
175 }
176
177
178 func (x Int8s) Less(y Int8s) Mask8s {
179 var res Mask8s
180 for i := 0; i < 16; i++ {
181 if x.get(i) < y.get(i) {
182 res.set(i, true)
183 }
184 }
185 return res
186 }
187
188
189 func (x Int8s) LessEqual(y Int8s) Mask8s {
190 var res Mask8s
191 for i := 0; i < 16; i++ {
192 if x.get(i) <= y.get(i) {
193 res.set(i, true)
194 }
195 }
196 return res
197 }
198
199
200 func (x Int8s) NotEqual(y Int8s) Mask8s {
201 var res Mask8s
202 for i := 0; i < 16; i++ {
203 if x.get(i) != y.get(i) {
204 res.set(i, true)
205 }
206 }
207 return res
208 }
209
210
211 func (x Int8s) Len() int {
212 return 16
213 }
214
215
216 func (x Int8s) Masked(mask Mask8s) Int8s {
217 return Int8s{a: x.a & mask.a, b: x.b & mask.b}
218 }
219
220
221 func (x Int8s) Max(y Int8s) Int8s {
222 var res Int8s
223 for i := 0; i < 16; i++ {
224 vx := x.get(i)
225 vy := y.get(i)
226 if vx > vy {
227 res.set(i, vx)
228 } else {
229 res.set(i, vy)
230 }
231 }
232 return res
233 }
234
235
236 func (x Int8s) Mul(y Int8s) Int8s {
237 var res Int8s
238 for i := 0; i < 16; i++ {
239 res.set(i, x.get(i)*y.get(i))
240 }
241 return res
242 }
243
244
245 func (x Int8s) IfElse(mask Mask8s, y Int8s) Int8s {
246 return Int8s{
247 a: (x.a & mask.a) | (y.a &^ mask.a),
248 b: (x.b & mask.b) | (y.b &^ mask.b),
249 }
250 }
251
252
253 func (x Int8s) Min(y Int8s) Int8s {
254 var res Int8s
255 for i := 0; i < 16; i++ {
256 vx := x.get(i)
257 vy := y.get(i)
258 if vx < vy {
259 res.set(i, vx)
260 } else {
261 res.set(i, vy)
262 }
263 }
264 return res
265 }
266
267
268 func (x Int8s) Neg() Int8s {
269 var res Int8s
270 for i := 0; i < 16; i++ {
271 res.set(i, -x.get(i))
272 }
273 return res
274 }
275
276
277 func (x Int8s) Not() Int8s {
278 return Int8s{a: ^x.a, b: ^x.b}
279 }
280
281
282 func (x Int8s) Or(y Int8s) Int8s {
283 return Int8s{a: x.a | y.a, b: x.b | y.b}
284 }
285
286
287 func (x Int8s) Store(s []int8) {
288 for i := 0; i < 16 && i < len(s); i++ {
289 s[i] = x.get(i)
290 }
291 }
292
293
294 func (x Int8s) StorePart(s []int8) int {
295 x.Store(s)
296 return min(len(s), x.Len())
297 }
298
299
300 func (x Int8s) String() string {
301 var parts [16]int8
302 for i := 0; i < 16; i++ {
303 parts[i] = x.get(i)
304 }
305 return fmt.Sprint(parts)
306 }
307
308
309 func (x Int8s) Sub(y Int8s) Int8s {
310 var res Int8s
311 for i := 0; i < 16; i++ {
312 res.set(i, x.get(i)-y.get(i))
313 }
314 return res
315 }
316
317
318 func (x Int8s) SubSaturated(y Int8s) Int8s {
319 var res Int8s
320 for i := 0; i < 16; i++ {
321 diff := int(x.get(i)) - int(y.get(i))
322 if diff > math.MaxInt8 {
323 res.set(i, math.MaxInt8)
324 } else if diff < math.MinInt8 {
325 res.set(i, math.MinInt8)
326 } else {
327 res.set(i, int8(diff))
328 }
329 }
330 return res
331 }
332
333
334 func (x Int8s) ToMask() Mask8s {
335 var res Mask8s
336 for i := 0; i < 16; i++ {
337 if x.get(i) != 0 {
338 res.set(i, true)
339 }
340 }
341 return res
342 }
343
344
345 func (x Int8s) Xor(y Int8s) Int8s {
346 return Int8s{a: x.a ^ y.a, b: x.b ^ y.b}
347 }
348
349
350 func (x Int8s) ConvertToUint8() Uint8s {
351 return Uint8s{a: x.a, b: x.b}
352 }
353
354
355 func (x Int8s) ToBits() Uint8s {
356 return Uint8s{a: x.a, b: x.b}
357 }
358
359
360 type Int16s struct {
361 _ _simd
362 a, b uint64
363 }
364
365
366 func LoadInt16s(s []int16) Int16s {
367 var a, b uint64
368 for i := 0; i < 8; i++ {
369 val := uint64(uint16(s[i]))
370 if i < 4 {
371 a |= val << (16 * i)
372 } else {
373 b |= val << (16 * (i - 4))
374 }
375 }
376 return Int16s{a: a, b: b}
377 }
378
379
380 func LoadInt16sPart(s []int16) (Int16s, int) {
381 var a, b uint64
382 n := len(s)
383 if n > 8 {
384 n = 8
385 }
386 for i := 0; i < n; i++ {
387 val := uint64(uint16(s[i]))
388 if i < 4 {
389 a |= val << (16 * i)
390 } else {
391 b |= val << (16 * (i - 4))
392 }
393 }
394 return Int16s{a: a, b: b}, n
395 }
396
397 func (x Int16s) get(i int) int16 {
398 if i < 4 {
399 return int16(x.a >> (16 * i))
400 }
401 return int16(x.b >> (16 * (i - 4)))
402 }
403
404 func (x *Int16s) set(i int, v int16) {
405 val := uint64(uint16(v))
406 if i < 4 {
407 mask := uint64(0xffff) << (16 * i)
408 x.a = (x.a &^ mask) | (val << (16 * i))
409 } else {
410 mask := uint64(0xffff) << (16 * (i - 4))
411 x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
412 }
413 }
414
415
416 func (x Int16s) Abs() Int16s {
417 var res Int16s
418 for i := 0; i < 8; i++ {
419 v := x.get(i)
420 if v < 0 {
421 res.set(i, -v)
422 } else {
423 res.set(i, v)
424 }
425 }
426 return res
427 }
428
429
430 func (x Int16s) Add(y Int16s) Int16s {
431 var res Int16s
432 for i := 0; i < 8; i++ {
433 res.set(i, x.get(i)+y.get(i))
434 }
435 return res
436 }
437
438
439 func (x Int16s) AddSaturated(y Int16s) Int16s {
440 var res Int16s
441 for i := 0; i < 8; i++ {
442 sum := int(x.get(i)) + int(y.get(i))
443 if sum > math.MaxInt16 {
444 res.set(i, math.MaxInt16)
445 } else if sum < math.MinInt16 {
446 res.set(i, math.MinInt16)
447 } else {
448 res.set(i, int16(sum))
449 }
450 }
451 return res
452 }
453
454
455 func (x Int16s) And(y Int16s) Int16s {
456 return Int16s{a: x.a & y.a, b: x.b & y.b}
457 }
458
459
460 func (x Int16s) AndNot(y Int16s) Int16s {
461 return Int16s{a: x.a &^ y.a, b: x.b &^ y.b}
462 }
463
464
465 func (x Int16s) Equal(y Int16s) Mask16s {
466 var res Mask16s
467 for i := 0; i < 8; i++ {
468 if x.get(i) == y.get(i) {
469 res.set(i, true)
470 }
471 }
472 return res
473 }
474
475
476 func (x Int16s) Greater(y Int16s) Mask16s {
477 var res Mask16s
478 for i := 0; i < 8; i++ {
479 if x.get(i) > y.get(i) {
480 res.set(i, true)
481 }
482 }
483 return res
484 }
485
486
487 func (x Int16s) GreaterEqual(y Int16s) Mask16s {
488 var res Mask16s
489 for i := 0; i < 8; i++ {
490 if x.get(i) >= y.get(i) {
491 res.set(i, true)
492 }
493 }
494 return res
495 }
496
497
498 func (x Int16s) Less(y Int16s) Mask16s {
499 var res Mask16s
500 for i := 0; i < 8; i++ {
501 if x.get(i) < y.get(i) {
502 res.set(i, true)
503 }
504 }
505 return res
506 }
507
508
509 func (x Int16s) LessEqual(y Int16s) Mask16s {
510 var res Mask16s
511 for i := 0; i < 8; i++ {
512 if x.get(i) <= y.get(i) {
513 res.set(i, true)
514 }
515 }
516 return res
517 }
518
519
520 func (x Int16s) NotEqual(y Int16s) Mask16s {
521 var res Mask16s
522 for i := 0; i < 8; i++ {
523 if x.get(i) != y.get(i) {
524 res.set(i, true)
525 }
526 }
527 return res
528 }
529
530
531 func (x Int16s) Len() int {
532 return 8
533 }
534
535
536 func (x Int16s) Masked(mask Mask16s) Int16s {
537 return Int16s{a: x.a & mask.a, b: x.b & mask.b}
538 }
539
540
541 func (x Int16s) Max(y Int16s) Int16s {
542 var res Int16s
543 for i := 0; i < 8; i++ {
544 vx := x.get(i)
545 vy := y.get(i)
546 if vx > vy {
547 res.set(i, vx)
548 } else {
549 res.set(i, vy)
550 }
551 }
552 return res
553 }
554
555
556 func (x Int16s) IfElse(mask Mask16s, y Int16s) Int16s {
557 return Int16s{
558 a: (x.a & mask.a) | (y.a &^ mask.a),
559 b: (x.b & mask.b) | (y.b &^ mask.b),
560 }
561 }
562
563
564 func (x Int16s) Min(y Int16s) Int16s {
565 var res Int16s
566 for i := 0; i < 8; i++ {
567 vx := x.get(i)
568 vy := y.get(i)
569 if vx < vy {
570 res.set(i, vx)
571 } else {
572 res.set(i, vy)
573 }
574 }
575 return res
576 }
577
578
579 func (x Int16s) Mul(y Int16s) Int16s {
580 var res Int16s
581 for i := 0; i < 8; i++ {
582 res.set(i, x.get(i)*y.get(i))
583 }
584 return res
585 }
586
587
588 func (x Int16s) Neg() Int16s {
589 var res Int16s
590 for i := 0; i < 8; i++ {
591 res.set(i, -x.get(i))
592 }
593 return res
594 }
595
596
597 func (x Int16s) Not() Int16s {
598 return Int16s{a: ^x.a, b: ^x.b}
599 }
600
601
602 func (x Int16s) Or(y Int16s) Int16s {
603 return Int16s{a: x.a | y.a, b: x.b | y.b}
604 }
605
606
607 func (x Int16s) ShiftAllLeft(y uint8) Int16s {
608 var res Int16s
609 for i := 0; i < 8; i++ {
610 res.set(i, x.get(i)<<y)
611 }
612 return res
613 }
614
615
616 func (x Int16s) ShiftAllRight(y uint8) Int16s {
617 var res Int16s
618 for i := 0; i < 8; i++ {
619 res.set(i, x.get(i)>>y)
620 }
621 return res
622 }
623
624
625 func (x Int16s) RotateAllLeft(dist uint64) Int16s {
626 var res Int16s
627 d := dist & 15
628 for i := 0; i < 8; i++ {
629 u := uint16(x.get(i))
630 r := (u << d) | (u >> ((16 - d) & 15))
631 res.set(i, int16(r))
632 }
633 return res
634 }
635
636
637 func (x Int16s) RotateAllRight(dist uint64) Int16s {
638 var res Int16s
639 d := dist & 15
640 for i := 0; i < 8; i++ {
641 u := uint16(x.get(i))
642 r := (u >> d) | (u << ((16 - d) & 15))
643 res.set(i, int16(r))
644 }
645 return res
646 }
647
648
649 func (x Int16s) Store(s []int16) {
650 for i := 0; i < 8 && i < len(s); i++ {
651 s[i] = x.get(i)
652 }
653 }
654
655
656 func (x Int16s) StorePart(s []int16) int {
657 x.Store(s)
658 return min(len(s), x.Len())
659 }
660
661
662 func (x Int16s) String() string {
663 var parts [8]int16
664 for i := 0; i < 8; i++ {
665 parts[i] = x.get(i)
666 }
667 return fmt.Sprint(parts)
668 }
669
670
671 func (x Int16s) Sub(y Int16s) Int16s {
672 var res Int16s
673 for i := 0; i < 8; i++ {
674 res.set(i, x.get(i)-y.get(i))
675 }
676 return res
677 }
678
679
680 func (x Int16s) SubSaturated(y Int16s) Int16s {
681 var res Int16s
682 for i := 0; i < 8; i++ {
683 diff := int(x.get(i)) - int(y.get(i))
684 if diff > math.MaxInt16 {
685 res.set(i, math.MaxInt16)
686 } else if diff < math.MinInt16 {
687 res.set(i, math.MinInt16)
688 } else {
689 res.set(i, int16(diff))
690 }
691 }
692 return res
693 }
694
695
696 func (x Int16s) ToMask() Mask16s {
697 var res Mask16s
698 for i := 0; i < 8; i++ {
699 if x.get(i) != 0 {
700 res.set(i, true)
701 }
702 }
703 return res
704 }
705
706
707 func (x Int16s) Xor(y Int16s) Int16s {
708 return Int16s{a: x.a ^ y.a, b: x.b ^ y.b}
709 }
710
711
712 func (x Int16s) ConvertToUint16() Uint16s {
713 return Uint16s{a: x.a, b: x.b}
714 }
715
716
717 func (x Int16s) ToBits() Uint16s {
718 return Uint16s{a: x.a, b: x.b}
719 }
720
721
722 type Int32s struct {
723 _ _simd
724 a, b uint64
725 }
726
727
728 func LoadInt32s(s []int32) Int32s {
729 var a, b uint64
730 for i := 0; i < 4; i++ {
731 val := uint64(uint32(s[i]))
732 if i < 2 {
733 a |= val << (32 * i)
734 } else {
735 b |= val << (32 * (i - 2))
736 }
737 }
738 return Int32s{a: a, b: b}
739 }
740
741
742 func LoadInt32sPart(s []int32) (Int32s, int) {
743 var a, b uint64
744 n := len(s)
745 if n > 4 {
746 n = 4
747 }
748 for i := 0; i < n; i++ {
749 val := uint64(uint32(s[i]))
750 if i < 2 {
751 a |= val << (32 * i)
752 } else {
753 b |= val << (32 * (i - 2))
754 }
755 }
756 return Int32s{a: a, b: b}, n
757 }
758
759 func (x Int32s) get(i int) int32 {
760 if i < 2 {
761 return int32(x.a >> (32 * i))
762 }
763 return int32(x.b >> (32 * (i - 2)))
764 }
765
766 func (x *Int32s) set(i int, v int32) {
767 val := uint64(uint32(v))
768 if i < 2 {
769 mask := uint64(0xffffffff) << (32 * i)
770 x.a = (x.a &^ mask) | (val << (32 * i))
771 } else {
772 mask := uint64(0xffffffff) << (32 * (i - 2))
773 x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
774 }
775 }
776
777
778 func (x Int32s) Abs() Int32s {
779 var res Int32s
780 for i := 0; i < 4; i++ {
781 v := x.get(i)
782 if v < 0 {
783 res.set(i, -v)
784 } else {
785 res.set(i, v)
786 }
787 }
788 return res
789 }
790
791
792 func (x Int32s) Add(y Int32s) Int32s {
793 var res Int32s
794 for i := 0; i < 4; i++ {
795 res.set(i, x.get(i)+y.get(i))
796 }
797 return res
798 }
799
800
801 func (x Int32s) And(y Int32s) Int32s {
802 return Int32s{a: x.a & y.a, b: x.b & y.b}
803 }
804
805
806 func (x Int32s) AndNot(y Int32s) Int32s {
807 return Int32s{a: x.a &^ y.a, b: x.b &^ y.b}
808 }
809
810
811 func (x Int32s) ConvertToFloat32() Float32s {
812 var res Float32s
813 for i := 0; i < 4; i++ {
814 res.set(i, float32(x.get(i)))
815 }
816 return res
817 }
818
819
820 func (x Int32s) Equal(y Int32s) Mask32s {
821 var res Mask32s
822 for i := 0; i < 4; i++ {
823 if x.get(i) == y.get(i) {
824 res.set(i, true)
825 }
826 }
827 return res
828 }
829
830
831 func (x Int32s) Greater(y Int32s) Mask32s {
832 var res Mask32s
833 for i := 0; i < 4; i++ {
834 if x.get(i) > y.get(i) {
835 res.set(i, true)
836 }
837 }
838 return res
839 }
840
841
842 func (x Int32s) GreaterEqual(y Int32s) Mask32s {
843 var res Mask32s
844 for i := 0; i < 4; i++ {
845 if x.get(i) >= y.get(i) {
846 res.set(i, true)
847 }
848 }
849 return res
850 }
851
852
853 func (x Int32s) Less(y Int32s) Mask32s {
854 var res Mask32s
855 for i := 0; i < 4; i++ {
856 if x.get(i) < y.get(i) {
857 res.set(i, true)
858 }
859 }
860 return res
861 }
862
863
864 func (x Int32s) LessEqual(y Int32s) Mask32s {
865 var res Mask32s
866 for i := 0; i < 4; i++ {
867 if x.get(i) <= y.get(i) {
868 res.set(i, true)
869 }
870 }
871 return res
872 }
873
874
875 func (x Int32s) NotEqual(y Int32s) Mask32s {
876 var res Mask32s
877 for i := 0; i < 4; i++ {
878 if x.get(i) != y.get(i) {
879 res.set(i, true)
880 }
881 }
882 return res
883 }
884
885
886 func (x Int32s) Len() int {
887 return 4
888 }
889
890
891 func (x Int32s) Masked(mask Mask32s) Int32s {
892 return Int32s{a: x.a & mask.a, b: x.b & mask.b}
893 }
894
895
896 func (x Int32s) Max(y Int32s) Int32s {
897 var res Int32s
898 for i := 0; i < 4; i++ {
899 vx := x.get(i)
900 vy := y.get(i)
901 if vx > vy {
902 res.set(i, vx)
903 } else {
904 res.set(i, vy)
905 }
906 }
907 return res
908 }
909
910
911 func (x Int32s) IfElse(mask Mask32s, y Int32s) Int32s {
912 return Int32s{
913 a: (x.a & mask.a) | (y.a &^ mask.a),
914 b: (x.b & mask.b) | (y.b &^ mask.b),
915 }
916 }
917
918
919 func (x Int32s) Min(y Int32s) Int32s {
920 var res Int32s
921 for i := 0; i < 4; i++ {
922 vx := x.get(i)
923 vy := y.get(i)
924 if vx < vy {
925 res.set(i, vx)
926 } else {
927 res.set(i, vy)
928 }
929 }
930 return res
931 }
932
933
934 func (x Int32s) Mul(y Int32s) Int32s {
935 var res Int32s
936 for i := 0; i < 4; i++ {
937 res.set(i, x.get(i)*y.get(i))
938 }
939 return res
940 }
941
942
943 func (x Int32s) Neg() Int32s {
944 var res Int32s
945 for i := 0; i < 4; i++ {
946 res.set(i, -x.get(i))
947 }
948 return res
949 }
950
951
952 func (x Int32s) Not() Int32s {
953 return Int32s{a: ^x.a, b: ^x.b}
954 }
955
956
957 func (x Int32s) Or(y Int32s) Int32s {
958 return Int32s{a: x.a | y.a, b: x.b | y.b}
959 }
960
961
962 func (x Int32s) ShiftAllLeft(y uint8) Int32s {
963 var res Int32s
964 for i := 0; i < 4; i++ {
965 res.set(i, x.get(i)<<y)
966 }
967 return res
968 }
969
970
971 func (x Int32s) ShiftAllRight(y uint8) Int32s {
972 var res Int32s
973 for i := 0; i < 4; i++ {
974 res.set(i, x.get(i)>>y)
975 }
976 return res
977 }
978
979
980 func (x Int32s) RotateAllLeft(dist uint64) Int32s {
981 var res Int32s
982 d := dist & 31
983 for i := 0; i < 4; i++ {
984 u := uint32(x.get(i))
985 r := (u << d) | (u >> ((32 - d) & 31))
986 res.set(i, int32(r))
987 }
988 return res
989 }
990
991
992 func (x Int32s) RotateAllRight(dist uint64) Int32s {
993 var res Int32s
994 d := dist & 31
995 for i := 0; i < 4; i++ {
996 u := uint32(x.get(i))
997 r := (u >> d) | (u << ((32 - d) & 31))
998 res.set(i, int32(r))
999 }
1000 return res
1001 }
1002
1003
1004 func (x Int32s) Store(s []int32) {
1005 for i := 0; i < 4 && i < len(s); i++ {
1006 s[i] = x.get(i)
1007 }
1008 }
1009
1010
1011 func (x Int32s) StorePart(s []int32) int {
1012 x.Store(s)
1013 return min(len(s), x.Len())
1014 }
1015
1016
1017 func (x Int32s) String() string {
1018 var parts [4]int32
1019 for i := 0; i < 4; i++ {
1020 parts[i] = x.get(i)
1021 }
1022 return fmt.Sprint(parts)
1023 }
1024
1025
1026 func (x Int32s) Sub(y Int32s) Int32s {
1027 var res Int32s
1028 for i := 0; i < 4; i++ {
1029 res.set(i, x.get(i)-y.get(i))
1030 }
1031 return res
1032 }
1033
1034
1035 func (x Int32s) ToMask() Mask32s {
1036 var res Mask32s
1037 for i := 0; i < 4; i++ {
1038 if x.get(i) != 0 {
1039 res.set(i, true)
1040 }
1041 }
1042 return res
1043 }
1044
1045
1046 func (x Int32s) Xor(y Int32s) Int32s {
1047 return Int32s{a: x.a ^ y.a, b: x.b ^ y.b}
1048 }
1049
1050
1051 func (x Int32s) ConvertToUint32() Uint32s {
1052 return Uint32s{a: x.a, b: x.b}
1053 }
1054
1055
1056 func (x Int32s) ToBits() Uint32s {
1057 return Uint32s{a: x.a, b: x.b}
1058 }
1059
1060
1061 type Int64s struct {
1062 _ _simd
1063 a, b uint64
1064 }
1065
1066
1067 func LoadInt64s(s []int64) Int64s {
1068 var a, b uint64
1069 a = uint64(s[0])
1070 b = uint64(s[1])
1071 return Int64s{a: a, b: b}
1072 }
1073
1074
1075 func LoadInt64sPart(s []int64) (Int64s, int) {
1076 var a, b uint64
1077 if len(s) > 0 {
1078 a = uint64(s[0])
1079 }
1080 if len(s) > 1 {
1081 b = uint64(s[1])
1082 }
1083 return Int64s{a: a, b: b}, len(s)
1084 }
1085
1086 func (x Int64s) get(i int) int64 {
1087 if i == 0 {
1088 return int64(x.a)
1089 }
1090 return int64(x.b)
1091 }
1092
1093 func (x *Int64s) set(i int, v int64) {
1094 if i == 0 {
1095 x.a = uint64(v)
1096 } else {
1097 x.b = uint64(v)
1098 }
1099 }
1100
1101
1102 func (x Int64s) Add(y Int64s) Int64s {
1103 return Int64s{a: x.a + y.a, b: x.b + y.b}
1104 }
1105
1106
1107 func (x Int64s) And(y Int64s) Int64s {
1108 return Int64s{a: x.a & y.a, b: x.b & y.b}
1109 }
1110
1111
1112 func (x Int64s) AndNot(y Int64s) Int64s {
1113 return Int64s{a: x.a &^ y.a, b: x.b &^ y.b}
1114 }
1115
1116
1117 func (x Int64s) Equal(y Int64s) Mask64s {
1118 var res Mask64s
1119 if x.a == y.a {
1120 res.a = ^uint64(0)
1121 }
1122 if x.b == y.b {
1123 res.b = ^uint64(0)
1124 }
1125 return res
1126 }
1127
1128
1129 func (x Int64s) Greater(y Int64s) Mask64s {
1130 var res Mask64s
1131 if int64(x.a) > int64(y.a) {
1132 res.a = ^uint64(0)
1133 }
1134 if int64(x.b) > int64(y.b) {
1135 res.b = ^uint64(0)
1136 }
1137 return res
1138 }
1139
1140
1141 func (x Int64s) GreaterEqual(y Int64s) Mask64s {
1142 var res Mask64s
1143 if int64(x.a) >= int64(y.a) {
1144 res.a = ^uint64(0)
1145 }
1146 if int64(x.b) >= int64(y.b) {
1147 res.b = ^uint64(0)
1148 }
1149 return res
1150 }
1151
1152
1153 func (x Int64s) Less(y Int64s) Mask64s {
1154 var res Mask64s
1155 if int64(x.a) < int64(y.a) {
1156 res.a = ^uint64(0)
1157 }
1158 if int64(x.b) < int64(y.b) {
1159 res.b = ^uint64(0)
1160 }
1161 return res
1162 }
1163
1164
1165 func (x Int64s) LessEqual(y Int64s) Mask64s {
1166 var res Mask64s
1167 if int64(x.a) <= int64(y.a) {
1168 res.a = ^uint64(0)
1169 }
1170 if int64(x.b) <= int64(y.b) {
1171 res.b = ^uint64(0)
1172 }
1173 return res
1174 }
1175
1176
1177 func (x Int64s) NotEqual(y Int64s) Mask64s {
1178 var res Mask64s
1179 if x.a != y.a {
1180 res.a = ^uint64(0)
1181 }
1182 if x.b != y.b {
1183 res.b = ^uint64(0)
1184 }
1185 return res
1186 }
1187
1188
1189 func (x Int64s) Len() int {
1190 return 2
1191 }
1192
1193
1194 func (x Int64s) Masked(mask Mask64s) Int64s {
1195 return Int64s{a: x.a & mask.a, b: x.b & mask.b}
1196 }
1197
1198
1199 func (x Int64s) IfElse(mask Mask64s, y Int64s) Int64s {
1200 return Int64s{
1201 a: (x.a & mask.a) | (y.a &^ mask.a),
1202 b: (x.b & mask.b) | (y.b &^ mask.b),
1203 }
1204 }
1205
1206
1207 func (x Int64s) Neg() Int64s {
1208 return Int64s{a: uint64(-int64(x.a)), b: uint64(-int64(x.b))}
1209 }
1210
1211
1212 func (x Int64s) Not() Int64s {
1213 return Int64s{a: ^x.a, b: ^x.b}
1214 }
1215
1216
1217 func (x Int64s) Or(y Int64s) Int64s {
1218 return Int64s{a: x.a | y.a, b: x.b | y.b}
1219 }
1220
1221
1222 func (x Int64s) ShiftAllLeft(y uint8) Int64s {
1223 return Int64s{a: x.a << y, b: x.b << y}
1224 }
1225
1226
1227 func (x Int64s) RotateAllLeft(dist uint64) Int64s {
1228 d := dist & 63
1229 return Int64s{
1230 a: (x.a << d) | (x.a >> ((64 - d) & 63)),
1231 b: (x.b << d) | (x.b >> ((64 - d) & 63)),
1232 }
1233 }
1234
1235
1236 func (x Int64s) RotateAllRight(dist uint64) Int64s {
1237 d := dist & 63
1238 return Int64s{
1239 a: (x.a >> d) | (x.a << ((64 - d) & 63)),
1240 b: (x.b >> d) | (x.b << ((64 - d) & 63)),
1241 }
1242 }
1243
1244
1245 func (x Int64s) Store(s []int64) {
1246 if len(s) > 0 {
1247 s[0] = int64(x.a)
1248 }
1249 if len(s) > 1 {
1250 s[1] = int64(x.b)
1251 }
1252 }
1253
1254
1255 func (x Int64s) StorePart(s []int64) int {
1256 x.Store(s)
1257 return min(len(s), x.Len())
1258 }
1259
1260
1261 func (x Int64s) String() string {
1262 return fmt.Sprint([2]int64{int64(x.a), int64(x.b)})
1263 }
1264
1265
1266 func (x Int64s) Sub(y Int64s) Int64s {
1267 return Int64s{a: x.a - y.a, b: x.b - y.b}
1268 }
1269
1270
1271 func (x Int64s) ToMask() Mask64s {
1272 var res Mask64s
1273 if x.a != 0 {
1274 res.a = ^uint64(0)
1275 }
1276 if x.b != 0 {
1277 res.b = ^uint64(0)
1278 }
1279 return res
1280 }
1281
1282
1283 func (x Int64s) Xor(y Int64s) Int64s {
1284 return Int64s{a: x.a ^ y.a, b: x.b ^ y.b}
1285 }
1286
1287
1288 func (x Int64s) ConvertToUint64() Uint64s {
1289 return Uint64s{a: x.a, b: x.b}
1290 }
1291
1292
1293 func (x Int64s) ToBits() Uint64s {
1294 return Uint64s{a: x.a, b: x.b}
1295 }
1296
1297
1298 type Uint8s struct {
1299 _ _simd
1300 a, b uint64
1301 }
1302
1303
1304 func LoadUint8s(s []uint8) Uint8s {
1305 var a, b uint64
1306 for i := 0; i < 16; i++ {
1307 val := uint64(s[i])
1308 if i < 8 {
1309 a |= val << (8 * i)
1310 } else {
1311 b |= val << (8 * (i - 8))
1312 }
1313 }
1314 return Uint8s{a: a, b: b}
1315 }
1316
1317
1318 func LoadUint8sPart(s []uint8) (Uint8s, int) {
1319 var a, b uint64
1320 n := len(s)
1321 if n > 16 {
1322 n = 16
1323 }
1324 for i := 0; i < n; i++ {
1325 val := uint64(s[i])
1326 if i < 8 {
1327 a |= val << (8 * i)
1328 } else {
1329 b |= val << (8 * (i - 8))
1330 }
1331 }
1332 return Uint8s{a: a, b: b}, n
1333 }
1334
1335 func (x Uint8s) get(i int) uint8 {
1336 if i < 8 {
1337 return uint8(x.a >> (8 * i))
1338 }
1339 return uint8(x.b >> (8 * (i - 8)))
1340 }
1341
1342 func (x *Uint8s) set(i int, v uint8) {
1343 val := uint64(v)
1344 if i < 8 {
1345 mask := uint64(0xff) << (8 * i)
1346 x.a = (x.a &^ mask) | (val << (8 * i))
1347 } else {
1348 mask := uint64(0xff) << (8 * (i - 8))
1349 x.b = (x.b &^ mask) | (val << (8 * (i - 8)))
1350 }
1351 }
1352
1353
1354 func (x Uint8s) Add(y Uint8s) Uint8s {
1355 var res Uint8s
1356 for i := 0; i < 16; i++ {
1357 res.set(i, x.get(i)+y.get(i))
1358 }
1359 return res
1360 }
1361
1362
1363 func (x Uint8s) AddSaturated(y Uint8s) Uint8s {
1364 var res Uint8s
1365 for i := 0; i < 16; i++ {
1366 sum := int(x.get(i)) + int(y.get(i))
1367 if sum > math.MaxUint8 {
1368 res.set(i, math.MaxUint8)
1369 } else {
1370 res.set(i, uint8(sum))
1371 }
1372 }
1373 return res
1374 }
1375
1376
1377 func (x Uint8s) And(y Uint8s) Uint8s {
1378 return Uint8s{a: x.a & y.a, b: x.b & y.b}
1379 }
1380
1381
1382 func (x Uint8s) AndNot(y Uint8s) Uint8s {
1383 return Uint8s{a: x.a &^ y.a, b: x.b &^ y.b}
1384 }
1385
1386
1387 func (x Uint8s) Average(y Uint8s) Uint8s {
1388 var res Uint8s
1389 for i := 0; i < 16; i++ {
1390 res.set(i, uint8((int(x.get(i))+int(y.get(i))+1)>>1))
1391 }
1392 return res
1393 }
1394
1395
1396 func (x Uint8s) Equal(y Uint8s) Mask8s {
1397 var res Mask8s
1398 for i := 0; i < 16; i++ {
1399 if x.get(i) == y.get(i) {
1400 res.set(i, true)
1401 }
1402 }
1403 return res
1404 }
1405
1406
1407 func (x Uint8s) NotEqual(y Uint8s) Mask8s {
1408 var res Mask8s
1409 for i := 0; i < 16; i++ {
1410 if x.get(i) != y.get(i) {
1411 res.set(i, true)
1412 }
1413 }
1414 return res
1415 }
1416
1417
1418 func (x Uint8s) Len() int {
1419 return 16
1420 }
1421
1422
1423 func (x Uint8s) Masked(mask Mask8s) Uint8s {
1424 return Uint8s{a: x.a & mask.a, b: x.b & mask.b}
1425 }
1426
1427
1428 func (x Uint8s) Max(y Uint8s) Uint8s {
1429 var res Uint8s
1430 for i := 0; i < 16; i++ {
1431 vx := x.get(i)
1432 vy := y.get(i)
1433 if vx > vy {
1434 res.set(i, vx)
1435 } else {
1436 res.set(i, vy)
1437 }
1438 }
1439 return res
1440 }
1441
1442
1443 func (x Uint8s) IfElse(mask Mask8s, y Uint8s) Uint8s {
1444 return Uint8s{
1445 a: (x.a & mask.a) | (y.a &^ mask.a),
1446 b: (x.b & mask.b) | (y.b &^ mask.b),
1447 }
1448 }
1449
1450
1451 func (x Uint8s) Min(y Uint8s) Uint8s {
1452 var res Uint8s
1453 for i := 0; i < 16; i++ {
1454 vx := x.get(i)
1455 vy := y.get(i)
1456 if vx < vy {
1457 res.set(i, vx)
1458 } else {
1459 res.set(i, vy)
1460 }
1461 }
1462 return res
1463 }
1464
1465
1466 func (x Uint8s) Mul(y Uint8s) Uint8s {
1467 var res Uint8s
1468 for i := 0; i < 16; i++ {
1469 res.set(i, x.get(i)*y.get(i))
1470 }
1471 return res
1472 }
1473
1474
1475 func (x Uint8s) Not() Uint8s {
1476 return Uint8s{a: ^x.a, b: ^x.b}
1477 }
1478
1479
1480 func (x Uint8s) Or(y Uint8s) Uint8s {
1481 return Uint8s{a: x.a | y.a, b: x.b | y.b}
1482 }
1483
1484
1485 func (x Uint8s) Store(s []uint8) {
1486 for i := 0; i < 16 && i < len(s); i++ {
1487 s[i] = x.get(i)
1488 }
1489 }
1490
1491
1492 func (x Uint8s) StorePart(s []uint8) int {
1493 x.Store(s)
1494 return min(len(s), x.Len())
1495 }
1496
1497
1498 func (x Uint8s) String() string {
1499 var parts [16]uint8
1500 for i := 0; i < 16; i++ {
1501 parts[i] = x.get(i)
1502 }
1503 return fmt.Sprint(parts)
1504 }
1505
1506
1507 func (x Uint8s) Sub(y Uint8s) Uint8s {
1508 var res Uint8s
1509 for i := 0; i < 16; i++ {
1510 res.set(i, x.get(i)-y.get(i))
1511 }
1512 return res
1513 }
1514
1515
1516 func (x Uint8s) SubSaturated(y Uint8s) Uint8s {
1517 var res Uint8s
1518 for i := 0; i < 16; i++ {
1519 vx := x.get(i)
1520 vy := y.get(i)
1521 if vx < vy {
1522 res.set(i, 0)
1523 } else {
1524 res.set(i, vx-vy)
1525 }
1526 }
1527 return res
1528 }
1529
1530
1531 func (x Uint8s) Xor(y Uint8s) Uint8s {
1532 return Uint8s{a: x.a ^ y.a, b: x.b ^ y.b}
1533 }
1534
1535
1536 func (x Uint8s) BitsToInt8() Int8s {
1537 return Int8s{a: x.a, b: x.b}
1538 }
1539
1540
1541 func (x Uint8s) ConvertToInt8() Int8s {
1542 return Int8s{a: x.a, b: x.b}
1543 }
1544
1545
1546 func (x Uint8s) ReshapeToUint16s() Uint16s {
1547 return Uint16s{a: x.a, b: x.b}
1548 }
1549
1550
1551 func (x Uint8s) ReshapeToUint32s() Uint32s {
1552 return Uint32s{a: x.a, b: x.b}
1553 }
1554
1555
1556 func (x Uint8s) ReshapeToUint64s() Uint64s {
1557 return Uint64s{a: x.a, b: x.b}
1558 }
1559
1560
1561 type Uint16s struct {
1562 _ _simd
1563 a, b uint64
1564 }
1565
1566
1567 func LoadUint16s(s []uint16) Uint16s {
1568 var a, b uint64
1569 for i := 0; i < 8; i++ {
1570 val := uint64(s[i])
1571 if i < 4 {
1572 a |= val << (16 * i)
1573 } else {
1574 b |= val << (16 * (i - 4))
1575 }
1576 }
1577 return Uint16s{a: a, b: b}
1578 }
1579
1580
1581 func LoadUint16sPart(s []uint16) (Uint16s, int) {
1582 var a, b uint64
1583 n := len(s)
1584 if n > 8 {
1585 n = 8
1586 }
1587 for i := 0; i < n; i++ {
1588 val := uint64(s[i])
1589 if i < 4 {
1590 a |= val << (16 * i)
1591 } else {
1592 b |= val << (16 * (i - 4))
1593 }
1594 }
1595 return Uint16s{a: a, b: b}, n
1596 }
1597
1598 func (x Uint16s) get(i int) uint16 {
1599 if i < 4 {
1600 return uint16(x.a >> (16 * i))
1601 }
1602 return uint16(x.b >> (16 * (i - 4)))
1603 }
1604
1605 func (x *Uint16s) set(i int, v uint16) {
1606 val := uint64(v)
1607 if i < 4 {
1608 mask := uint64(0xffff) << (16 * i)
1609 x.a = (x.a &^ mask) | (val << (16 * i))
1610 } else {
1611 mask := uint64(0xffff) << (16 * (i - 4))
1612 x.b = (x.b &^ mask) | (val << (16 * (i - 4)))
1613 }
1614 }
1615
1616
1617 func (x Uint16s) Add(y Uint16s) Uint16s {
1618 var res Uint16s
1619 for i := 0; i < 8; i++ {
1620 res.set(i, x.get(i)+y.get(i))
1621 }
1622 return res
1623 }
1624
1625
1626 func (x Uint16s) AddSaturated(y Uint16s) Uint16s {
1627 var res Uint16s
1628 for i := 0; i < 8; i++ {
1629 sum := int(x.get(i)) + int(y.get(i))
1630 if sum > math.MaxUint16 {
1631 res.set(i, math.MaxUint16)
1632 } else {
1633 res.set(i, uint16(sum))
1634 }
1635 }
1636 return res
1637 }
1638
1639
1640 func (x Uint16s) And(y Uint16s) Uint16s {
1641 return Uint16s{a: x.a & y.a, b: x.b & y.b}
1642 }
1643
1644
1645 func (x Uint16s) AndNot(y Uint16s) Uint16s {
1646 return Uint16s{a: x.a &^ y.a, b: x.b &^ y.b}
1647 }
1648
1649
1650 func (x Uint16s) Average(y Uint16s) Uint16s {
1651 var res Uint16s
1652 for i := 0; i < 8; i++ {
1653 res.set(i, uint16((int(x.get(i))+int(y.get(i))+1)>>1))
1654 }
1655 return res
1656 }
1657
1658
1659 func (x Uint16s) Equal(y Uint16s) Mask16s {
1660 var res Mask16s
1661 for i := 0; i < 8; i++ {
1662 if x.get(i) == y.get(i) {
1663 res.set(i, true)
1664 }
1665 }
1666 return res
1667 }
1668
1669
1670 func (x Uint16s) Greater(y Uint16s) Mask16s {
1671 var res Mask16s
1672 for i := 0; i < 8; i++ {
1673 if x.get(i) > y.get(i) {
1674 res.set(i, true)
1675 }
1676 }
1677 return res
1678 }
1679
1680
1681 func (x Uint16s) GreaterEqual(y Uint16s) Mask16s {
1682 var res Mask16s
1683 for i := 0; i < 8; i++ {
1684 if x.get(i) >= y.get(i) {
1685 res.set(i, true)
1686 }
1687 }
1688 return res
1689 }
1690
1691
1692 func (x Uint16s) Less(y Uint16s) Mask16s {
1693 var res Mask16s
1694 for i := 0; i < 8; i++ {
1695 if x.get(i) < y.get(i) {
1696 res.set(i, true)
1697 }
1698 }
1699 return res
1700 }
1701
1702
1703 func (x Uint16s) LessEqual(y Uint16s) Mask16s {
1704 var res Mask16s
1705 for i := 0; i < 8; i++ {
1706 if x.get(i) <= y.get(i) {
1707 res.set(i, true)
1708 }
1709 }
1710 return res
1711 }
1712
1713
1714 func (x Uint16s) NotEqual(y Uint16s) Mask16s {
1715 var res Mask16s
1716 for i := 0; i < 8; i++ {
1717 if x.get(i) != y.get(i) {
1718 res.set(i, true)
1719 }
1720 }
1721 return res
1722 }
1723
1724
1725 func (x Uint16s) Len() int {
1726 return 8
1727 }
1728
1729
1730 func (x Uint16s) Masked(mask Mask16s) Uint16s {
1731 return Uint16s{a: x.a & mask.a, b: x.b & mask.b}
1732 }
1733
1734
1735 func (x Uint16s) Max(y Uint16s) Uint16s {
1736 var res Uint16s
1737 for i := 0; i < 8; i++ {
1738 vx := x.get(i)
1739 vy := y.get(i)
1740 if vx > vy {
1741 res.set(i, vx)
1742 } else {
1743 res.set(i, vy)
1744 }
1745 }
1746 return res
1747 }
1748
1749
1750 func (x Uint16s) IfElse(mask Mask16s, y Uint16s) Uint16s {
1751 return Uint16s{
1752 a: (x.a & mask.a) | (y.a &^ mask.a),
1753 b: (x.b & mask.b) | (y.b &^ mask.b),
1754 }
1755 }
1756
1757
1758 func (x Uint16s) Min(y Uint16s) Uint16s {
1759 var res Uint16s
1760 for i := 0; i < 8; i++ {
1761 vx := x.get(i)
1762 vy := y.get(i)
1763 if vx < vy {
1764 res.set(i, vx)
1765 } else {
1766 res.set(i, vy)
1767 }
1768 }
1769 return res
1770 }
1771
1772
1773 func (x Uint16s) Mul(y Uint16s) Uint16s {
1774 var res Uint16s
1775 for i := 0; i < 8; i++ {
1776 res.set(i, x.get(i)*y.get(i))
1777 }
1778 return res
1779 }
1780
1781
1782 func (x Uint16s) Not() Uint16s {
1783 return Uint16s{a: ^x.a, b: ^x.b}
1784 }
1785
1786
1787 func (x Uint16s) Or(y Uint16s) Uint16s {
1788 return Uint16s{a: x.a | y.a, b: x.b | y.b}
1789 }
1790
1791
1792 func (x Uint16s) ShiftAllLeft(y uint8) Uint16s {
1793 var res Uint16s
1794 for i := 0; i < 8; i++ {
1795 res.set(i, x.get(i)<<y)
1796 }
1797 return res
1798 }
1799
1800
1801 func (x Uint16s) ShiftAllRight(y uint8) Uint16s {
1802 var res Uint16s
1803 for i := 0; i < 8; i++ {
1804 res.set(i, x.get(i)>>y)
1805 }
1806 return res
1807 }
1808
1809
1810 func (x Uint16s) RotateAllLeft(dist uint64) Uint16s {
1811 var res Uint16s
1812 d := dist & 15
1813 for i := 0; i < 8; i++ {
1814 u := x.get(i)
1815 r := (u << d) | (u >> ((16 - d) & 15))
1816 res.set(i, r)
1817 }
1818 return res
1819 }
1820
1821
1822 func (x Uint16s) RotateAllRight(dist uint64) Uint16s {
1823 var res Uint16s
1824 d := dist & 15
1825 for i := 0; i < 8; i++ {
1826 u := x.get(i)
1827 r := (u >> d) | (u << ((16 - d) & 15))
1828 res.set(i, r)
1829 }
1830 return res
1831 }
1832
1833
1834 func (x Uint16s) Store(s []uint16) {
1835 for i := 0; i < 8 && i < len(s); i++ {
1836 s[i] = x.get(i)
1837 }
1838 }
1839
1840
1841 func (x Uint16s) StorePart(s []uint16) int {
1842 x.Store(s)
1843 return min(len(s), x.Len())
1844 }
1845
1846
1847 func (x Uint16s) String() string {
1848 var parts [8]uint16
1849 for i := 0; i < 8; i++ {
1850 parts[i] = x.get(i)
1851 }
1852 return fmt.Sprint(parts)
1853 }
1854
1855
1856 func (x Uint16s) Sub(y Uint16s) Uint16s {
1857 var res Uint16s
1858 for i := 0; i < 8; i++ {
1859 res.set(i, x.get(i)-y.get(i))
1860 }
1861 return res
1862 }
1863
1864
1865 func (x Uint16s) SubSaturated(y Uint16s) Uint16s {
1866 var res Uint16s
1867 for i := 0; i < 8; i++ {
1868 vx := x.get(i)
1869 vy := y.get(i)
1870 if vx < vy {
1871 res.set(i, 0)
1872 } else {
1873 res.set(i, vx-vy)
1874 }
1875 }
1876 return res
1877 }
1878
1879
1880 func (x Uint16s) Xor(y Uint16s) Uint16s {
1881 return Uint16s{a: x.a ^ y.a, b: x.b ^ y.b}
1882 }
1883
1884
1885 func (x Uint16s) BitsToInt16() Int16s {
1886 return Int16s{a: x.a, b: x.b}
1887 }
1888
1889
1890 func (x Uint16s) ConvertToInt16() Int16s {
1891 return Int16s{a: x.a, b: x.b}
1892 }
1893
1894
1895 func (x Uint16s) ReshapeToUint32s() Uint32s {
1896 return Uint32s{a: x.a, b: x.b}
1897 }
1898
1899
1900 func (x Uint16s) ReshapeToUint64s() Uint64s {
1901 return Uint64s{a: x.a, b: x.b}
1902 }
1903
1904
1905 func (x Uint16s) ReshapeToUint8s() Uint8s {
1906 return Uint8s{a: x.a, b: x.b}
1907 }
1908
1909
1910 type Uint32s struct {
1911 _ _simd
1912 a, b uint64
1913 }
1914
1915
1916 func LoadUint32s(s []uint32) Uint32s {
1917 var a, b uint64
1918 for i := 0; i < 4; i++ {
1919 val := uint64(s[i])
1920 if i < 2 {
1921 a |= val << (32 * i)
1922 } else {
1923 b |= val << (32 * (i - 2))
1924 }
1925 }
1926 return Uint32s{a: a, b: b}
1927 }
1928
1929
1930 func LoadUint32sPart(s []uint32) (Uint32s, int) {
1931 var a, b uint64
1932 n := len(s)
1933 if n > 4 {
1934 n = 4
1935 }
1936 for i := 0; i < n; i++ {
1937 val := uint64(s[i])
1938 if i < 2 {
1939 a |= val << (32 * i)
1940 } else {
1941 b |= val << (32 * (i - 2))
1942 }
1943 }
1944 return Uint32s{a: a, b: b}, n
1945 }
1946
1947 func (x Uint32s) get(i int) uint32 {
1948 if i < 2 {
1949 return uint32(x.a >> (32 * i))
1950 }
1951 return uint32(x.b >> (32 * (i - 2)))
1952 }
1953
1954 func (x *Uint32s) set(i int, v uint32) {
1955 val := uint64(v)
1956 if i < 2 {
1957 mask := uint64(0xffffffff) << (32 * i)
1958 x.a = (x.a &^ mask) | (val << (32 * i))
1959 } else {
1960 mask := uint64(0xffffffff) << (32 * (i - 2))
1961 x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
1962 }
1963 }
1964
1965
1966 func (x Uint32s) Add(y Uint32s) Uint32s {
1967 var res Uint32s
1968 for i := 0; i < 4; i++ {
1969 res.set(i, x.get(i)+y.get(i))
1970 }
1971 return res
1972 }
1973
1974
1975 func (x Uint32s) And(y Uint32s) Uint32s {
1976 return Uint32s{a: x.a & y.a, b: x.b & y.b}
1977 }
1978
1979
1980 func (x Uint32s) AndNot(y Uint32s) Uint32s {
1981 return Uint32s{a: x.a &^ y.a, b: x.b &^ y.b}
1982 }
1983
1984
1985 func (x Uint32s) Equal(y Uint32s) Mask32s {
1986 var res Mask32s
1987 for i := 0; i < 4; i++ {
1988 if x.get(i) == y.get(i) {
1989 res.set(i, true)
1990 }
1991 }
1992 return res
1993 }
1994
1995
1996 func (x Uint32s) Greater(y Uint32s) Mask32s {
1997 var res Mask32s
1998 for i := 0; i < 4; i++ {
1999 if x.get(i) > y.get(i) {
2000 res.set(i, true)
2001 }
2002 }
2003 return res
2004 }
2005
2006
2007 func (x Uint32s) GreaterEqual(y Uint32s) Mask32s {
2008 var res Mask32s
2009 for i := 0; i < 4; i++ {
2010 if x.get(i) >= y.get(i) {
2011 res.set(i, true)
2012 }
2013 }
2014 return res
2015 }
2016
2017
2018 func (x Uint32s) Less(y Uint32s) Mask32s {
2019 var res Mask32s
2020 for i := 0; i < 4; i++ {
2021 if x.get(i) < y.get(i) {
2022 res.set(i, true)
2023 }
2024 }
2025 return res
2026 }
2027
2028
2029 func (x Uint32s) LessEqual(y Uint32s) Mask32s {
2030 var res Mask32s
2031 for i := 0; i < 4; i++ {
2032 if x.get(i) <= y.get(i) {
2033 res.set(i, true)
2034 }
2035 }
2036 return res
2037 }
2038
2039
2040 func (x Uint32s) NotEqual(y Uint32s) Mask32s {
2041 var res Mask32s
2042 for i := 0; i < 4; i++ {
2043 if x.get(i) != y.get(i) {
2044 res.set(i, true)
2045 }
2046 }
2047 return res
2048 }
2049
2050
2051 func (x Uint32s) Len() int {
2052 return 4
2053 }
2054
2055
2056 func (x Uint32s) Masked(mask Mask32s) Uint32s {
2057 return Uint32s{a: x.a & mask.a, b: x.b & mask.b}
2058 }
2059
2060
2061 func (x Uint32s) Max(y Uint32s) Uint32s {
2062 var res Uint32s
2063 for i := 0; i < 4; i++ {
2064 vx := x.get(i)
2065 vy := y.get(i)
2066 if vx > vy {
2067 res.set(i, vx)
2068 } else {
2069 res.set(i, vy)
2070 }
2071 }
2072 return res
2073 }
2074
2075
2076 func (x Uint32s) IfElse(mask Mask32s, y Uint32s) Uint32s {
2077 return Uint32s{
2078 a: (x.a & mask.a) | (y.a &^ mask.a),
2079 b: (x.b & mask.b) | (y.b &^ mask.b),
2080 }
2081 }
2082
2083
2084 func (x Uint32s) Min(y Uint32s) Uint32s {
2085 var res Uint32s
2086 for i := 0; i < 4; i++ {
2087 vx := x.get(i)
2088 vy := y.get(i)
2089 if vx < vy {
2090 res.set(i, vx)
2091 } else {
2092 res.set(i, vy)
2093 }
2094 }
2095 return res
2096 }
2097
2098
2099 func (x Uint32s) Mul(y Uint32s) Uint32s {
2100 var res Uint32s
2101 for i := 0; i < 4; i++ {
2102 res.set(i, x.get(i)*y.get(i))
2103 }
2104 return res
2105 }
2106
2107
2108 func (x Uint32s) Not() Uint32s {
2109 return Uint32s{a: ^x.a, b: ^x.b}
2110 }
2111
2112
2113 func (x Uint32s) Or(y Uint32s) Uint32s {
2114 return Uint32s{a: x.a | y.a, b: x.b | y.b}
2115 }
2116
2117
2118 func (x Uint32s) ShiftAllLeft(y uint8) Uint32s {
2119 var res Uint32s
2120 for i := 0; i < 4; i++ {
2121 res.set(i, x.get(i)<<y)
2122 }
2123 return res
2124 }
2125
2126
2127 func (x Uint32s) ShiftAllRight(y uint8) Uint32s {
2128 var res Uint32s
2129 for i := 0; i < 4; i++ {
2130 res.set(i, x.get(i)>>y)
2131 }
2132 return res
2133 }
2134
2135
2136 func (x Uint32s) RotateAllLeft(dist uint64) Uint32s {
2137 var res Uint32s
2138 d := dist & 31
2139 for i := 0; i < 4; i++ {
2140 u := x.get(i)
2141 r := (u << d) | (u >> ((32 - d) & 31))
2142 res.set(i, r)
2143 }
2144 return res
2145 }
2146
2147
2148 func (x Uint32s) RotateAllRight(dist uint64) Uint32s {
2149 var res Uint32s
2150 d := dist & 31
2151 for i := 0; i < 4; i++ {
2152 u := x.get(i)
2153 r := (u >> d) | (u << ((32 - d) & 31))
2154 res.set(i, r)
2155 }
2156 return res
2157 }
2158
2159
2160 func (x Uint32s) Store(s []uint32) {
2161 for i := 0; i < 4 && i < len(s); i++ {
2162 s[i] = x.get(i)
2163 }
2164 }
2165
2166
2167 func (x Uint32s) StorePart(s []uint32) int {
2168 x.Store(s)
2169 return min(len(s), x.Len())
2170 }
2171
2172
2173 func (x Uint32s) String() string {
2174 var parts [4]uint32
2175 for i := 0; i < 4; i++ {
2176 parts[i] = x.get(i)
2177 }
2178 return fmt.Sprint(parts)
2179 }
2180
2181
2182 func (x Uint32s) Sub(y Uint32s) Uint32s {
2183 var res Uint32s
2184 for i := 0; i < 4; i++ {
2185 res.set(i, x.get(i)-y.get(i))
2186 }
2187 return res
2188 }
2189
2190
2191 func (x Uint32s) Xor(y Uint32s) Uint32s {
2192 return Uint32s{a: x.a ^ y.a, b: x.b ^ y.b}
2193 }
2194
2195
2196 func (x Uint32s) BitsToFloat32() Float32s {
2197 return Float32s{a: x.a, b: x.b}
2198 }
2199
2200
2201 func (x Uint32s) BitsToInt32() Int32s {
2202 return Int32s{a: x.a, b: x.b}
2203 }
2204
2205
2206 func (x Uint32s) ConvertToInt32() Int32s {
2207 return Int32s{a: x.a, b: x.b}
2208 }
2209
2210
2211 func (x Uint32s) ReshapeToUint16s() Uint16s {
2212 return Uint16s{a: x.a, b: x.b}
2213 }
2214
2215
2216 func (x Uint32s) ReshapeToUint64s() Uint64s {
2217 return Uint64s{a: x.a, b: x.b}
2218 }
2219
2220
2221 func (x Uint32s) ReshapeToUint8s() Uint8s {
2222 return Uint8s{a: x.a, b: x.b}
2223 }
2224
2225
2226 type Uint64s struct {
2227 _ _simd
2228 a, b uint64
2229 }
2230
2231
2232 func LoadUint64s(s []uint64) Uint64s {
2233 var a, b uint64
2234 a = s[0]
2235 b = s[1]
2236 return Uint64s{a: a, b: b}
2237 }
2238
2239
2240 func LoadUint64sPart(s []uint64) (Uint64s, int) {
2241 n := len(s)
2242 var a, b uint64
2243 if n > 0 {
2244 a = s[0]
2245 }
2246 if n > 1 {
2247 b = s[1]
2248 }
2249 return Uint64s{a: a, b: b}, n
2250 }
2251
2252 func (x Uint64s) get(i int) uint64 {
2253 if i == 0 {
2254 return x.a
2255 }
2256 return x.b
2257 }
2258
2259 func (x *Uint64s) set(i int, v uint64) {
2260 if i == 0 {
2261 x.a = v
2262 } else {
2263 x.b = v
2264 }
2265 }
2266
2267
2268 func (x Uint64s) Add(y Uint64s) Uint64s {
2269 return Uint64s{a: x.a + y.a, b: x.b + y.b}
2270 }
2271
2272
2273 func (x Uint64s) And(y Uint64s) Uint64s {
2274 return Uint64s{a: x.a & y.a, b: x.b & y.b}
2275 }
2276
2277
2278 func (x Uint64s) AndNot(y Uint64s) Uint64s {
2279 return Uint64s{a: x.a &^ y.a, b: x.b &^ y.b}
2280 }
2281
2282
2283 func (x Uint64s) Equal(y Uint64s) Mask64s {
2284 var res Mask64s
2285 if x.a == y.a {
2286 res.a = ^uint64(0)
2287 }
2288 if x.b == y.b {
2289 res.b = ^uint64(0)
2290 }
2291 return res
2292 }
2293
2294
2295 func (x Uint64s) Greater(y Uint64s) Mask64s {
2296 var res Mask64s
2297 for i := 0; i < 2; i++ {
2298 if x.get(i) > y.get(i) {
2299 res.set(i, true)
2300 }
2301 }
2302 return res
2303 }
2304
2305
2306 func (x Uint64s) GreaterEqual(y Uint64s) Mask64s {
2307 var res Mask64s
2308 for i := 0; i < 2; i++ {
2309 if x.get(i) >= y.get(i) {
2310 res.set(i, true)
2311 }
2312 }
2313 return res
2314 }
2315
2316
2317 func (x Uint64s) Less(y Uint64s) Mask64s {
2318 var res Mask64s
2319 for i := 0; i < 2; i++ {
2320 if x.get(i) < y.get(i) {
2321 res.set(i, true)
2322 }
2323 }
2324 return res
2325 }
2326
2327
2328 func (x Uint64s) LessEqual(y Uint64s) Mask64s {
2329 var res Mask64s
2330 for i := 0; i < 2; i++ {
2331 if x.get(i) <= y.get(i) {
2332 res.set(i, true)
2333 }
2334 }
2335 return res
2336 }
2337
2338
2339 func (x Uint64s) NotEqual(y Uint64s) Mask64s {
2340 var res Mask64s
2341 if x.a != y.a {
2342 res.a = ^uint64(0)
2343 }
2344 if x.b != y.b {
2345 res.b = ^uint64(0)
2346 }
2347 return res
2348 }
2349
2350
2351 func (x Uint64s) Len() int {
2352 return 2
2353 }
2354
2355
2356 func (x Uint64s) Masked(mask Mask64s) Uint64s {
2357 return Uint64s{a: x.a & mask.a, b: x.b & mask.b}
2358 }
2359
2360
2361 func (x Uint64s) IfElse(mask Mask64s, y Uint64s) Uint64s {
2362 return Uint64s{
2363 a: (x.a & mask.a) | (y.a &^ mask.a),
2364 b: (x.b & mask.b) | (y.b &^ mask.b),
2365 }
2366 }
2367
2368
2369 func (x Uint64s) Not() Uint64s {
2370 return Uint64s{a: ^x.a, b: ^x.b}
2371 }
2372
2373
2374 func (x Uint64s) Or(y Uint64s) Uint64s {
2375 return Uint64s{a: x.a | y.a, b: x.b | y.b}
2376 }
2377
2378
2379 func (x Uint64s) ShiftAllLeft(y uint8) Uint64s {
2380 return Uint64s{a: x.a << y, b: x.b << y}
2381 }
2382
2383
2384 func (x Uint64s) ShiftAllRight(y uint8) Uint64s {
2385 return Uint64s{a: x.a >> y, b: x.b >> y}
2386 }
2387
2388
2389 func (x Uint64s) RotateAllLeft(dist uint64) Uint64s {
2390 d := dist & 63
2391 return Uint64s{
2392 a: (x.a << d) | (x.a >> ((64 - d) & 63)),
2393 b: (x.b << d) | (x.b >> ((64 - d) & 63)),
2394 }
2395 }
2396
2397
2398 func (x Uint64s) RotateAllRight(dist uint64) Uint64s {
2399 d := dist & 63
2400 return Uint64s{
2401 a: (x.a >> d) | (x.a << ((64 - d) & 63)),
2402 b: (x.b >> d) | (x.b << ((64 - d) & 63)),
2403 }
2404 }
2405
2406
2407 func (x Uint64s) Store(s []uint64) {
2408 if len(s) > 0 {
2409 s[0] = x.a
2410 }
2411 if len(s) > 1 {
2412 s[1] = x.b
2413 }
2414 }
2415
2416
2417 func (x Uint64s) StorePart(s []uint64) int {
2418 x.Store(s)
2419 return min(len(s), x.Len())
2420 }
2421
2422
2423 func (x Uint64s) String() string {
2424 return fmt.Sprint([2]uint64{x.a, x.b})
2425 }
2426
2427
2428 func (x Uint64s) Sub(y Uint64s) Uint64s {
2429 return Uint64s{a: x.a - y.a, b: x.b - y.b}
2430 }
2431
2432
2433 func (x Uint64s) Xor(y Uint64s) Uint64s {
2434 return Uint64s{a: x.a ^ y.a, b: x.b ^ y.b}
2435 }
2436
2437
2438 func (x Uint64s) BitsToFloat64() Float64s {
2439 return Float64s{a: x.a, b: x.b}
2440 }
2441
2442
2443 func (x Uint64s) BitsToInt64() Int64s {
2444 return Int64s{a: x.a, b: x.b}
2445 }
2446
2447
2448 func (x Uint64s) ConvertToInt64() Int64s {
2449 return Int64s{a: x.a, b: x.b}
2450 }
2451
2452
2453 func (x Uint64s) ReshapeToUint16s() Uint16s {
2454 return Uint16s{a: x.a, b: x.b}
2455 }
2456
2457
2458 func (x Uint64s) ReshapeToUint32s() Uint32s {
2459 return Uint32s{a: x.a, b: x.b}
2460 }
2461
2462
2463 func (x Uint64s) ReshapeToUint8s() Uint8s {
2464 return Uint8s{a: x.a, b: x.b}
2465 }
2466
2467
2468 type Float32s struct {
2469 _ _simd
2470 a, b uint64
2471 }
2472
2473
2474 func LoadFloat32s(s []float32) Float32s {
2475 var a, b uint64
2476 for i := 0; i < 4; i++ {
2477 val := uint64(math.Float32bits(s[i]))
2478 if i < 2 {
2479 a |= val << (32 * i)
2480 } else {
2481 b |= val << (32 * (i - 2))
2482 }
2483 }
2484 return Float32s{a: a, b: b}
2485 }
2486
2487
2488 func LoadFloat32sPart(s []float32) (Float32s, int) {
2489 var a, b uint64
2490 n := len(s)
2491 if n > 4 {
2492 n = 4
2493 }
2494 for i := 0; i < n; i++ {
2495 val := uint64(math.Float32bits(s[i]))
2496 if i < 2 {
2497 a |= val << (32 * i)
2498 } else {
2499 b |= val << (32 * (i - 2))
2500 }
2501 }
2502 return Float32s{a: a, b: b}, n
2503 }
2504
2505 func (x Float32s) get(i int) float32 {
2506 if i < 2 {
2507 return math.Float32frombits(uint32(x.a >> (32 * i)))
2508 }
2509 return math.Float32frombits(uint32(x.b >> (32 * (i - 2))))
2510 }
2511
2512 func (x *Float32s) set(i int, v float32) {
2513 val := uint64(math.Float32bits(v))
2514 if i < 2 {
2515 mask := uint64(0xffffffff) << (32 * i)
2516 x.a = (x.a &^ mask) | (val << (32 * i))
2517 } else {
2518 mask := uint64(0xffffffff) << (32 * (i - 2))
2519 x.b = (x.b &^ mask) | (val << (32 * (i - 2)))
2520 }
2521 }
2522
2523
2524 func (x Float32s) Abs() Float32s {
2525 var res Float32s
2526 for i := 0; i < 4; i++ {
2527 v := x.get(i)
2528 if v < 0 {
2529 res.set(i, -v)
2530 } else {
2531 res.set(i, v)
2532 }
2533 }
2534 return res
2535 }
2536
2537
2538 func (x Float32s) Add(y Float32s) Float32s {
2539 var res Float32s
2540 for i := 0; i < 4; i++ {
2541 res.set(i, x.get(i)+y.get(i))
2542 }
2543 return res
2544 }
2545
2546
2547 func (x Float32s) ConvertToInt32() Int32s {
2548 var res Int32s
2549 for i := 0; i < 4; i++ {
2550 res.set(i, int32(x.get(i)))
2551 }
2552 return res
2553 }
2554
2555
2556 func (x Float32s) Div(y Float32s) Float32s {
2557 var res Float32s
2558 for i := 0; i < 4; i++ {
2559 res.set(i, x.get(i)/y.get(i))
2560 }
2561 return res
2562 }
2563
2564
2565 func (x Float32s) Equal(y Float32s) Mask32s {
2566 var res Mask32s
2567 for i := 0; i < 4; i++ {
2568 if x.get(i) == y.get(i) {
2569 res.set(i, true)
2570 }
2571 }
2572 return res
2573 }
2574
2575
2576 func (x Float32s) Greater(y Float32s) Mask32s {
2577 var res Mask32s
2578 for i := 0; i < 4; i++ {
2579 if x.get(i) > y.get(i) {
2580 res.set(i, true)
2581 }
2582 }
2583 return res
2584 }
2585
2586
2587 func (x Float32s) GreaterEqual(y Float32s) Mask32s {
2588 var res Mask32s
2589 for i := 0; i < 4; i++ {
2590 if x.get(i) >= y.get(i) {
2591 res.set(i, true)
2592 }
2593 }
2594 return res
2595 }
2596
2597
2598 func (x Float32s) Len() int {
2599 return 4
2600 }
2601
2602
2603 func (x Float32s) Less(y Float32s) Mask32s {
2604 var res Mask32s
2605 for i := 0; i < 4; i++ {
2606 if x.get(i) < y.get(i) {
2607 res.set(i, true)
2608 }
2609 }
2610 return res
2611 }
2612
2613
2614 func (x Float32s) LessEqual(y Float32s) Mask32s {
2615 var res Mask32s
2616 for i := 0; i < 4; i++ {
2617 if x.get(i) <= y.get(i) {
2618 res.set(i, true)
2619 }
2620 }
2621 return res
2622 }
2623
2624
2625 func (x Float32s) Masked(mask Mask32s) Float32s {
2626 return Float32s{a: x.a & mask.a, b: x.b & mask.b}
2627 }
2628
2629
2630 func (x Float32s) Max(y Float32s) Float32s {
2631 var res Float32s
2632 for i := 0; i < 4; i++ {
2633 vx := x.get(i)
2634 vy := y.get(i)
2635 if vx > vy {
2636 res.set(i, vx)
2637 } else {
2638 res.set(i, vy)
2639 }
2640 }
2641 return res
2642 }
2643
2644
2645 func (x Float32s) IfElse(mask Mask32s, y Float32s) Float32s {
2646 return Float32s{
2647 a: (x.a & mask.a) | (y.a &^ mask.a),
2648 b: (x.b & mask.b) | (y.b &^ mask.b),
2649 }
2650 }
2651
2652
2653 func (x Float32s) Min(y Float32s) Float32s {
2654 var res Float32s
2655 for i := 0; i < 4; i++ {
2656 vx := x.get(i)
2657 vy := y.get(i)
2658 if vx < vy {
2659 res.set(i, vx)
2660 } else {
2661 res.set(i, vy)
2662 }
2663 }
2664 return res
2665 }
2666
2667
2668 func (x Float32s) Mul(y Float32s) Float32s {
2669 var res Float32s
2670 for i := 0; i < 4; i++ {
2671 res.set(i, x.get(i)*y.get(i))
2672 }
2673 return res
2674 }
2675
2676
2677 func (x Float32s) MulAdd(y, z Float32s) Float32s {
2678 var res Float32s
2679 for i := 0; i < 4; i++ {
2680 res.set(i, x.get(i)+y.get(i)*z.get(i))
2681 }
2682 return res
2683 }
2684
2685
2686 func (x Float32s) Neg() Float32s {
2687 var res Float32s
2688 for i := 0; i < 4; i++ {
2689 res.set(i, -(x.get(i)))
2690 }
2691 return res
2692 }
2693
2694
2695 func (x Float32s) NotEqual(y Float32s) Mask32s {
2696 var res Mask32s
2697 for i := 0; i < 4; i++ {
2698 if x.get(i) != y.get(i) {
2699 res.set(i, true)
2700 }
2701 }
2702 return res
2703 }
2704
2705
2706 func (x Float32s) Sqrt() Float32s {
2707 var res Float32s
2708 for i := 0; i < 4; i++ {
2709 res.set(i, float32(math.Sqrt(float64(x.get(i)))))
2710 }
2711 return res
2712 }
2713
2714
2715 func (x Float32s) Store(s []float32) {
2716 for i := 0; i < 4 && i < len(s); i++ {
2717 s[i] = x.get(i)
2718 }
2719 }
2720
2721
2722 func (x Float32s) StorePart(s []float32) int {
2723 x.Store(s)
2724 return min(len(s), x.Len())
2725 }
2726
2727
2728 func (x Float32s) String() string {
2729 var parts [4]float32
2730 for i := 0; i < 4; i++ {
2731 parts[i] = x.get(i)
2732 }
2733 return fmt.Sprint(parts)
2734 }
2735
2736
2737 func (x Float32s) Sub(y Float32s) Float32s {
2738 var res Float32s
2739 for i := 0; i < 4; i++ {
2740 res.set(i, x.get(i)-y.get(i))
2741 }
2742 return res
2743 }
2744
2745
2746 func (x Float32s) ToBits() Uint32s {
2747 return Uint32s{a: x.a, b: x.b}
2748 }
2749
2750
2751 type Float64s struct {
2752 _ _simd
2753 a, b uint64
2754 }
2755
2756
2757 func LoadFloat64s(s []float64) Float64s {
2758 var a, b uint64
2759 a = math.Float64bits(s[0])
2760 b = math.Float64bits(s[1])
2761 return Float64s{a: a, b: b}
2762 }
2763
2764
2765 func LoadFloat64sPart(s []float64) (Float64s, int) {
2766 n := len(s)
2767 var a, b uint64
2768 if n > 0 {
2769 a = math.Float64bits(s[0])
2770 }
2771 if n > 1 {
2772 b = math.Float64bits(s[1])
2773 }
2774 return Float64s{a: a, b: b}, n
2775 }
2776
2777 func (x Float64s) get(i int) float64 {
2778 if i == 0 {
2779 return math.Float64frombits(x.a)
2780 }
2781 return math.Float64frombits(x.b)
2782 }
2783
2784 func (x *Float64s) set(i int, v float64) {
2785 if i == 0 {
2786 x.a = math.Float64bits(v)
2787 } else {
2788 x.b = math.Float64bits(v)
2789 }
2790 }
2791
2792
2793 func (x Float64s) Abs() Float64s {
2794 var res Float64s
2795 for i := 0; i < 4; i++ {
2796 v := x.get(i)
2797 if v < 0 {
2798 res.set(i, -v)
2799 } else {
2800 res.set(i, v)
2801 }
2802 }
2803 return res
2804 }
2805
2806
2807 func (x Float64s) Add(y Float64s) Float64s {
2808 var res Float64s
2809 res.set(0, x.get(0)+y.get(0))
2810 res.set(1, x.get(1)+y.get(1))
2811 return res
2812 }
2813
2814
2815 func (x Float64s) Div(y Float64s) Float64s {
2816 var res Float64s
2817 res.set(0, x.get(0)/y.get(0))
2818 res.set(1, x.get(1)/y.get(1))
2819 return res
2820 }
2821
2822
2823 func (x Float64s) Equal(y Float64s) Mask64s {
2824 var res Mask64s
2825 if x.get(0) == y.get(0) {
2826 res.a = ^uint64(0)
2827 }
2828 if x.get(1) == y.get(1) {
2829 res.b = ^uint64(0)
2830 }
2831 return res
2832 }
2833
2834
2835 func (x Float64s) Greater(y Float64s) Mask64s {
2836 var res Mask64s
2837 if x.get(0) > y.get(0) {
2838 res.a = ^uint64(0)
2839 }
2840 if x.get(1) > y.get(1) {
2841 res.b = ^uint64(0)
2842 }
2843 return res
2844 }
2845
2846
2847 func (x Float64s) GreaterEqual(y Float64s) Mask64s {
2848 var res Mask64s
2849 if x.get(0) >= y.get(0) {
2850 res.a = ^uint64(0)
2851 }
2852 if x.get(1) >= y.get(1) {
2853 res.b = ^uint64(0)
2854 }
2855 return res
2856 }
2857
2858
2859 func (x Float64s) Len() int {
2860 return 2
2861 }
2862
2863
2864 func (x Float64s) Less(y Float64s) Mask64s {
2865 var res Mask64s
2866 if x.get(0) < y.get(0) {
2867 res.a = ^uint64(0)
2868 }
2869 if x.get(1) < y.get(1) {
2870 res.b = ^uint64(0)
2871 }
2872 return res
2873 }
2874
2875
2876 func (x Float64s) LessEqual(y Float64s) Mask64s {
2877 var res Mask64s
2878 if x.get(0) <= y.get(0) {
2879 res.a = ^uint64(0)
2880 }
2881 if x.get(1) <= y.get(1) {
2882 res.b = ^uint64(0)
2883 }
2884 return res
2885 }
2886
2887
2888 func (x Float64s) Masked(mask Mask64s) Float64s {
2889 return Float64s{a: x.a & mask.a, b: x.b & mask.b}
2890 }
2891
2892
2893 func (x Float64s) Max(y Float64s) Float64s {
2894 var res Float64s
2895 vx := x.get(0)
2896 vy := y.get(0)
2897 if vx > vy {
2898 res.set(0, vx)
2899 } else {
2900 res.set(0, vy)
2901 }
2902 vx = x.get(1)
2903 vy = y.get(1)
2904 if vx > vy {
2905 res.set(1, vx)
2906 } else {
2907 res.set(1, vy)
2908 }
2909 return res
2910 }
2911
2912
2913 func (x Float64s) IfElse(mask Mask64s, y Float64s) Float64s {
2914 return Float64s{
2915 a: (x.a & mask.a) | (y.a &^ mask.a),
2916 b: (x.b & mask.b) | (y.b &^ mask.b),
2917 }
2918 }
2919
2920
2921 func (x Float64s) Min(y Float64s) Float64s {
2922 var res Float64s
2923 vx := x.get(0)
2924 vy := y.get(0)
2925 if vx < vy {
2926 res.set(0, vx)
2927 } else {
2928 res.set(0, vy)
2929 }
2930 vx = x.get(1)
2931 vy = y.get(1)
2932 if vx < vy {
2933 res.set(1, vx)
2934 } else {
2935 res.set(1, vy)
2936 }
2937 return res
2938 }
2939
2940
2941 func (x Float64s) Mul(y Float64s) Float64s {
2942 var res Float64s
2943 res.set(0, x.get(0)*y.get(0))
2944 res.set(1, x.get(1)*y.get(1))
2945 return res
2946 }
2947
2948
2949 func (x Float64s) MulAdd(y, z Float64s) Float64s {
2950 var res Float64s
2951 res.set(0, x.get(0)+y.get(0)*z.get(0))
2952 res.set(1, x.get(1)+y.get(1)*z.get(1))
2953 return res
2954 }
2955
2956
2957 func (x Float64s) Neg() Float64s {
2958 var res Float64s
2959 for i := 0; i < 4; i++ {
2960 res.set(i, -(x.get(i)))
2961 }
2962 return res
2963 }
2964
2965
2966 func (x Float64s) NotEqual(y Float64s) Mask64s {
2967 var res Mask64s
2968 if x.get(0) != y.get(0) {
2969 res.a = ^uint64(0)
2970 }
2971 if x.get(1) != y.get(1) {
2972 res.b = ^uint64(0)
2973 }
2974 return res
2975 }
2976
2977
2978 func (x Float64s) Sqrt() Float64s {
2979 var res Float64s
2980 res.set(0, math.Sqrt(x.get(0)))
2981 res.set(1, math.Sqrt(x.get(1)))
2982 return res
2983 }
2984
2985
2986 func (x Float64s) Store(s []float64) {
2987 if len(s) > 0 {
2988 s[0] = x.get(0)
2989 }
2990 if len(s) > 1 {
2991 s[1] = x.get(1)
2992 }
2993 }
2994
2995
2996 func (x Float64s) StorePart(s []float64) int {
2997 x.Store(s)
2998 return min(len(s), x.Len())
2999 }
3000
3001
3002 func (x Float64s) String() string {
3003 return fmt.Sprint([2]float64{x.get(0), x.get(1)})
3004 }
3005
3006
3007 func (x Float64s) Sub(y Float64s) Float64s {
3008 var res Float64s
3009 res.set(0, x.get(0)-y.get(0))
3010 res.set(1, x.get(1)-y.get(1))
3011 return res
3012 }
3013
3014
3015 func (x Float64s) ToBits() Uint64s {
3016 return Uint64s{a: x.a, b: x.b}
3017 }
3018
3019
3020 type Mask8s struct {
3021 _ _simd
3022 a, b uint64
3023 }
3024
3025 func (x *Mask8s) set(i int, v bool) {
3026 if v {
3027 if i < 8 {
3028 mask := uint64(0xff) << (8 * i)
3029 x.a |= mask
3030 } else {
3031 mask := uint64(0xff) << (8 * (i - 8))
3032 x.b |= mask
3033 }
3034 }
3035 }
3036
3037
3038 func (x Mask8s) And(y Mask8s) Mask8s {
3039 return Mask8s{a: x.a & y.a, b: x.b & y.b}
3040 }
3041
3042
3043 func (x Mask8s) Or(y Mask8s) Mask8s {
3044 return Mask8s{a: x.a | y.a, b: x.b | y.b}
3045 }
3046
3047
3048 func (x Mask8s) String() string {
3049 return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
3050 }
3051
3052
3053 func (x Mask8s) ToInt8s() Int8s {
3054 return Int8s{a: x.a, b: x.b}
3055 }
3056
3057
3058 type Mask16s struct {
3059 _ _simd
3060 a, b uint64
3061 }
3062
3063 func (x *Mask16s) set(i int, v bool) {
3064 if v {
3065 if i < 4 {
3066 mask := uint64(0xffff) << (16 * i)
3067 x.a |= mask
3068 } else {
3069 mask := uint64(0xffff) << (16 * (i - 4))
3070 x.b |= mask
3071 }
3072 }
3073 }
3074
3075
3076 func (x Mask16s) And(y Mask16s) Mask16s {
3077 return Mask16s{a: x.a & y.a, b: x.b & y.b}
3078 }
3079
3080
3081 func (x Mask16s) Or(y Mask16s) Mask16s {
3082 return Mask16s{a: x.a | y.a, b: x.b | y.b}
3083 }
3084
3085
3086 func (x Mask16s) String() string {
3087 return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
3088 }
3089
3090
3091 func (x Mask16s) ToInt16s() Int16s {
3092 return Int16s{a: x.a, b: x.b}
3093 }
3094
3095
3096 type Mask32s struct {
3097 _ _simd
3098 a, b uint64
3099 }
3100
3101 func (x *Mask32s) set(i int, v bool) {
3102 if v {
3103 if i < 2 {
3104 mask := uint64(0xffffffff) << (32 * i)
3105 x.a |= mask
3106 } else {
3107 mask := uint64(0xffffffff) << (32 * (i - 2))
3108 x.b |= mask
3109 }
3110 }
3111 }
3112
3113
3114 func (x Mask32s) And(y Mask32s) Mask32s {
3115 return Mask32s{a: x.a & y.a, b: x.b & y.b}
3116 }
3117
3118
3119 func (x Mask32s) Or(y Mask32s) Mask32s {
3120 return Mask32s{a: x.a | y.a, b: x.b | y.b}
3121 }
3122
3123
3124 func (x Mask32s) String() string {
3125 return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
3126 }
3127
3128
3129 func (x Mask32s) ToInt32s() Int32s {
3130 return Int32s{a: x.a, b: x.b}
3131 }
3132
3133
3134 type Mask64s struct {
3135 _ _simd
3136 a, b uint64
3137 }
3138
3139 func (x *Mask64s) set(i int, v bool) {
3140 if v {
3141 if i == 0 {
3142 x.a = ^uint64(0)
3143 } else {
3144 x.b = ^uint64(0)
3145 }
3146 }
3147 }
3148
3149
3150 func (x Mask64s) And(y Mask64s) Mask64s {
3151 return Mask64s{a: x.a & y.a, b: x.b & y.b}
3152 }
3153
3154
3155 func (x Mask64s) Or(y Mask64s) Mask64s {
3156 return Mask64s{a: x.a | y.a, b: x.b | y.b}
3157 }
3158
3159
3160 func (x Mask64s) String() string {
3161 return fmt.Sprintf("{a:%#x, b:%#x}", x.a, x.b)
3162 }
3163
3164
3165 func (x Mask64s) ToInt64s() Int64s {
3166 return Int64s{a: x.a, b: x.b}
3167 }
3168
3169 func newT(lo, hi uint64) Uint64s {
3170 return Uint64s{a: lo, b: hi}
3171 }
3172
3173
3174 func (x Uint64s) mwl(y Uint64s) Uint64s {
3175 hi, lo := bits.Mul64(x.a, y.a)
3176 return Uint64s{a: lo, b: hi}
3177 }
3178
3179 var (
3180
3181 m0 = newT(0x1084210842108421, 0x2108421084210842)
3182 m1 = newT(0x2108421084210842, 0x4210842108421084)
3183 m2 = newT(0x4210842108421084, 0x8421084210842108)
3184 m3 = newT(0x8421084210842108, 0x0842108421084210)
3185 m4 = newT(0x0842108421084210, 0x1084210842108421)
3186 )
3187
3188 func (x Uint64s) clmul(y Uint64s) Uint64s {
3189 x0 := x.And(m0)
3190 x1 := x.And(m1)
3191 x2 := x.And(m2)
3192 x3 := x.And(m3)
3193 x4 := x.And(m4)
3194
3195 y0 := y.And(m0)
3196 y1 := y.And(m1)
3197 y2 := y.And(m2)
3198 y3 := y.And(m3)
3199 y4 := y.And(m4)
3200
3201
3202 z := (x0.mwl(y0)).Xor(x1.mwl(y4)).Xor(x4.mwl(y1)).Xor(x2.mwl(y3)).Xor(x3.mwl(y2)).And(m0)
3203 z = (x3.mwl(y3)).Xor(x2.mwl(y4)).Xor(x4.mwl(y2)).Xor(x0.mwl(y1)).Xor(x1.mwl(y0)).And(m1).Or(z)
3204 z = (x1.mwl(y1)).Xor(x3.mwl(y4)).Xor(x4.mwl(y3)).Xor(x0.mwl(y2)).Xor(x2.mwl(y0)).And(m2).Or(z)
3205 z = (x4.mwl(y4)).Xor(x0.mwl(y3)).Xor(x3.mwl(y0)).Xor(x1.mwl(y2)).Xor(x2.mwl(y1)).And(m3).Or(z)
3206 z = (x2.mwl(y2)).Xor(x0.mwl(y4)).Xor(x4.mwl(y0)).Xor(x1.mwl(y3)).Xor(x3.mwl(y1)).And(m4).Or(z)
3207
3208 return z
3209 }
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224 func (x Uint64s) CarrylessMultiplyEven(y Uint64s) Uint64s {
3225 return x.clmul(y)
3226 }
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241 func (x Uint64s) CarrylessMultiplyOdd(y Uint64s) Uint64s {
3242 x.a = x.b
3243 y.a = y.b
3244 return x.clmul(y)
3245 }
3246
3247 const (
3248 by8 = 0x0101010101010101
3249 by16 = 0x0001000100010001
3250 )
3251
3252
3253 func BroadcastInt8s(x int8) Int8s {
3254 v := (255 & uint64(x)) * by8
3255 return Int8s{a: v, b: v}
3256 }
3257
3258
3259 func BroadcastInt16s(x int16) Int16s {
3260 v := (65535 & uint64(x)) * by16
3261 return Int16s{a: v, b: v}
3262 }
3263
3264
3265 func BroadcastInt32s(x int32) Int32s {
3266 v := uint64(x) & 0xffffffff
3267 v = v<<32 | v
3268 return Int32s{a: v, b: v}
3269 }
3270
3271
3272 func BroadcastInt64s(x int64) Int64s {
3273 v := uint64(x)
3274 return Int64s{a: v, b: v}
3275 }
3276
3277
3278 func BroadcastUint8s(x uint8) Uint8s {
3279 v := uint64(x) * by8
3280 return Uint8s{a: v, b: v}
3281
3282 }
3283
3284
3285 func BroadcastUint16s(x uint16) Uint16s {
3286 v := uint64(x) * by16
3287 return Uint16s{a: v, b: v}
3288
3289 }
3290
3291
3292 func BroadcastUint32s(x uint32) Uint32s {
3293 v := uint64(x)
3294 v = v<<32 | v
3295 return Uint32s{a: v, b: v}
3296 }
3297
3298
3299 func BroadcastUint64s(x uint64) Uint64s {
3300 return Uint64s{a: x, b: x}
3301 }
3302
3303
3304 func BroadcastFloat32s(x float32) Float32s {
3305 v := uint64(math.Float32bits(x))
3306 v = v<<32 | v
3307 return Float32s{a: v, b: v}
3308 }
3309
3310
3311 func BroadcastFloat64s(x float64) Float64s {
3312 v := math.Float64bits(x)
3313 return Float64s{a: v, b: v}
3314 }
3315
View as plain text