Text file
src/math/big/arith_ppc64x.s
1 // Copyright 2013 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !math_big_pure_go && (ppc64 || ppc64le)
6
7 #include "textflag.h"
8
9 // This file provides fast assembly versions for the elementary
10 // arithmetic operations on vectors implemented in arith.go.
11
12 // func addVV(z, y, y []Word) (c Word)
13 // z[i] = x[i] + y[i] for all i, carrying
14 TEXT ·addVV(SB), NOSPLIT, $0
15 MOVD z_len+8(FP), R7 // R7 = z_len
16 MOVD x+24(FP), R8 // R8 = x[]
17 MOVD y+48(FP), R9 // R9 = y[]
18 MOVD z+0(FP), R10 // R10 = z[]
19
20 // If z_len = 0, we are done
21 CMP R0, R7
22 MOVD R0, R4
23 BEQ done
24
25 // Process the first iteration out of the loop so we can
26 // use MOVDU and avoid 3 index registers updates.
27 MOVD 0(R8), R11 // R11 = x[i]
28 MOVD 0(R9), R12 // R12 = y[i]
29 ADD $-1, R7 // R7 = z_len - 1
30 ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
31 CMP R0, R7
32 MOVD R15, 0(R10) // z[i]
33 BEQ final // If z_len was 1, we are done
34
35 SRD $2, R7, R5 // R5 = z_len/4
36 CMP R0, R5
37 MOVD R5, CTR // Set up loop counter
38 BEQ tail // If R5 = 0, we can't use the loop
39
40 // Process 4 elements per iteration. Unrolling this loop
41 // means a performance trade-off: we will lose performance
42 // for small values of z_len (0.90x in the worst case), but
43 // gain significant performance as z_len increases (up to
44 // 1.45x).
45
46 PCALIGN $16
47 loop:
48 MOVD 8(R8), R11 // R11 = x[i]
49 MOVD 16(R8), R12 // R12 = x[i+1]
50 MOVD 24(R8), R14 // R14 = x[i+2]
51 MOVDU 32(R8), R15 // R15 = x[i+3]
52 MOVD 8(R9), R16 // R16 = y[i]
53 MOVD 16(R9), R17 // R17 = y[i+1]
54 MOVD 24(R9), R18 // R18 = y[i+2]
55 MOVDU 32(R9), R19 // R19 = y[i+3]
56 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
57 ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
58 ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
59 ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
60 MOVD R20, 8(R10) // z[i]
61 MOVD R21, 16(R10) // z[i+1]
62 MOVD R22, 24(R10) // z[i+2]
63 MOVDU R23, 32(R10) // z[i+3]
64 ADD $-4, R7 // R7 = z_len - 4
65 BC 16, 0, loop // bdnz
66
67 // We may have more elements to read
68 CMP R0, R7
69 BEQ final
70
71 // Process the remaining elements, one at a time
72 tail:
73 MOVDU 8(R8), R11 // R11 = x[i]
74 MOVDU 8(R9), R16 // R16 = y[i]
75 ADD $-1, R7 // R7 = z_len - 1
76 ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
77 CMP R0, R7
78 MOVDU R20, 8(R10) // z[i]
79 BEQ final // If R7 = 0, we are done
80
81 MOVDU 8(R8), R11
82 MOVDU 8(R9), R16
83 ADD $-1, R7
84 ADDE R11, R16, R20
85 CMP R0, R7
86 MOVDU R20, 8(R10)
87 BEQ final
88
89 MOVD 8(R8), R11
90 MOVD 8(R9), R16
91 ADDE R11, R16, R20
92 MOVD R20, 8(R10)
93
94 final:
95 ADDZE R4 // Capture CA
96
97 done:
98 MOVD R4, c+72(FP)
99 RET
100
101 // func subVV(z, x, y []Word) (c Word)
102 // z[i] = x[i] - y[i] for all i, carrying
103 TEXT ·subVV(SB), NOSPLIT, $0
104 MOVD z_len+8(FP), R7 // R7 = z_len
105 MOVD x+24(FP), R8 // R8 = x[]
106 MOVD y+48(FP), R9 // R9 = y[]
107 MOVD z+0(FP), R10 // R10 = z[]
108
109 // If z_len = 0, we are done
110 CMP R0, R7
111 MOVD R0, R4
112 BEQ done
113
114 // Process the first iteration out of the loop so we can
115 // use MOVDU and avoid 3 index registers updates.
116 MOVD 0(R8), R11 // R11 = x[i]
117 MOVD 0(R9), R12 // R12 = y[i]
118 ADD $-1, R7 // R7 = z_len - 1
119 SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
120 CMP R0, R7
121 MOVD R15, 0(R10) // z[i]
122 BEQ final // If z_len was 1, we are done
123
124 SRD $2, R7, R5 // R5 = z_len/4
125 CMP R0, R5
126 MOVD R5, CTR // Set up loop counter
127 BEQ tail // If R5 = 0, we can't use the loop
128
129 // Process 4 elements per iteration. Unrolling this loop
130 // means a performance trade-off: we will lose performance
131 // for small values of z_len (0.92x in the worst case), but
132 // gain significant performance as z_len increases (up to
133 // 1.45x).
134
135 PCALIGN $16
136 loop:
137 MOVD 8(R8), R11 // R11 = x[i]
138 MOVD 16(R8), R12 // R12 = x[i+1]
139 MOVD 24(R8), R14 // R14 = x[i+2]
140 MOVDU 32(R8), R15 // R15 = x[i+3]
141 MOVD 8(R9), R16 // R16 = y[i]
142 MOVD 16(R9), R17 // R17 = y[i+1]
143 MOVD 24(R9), R18 // R18 = y[i+2]
144 MOVDU 32(R9), R19 // R19 = y[i+3]
145 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
146 SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
147 SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
148 SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
149 MOVD R20, 8(R10) // z[i]
150 MOVD R21, 16(R10) // z[i+1]
151 MOVD R22, 24(R10) // z[i+2]
152 MOVDU R23, 32(R10) // z[i+3]
153 ADD $-4, R7 // R7 = z_len - 4
154 BC 16, 0, loop // bdnz
155
156 // We may have more elements to read
157 CMP R0, R7
158 BEQ final
159
160 // Process the remaining elements, one at a time
161 tail:
162 MOVDU 8(R8), R11 // R11 = x[i]
163 MOVDU 8(R9), R16 // R16 = y[i]
164 ADD $-1, R7 // R7 = z_len - 1
165 SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
166 CMP R0, R7
167 MOVDU R20, 8(R10) // z[i]
168 BEQ final // If R7 = 0, we are done
169
170 MOVDU 8(R8), R11
171 MOVDU 8(R9), R16
172 ADD $-1, R7
173 SUBE R16, R11, R20
174 CMP R0, R7
175 MOVDU R20, 8(R10)
176 BEQ final
177
178 MOVD 8(R8), R11
179 MOVD 8(R9), R16
180 SUBE R16, R11, R20
181 MOVD R20, 8(R10)
182
183 final:
184 ADDZE R4
185 XOR $1, R4
186
187 done:
188 MOVD R4, c+72(FP)
189 RET
190
191 // func addVW(z, x []Word, y Word) (c Word)
192 TEXT ·addVW(SB), NOSPLIT, $0
193 MOVD z+0(FP), R10 // R10 = z[]
194 MOVD x+24(FP), R8 // R8 = x[]
195 MOVD y+48(FP), R4 // R4 = y = c
196 MOVD z_len+8(FP), R11 // R11 = z_len
197
198 CMP R0, R11 // If z_len is zero, return
199 BEQ done
200
201 // We will process the first iteration out of the loop so we capture
202 // the value of c. In the subsequent iterations, we will rely on the
203 // value of CA set here.
204 MOVD 0(R8), R20 // R20 = x[i]
205 ADD $-1, R11 // R11 = z_len - 1
206 ADDC R20, R4, R6 // R6 = x[i] + c
207 CMP R0, R11 // If z_len was 1, we are done
208 MOVD R6, 0(R10) // z[i]
209 BEQ final
210
211 // We will read 4 elements per iteration
212 SRD $2, R11, R9 // R9 = z_len/4
213 DCBT (R8)
214 CMP R0, R9
215 MOVD R9, CTR // Set up the loop counter
216 BEQ tail // If R9 = 0, we can't use the loop
217 PCALIGN $16
218
219 loop:
220 MOVD 8(R8), R20 // R20 = x[i]
221 MOVD 16(R8), R21 // R21 = x[i+1]
222 MOVD 24(R8), R22 // R22 = x[i+2]
223 MOVDU 32(R8), R23 // R23 = x[i+3]
224 ADDZE R20, R24 // R24 = x[i] + CA
225 ADDZE R21, R25 // R25 = x[i+1] + CA
226 ADDZE R22, R26 // R26 = x[i+2] + CA
227 ADDZE R23, R27 // R27 = x[i+3] + CA
228 MOVD R24, 8(R10) // z[i]
229 MOVD R25, 16(R10) // z[i+1]
230 MOVD R26, 24(R10) // z[i+2]
231 MOVDU R27, 32(R10) // z[i+3]
232 ADD $-4, R11 // R11 = z_len - 4
233 BC 16, 0, loop // bdnz
234
235 // We may have some elements to read
236 CMP R0, R11
237 BEQ final
238
239 tail:
240 MOVDU 8(R8), R20
241 ADDZE R20, R24
242 ADD $-1, R11
243 MOVDU R24, 8(R10)
244 CMP R0, R11
245 BEQ final
246
247 MOVDU 8(R8), R20
248 ADDZE R20, R24
249 ADD $-1, R11
250 MOVDU R24, 8(R10)
251 CMP R0, R11
252 BEQ final
253
254 MOVD 8(R8), R20
255 ADDZE R20, R24
256 MOVD R24, 8(R10)
257
258 final:
259 ADDZE R0, R4 // c = CA
260 done:
261 MOVD R4, c+56(FP)
262 RET
263
264 // func subVW(z, x []Word, y Word) (c Word)
265 TEXT ·subVW(SB), NOSPLIT, $0
266 MOVD z+0(FP), R10 // R10 = z[]
267 MOVD x+24(FP), R8 // R8 = x[]
268 MOVD y+48(FP), R4 // R4 = y = c
269 MOVD z_len+8(FP), R11 // R11 = z_len
270
271 CMP R0, R11 // If z_len is zero, return
272 BEQ done
273
274 // We will process the first iteration out of the loop so we capture
275 // the value of c. In the subsequent iterations, we will rely on the
276 // value of CA set here.
277 MOVD 0(R8), R20 // R20 = x[i]
278 ADD $-1, R11 // R11 = z_len - 1
279 SUBC R4, R20, R6 // R6 = x[i] - c
280 CMP R0, R11 // If z_len was 1, we are done
281 MOVD R6, 0(R10) // z[i]
282 BEQ final
283
284 // We will read 4 elements per iteration
285 SRD $2, R11, R9 // R9 = z_len/4
286 DCBT (R8)
287 CMP R0, R9
288 MOVD R9, CTR // Set up the loop counter
289 BEQ tail // If R9 = 0, we can't use the loop
290
291 // The loop here is almost the same as the one used in s390x, but
292 // we don't need to capture CA every iteration because we've already
293 // done that above.
294
295 PCALIGN $16
296 loop:
297 MOVD 8(R8), R20
298 MOVD 16(R8), R21
299 MOVD 24(R8), R22
300 MOVDU 32(R8), R23
301 SUBE R0, R20
302 SUBE R0, R21
303 SUBE R0, R22
304 SUBE R0, R23
305 MOVD R20, 8(R10)
306 MOVD R21, 16(R10)
307 MOVD R22, 24(R10)
308 MOVDU R23, 32(R10)
309 ADD $-4, R11
310 BC 16, 0, loop // bdnz
311
312 // We may have some elements to read
313 CMP R0, R11
314 BEQ final
315
316 tail:
317 MOVDU 8(R8), R20
318 SUBE R0, R20
319 ADD $-1, R11
320 MOVDU R20, 8(R10)
321 CMP R0, R11
322 BEQ final
323
324 MOVDU 8(R8), R20
325 SUBE R0, R20
326 ADD $-1, R11
327 MOVDU R20, 8(R10)
328 CMP R0, R11
329 BEQ final
330
331 MOVD 8(R8), R20
332 SUBE R0, R20
333 MOVD R20, 8(R10)
334
335 final:
336 // Capture CA
337 SUBE R4, R4
338 NEG R4, R4
339
340 done:
341 MOVD R4, c+56(FP)
342 RET
343
344 //func shlVU(z, x []Word, s uint) (c Word)
345 TEXT ·shlVU(SB), NOSPLIT, $0
346 MOVD z+0(FP), R3
347 MOVD x+24(FP), R6
348 MOVD s+48(FP), R9
349 MOVD z_len+8(FP), R4
350 MOVD x_len+32(FP), R7
351 CMP R9, R0 // s==0 copy(z,x)
352 BEQ zeroshift
353 CMP R4, R0 // len(z)==0 return
354 BEQ done
355
356 ADD $-1, R4, R5 // len(z)-1
357 SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
358 SLD $3, R5, R7
359 ADD R6, R7, R15 // save starting address &x[len(z)-1]
360 ADD R3, R7, R16 // save starting address &z[len(z)-1]
361 MOVD (R6)(R7), R14
362 SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7
363 CMP R5, R0 // iterate from i=len(z)-1 to 0
364 BEQ loopexit // Already at end?
365 MOVD 0(R15),R10 // x[i]
366 PCALIGN $16
367 shloop:
368 SLD R9, R10, R10 // x[i]<<s
369 MOVDU -8(R15), R14
370 SRD R4, R14, R11 // x[i-1]>>ŝ
371 OR R11, R10, R10
372 MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ
373 MOVD R14, R10 // reuse x[i-1] for next iteration
374 ADD $-8, R16 // i--
375 CMP R15, R6 // &x[i-1]>&x[0]?
376 BGT shloop
377 loopexit:
378 MOVD 0(R6), R4
379 SLD R9, R4, R4
380 MOVD R4, 0(R3) // z[0]=x[0]<<s
381 MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c
382 RET
383
384 zeroshift:
385 CMP R6, R0 // x is null, nothing to copy
386 BEQ done
387 CMP R6, R3 // if x is same as z, nothing to copy
388 BEQ done
389 CMP R7, R4
390 ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z
391 SLD $3, R7, R7
392 SUB R6, R3, R11 // dest - src
393 CMPU R11, R7, CR2 // < len?
394 BLT CR2, backward // there is overlap, copy backwards
395 MOVD $0, R14
396 // shlVU processes backwards, but added a forward copy option
397 // since its faster on POWER
398 repeat:
399 MOVD (R6)(R14), R15 // Copy 8 bytes at a time
400 MOVD R15, (R3)(R14)
401 ADD $8, R14
402 CMP R14, R7 // More 8 bytes left?
403 BLT repeat
404 BR done
405 backward:
406 ADD $-8,R7, R14
407 repeatback:
408 MOVD (R6)(R14), R15 // copy x into z backwards
409 MOVD R15, (R3)(R14) // copy 8 bytes at a time
410 SUB $8, R14
411 CMP R14, $-8 // More 8 bytes left?
412 BGT repeatback
413
414 done:
415 MOVD R0, c+56(FP) // c=0
416 RET
417
418 //func shrVU(z, x []Word, s uint) (c Word)
419 TEXT ·shrVU(SB), NOSPLIT, $0
420 MOVD z+0(FP), R3
421 MOVD x+24(FP), R6
422 MOVD s+48(FP), R9
423 MOVD z_len+8(FP), R4
424 MOVD x_len+32(FP), R7
425
426 CMP R9, R0 // s==0, copy(z,x)
427 BEQ zeroshift
428 CMP R4, R0 // len(z)==0 return
429 BEQ done
430 SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
431
432 MOVD 0(R6), R7
433 SLD R5, R7, R7 // compute x[0]<<ŝ
434 MOVD $1, R8 // iterate from i=1 to i<len(z)
435 CMP R8, R4
436 BGE loopexit // Already at end?
437
438 // vectorize if len(z) is >=3, else jump to scalar loop
439 CMP R4, $3
440 BLT scalar
441 MTVSRD R9, VS38 // s
442 VSPLTB $7, V6, V4
443 MTVSRD R5, VS39 // ŝ
444 VSPLTB $7, V7, V2
445 ADD $-2, R4, R16
446 PCALIGN $16
447 loopback:
448 ADD $-1, R8, R10
449 SLD $3, R10
450 LXVD2X (R6)(R10), VS32 // load x[i-1], x[i]
451 SLD $3, R8, R12
452 LXVD2X (R6)(R12), VS33 // load x[i], x[i+1]
453
454 VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s
455 VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ
456 VOR V3, V5, V5 // Or(|) the two registers together
457 STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
458 ADD $2, R8 // Done processing 2 entries, i and i+1
459 CMP R8, R16 // Are there at least a couple of more entries left?
460 BLE loopback
461 CMP R8, R4 // Are we at the last element?
462 BEQ loopexit
463 scalar:
464 ADD $-1, R8, R10
465 SLD $3, R10
466 MOVD (R6)(R10),R11
467 SRD R9, R11, R11 // x[len(z)-2] >> s
468 SLD $3, R8, R12
469 MOVD (R6)(R12), R12
470 SLD R5, R12, R12 // x[len(z)-1]<<ŝ
471 OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
472 MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
473 loopexit:
474 ADD $-1, R4
475 SLD $3, R4
476 MOVD (R6)(R4), R5
477 SRD R9, R5, R5 // x[len(z)-1]>>s
478 MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s
479 MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c
480 RET
481
482 zeroshift:
483 CMP R6, R0 // x is null, nothing to copy
484 BEQ done
485 CMP R6, R3 // if x is same as z, nothing to copy
486 BEQ done
487 CMP R7, R4
488 ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z
489 SLD $3, R7, R7
490 MOVD $0, R14
491 repeat:
492 MOVD (R6)(R14), R15 // copy 8 bytes at a time
493 MOVD R15, (R3)(R14) // shrVU processes bytes only forwards
494 ADD $8, R14
495 CMP R14, R7 // More 8 bytes left?
496 BLT repeat
497 done:
498 MOVD R0, c+56(FP)
499 RET
500
501 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
502 TEXT ·mulAddVWW(SB), NOSPLIT, $0
503 MOVD z+0(FP), R10 // R10 = z[]
504 MOVD x+24(FP), R8 // R8 = x[]
505 MOVD y+48(FP), R9 // R9 = y
506 MOVD r+56(FP), R4 // R4 = r = c
507 MOVD z_len+8(FP), R11 // R11 = z_len
508
509 CMP R0, R11
510 BEQ done
511
512 MOVD 0(R8), R20
513 ADD $-1, R11
514 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
515 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
516 ADDC R4, R6 // R6 = z0 + r
517 ADDZE R7, R4 // R4 = z1 + CA
518 CMP R0, R11
519 MOVD R6, 0(R10) // z[i]
520 BEQ done
521
522 // We will read 4 elements per iteration
523 SRD $2, R11, R14 // R14 = z_len/4
524 DCBT (R8)
525 CMP R0, R14
526 MOVD R14, CTR // Set up the loop counter
527 BEQ tail // If R9 = 0, we can't use the loop
528 PCALIGN $16
529
530 loop:
531 MOVD 8(R8), R20 // R20 = x[i]
532 MOVD 16(R8), R21 // R21 = x[i+1]
533 MOVD 24(R8), R22 // R22 = x[i+2]
534 MOVDU 32(R8), R23 // R23 = x[i+3]
535 MULLD R9, R20, R24 // R24 = z0[i]
536 MULHDU R9, R20, R20 // R20 = z1[i]
537 ADDC R4, R24 // R24 = z0[i] + c
538 MULLD R9, R21, R25
539 MULHDU R9, R21, R21
540 ADDE R20, R25
541 MULLD R9, R22, R26
542 MULHDU R9, R22, R22
543 MULLD R9, R23, R27
544 MULHDU R9, R23, R23
545 ADDE R21, R26
546 MOVD R24, 8(R10) // z[i]
547 MOVD R25, 16(R10) // z[i+1]
548 ADDE R22, R27
549 ADDZE R23,R4 // update carry
550 MOVD R26, 24(R10) // z[i+2]
551 MOVDU R27, 32(R10) // z[i+3]
552 ADD $-4, R11 // R11 = z_len - 4
553 BC 16, 0, loop // bdnz
554
555 // We may have some elements to read
556 CMP R0, R11
557 BEQ done
558
559 // Process the remaining elements, one at a time
560 tail:
561 MOVDU 8(R8), R20 // R20 = x[i]
562 MULLD R9, R20, R24 // R24 = z0[i]
563 MULHDU R9, R20, R25 // R25 = z1[i]
564 ADD $-1, R11 // R11 = z_len - 1
565 ADDC R4, R24
566 ADDZE R25, R4
567 MOVDU R24, 8(R10) // z[i]
568 CMP R0, R11
569 BEQ done // If R11 = 0, we are done
570
571 MOVDU 8(R8), R20
572 MULLD R9, R20, R24
573 MULHDU R9, R20, R25
574 ADD $-1, R11
575 ADDC R4, R24
576 ADDZE R25, R4
577 MOVDU R24, 8(R10)
578 CMP R0, R11
579 BEQ done
580
581 MOVD 8(R8), R20
582 MULLD R9, R20, R24
583 MULHDU R9, R20, R25
584 ADD $-1, R11
585 ADDC R4, R24
586 ADDZE R25,R4
587 MOVD R24, 8(R10)
588
589 done:
590 MOVD R4, c+64(FP)
591 RET
592
593 // func addMulVVW(z, x []Word, y Word) (c Word)
594 TEXT ·addMulVVW(SB), NOSPLIT, $0
595 MOVD z+0(FP), R3 // R3 = z[]
596 MOVD x+24(FP), R4 // R4 = x[]
597 MOVD y+48(FP), R5 // R5 = y
598 MOVD z_len+8(FP), R6 // R6 = z_len
599
600 CMP R6, $4
601 MOVD R0, R9 // R9 = c = 0
602 BLT tail
603 SRD $2, R6, R7
604 MOVD R7, CTR // Initialize loop counter
605 PCALIGN $16
606
607 loop:
608 MOVD 0(R4), R14 // x[i]
609 MOVD 8(R4), R16 // x[i+1]
610 MOVD 16(R4), R18 // x[i+2]
611 MOVD 24(R4), R20 // x[i+3]
612 MOVD 0(R3), R15 // z[i]
613 MOVD 8(R3), R17 // z[i+1]
614 MOVD 16(R3), R19 // z[i+2]
615 MOVD 24(R3), R21 // z[i+3]
616 MULLD R5, R14, R10 // low x[i]*y
617 MULHDU R5, R14, R11 // high x[i]*y
618 ADDC R15, R10
619 ADDZE R11
620 ADDC R9, R10
621 ADDZE R11, R9
622 MULLD R5, R16, R14 // low x[i+1]*y
623 MULHDU R5, R16, R15 // high x[i+1]*y
624 ADDC R17, R14
625 ADDZE R15
626 ADDC R9, R14
627 ADDZE R15, R9
628 MULLD R5, R18, R16 // low x[i+2]*y
629 MULHDU R5, R18, R17 // high x[i+2]*y
630 ADDC R19, R16
631 ADDZE R17
632 ADDC R9, R16
633 ADDZE R17, R9
634 MULLD R5, R20, R18 // low x[i+3]*y
635 MULHDU R5, R20, R19 // high x[i+3]*y
636 ADDC R21, R18
637 ADDZE R19
638 ADDC R9, R18
639 ADDZE R19, R9
640 MOVD R10, 0(R3) // z[i]
641 MOVD R14, 8(R3) // z[i+1]
642 MOVD R16, 16(R3) // z[i+2]
643 MOVD R18, 24(R3) // z[i+3]
644 ADD $32, R3
645 ADD $32, R4
646 BDNZ loop
647
648 ANDCC $3, R6
649 tail:
650 CMP R0, R6
651 BEQ done
652 MOVD R6, CTR
653 PCALIGN $16
654 tailloop:
655 MOVD 0(R4), R14
656 MOVD 0(R3), R15
657 MULLD R5, R14, R10
658 MULHDU R5, R14, R11
659 ADDC R15, R10
660 ADDZE R11
661 ADDC R9, R10
662 ADDZE R11, R9
663 MOVD R10, 0(R3)
664 ADD $8, R3
665 ADD $8, R4
666 BDNZ tailloop
667
668 done:
669 MOVD R9, c+56(FP)
670 RET
671
672
View as plain text