Text file
src/math/big/arith_loong64.s
1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
6
7 //go:build !math_big_pure_go
8
9 #include "textflag.h"
10
11 // func addVV(z, x, y []Word) (c Word)
12 TEXT ·addVV(SB), NOSPLIT, $0
13 MOVV z_len+8(FP), R4
14 MOVV x_base+24(FP), R5
15 MOVV y_base+48(FP), R6
16 MOVV z_base+0(FP), R7
17 // compute unrolled loop lengths
18 AND $3, R4, R8
19 SRLV $2, R4
20 XOR R28, R28 // clear carry
21 loop1:
22 BEQ R8, loop1done
23 loop1cont:
24 // unroll 1X
25 MOVV 0(R5), R9
26 MOVV 0(R6), R10
27 ADDVU R10, R9 // ADCS R10, R9, R9 (cr=R28)
28 SGTU R10, R9, R30 // ...
29 ADDVU R28, R9 // ...
30 SGTU R28, R9, R28 // ...
31 ADDVU R30, R28 // ...
32 MOVV R9, 0(R7)
33 ADDVU $8, R5
34 ADDVU $8, R6
35 ADDVU $8, R7
36 SUBVU $1, R8
37 BNE R8, loop1cont
38 loop1done:
39 loop4:
40 BEQ R4, loop4done
41 loop4cont:
42 // unroll 4X
43 MOVV 0(R5), R8
44 MOVV 8(R5), R9
45 MOVV 16(R5), R10
46 MOVV 24(R5), R11
47 MOVV 0(R6), R12
48 MOVV 8(R6), R13
49 MOVV 16(R6), R14
50 MOVV 24(R6), R15
51 ADDVU R12, R8 // ADCS R12, R8, R8 (cr=R28)
52 SGTU R12, R8, R30 // ...
53 ADDVU R28, R8 // ...
54 SGTU R28, R8, R28 // ...
55 ADDVU R30, R28 // ...
56 ADDVU R13, R9 // ADCS R13, R9, R9 (cr=R28)
57 SGTU R13, R9, R30 // ...
58 ADDVU R28, R9 // ...
59 SGTU R28, R9, R28 // ...
60 ADDVU R30, R28 // ...
61 ADDVU R14, R10 // ADCS R14, R10, R10 (cr=R28)
62 SGTU R14, R10, R30 // ...
63 ADDVU R28, R10 // ...
64 SGTU R28, R10, R28 // ...
65 ADDVU R30, R28 // ...
66 ADDVU R15, R11 // ADCS R15, R11, R11 (cr=R28)
67 SGTU R15, R11, R30 // ...
68 ADDVU R28, R11 // ...
69 SGTU R28, R11, R28 // ...
70 ADDVU R30, R28 // ...
71 MOVV R8, 0(R7)
72 MOVV R9, 8(R7)
73 MOVV R10, 16(R7)
74 MOVV R11, 24(R7)
75 ADDVU $32, R5
76 ADDVU $32, R6
77 ADDVU $32, R7
78 SUBVU $1, R4
79 BNE R4, loop4cont
80 loop4done:
81 MOVV R28, c+72(FP)
82 RET
83
84 // func subVV(z, x, y []Word) (c Word)
85 TEXT ·subVV(SB), NOSPLIT, $0
86 MOVV z_len+8(FP), R4
87 MOVV x_base+24(FP), R5
88 MOVV y_base+48(FP), R6
89 MOVV z_base+0(FP), R7
90 // compute unrolled loop lengths
91 AND $3, R4, R8
92 SRLV $2, R4
93 XOR R28, R28 // clear carry
94 loop1:
95 BEQ R8, loop1done
96 loop1cont:
97 // unroll 1X
98 MOVV 0(R5), R9
99 MOVV 0(R6), R10
100 SGTU R28, R9, R30 // SBCS R10, R9, R9
101 SUBVU R28, R9 // ...
102 SGTU R10, R9, R28 // ...
103 SUBVU R10, R9 // ...
104 ADDVU R30, R28 // ...
105 MOVV R9, 0(R7)
106 ADDVU $8, R5
107 ADDVU $8, R6
108 ADDVU $8, R7
109 SUBVU $1, R8
110 BNE R8, loop1cont
111 loop1done:
112 loop4:
113 BEQ R4, loop4done
114 loop4cont:
115 // unroll 4X
116 MOVV 0(R5), R8
117 MOVV 8(R5), R9
118 MOVV 16(R5), R10
119 MOVV 24(R5), R11
120 MOVV 0(R6), R12
121 MOVV 8(R6), R13
122 MOVV 16(R6), R14
123 MOVV 24(R6), R15
124 SGTU R28, R8, R30 // SBCS R12, R8, R8
125 SUBVU R28, R8 // ...
126 SGTU R12, R8, R28 // ...
127 SUBVU R12, R8 // ...
128 ADDVU R30, R28 // ...
129 SGTU R28, R9, R30 // SBCS R13, R9, R9
130 SUBVU R28, R9 // ...
131 SGTU R13, R9, R28 // ...
132 SUBVU R13, R9 // ...
133 ADDVU R30, R28 // ...
134 SGTU R28, R10, R30 // SBCS R14, R10, R10
135 SUBVU R28, R10 // ...
136 SGTU R14, R10, R28 // ...
137 SUBVU R14, R10 // ...
138 ADDVU R30, R28 // ...
139 SGTU R28, R11, R30 // SBCS R15, R11, R11
140 SUBVU R28, R11 // ...
141 SGTU R15, R11, R28 // ...
142 SUBVU R15, R11 // ...
143 ADDVU R30, R28 // ...
144 MOVV R8, 0(R7)
145 MOVV R9, 8(R7)
146 MOVV R10, 16(R7)
147 MOVV R11, 24(R7)
148 ADDVU $32, R5
149 ADDVU $32, R6
150 ADDVU $32, R7
151 SUBVU $1, R4
152 BNE R4, loop4cont
153 loop4done:
154 MOVV R28, c+72(FP)
155 RET
156
157 // func lshVU(z, x []Word, s uint) (c Word)
158 TEXT ·lshVU(SB), NOSPLIT, $0
159 MOVV z_len+8(FP), R4
160 BEQ R4, ret0
161 MOVV s+48(FP), R5
162 MOVV x_base+24(FP), R6
163 MOVV z_base+0(FP), R7
164 // run loop backward
165 ALSLV $3, R4, R6, R6
166 ALSLV $3, R4, R7, R7
167 // shift first word into carry
168 MOVV -8(R6), R8
169 MOVV $64, R9
170 SUBVU R5, R9
171 SRLV R9, R8, R10
172 SLLV R5, R8
173 MOVV R10, c+56(FP)
174 // shift remaining words
175 SUBVU $1, R4
176 // compute unrolled loop lengths
177 AND $3, R4, R10
178 SRLV $2, R4
179 loop1:
180 BEQ R10, loop1done
181 loop1cont:
182 // unroll 1X
183 MOVV -16(R6), R11
184 SRLV R9, R11, R12
185 OR R8, R12
186 SLLV R5, R11, R8
187 MOVV R12, -8(R7)
188 ADDVU $-8, R6
189 ADDVU $-8, R7
190 SUBVU $1, R10
191 BNE R10, loop1cont
192 loop1done:
193 loop4:
194 BEQ R4, loop4done
195 loop4cont:
196 // unroll 4X
197 MOVV -16(R6), R10
198 MOVV -24(R6), R11
199 MOVV -32(R6), R12
200 MOVV -40(R6), R13
201 SRLV R9, R10, R14
202 OR R8, R14
203 SLLV R5, R10, R8
204 SRLV R9, R11, R10
205 OR R8, R10
206 SLLV R5, R11, R8
207 SRLV R9, R12, R11
208 OR R8, R11
209 SLLV R5, R12, R8
210 SRLV R9, R13, R12
211 OR R8, R12
212 SLLV R5, R13, R8
213 MOVV R14, -8(R7)
214 MOVV R10, -16(R7)
215 MOVV R11, -24(R7)
216 MOVV R12, -32(R7)
217 ADDVU $-32, R6
218 ADDVU $-32, R7
219 SUBVU $1, R4
220 BNE R4, loop4cont
221 loop4done:
222 // store final shifted bits
223 MOVV R8, -8(R7)
224 RET
225 ret0:
226 MOVV R0, c+56(FP)
227 RET
228
229 // func rshVU(z, x []Word, s uint) (c Word)
230 TEXT ·rshVU(SB), NOSPLIT, $0
231 MOVV z_len+8(FP), R4
232 BEQ R4, ret0
233 MOVV s+48(FP), R5
234 MOVV x_base+24(FP), R6
235 MOVV z_base+0(FP), R7
236 // shift first word into carry
237 MOVV 0(R6), R8
238 MOVV $64, R9
239 SUBVU R5, R9
240 SLLV R9, R8, R10
241 SRLV R5, R8
242 MOVV R10, c+56(FP)
243 // shift remaining words
244 SUBVU $1, R4
245 // compute unrolled loop lengths
246 AND $3, R4, R10
247 SRLV $2, R4
248 loop1:
249 BEQ R10, loop1done
250 loop1cont:
251 // unroll 1X
252 MOVV 8(R6), R11
253 SLLV R9, R11, R12
254 OR R8, R12
255 SRLV R5, R11, R8
256 MOVV R12, 0(R7)
257 ADDVU $8, R6
258 ADDVU $8, R7
259 SUBVU $1, R10
260 BNE R10, loop1cont
261 loop1done:
262 loop4:
263 BEQ R4, loop4done
264 loop4cont:
265 // unroll 4X
266 MOVV 8(R6), R10
267 MOVV 16(R6), R11
268 MOVV 24(R6), R12
269 MOVV 32(R6), R13
270 SLLV R9, R10, R14
271 OR R8, R14
272 SRLV R5, R10, R8
273 SLLV R9, R11, R10
274 OR R8, R10
275 SRLV R5, R11, R8
276 SLLV R9, R12, R11
277 OR R8, R11
278 SRLV R5, R12, R8
279 SLLV R9, R13, R12
280 OR R8, R12
281 SRLV R5, R13, R8
282 MOVV R14, 0(R7)
283 MOVV R10, 8(R7)
284 MOVV R11, 16(R7)
285 MOVV R12, 24(R7)
286 ADDVU $32, R6
287 ADDVU $32, R7
288 SUBVU $1, R4
289 BNE R4, loop4cont
290 loop4done:
291 // store final shifted bits
292 MOVV R8, 0(R7)
293 RET
294 ret0:
295 MOVV R0, c+56(FP)
296 RET
297
298 // func mulAddVWW(z, x []Word, m, a Word) (c Word)
299 TEXT ·mulAddVWW(SB), NOSPLIT, $0
300 MOVV m+48(FP), R4
301 MOVV a+56(FP), R5
302 MOVV z_len+8(FP), R6
303 MOVV x_base+24(FP), R7
304 MOVV z_base+0(FP), R8
305 // compute unrolled loop lengths
306 AND $3, R6, R9
307 SRLV $2, R6
308 loop1:
309 BEQ R9, loop1done
310 loop1cont:
311 // unroll 1X
312 MOVV 0(R7), R10
313 // synthetic carry, one column at a time
314 MULV R4, R10, R11
315 MULHVU R4, R10, R12
316 ADDVU R5, R11, R10 // ADDS R5, R11, R10 (cr=R28)
317 SGTU R5, R10, R28 // ...
318 ADDVU R28, R12, R5 // ADC $0, R12, R5
319 MOVV R10, 0(R8)
320 ADDVU $8, R7
321 ADDVU $8, R8
322 SUBVU $1, R9
323 BNE R9, loop1cont
324 loop1done:
325 loop4:
326 BEQ R6, loop4done
327 loop4cont:
328 // unroll 4X
329 MOVV 0(R7), R9
330 MOVV 8(R7), R10
331 MOVV 16(R7), R11
332 MOVV 24(R7), R12
333 // synthetic carry, one column at a time
334 MULV R4, R9, R13
335 MULHVU R4, R9, R14
336 ADDVU R5, R13, R9 // ADDS R5, R13, R9 (cr=R28)
337 SGTU R5, R9, R28 // ...
338 ADDVU R28, R14, R5 // ADC $0, R14, R5
339 MULV R4, R10, R13
340 MULHVU R4, R10, R14
341 ADDVU R5, R13, R10 // ADDS R5, R13, R10 (cr=R28)
342 SGTU R5, R10, R28 // ...
343 ADDVU R28, R14, R5 // ADC $0, R14, R5
344 MULV R4, R11, R13
345 MULHVU R4, R11, R14
346 ADDVU R5, R13, R11 // ADDS R5, R13, R11 (cr=R28)
347 SGTU R5, R11, R28 // ...
348 ADDVU R28, R14, R5 // ADC $0, R14, R5
349 MULV R4, R12, R13
350 MULHVU R4, R12, R14
351 ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
352 SGTU R5, R12, R28 // ...
353 ADDVU R28, R14, R5 // ADC $0, R14, R5
354 MOVV R9, 0(R8)
355 MOVV R10, 8(R8)
356 MOVV R11, 16(R8)
357 MOVV R12, 24(R8)
358 ADDVU $32, R7
359 ADDVU $32, R8
360 SUBVU $1, R6
361 BNE R6, loop4cont
362 loop4done:
363 MOVV R5, c+64(FP)
364 RET
365
366 // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
367 TEXT ·addMulVVWW(SB), NOSPLIT, $0
368 MOVV m+72(FP), R4
369 MOVV a+80(FP), R5
370 MOVV z_len+8(FP), R6
371 MOVV x_base+24(FP), R7
372 MOVV y_base+48(FP), R8
373 MOVV z_base+0(FP), R9
374 // compute unrolled loop lengths
375 AND $3, R6, R10
376 SRLV $2, R6
377 loop1:
378 BEQ R10, loop1done
379 loop1cont:
380 // unroll 1X
381 MOVV 0(R7), R11
382 MOVV 0(R8), R12
383 // synthetic carry, one column at a time
384 MULV R4, R12, R13
385 MULHVU R4, R12, R14
386 ADDVU R11, R13 // ADDS R11, R13, R13 (cr=R28)
387 SGTU R11, R13, R28 // ...
388 ADDVU R28, R14 // ADC $0, R14, R14
389 ADDVU R5, R13, R12 // ADDS R5, R13, R12 (cr=R28)
390 SGTU R5, R12, R28 // ...
391 ADDVU R28, R14, R5 // ADC $0, R14, R5
392 MOVV R12, 0(R9)
393 ADDVU $8, R7
394 ADDVU $8, R8
395 ADDVU $8, R9
396 SUBVU $1, R10
397 BNE R10, loop1cont
398 loop1done:
399 loop4:
400 BEQ R6, loop4done
401 loop4cont:
402 // unroll 4X
403 MOVV 0(R7), R10
404 MOVV 8(R7), R11
405 MOVV 16(R7), R12
406 MOVV 24(R7), R13
407 MOVV 0(R8), R14
408 MOVV 8(R8), R15
409 MOVV 16(R8), R16
410 MOVV 24(R8), R17
411 // synthetic carry, one column at a time
412 MULV R4, R14, R18
413 MULHVU R4, R14, R19
414 ADDVU R10, R18 // ADDS R10, R18, R18 (cr=R28)
415 SGTU R10, R18, R28 // ...
416 ADDVU R28, R19 // ADC $0, R19, R19
417 ADDVU R5, R18, R14 // ADDS R5, R18, R14 (cr=R28)
418 SGTU R5, R14, R28 // ...
419 ADDVU R28, R19, R5 // ADC $0, R19, R5
420 MULV R4, R15, R18
421 MULHVU R4, R15, R19
422 ADDVU R11, R18 // ADDS R11, R18, R18 (cr=R28)
423 SGTU R11, R18, R28 // ...
424 ADDVU R28, R19 // ADC $0, R19, R19
425 ADDVU R5, R18, R15 // ADDS R5, R18, R15 (cr=R28)
426 SGTU R5, R15, R28 // ...
427 ADDVU R28, R19, R5 // ADC $0, R19, R5
428 MULV R4, R16, R18
429 MULHVU R4, R16, R19
430 ADDVU R12, R18 // ADDS R12, R18, R18 (cr=R28)
431 SGTU R12, R18, R28 // ...
432 ADDVU R28, R19 // ADC $0, R19, R19
433 ADDVU R5, R18, R16 // ADDS R5, R18, R16 (cr=R28)
434 SGTU R5, R16, R28 // ...
435 ADDVU R28, R19, R5 // ADC $0, R19, R5
436 MULV R4, R17, R18
437 MULHVU R4, R17, R19
438 ADDVU R13, R18 // ADDS R13, R18, R18 (cr=R28)
439 SGTU R13, R18, R28 // ...
440 ADDVU R28, R19 // ADC $0, R19, R19
441 ADDVU R5, R18, R17 // ADDS R5, R18, R17 (cr=R28)
442 SGTU R5, R17, R28 // ...
443 ADDVU R28, R19, R5 // ADC $0, R19, R5
444 MOVV R14, 0(R9)
445 MOVV R15, 8(R9)
446 MOVV R16, 16(R9)
447 MOVV R17, 24(R9)
448 ADDVU $32, R7
449 ADDVU $32, R8
450 ADDVU $32, R9
451 SUBVU $1, R6
452 BNE R6, loop4cont
453 loop4done:
454 MOVV R5, c+88(FP)
455 RET
456
View as plain text