Text file src/math/big/arith_loong64.s

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Code generated by 'go generate' (with ./internal/asmgen). DO NOT EDIT.
     6  
     7  //go:build !math_big_pure_go
     8  
     9  #include "textflag.h"
    10  
    11  // func addVV(z, x, y []Word) (c Word)
    12  TEXT ·addVV(SB), NOSPLIT, $0
    13  	MOVV z_len+8(FP), R4
    14  	MOVV x_base+24(FP), R5
    15  	MOVV y_base+48(FP), R6
    16  	MOVV z_base+0(FP), R7
    17  	// compute unrolled loop lengths
    18  	AND $3, R4, R8
    19  	SRLV $2, R4
    20  	XOR R28, R28	// clear carry
    21  loop1:
    22  	BEQ R8, loop1done
    23  loop1cont:
    24  	// unroll 1X
    25  	MOVV 0(R5), R9
    26  	MOVV 0(R6), R10
    27  	ADDVU R10, R9	// ADCS R10, R9, R9 (cr=R28)
    28  	SGTU R10, R9, R30	// ...
    29  	ADDVU R28, R9	// ...
    30  	SGTU R28, R9, R28	// ...
    31  	ADDVU R30, R28	// ...
    32  	MOVV R9, 0(R7)
    33  	ADDVU $8, R5
    34  	ADDVU $8, R6
    35  	ADDVU $8, R7
    36  	SUBVU $1, R8
    37  	BNE R8, loop1cont
    38  loop1done:
    39  loop4:
    40  	BEQ R4, loop4done
    41  loop4cont:
    42  	// unroll 4X
    43  	MOVV 0(R5), R8
    44  	MOVV 8(R5), R9
    45  	MOVV 16(R5), R10
    46  	MOVV 24(R5), R11
    47  	MOVV 0(R6), R12
    48  	MOVV 8(R6), R13
    49  	MOVV 16(R6), R14
    50  	MOVV 24(R6), R15
    51  	ADDVU R12, R8	// ADCS R12, R8, R8 (cr=R28)
    52  	SGTU R12, R8, R30	// ...
    53  	ADDVU R28, R8	// ...
    54  	SGTU R28, R8, R28	// ...
    55  	ADDVU R30, R28	// ...
    56  	ADDVU R13, R9	// ADCS R13, R9, R9 (cr=R28)
    57  	SGTU R13, R9, R30	// ...
    58  	ADDVU R28, R9	// ...
    59  	SGTU R28, R9, R28	// ...
    60  	ADDVU R30, R28	// ...
    61  	ADDVU R14, R10	// ADCS R14, R10, R10 (cr=R28)
    62  	SGTU R14, R10, R30	// ...
    63  	ADDVU R28, R10	// ...
    64  	SGTU R28, R10, R28	// ...
    65  	ADDVU R30, R28	// ...
    66  	ADDVU R15, R11	// ADCS R15, R11, R11 (cr=R28)
    67  	SGTU R15, R11, R30	// ...
    68  	ADDVU R28, R11	// ...
    69  	SGTU R28, R11, R28	// ...
    70  	ADDVU R30, R28	// ...
    71  	MOVV R8, 0(R7)
    72  	MOVV R9, 8(R7)
    73  	MOVV R10, 16(R7)
    74  	MOVV R11, 24(R7)
    75  	ADDVU $32, R5
    76  	ADDVU $32, R6
    77  	ADDVU $32, R7
    78  	SUBVU $1, R4
    79  	BNE R4, loop4cont
    80  loop4done:
    81  	MOVV R28, c+72(FP)
    82  	RET
    83  
    84  // func subVV(z, x, y []Word) (c Word)
    85  TEXT ·subVV(SB), NOSPLIT, $0
    86  	MOVV z_len+8(FP), R4
    87  	MOVV x_base+24(FP), R5
    88  	MOVV y_base+48(FP), R6
    89  	MOVV z_base+0(FP), R7
    90  	// compute unrolled loop lengths
    91  	AND $3, R4, R8
    92  	SRLV $2, R4
    93  	XOR R28, R28	// clear carry
    94  loop1:
    95  	BEQ R8, loop1done
    96  loop1cont:
    97  	// unroll 1X
    98  	MOVV 0(R5), R9
    99  	MOVV 0(R6), R10
   100  	SGTU R28, R9, R30	// SBCS R10, R9, R9
   101  	SUBVU R28, R9	// ...
   102  	SGTU R10, R9, R28	// ...
   103  	SUBVU R10, R9	// ...
   104  	ADDVU R30, R28	// ...
   105  	MOVV R9, 0(R7)
   106  	ADDVU $8, R5
   107  	ADDVU $8, R6
   108  	ADDVU $8, R7
   109  	SUBVU $1, R8
   110  	BNE R8, loop1cont
   111  loop1done:
   112  loop4:
   113  	BEQ R4, loop4done
   114  loop4cont:
   115  	// unroll 4X
   116  	MOVV 0(R5), R8
   117  	MOVV 8(R5), R9
   118  	MOVV 16(R5), R10
   119  	MOVV 24(R5), R11
   120  	MOVV 0(R6), R12
   121  	MOVV 8(R6), R13
   122  	MOVV 16(R6), R14
   123  	MOVV 24(R6), R15
   124  	SGTU R28, R8, R30	// SBCS R12, R8, R8
   125  	SUBVU R28, R8	// ...
   126  	SGTU R12, R8, R28	// ...
   127  	SUBVU R12, R8	// ...
   128  	ADDVU R30, R28	// ...
   129  	SGTU R28, R9, R30	// SBCS R13, R9, R9
   130  	SUBVU R28, R9	// ...
   131  	SGTU R13, R9, R28	// ...
   132  	SUBVU R13, R9	// ...
   133  	ADDVU R30, R28	// ...
   134  	SGTU R28, R10, R30	// SBCS R14, R10, R10
   135  	SUBVU R28, R10	// ...
   136  	SGTU R14, R10, R28	// ...
   137  	SUBVU R14, R10	// ...
   138  	ADDVU R30, R28	// ...
   139  	SGTU R28, R11, R30	// SBCS R15, R11, R11
   140  	SUBVU R28, R11	// ...
   141  	SGTU R15, R11, R28	// ...
   142  	SUBVU R15, R11	// ...
   143  	ADDVU R30, R28	// ...
   144  	MOVV R8, 0(R7)
   145  	MOVV R9, 8(R7)
   146  	MOVV R10, 16(R7)
   147  	MOVV R11, 24(R7)
   148  	ADDVU $32, R5
   149  	ADDVU $32, R6
   150  	ADDVU $32, R7
   151  	SUBVU $1, R4
   152  	BNE R4, loop4cont
   153  loop4done:
   154  	MOVV R28, c+72(FP)
   155  	RET
   156  
   157  // func lshVU(z, x []Word, s uint) (c Word)
   158  TEXT ·lshVU(SB), NOSPLIT, $0
   159  	MOVV z_len+8(FP), R4
   160  	BEQ R4, ret0
   161  	MOVV s+48(FP), R5
   162  	MOVV x_base+24(FP), R6
   163  	MOVV z_base+0(FP), R7
   164  	// run loop backward
   165  	ALSLV $3, R4, R6, R6
   166  	ALSLV $3, R4, R7, R7
   167  	// shift first word into carry
   168  	MOVV -8(R6), R8
   169  	MOVV $64, R9
   170  	SUBVU R5, R9
   171  	SRLV R9, R8, R10
   172  	SLLV R5, R8
   173  	MOVV R10, c+56(FP)
   174  	// shift remaining words
   175  	SUBVU $1, R4
   176  	// compute unrolled loop lengths
   177  	AND $3, R4, R10
   178  	SRLV $2, R4
   179  loop1:
   180  	BEQ R10, loop1done
   181  loop1cont:
   182  	// unroll 1X
   183  	MOVV -16(R6), R11
   184  	SRLV R9, R11, R12
   185  	OR R8, R12
   186  	SLLV R5, R11, R8
   187  	MOVV R12, -8(R7)
   188  	ADDVU $-8, R6
   189  	ADDVU $-8, R7
   190  	SUBVU $1, R10
   191  	BNE R10, loop1cont
   192  loop1done:
   193  loop4:
   194  	BEQ R4, loop4done
   195  loop4cont:
   196  	// unroll 4X
   197  	MOVV -16(R6), R10
   198  	MOVV -24(R6), R11
   199  	MOVV -32(R6), R12
   200  	MOVV -40(R6), R13
   201  	SRLV R9, R10, R14
   202  	OR R8, R14
   203  	SLLV R5, R10, R8
   204  	SRLV R9, R11, R10
   205  	OR R8, R10
   206  	SLLV R5, R11, R8
   207  	SRLV R9, R12, R11
   208  	OR R8, R11
   209  	SLLV R5, R12, R8
   210  	SRLV R9, R13, R12
   211  	OR R8, R12
   212  	SLLV R5, R13, R8
   213  	MOVV R14, -8(R7)
   214  	MOVV R10, -16(R7)
   215  	MOVV R11, -24(R7)
   216  	MOVV R12, -32(R7)
   217  	ADDVU $-32, R6
   218  	ADDVU $-32, R7
   219  	SUBVU $1, R4
   220  	BNE R4, loop4cont
   221  loop4done:
   222  	// store final shifted bits
   223  	MOVV R8, -8(R7)
   224  	RET
   225  ret0:
   226  	MOVV R0, c+56(FP)
   227  	RET
   228  
   229  // func rshVU(z, x []Word, s uint) (c Word)
   230  TEXT ·rshVU(SB), NOSPLIT, $0
   231  	MOVV z_len+8(FP), R4
   232  	BEQ R4, ret0
   233  	MOVV s+48(FP), R5
   234  	MOVV x_base+24(FP), R6
   235  	MOVV z_base+0(FP), R7
   236  	// shift first word into carry
   237  	MOVV 0(R6), R8
   238  	MOVV $64, R9
   239  	SUBVU R5, R9
   240  	SLLV R9, R8, R10
   241  	SRLV R5, R8
   242  	MOVV R10, c+56(FP)
   243  	// shift remaining words
   244  	SUBVU $1, R4
   245  	// compute unrolled loop lengths
   246  	AND $3, R4, R10
   247  	SRLV $2, R4
   248  loop1:
   249  	BEQ R10, loop1done
   250  loop1cont:
   251  	// unroll 1X
   252  	MOVV 8(R6), R11
   253  	SLLV R9, R11, R12
   254  	OR R8, R12
   255  	SRLV R5, R11, R8
   256  	MOVV R12, 0(R7)
   257  	ADDVU $8, R6
   258  	ADDVU $8, R7
   259  	SUBVU $1, R10
   260  	BNE R10, loop1cont
   261  loop1done:
   262  loop4:
   263  	BEQ R4, loop4done
   264  loop4cont:
   265  	// unroll 4X
   266  	MOVV 8(R6), R10
   267  	MOVV 16(R6), R11
   268  	MOVV 24(R6), R12
   269  	MOVV 32(R6), R13
   270  	SLLV R9, R10, R14
   271  	OR R8, R14
   272  	SRLV R5, R10, R8
   273  	SLLV R9, R11, R10
   274  	OR R8, R10
   275  	SRLV R5, R11, R8
   276  	SLLV R9, R12, R11
   277  	OR R8, R11
   278  	SRLV R5, R12, R8
   279  	SLLV R9, R13, R12
   280  	OR R8, R12
   281  	SRLV R5, R13, R8
   282  	MOVV R14, 0(R7)
   283  	MOVV R10, 8(R7)
   284  	MOVV R11, 16(R7)
   285  	MOVV R12, 24(R7)
   286  	ADDVU $32, R6
   287  	ADDVU $32, R7
   288  	SUBVU $1, R4
   289  	BNE R4, loop4cont
   290  loop4done:
   291  	// store final shifted bits
   292  	MOVV R8, 0(R7)
   293  	RET
   294  ret0:
   295  	MOVV R0, c+56(FP)
   296  	RET
   297  
   298  // func mulAddVWW(z, x []Word, m, a Word) (c Word)
   299  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   300  	MOVV m+48(FP), R4
   301  	MOVV a+56(FP), R5
   302  	MOVV z_len+8(FP), R6
   303  	MOVV x_base+24(FP), R7
   304  	MOVV z_base+0(FP), R8
   305  	// compute unrolled loop lengths
   306  	AND $3, R6, R9
   307  	SRLV $2, R6
   308  loop1:
   309  	BEQ R9, loop1done
   310  loop1cont:
   311  	// unroll 1X
   312  	MOVV 0(R7), R10
   313  	// synthetic carry, one column at a time
   314  	MULV R4, R10, R11
   315  	MULHVU R4, R10, R12
   316  	ADDVU R5, R11, R10	// ADDS R5, R11, R10 (cr=R28)
   317  	SGTU R5, R10, R28	// ...
   318  	ADDVU R28, R12, R5	// ADC $0, R12, R5
   319  	MOVV R10, 0(R8)
   320  	ADDVU $8, R7
   321  	ADDVU $8, R8
   322  	SUBVU $1, R9
   323  	BNE R9, loop1cont
   324  loop1done:
   325  loop4:
   326  	BEQ R6, loop4done
   327  loop4cont:
   328  	// unroll 4X
   329  	MOVV 0(R7), R9
   330  	MOVV 8(R7), R10
   331  	MOVV 16(R7), R11
   332  	MOVV 24(R7), R12
   333  	// synthetic carry, one column at a time
   334  	MULV R4, R9, R13
   335  	MULHVU R4, R9, R14
   336  	ADDVU R5, R13, R9	// ADDS R5, R13, R9 (cr=R28)
   337  	SGTU R5, R9, R28	// ...
   338  	ADDVU R28, R14, R5	// ADC $0, R14, R5
   339  	MULV R4, R10, R13
   340  	MULHVU R4, R10, R14
   341  	ADDVU R5, R13, R10	// ADDS R5, R13, R10 (cr=R28)
   342  	SGTU R5, R10, R28	// ...
   343  	ADDVU R28, R14, R5	// ADC $0, R14, R5
   344  	MULV R4, R11, R13
   345  	MULHVU R4, R11, R14
   346  	ADDVU R5, R13, R11	// ADDS R5, R13, R11 (cr=R28)
   347  	SGTU R5, R11, R28	// ...
   348  	ADDVU R28, R14, R5	// ADC $0, R14, R5
   349  	MULV R4, R12, R13
   350  	MULHVU R4, R12, R14
   351  	ADDVU R5, R13, R12	// ADDS R5, R13, R12 (cr=R28)
   352  	SGTU R5, R12, R28	// ...
   353  	ADDVU R28, R14, R5	// ADC $0, R14, R5
   354  	MOVV R9, 0(R8)
   355  	MOVV R10, 8(R8)
   356  	MOVV R11, 16(R8)
   357  	MOVV R12, 24(R8)
   358  	ADDVU $32, R7
   359  	ADDVU $32, R8
   360  	SUBVU $1, R6
   361  	BNE R6, loop4cont
   362  loop4done:
   363  	MOVV R5, c+64(FP)
   364  	RET
   365  
   366  // func addMulVVWW(z, x, y []Word, m, a Word) (c Word)
   367  TEXT ·addMulVVWW(SB), NOSPLIT, $0
   368  	MOVV m+72(FP), R4
   369  	MOVV a+80(FP), R5
   370  	MOVV z_len+8(FP), R6
   371  	MOVV x_base+24(FP), R7
   372  	MOVV y_base+48(FP), R8
   373  	MOVV z_base+0(FP), R9
   374  	// compute unrolled loop lengths
   375  	AND $3, R6, R10
   376  	SRLV $2, R6
   377  loop1:
   378  	BEQ R10, loop1done
   379  loop1cont:
   380  	// unroll 1X
   381  	MOVV 0(R7), R11
   382  	MOVV 0(R8), R12
   383  	// synthetic carry, one column at a time
   384  	MULV R4, R12, R13
   385  	MULHVU R4, R12, R14
   386  	ADDVU R11, R13	// ADDS R11, R13, R13 (cr=R28)
   387  	SGTU R11, R13, R28	// ...
   388  	ADDVU R28, R14	// ADC $0, R14, R14
   389  	ADDVU R5, R13, R12	// ADDS R5, R13, R12 (cr=R28)
   390  	SGTU R5, R12, R28	// ...
   391  	ADDVU R28, R14, R5	// ADC $0, R14, R5
   392  	MOVV R12, 0(R9)
   393  	ADDVU $8, R7
   394  	ADDVU $8, R8
   395  	ADDVU $8, R9
   396  	SUBVU $1, R10
   397  	BNE R10, loop1cont
   398  loop1done:
   399  loop4:
   400  	BEQ R6, loop4done
   401  loop4cont:
   402  	// unroll 4X
   403  	MOVV 0(R7), R10
   404  	MOVV 8(R7), R11
   405  	MOVV 16(R7), R12
   406  	MOVV 24(R7), R13
   407  	MOVV 0(R8), R14
   408  	MOVV 8(R8), R15
   409  	MOVV 16(R8), R16
   410  	MOVV 24(R8), R17
   411  	// synthetic carry, one column at a time
   412  	MULV R4, R14, R18
   413  	MULHVU R4, R14, R19
   414  	ADDVU R10, R18	// ADDS R10, R18, R18 (cr=R28)
   415  	SGTU R10, R18, R28	// ...
   416  	ADDVU R28, R19	// ADC $0, R19, R19
   417  	ADDVU R5, R18, R14	// ADDS R5, R18, R14 (cr=R28)
   418  	SGTU R5, R14, R28	// ...
   419  	ADDVU R28, R19, R5	// ADC $0, R19, R5
   420  	MULV R4, R15, R18
   421  	MULHVU R4, R15, R19
   422  	ADDVU R11, R18	// ADDS R11, R18, R18 (cr=R28)
   423  	SGTU R11, R18, R28	// ...
   424  	ADDVU R28, R19	// ADC $0, R19, R19
   425  	ADDVU R5, R18, R15	// ADDS R5, R18, R15 (cr=R28)
   426  	SGTU R5, R15, R28	// ...
   427  	ADDVU R28, R19, R5	// ADC $0, R19, R5
   428  	MULV R4, R16, R18
   429  	MULHVU R4, R16, R19
   430  	ADDVU R12, R18	// ADDS R12, R18, R18 (cr=R28)
   431  	SGTU R12, R18, R28	// ...
   432  	ADDVU R28, R19	// ADC $0, R19, R19
   433  	ADDVU R5, R18, R16	// ADDS R5, R18, R16 (cr=R28)
   434  	SGTU R5, R16, R28	// ...
   435  	ADDVU R28, R19, R5	// ADC $0, R19, R5
   436  	MULV R4, R17, R18
   437  	MULHVU R4, R17, R19
   438  	ADDVU R13, R18	// ADDS R13, R18, R18 (cr=R28)
   439  	SGTU R13, R18, R28	// ...
   440  	ADDVU R28, R19	// ADC $0, R19, R19
   441  	ADDVU R5, R18, R17	// ADDS R5, R18, R17 (cr=R28)
   442  	SGTU R5, R17, R28	// ...
   443  	ADDVU R28, R19, R5	// ADC $0, R19, R5
   444  	MOVV R14, 0(R9)
   445  	MOVV R15, 8(R9)
   446  	MOVV R16, 16(R9)
   447  	MOVV R17, 24(R9)
   448  	ADDVU $32, R7
   449  	ADDVU $32, R8
   450  	ADDVU $32, R9
   451  	SUBVU $1, R6
   452  	BNE R6, loop4cont
   453  loop4done:
   454  	MOVV R5, c+88(FP)
   455  	RET
   456  

View as plain text