1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 //go:build !purego
6
7 #include "asm_riscv64.h"
8 #include "go_asm.h"
9 #include "textflag.h"
10
11 // func xorBytesRISCV64(dst, a, b *byte, n int, hasV bool)
12 TEXT ·xorBytesRISCV64(SB), NOSPLIT|NOFRAME, $0
13 MOV dst+0(FP), X10
14 MOV a+8(FP), X11
15 MOV b+16(FP), X12
16 MOV n+24(FP), X13
17
18 MOV $32, X15
19 BLT X13, X15, loop4_check
20
21 #ifndef hasV
22 MOVB hasV+32(FP), X5
23 BEQZ X5, xorbytes_scalar
24 #endif
25
26 // Use vector if not 8 byte aligned.
27 OR X10, X11, X5
28 AND $7, X5
29 BNEZ X5, vector_loop
30
31 // Use scalar if 8 byte aligned and <= 64 bytes.
32 SUB $64, X12, X6
33 BLEZ X6, loop64_check
34
35 PCALIGN $16
36 vector_loop:
37 VSETVLI X13, E8, M8, TU, MU, X15
38 VLE8V (X11), V8
39 VLE8V (X12), V16
40 VXORVV V8, V16, V24
41 VSE8V V24, (X10)
42 ADD X15, X10
43 ADD X15, X11
44 ADD X15, X12
45 SUB X15, X13
46 BNEZ X13, vector_loop
47 RET
48
49 xorbytes_scalar:
50 // Check alignment - if alignment differs we have to do one byte at a time.
51 AND $7, X10, X5
52 AND $7, X11, X6
53 AND $7, X12, X7
54 BNE X5, X6, loop4_check
55 BNE X5, X7, loop4_check
56 BEQZ X5, loop64_check
57
58 // Check one byte at a time until we reach 8 byte alignment.
59 MOV $8, X8
60 SUB X5, X8
61 SUB X8, X13
62 align:
63 MOVBU 0(X11), X16
64 MOVBU 0(X12), X17
65 XOR X16, X17
66 MOVB X17, 0(X10)
67 ADD $1, X10
68 ADD $1, X11
69 ADD $1, X12
70 SUB $1, X8
71 BNEZ X8, align
72
73 loop64_check:
74 MOV $64, X15
75 BLT X13, X15, tail32_check
76 PCALIGN $16
77 loop64:
78 MOV 0(X11), X16
79 MOV 0(X12), X17
80 MOV 8(X11), X18
81 MOV 8(X12), X19
82 XOR X16, X17
83 XOR X18, X19
84 MOV X17, 0(X10)
85 MOV X19, 8(X10)
86 MOV 16(X11), X20
87 MOV 16(X12), X21
88 MOV 24(X11), X22
89 MOV 24(X12), X23
90 XOR X20, X21
91 XOR X22, X23
92 MOV X21, 16(X10)
93 MOV X23, 24(X10)
94 MOV 32(X11), X16
95 MOV 32(X12), X17
96 MOV 40(X11), X18
97 MOV 40(X12), X19
98 XOR X16, X17
99 XOR X18, X19
100 MOV X17, 32(X10)
101 MOV X19, 40(X10)
102 MOV 48(X11), X20
103 MOV 48(X12), X21
104 MOV 56(X11), X22
105 MOV 56(X12), X23
106 XOR X20, X21
107 XOR X22, X23
108 MOV X21, 48(X10)
109 MOV X23, 56(X10)
110 ADD $64, X10
111 ADD $64, X11
112 ADD $64, X12
113 SUB $64, X13
114 BGE X13, X15, loop64
115 BEQZ X13, done
116
117 tail32_check:
118 MOV $32, X15
119 BLT X13, X15, tail16_check
120 MOV 0(X11), X16
121 MOV 0(X12), X17
122 MOV 8(X11), X18
123 MOV 8(X12), X19
124 XOR X16, X17
125 XOR X18, X19
126 MOV X17, 0(X10)
127 MOV X19, 8(X10)
128 MOV 16(X11), X20
129 MOV 16(X12), X21
130 MOV 24(X11), X22
131 MOV 24(X12), X23
132 XOR X20, X21
133 XOR X22, X23
134 MOV X21, 16(X10)
135 MOV X23, 24(X10)
136 ADD $32, X10
137 ADD $32, X11
138 ADD $32, X12
139 SUB $32, X13
140 BEQZ X13, done
141
142 tail16_check:
143 MOV $16, X15
144 BLT X13, X15, loop4_check
145 MOV 0(X11), X16
146 MOV 0(X12), X17
147 MOV 8(X11), X18
148 MOV 8(X12), X19
149 XOR X16, X17
150 XOR X18, X19
151 MOV X17, 0(X10)
152 MOV X19, 8(X10)
153 ADD $16, X10
154 ADD $16, X11
155 ADD $16, X12
156 SUB $16, X13
157 BEQZ X13, done
158
159 loop4_check:
160 MOV $4, X15
161 BLT X13, X15, loop1
162 PCALIGN $16
163 loop4:
164 MOVBU 0(X11), X16
165 MOVBU 0(X12), X17
166 MOVBU 1(X11), X18
167 MOVBU 1(X12), X19
168 XOR X16, X17
169 XOR X18, X19
170 MOVB X17, 0(X10)
171 MOVB X19, 1(X10)
172 MOVBU 2(X11), X20
173 MOVBU 2(X12), X21
174 MOVBU 3(X11), X22
175 MOVBU 3(X12), X23
176 XOR X20, X21
177 XOR X22, X23
178 MOVB X21, 2(X10)
179 MOVB X23, 3(X10)
180 ADD $4, X10
181 ADD $4, X11
182 ADD $4, X12
183 SUB $4, X13
184 BGE X13, X15, loop4
185
186 PCALIGN $16
187 loop1:
188 BEQZ X13, done
189 MOVBU 0(X11), X16
190 MOVBU 0(X12), X17
191 XOR X16, X17
192 MOVB X17, 0(X10)
193 ADD $1, X10
194 ADD $1, X11
195 ADD $1, X12
196 SUB $1, X13
197 JMP loop1
198
199 done:
200 RET
201
View as plain text