1 // Copyright 2025 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "go_asm.h"
6 #include "textflag.h"
7
8 // Test-only.
9 TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
10 MOVQ sizeClass+0(FP), CX
11 MOVQ packed+8(FP), AX
12
13 // Call the expander for this size class
14 LEAQ ·gcExpandersAVX512(SB), BX
15 MOVQ (BX)(CX*8), DX // Move to register first so -spectre works
16 CALL DX
17
18 MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
19 VMOVDQU64 Z1, 0(DI)
20 VMOVDQU64 Z2, 64(DI)
21 VZEROUPPER
22 RET
23
24 TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
25 // Z1+Z2 = Expand the grey object mask into a grey word mask
26 MOVQ objMarks+16(FP), AX
27 MOVQ sizeClass+24(FP), CX
28 LEAQ ·gcExpandersAVX512(SB), BX
29 MOVQ (BX)(CX*8), DX // Move to register first so -spectre works
30 CALL DX
31
32 // Z3+Z4 = Load the pointer mask
33 MOVQ ptrMask+32(FP), AX
34 VMOVDQU64 0(AX), Z3
35 VMOVDQU64 64(AX), Z4
36
37 // Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
38 VPANDQ Z1, Z3, Z1
39 VPANDQ Z2, Z4, Z2
40
41 // Now each bit of Z1+Z2 represents one word of the span.
42 // Thus, each byte covers 64 bytes of memory, which is also how
43 // much we can fix in a Z register.
44 //
45 // We do a load/compress for each 64 byte frame.
46 //
47 // Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
48 VPOPCNTB Z1, Z3 // Requires BITALG
49 VPOPCNTB Z2, Z4
50
51 // Store the scan mask and word counts at 0(SP) and 128(SP).
52 //
53 // TODO: Is it better to read directly from the registers?
54 VMOVDQU64 Z1, 0(SP)
55 VMOVDQU64 Z2, 64(SP)
56 VMOVDQU64 Z3, 128(SP)
57 VMOVDQU64 Z4, 192(SP)
58
59 // SI = Current address in span
60 MOVQ mem+0(FP), SI
61 // DI = Scan buffer base
62 MOVQ bufp+8(FP), DI
63 // DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
64 MOVQ $0, DX
65
66 // AX = address in scan mask, 128(AX) = address in popcount
67 LEAQ 0(SP), AX
68
69 // Loop over the 64 byte frames in this span.
70 // BX = 1 past the end of the scan mask
71 LEAQ 128(SP), BX
72
73 // Align loop to a cache line so that performance is less sensitive
74 // to how this function ends up laid out in memory. This is a hot
75 // function in the GC, and this is a tight loop. We don't want
76 // performance to waver wildly due to unrelated changes.
77 PCALIGN $64
78 loop:
79 // CX = Fetch the mask of words to load from this frame.
80 MOVBQZX 0(AX), CX
81 // Skip empty frames.
82 TESTQ CX, CX
83 JZ skip
84
85 // Load the 64 byte frame.
86 KMOVB CX, K1
87 VMOVDQA64 0(SI), Z1
88
89 // Collect just the pointers from the greyed objects into the scan buffer,
90 // i.e., copy the word indices in the mask from Z1 into contiguous memory.
91 //
92 // N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
93 // AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
94 // imposes a severe performance penalty of around an order of magnitude
95 // compared to a register destination.
96 //
97 // This workaround is unfortunate on other microarchitectures, where a
98 // memory destination is slightly faster than adding an additional move
99 // instruction, but no where near an order of magnitude. It would be
100 // nice to have a Genoa-only variant here.
101 //
102 // AMD Turin / Zen 5 fixes this issue.
103 //
104 // See
105 // https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
106 VPCOMPRESSQ Z1, K1, Z2
107 VMOVDQU64 Z2, (DI)(DX*8)
108
109 // Advance the scan buffer position by the number of pointers.
110 MOVBQZX 128(AX), CX
111 ADDQ CX, DX
112
113 skip:
114 ADDQ $64, SI
115 ADDQ $1, AX
116 CMPQ AX, BX
117 JB loop
118
119 end:
120 MOVL DX, count+40(FP)
121 VZEROUPPER
122 RET
123
View as plain text