scan_amd64.s

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  // Test-only.
     9  TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
    10  	MOVQ sizeClass+0(FP), CX
    11  	MOVQ packed+8(FP), AX
    12  
    13  	// Call the expander for this size class
    14  	LEAQ ·gcExpandersAVX512(SB), BX
    15  	MOVQ (BX)(CX*8), DX // Move to register first so -spectre works
    16  	CALL DX
    17  
    18  	MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
    19  	VMOVDQU64 Z1, 0(DI)
    20  	VMOVDQU64 Z2, 64(DI)
    21  	VZEROUPPER
    22  	RET
    23  
    24  TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
    25  	// Z1+Z2 = Expand the grey object mask into a grey word mask
    26  	MOVQ objMarks+16(FP), AX
    27  	MOVQ sizeClass+24(FP), CX
    28  	LEAQ ·gcExpandersAVX512(SB), BX
    29  	MOVQ (BX)(CX*8), DX // Move to register first so -spectre works
    30  	CALL DX
    31  
    32  	// Z3+Z4 = Load the pointer mask
    33  	MOVQ ptrMask+32(FP), AX
    34  	VMOVDQU64 0(AX), Z3
    35  	VMOVDQU64 64(AX), Z4
    36  
    37  	// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
    38  	VPANDQ Z1, Z3, Z1
    39  	VPANDQ Z2, Z4, Z2
    40  
    41  	// Now each bit of Z1+Z2 represents one word of the span.
    42  	// Thus, each byte covers 64 bytes of memory, which is also how
    43  	// much we can fix in a Z register.
    44  	//
    45  	// We do a load/compress for each 64 byte frame.
    46  	//
    47  	// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
    48  	VPOPCNTB Z1, Z3 // Requires BITALG
    49  	VPOPCNTB Z2, Z4
    50  
    51  	// Store the scan mask and word counts at 0(SP) and 128(SP).
    52  	//
    53  	// TODO: Is it better to read directly from the registers?
    54  	VMOVDQU64 Z1, 0(SP)
    55  	VMOVDQU64 Z2, 64(SP)
    56  	VMOVDQU64 Z3, 128(SP)
    57  	VMOVDQU64 Z4, 192(SP)
    58  
    59  	// SI = Current address in span
    60  	MOVQ mem+0(FP), SI
    61  	// DI = Scan buffer base
    62  	MOVQ bufp+8(FP), DI
    63  	// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
    64  	MOVQ $0, DX
    65  
    66  	// AX = address in scan mask, 128(AX) = address in popcount
    67  	LEAQ 0(SP), AX
    68  
    69  	// Loop over the 64 byte frames in this span.
    70  	// BX = 1 past the end of the scan mask
    71  	LEAQ 128(SP), BX
    72  
    73  	// Align loop to a cache line so that performance is less sensitive
    74  	// to how this function ends up laid out in memory. This is a hot
    75  	// function in the GC, and this is a tight loop. We don't want
    76  	// performance to waver wildly due to unrelated changes.
    77  	PCALIGN $64
    78  loop:
    79  	// CX = Fetch the mask of words to load from this frame.
    80  	MOVBQZX 0(AX), CX
    81  	// Skip empty frames.
    82  	TESTQ CX, CX
    83  	JZ skip
    84  
    85  	// Load the 64 byte frame.
    86  	KMOVB CX, K1
    87  	VMOVDQA64 0(SI), Z1
    88  
    89  	// Collect just the pointers from the greyed objects into the scan buffer,
    90  	// i.e., copy the word indices in the mask from Z1 into contiguous memory.
    91  	//
    92  	// N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
    93  	// AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
    94  	// imposes a severe performance penalty of around an order of magnitude
    95  	// compared to a register destination.
    96  	//
    97  	// This workaround is unfortunate on other microarchitectures, where a
    98  	// memory destination is slightly faster than adding an additional move
    99  	// instruction, but no where near an order of magnitude. It would be
   100  	// nice to have a Genoa-only variant here.
   101  	//
   102  	// AMD Turin / Zen 5 fixes this issue.
   103  	//
   104  	// See
   105  	// https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
   106  	VPCOMPRESSQ Z1, K1, Z2
   107  	VMOVDQU64 Z2, (DI)(DX*8)
   108  
   109  	// Advance the scan buffer position by the number of pointers.
   110  	MOVBQZX 128(AX), CX
   111  	ADDQ CX, DX
   112  
   113  skip:
   114  	ADDQ $64, SI
   115  	ADDQ $1, AX
   116  	CMPQ AX, BX
   117  	JB loop
   118  
   119  end:
   120  	MOVL DX, count+40(FP)
   121  	VZEROUPPER
   122  	RET
   123
View as plain text