// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build amd64 || arm64 || loong64

// This provides common support for architectures that use extended register
// state in asynchronous preemption.
//
// While asynchronous preemption stores general-purpose (GP) registers on the
// preempted goroutine's own stack, extended register state can be used to save
// non-GP state off the stack. In particular, this is meant for large vector
// register files. This memory is conservatively scanned to enable using
// non-GP registers for operations that may involve pointers.
//
// For an architecture to support extended register state, it must provide a Go
// definition of an xRegState type for storing the state, and its asyncPreempt
// implementation must write this register state to p.xRegs.scratch.

package runtime

import (
	"internal/abi"
	"internal/runtime/sys"
	"unsafe"
)

// xRegState is long-lived extended register state. It is allocated off-heap and
// manually managed.
type xRegState struct {
	_    sys.NotInHeap // Allocated from xRegAlloc
	regs xRegs
}

// xRegPerG stores extended register state while a goroutine is asynchronously
// preempted. This is nil otherwise, so we can reuse a (likely small) pool of
// xRegState objects.
type xRegPerG struct {
	state *xRegState
}

type xRegPerP struct {
	// scratch temporary per-P space where [asyncPreempt] saves the register
	// state before entering Go. It's quickly copied to per-G state.
	scratch xRegs

	// cache is a 1-element allocation cache of extended register state used by
	// asynchronous preemption. On entry to preemption, this is used as a simple
	// allocation cache. On exit from preemption, the G's xRegState is always
	// stored here where it can be restored, and later either freed or reused
	// for another preemption. On exit, this serves the dual purpose of
	// delay-freeing the allocated xRegState until after we've definitely
	// restored it.
	cache *xRegState
}

// xRegAlloc allocates xRegState objects.
var xRegAlloc struct {
	lock  mutex
	alloc fixalloc
}

func xRegInitAlloc() {
	lockInit(&xRegAlloc.lock, lockRankXRegAlloc)
	xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys)
}

// xRegSave saves the extended register state on this P to gp.
//
// This must run on the system stack because it assumes the P won't change.
//
//go:systemstack
func xRegSave(gp *g) {
	if gp.xRegs.state != nil {
		// Double preempt?
		throw("gp.xRegState.p != nil on async preempt")
	}

	// Get the place to save the register state.
	var dest *xRegState
	pp := gp.m.p.ptr()
	if pp.xRegs.cache != nil {
		// Use the cached allocation.
		dest = pp.xRegs.cache
		pp.xRegs.cache = nil
	} else {
		// Allocate a new save block.
		lock(&xRegAlloc.lock)
		dest = (*xRegState)(xRegAlloc.alloc.alloc())
		unlock(&xRegAlloc.lock)
	}

	// Copy state saved in the scratchpad to dest.
	//
	// If we ever need to save less state (e.g., avoid saving vector registers
	// that aren't in use), we could have multiple allocation pools for
	// different size states and copy only the registers we need.
	dest.regs = pp.xRegs.scratch

	// Save on the G.
	gp.xRegs.state = dest
}

// xRegRestore prepares the extended register state on gp to be restored.
//
// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find
// it. This means nothing else may use the cache between this call and the
// return to asyncPreempt. This is not quite symmetric with [xRegSave], which
// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy.
//
// This is called with asyncPreempt on the stack and thus must not grow the
// stack.
//
//go:nosplit
func xRegRestore(gp *g) {
	if gp.xRegs.state == nil {
		throw("gp.xRegState.p == nil on return from async preempt")
	}
	// If the P has a block cached on it, free that so we can replace it.
	pp := gp.m.p.ptr()
	if pp.xRegs.cache != nil {
		// Don't grow the G stack.
		systemstack(func() {
			pp.xRegs.free()
		})
	}
	pp.xRegs.cache = gp.xRegs.state
	gp.xRegs.state = nil
}

func (xRegs *xRegPerP) free() {
	if xRegs.cache != nil {
		lock(&xRegAlloc.lock)
		xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache))
		xRegs.cache = nil
		unlock(&xRegAlloc.lock)
	}
}

// xRegScan conservatively scans the extended register state.
//
// This is supposed to be called only by scanstack when it handles async preemption.
func xRegScan(gp *g, gcw *gcWork, state *stackScanState) {
	// Regular async preemption always provides the extended register state.
	if gp.xRegs.state == nil {
		var u unwinder
		for u.init(gp, 0); u.valid(); u.next() {
			if u.frame.fn.valid() && u.frame.fn.funcID == abi.FuncID_debugCallV2 {
				return
			}
		}
		println("runtime: gp=", gp, ", goid=", gp.goid)
		throw("gp.xRegs.state == nil on a scanstack attempt during async preemption")
	}
	b := uintptr(unsafe.Pointer(&gp.xRegs.state.regs))
	n := uintptr(unsafe.Sizeof(gp.xRegs.state.regs))
	if debugScanConservative {
		print("begin scan xRegs of goroutine ", gp.goid, " at [", hex(b), ",", hex(b+n), ")\n")
	}
	scanConservative(b, n, nil, gcw, state)
	if debugScanConservative {
		print("end scan xRegs of goroutine ", gp.goid, "\n")
	}
}