Source file src/cmd/compile/internal/ssa/pair.go

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssa
     6  
     7  import (
     8  	"cmd/compile/internal/ir"
     9  	"cmd/compile/internal/types"
    10  	"cmd/internal/obj"
    11  	"slices"
    12  )
    13  
    14  // The pair pass finds memory operations that can be paired up
    15  // into single 2-register memory instructions.
    16  func pair(f *Func) {
    17  	// Only arm64 for now. This pass is fairly arch-specific.
    18  	switch f.Config.arch {
    19  	case "arm64":
    20  	default:
    21  		return
    22  	}
    23  	pairLoads(f)
    24  	pairStores(f)
    25  }
    26  
    27  type pairableLoadInfo struct {
    28  	width int64 // width of one element in the pair, in bytes
    29  	pair  Op
    30  }
    31  
    32  // All pairableLoad ops must take 2 arguments, a pointer and a memory.
    33  // They must also take an offset in Aux/AuxInt.
    34  var pairableLoads = map[Op]pairableLoadInfo{
    35  	OpARM64MOVDload:  {8, OpARM64LDP},
    36  	OpARM64MOVWUload: {4, OpARM64LDPW},
    37  	OpARM64MOVWload:  {4, OpARM64LDPSW},
    38  	// TODO: conceivably we could pair a signed and unsigned load
    39  	// if we knew the upper bits of one of them weren't being used.
    40  	OpARM64FMOVDload: {8, OpARM64FLDPD},
    41  	OpARM64FMOVSload: {4, OpARM64FLDPS},
    42  }
    43  
    44  type pairableStoreInfo struct {
    45  	width int64 // width of one element in the pair, in bytes
    46  	pair  Op
    47  }
    48  
    49  // All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
    50  // All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
    51  // They must also take an offset in Aux/AuxInt.
    52  var pairableStores = map[Op]pairableStoreInfo{
    53  	OpARM64MOVDstore:  {8, OpARM64STP},
    54  	OpARM64MOVWstore:  {4, OpARM64STPW},
    55  	OpARM64FMOVDstore: {8, OpARM64FSTPD},
    56  	OpARM64FMOVSstore: {4, OpARM64FSTPS},
    57  }
    58  
    59  // offsetOk returns true if a pair instruction should be used
    60  // for the offset Aux+off, when the data width (of the
    61  // unpaired instructions) is width.
    62  // This function is best-effort. The compiled function must
    63  // still work if offsetOk always returns true.
    64  // TODO: this is currently arm64-specific.
    65  func offsetOk(aux Aux, off, width int64) bool {
    66  	if true {
    67  		// Seems to generate slightly smaller code if we just
    68  		// always allow this rewrite.
    69  		//
    70  		// Without pairing, we have 2 load instructions, like:
    71  		//   LDR 88(R0), R1
    72  		//   LDR 96(R0), R2
    73  		// with pairing we have, best case:
    74  		//   LDP 88(R0), R1, R2
    75  		// but maybe we need an adjuster if out of range or unaligned:
    76  		//   ADD R0, $88, R27
    77  		//   LDP (R27), R1, R2
    78  		// Even with the adjuster, it is at least no worse.
    79  		//
    80  		// A similar situation occurs when accessing globals.
    81  		// Two loads from globals requires 4 instructions,
    82  		// two ADRP and two LDR. With pairing, we need
    83  		// ADRP+ADD+LDP, three instructions.
    84  		//
    85  		// With pairing, it looks like the critical path might
    86  		// be a little bit longer. But it should never be more
    87  		// instructions.
    88  		// TODO: see if that longer critical path causes any
    89  		// regressions.
    90  		return true
    91  	}
    92  	if aux != nil {
    93  		if _, ok := aux.(*ir.Name); !ok {
    94  			// Offset is probably too big (globals).
    95  			return false
    96  		}
    97  		// We let *ir.Names pass here, as
    98  		// they are probably small offsets from SP.
    99  		// There's no guarantee that we're in range
   100  		// in that case though (we don't know the
   101  		// stack frame size yet), so the assembler
   102  		// might need to issue fixup instructions.
   103  		// Assume some small frame size.
   104  		if off >= 0 {
   105  			off += 120
   106  		}
   107  		// TODO: figure out how often this helps vs. hurts.
   108  	}
   109  	switch width {
   110  	case 4:
   111  		if off >= -256 && off <= 252 && off%4 == 0 {
   112  			return true
   113  		}
   114  	case 8:
   115  		if off >= -512 && off <= 504 && off%8 == 0 {
   116  			return true
   117  		}
   118  	}
   119  	return false
   120  }
   121  
   122  func pairLoads(f *Func) {
   123  	var loads []*Value
   124  
   125  	// Registry of aux values for sorting.
   126  	auxIDs := map[Aux]int{}
   127  	auxID := func(aux Aux) int {
   128  		id, ok := auxIDs[aux]
   129  		if !ok {
   130  			id = len(auxIDs)
   131  			auxIDs[aux] = id
   132  		}
   133  		return id
   134  	}
   135  
   136  	for _, b := range f.Blocks {
   137  		// Find loads.
   138  		loads = loads[:0]
   139  		clear(auxIDs)
   140  		for _, v := range b.Values {
   141  			info := pairableLoads[v.Op]
   142  			if info.width == 0 {
   143  				continue // not pairable
   144  			}
   145  			if !offsetOk(v.Aux, v.AuxInt, info.width) {
   146  				continue // not advisable
   147  			}
   148  			loads = append(loads, v)
   149  		}
   150  		if len(loads) < 2 {
   151  			continue
   152  		}
   153  
   154  		// Sort to put pairable loads together.
   155  		slices.SortFunc(loads, func(x, y *Value) int {
   156  			// First sort by op, ptr, and memory arg.
   157  			if x.Op != y.Op {
   158  				return int(x.Op - y.Op)
   159  			}
   160  			if x.Args[0].ID != y.Args[0].ID {
   161  				return int(x.Args[0].ID - y.Args[0].ID)
   162  			}
   163  			if x.Args[1].ID != y.Args[1].ID {
   164  				return int(x.Args[1].ID - y.Args[1].ID)
   165  			}
   166  			// Then sort by aux. (nil first, then by aux ID)
   167  			if x.Aux != nil {
   168  				if y.Aux == nil {
   169  					return 1
   170  				}
   171  				a, b := auxID(x.Aux), auxID(y.Aux)
   172  				if a != b {
   173  					return a - b
   174  				}
   175  			} else if y.Aux != nil {
   176  				return -1
   177  			}
   178  			// Then sort by offset, low to high.
   179  			return int(x.AuxInt - y.AuxInt)
   180  		})
   181  
   182  		// Look for pairable loads.
   183  		for i := 0; i < len(loads)-1; i++ {
   184  			x := loads[i]
   185  			y := loads[i+1]
   186  			if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
   187  				continue
   188  			}
   189  			if x.Aux != y.Aux {
   190  				continue
   191  			}
   192  			if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
   193  				continue
   194  			}
   195  
   196  			// Commit point.
   197  
   198  			// Make the 2-register load.
   199  			load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
   200  
   201  			// Modify x to be (Select0 load). Similar for y.
   202  			x.reset(OpSelect0)
   203  			x.SetArgs1(load)
   204  			y.reset(OpSelect1)
   205  			y.SetArgs1(load)
   206  
   207  			i++ // Skip y next time around the loop.
   208  		}
   209  	}
   210  
   211  	// Try to pair a load with a load from a subsequent block.
   212  	// Note that this is always safe to do if the memory arguments match.
   213  	// (But see the memory barrier case below.)
   214  	type nextBlockKey struct {
   215  		op     Op
   216  		ptr    ID
   217  		mem    ID
   218  		auxInt int64
   219  		aux    any
   220  	}
   221  	nextBlock := map[nextBlockKey]*Value{}
   222  	for _, b := range f.Blocks {
   223  		if memoryBarrierTest(b) {
   224  			// TODO: Do we really need to skip write barrier test blocks?
   225  			//     type T struct {
   226  			//         a *byte
   227  			//         b int
   228  			//     }
   229  			//     func f(t *T) int {
   230  			//         r := t.b
   231  			//         t.a = nil
   232  			//         return r
   233  			//     }
   234  			// This would issue a single LDP for both the t.a and t.b fields,
   235  			// *before* we check the write barrier flag. (We load the t.a field
   236  			// to put it in the write barrier buffer.) Not sure if that is ok.
   237  			continue
   238  		}
   239  		// Find loads in the next block(s) that we can move to this one.
   240  		// TODO: could maybe look further than just one successor hop.
   241  		clear(nextBlock)
   242  		for _, e := range b.Succs {
   243  			if len(e.b.Preds) > 1 {
   244  				continue
   245  			}
   246  			for _, v := range e.b.Values {
   247  				info := pairableLoads[v.Op]
   248  				if info.width == 0 {
   249  					continue
   250  				}
   251  				if !offsetOk(v.Aux, v.AuxInt, info.width) {
   252  					continue // not advisable
   253  				}
   254  				nextBlock[nextBlockKey{op: v.Op, ptr: v.Args[0].ID, mem: v.Args[1].ID, auxInt: v.AuxInt, aux: v.Aux}] = v
   255  			}
   256  		}
   257  		if len(nextBlock) == 0 {
   258  			continue
   259  		}
   260  		// don't move too many loads. Each requires a register across a basic block boundary.
   261  		const maxMoved = 4
   262  		nMoved := 0
   263  		for i := len(b.Values) - 1; i >= 0 && nMoved < maxMoved; i-- {
   264  			x := b.Values[i]
   265  			info := pairableLoads[x.Op]
   266  			if info.width == 0 {
   267  				continue
   268  			}
   269  			if !offsetOk(x.Aux, x.AuxInt, info.width) {
   270  				continue // not advisable
   271  			}
   272  			key := nextBlockKey{op: x.Op, ptr: x.Args[0].ID, mem: x.Args[1].ID, auxInt: x.AuxInt + info.width, aux: x.Aux}
   273  			if y := nextBlock[key]; y != nil {
   274  				delete(nextBlock, key)
   275  
   276  				// Make the 2-register load.
   277  				load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
   278  
   279  				// Modify x to be (Select0 load).
   280  				x.reset(OpSelect0)
   281  				x.SetArgs1(load)
   282  				// Modify y to be (Copy (Select1 load)).
   283  				// Note: the Select* needs to live in the load's block, not y's block.
   284  				y.reset(OpCopy)
   285  				y.SetArgs1(b.NewValue1(y.Pos, OpSelect1, y.Type, load))
   286  				nMoved++
   287  				continue
   288  			}
   289  			key.auxInt = x.AuxInt - info.width
   290  			if y := nextBlock[key]; y != nil {
   291  				delete(nextBlock, key)
   292  
   293  				// Make the 2-register load.
   294  				load := b.NewValue2IA(x.Pos, info.pair, types.NewTuple(y.Type, x.Type), y.AuxInt, x.Aux, x.Args[0], x.Args[1])
   295  
   296  				// Modify x to be (Select1 load).
   297  				x.reset(OpSelect1)
   298  				x.SetArgs1(load)
   299  				// Modify y to be (Copy (Select0 load)).
   300  				y.reset(OpCopy)
   301  				y.SetArgs1(b.NewValue1(y.Pos, OpSelect0, y.Type, load))
   302  				nMoved++
   303  				continue
   304  			}
   305  		}
   306  	}
   307  }
   308  
   309  func memoryBarrierTest(b *Block) bool {
   310  	if b.Kind != BlockARM64NZW {
   311  		return false
   312  	}
   313  	c := b.Controls[0]
   314  	if c.Op != OpARM64MOVWUload {
   315  		return false
   316  	}
   317  	if globl, ok := c.Aux.(*obj.LSym); ok {
   318  		return globl.Name == "runtime.writeBarrier"
   319  	}
   320  	return false
   321  }
   322  
   323  func pairStores(f *Func) {
   324  	last := f.Cache.allocBoolSlice(f.NumValues())
   325  	defer f.Cache.freeBoolSlice(last)
   326  
   327  	// prevStore returns the previous store in the
   328  	// same block, or nil if there are none.
   329  	prevStore := func(v *Value) *Value {
   330  		if v.Op == OpInitMem || v.Op == OpPhi {
   331  			return nil
   332  		}
   333  		m := v.MemoryArg()
   334  		if m.Block != v.Block {
   335  			return nil
   336  		}
   337  		return m
   338  	}
   339  
   340  	for _, b := range f.Blocks {
   341  		// Find last store in block, so we can
   342  		// walk the stores last to first.
   343  		// Last to first helps ensure that the rewrites we
   344  		// perform do not get in the way of subsequent rewrites.
   345  		for _, v := range b.Values {
   346  			if v.Type.IsMemory() {
   347  				last[v.ID] = true
   348  			}
   349  		}
   350  		for _, v := range b.Values {
   351  			if v.Type.IsMemory() {
   352  				if m := prevStore(v); m != nil {
   353  					last[m.ID] = false
   354  				}
   355  			}
   356  		}
   357  		var lastMem *Value
   358  		for _, v := range b.Values {
   359  			if last[v.ID] {
   360  				lastMem = v
   361  				break
   362  			}
   363  		}
   364  
   365  		// Check all stores, from last to first.
   366  	memCheck:
   367  		for v := lastMem; v != nil; v = prevStore(v) {
   368  			info := pairableStores[v.Op]
   369  			if info.width == 0 {
   370  				continue // Not pairable.
   371  			}
   372  			if !offsetOk(v.Aux, v.AuxInt, info.width) {
   373  				continue // Not advisable to pair.
   374  			}
   375  			ptr := v.Args[0]
   376  			val := v.Args[1]
   377  			mem := v.Args[2]
   378  			off := v.AuxInt
   379  			aux := v.Aux
   380  
   381  			// Look for earlier store we can combine with.
   382  			lowerOk := true
   383  			higherOk := true
   384  			count := 10 // max lookback distance
   385  			for w := prevStore(v); w != nil; w = prevStore(w) {
   386  				if w.Uses != 1 {
   387  					// We can't combine stores if the earlier
   388  					// store has any use besides the next one
   389  					// in the store chain.
   390  					// (Unless we could check the aliasing of
   391  					// all those other uses.)
   392  					continue memCheck
   393  				}
   394  				if w.Op == v.Op &&
   395  					w.Args[0] == ptr &&
   396  					w.Aux == aux &&
   397  					(lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
   398  					// This op is mergeable with v.
   399  
   400  					// Commit point.
   401  
   402  					// ptr val1 val2 mem
   403  					args := []*Value{ptr, val, w.Args[1], mem}
   404  					if w.AuxInt == off-info.width {
   405  						args[1], args[2] = args[2], args[1]
   406  						off -= info.width
   407  					}
   408  					v.reset(info.pair)
   409  					v.AddArgs(args...)
   410  					v.Aux = aux
   411  					v.AuxInt = off
   412  					v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
   413  
   414  					// Make w just a memory copy.
   415  					wmem := w.MemoryArg()
   416  					w.reset(OpCopy)
   417  					w.SetArgs1(wmem)
   418  					continue memCheck
   419  				}
   420  				if count--; count == 0 {
   421  					// Only look back so far.
   422  					// This keeps us in O(n) territory, and it
   423  					// also prevents us from keeping values
   424  					// in registers for too long (and thus
   425  					// needing to spill them).
   426  					continue memCheck
   427  				}
   428  				// We're now looking at a store w which is currently
   429  				// between the store v that we're intending to merge into,
   430  				// and the store we'll eventually find to merge with it.
   431  				// Make sure this store doesn't alias with the one
   432  				// we'll be moving.
   433  				var width int64
   434  				switch w.Op {
   435  				case OpARM64MOVDstore, OpARM64FMOVDstore:
   436  					width = 8
   437  				case OpARM64MOVWstore, OpARM64FMOVSstore:
   438  					width = 4
   439  				case OpARM64MOVHstore:
   440  					width = 2
   441  				case OpARM64MOVBstore:
   442  					width = 1
   443  				case OpCopy:
   444  					continue // this was a store we merged earlier
   445  				default:
   446  					// Can't reorder with any other memory operations.
   447  					// (atomics, calls, ...)
   448  					continue memCheck
   449  				}
   450  
   451  				// We only allow reordering with respect to other
   452  				// writes to the same pointer and aux, so we can
   453  				// compute the exact the aliasing relationship.
   454  				if w.Args[0] != ptr || w.Aux != aux {
   455  					continue memCheck
   456  				}
   457  				if overlap(w.AuxInt, width, off-info.width, info.width) {
   458  					// Aliases with slot before v's location.
   459  					lowerOk = false
   460  				}
   461  				if overlap(w.AuxInt, width, off+info.width, info.width) {
   462  					// Aliases with slot after v's location.
   463  					higherOk = false
   464  				}
   465  				if !higherOk && !lowerOk {
   466  					continue memCheck
   467  				}
   468  			}
   469  		}
   470  	}
   471  }
   472  

View as plain text