Source file src/cmd/compile/internal/ssa/rewritetern.go

     1  // Copyright 2025 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssa
     6  
     7  import (
     8  	"internal/goarch"
     9  	"slices"
    10  )
    11  
    12  var truthTableValues [3]uint8 = [3]uint8{0b1111_0000, 0b1100_1100, 0b1010_1010}
    13  
    14  func (slop SIMDLogicalOP) String() string {
    15  	if slop == sloInterior {
    16  		return "leaf"
    17  	}
    18  	interior := ""
    19  	if slop&sloInterior != 0 {
    20  		interior = "+interior"
    21  	}
    22  	switch slop &^ sloInterior {
    23  	case sloAnd:
    24  		return "and" + interior
    25  	case sloXor:
    26  		return "xor" + interior
    27  	case sloOr:
    28  		return "or" + interior
    29  	case sloAndNot:
    30  		return "andNot" + interior
    31  	case sloNot:
    32  		return "not" + interior
    33  	}
    34  	return "wrong"
    35  }
    36  
    37  func rewriteTern(f *Func) {
    38  	if f.maxCPUFeatures == CPUNone {
    39  		return
    40  	}
    41  
    42  	arch := f.Config.Ctxt().Arch.Family
    43  	// TODO there are other SIMD architectures
    44  	if arch != goarch.AMD64 {
    45  		return
    46  	}
    47  
    48  	boolExprTrees := make(map[*Value]SIMDLogicalOP)
    49  
    50  	// Find logical-expr expression trees, including leaves.
    51  	// interior nodes will be marked sloInterior,
    52  	// root nodes will not be marked sloInterior,
    53  	// leaf nodes are only marked sloInterior.
    54  	for _, b := range f.Blocks {
    55  		for _, v := range b.Values {
    56  			slo := classifyBooleanSIMD(v)
    57  			switch slo {
    58  			case sloOr,
    59  				sloAndNot,
    60  				sloXor,
    61  				sloAnd:
    62  				boolExprTrees[v.Args[1]] |= sloInterior
    63  				fallthrough
    64  			case sloNot:
    65  				boolExprTrees[v.Args[0]] |= sloInterior
    66  				boolExprTrees[v] |= slo
    67  			}
    68  		}
    69  	}
    70  
    71  	// get a canonical sorted set of roots
    72  	var roots []*Value
    73  	for v, slo := range boolExprTrees {
    74  		if f.pass.debug > 1 {
    75  			f.Warnl(v.Pos, "%s has SLO %v", v.LongString(), slo)
    76  		}
    77  
    78  		if slo&sloInterior == 0 && v.Block.CPUfeatures.hasFeature(CPUavx512) {
    79  			roots = append(roots, v)
    80  		}
    81  	}
    82  	slices.SortFunc(roots, func(u, v *Value) int { return int(u.ID - v.ID) }) // IDs are small enough to not care about overflow.
    83  
    84  	// This rewrite works by iterating over the root set.
    85  	// For each boolean expression, it walks the expression
    86  	// bottom up accumulating sets of variables mentioned in
    87  	// subexpressions, lazy-greedily finding the largest subexpressions
    88  	// of 3 inputs that can be rewritten to use ternary-truth-table instructions.
    89  
    90  	// rewrite recursively attempts to replace v and v's subexpressions with
    91  	// ternary-logic truth-table operations, returning a set of not more than 3
    92  	// subexpressions within v that may be combined into a parent's replacement.
    93  	// V need not have the CPU features that allow a ternary-logic operation;
    94  	// in that case, v will not be rewritten.  Replacements also require
    95  	// exactly 3 different variable inputs to a boolean expression.
    96  	//
    97  	// Given the CPU feature and 3 inputs, v is replaced in the following
    98  	// cases:
    99  	//
   100  	// 1) v is a root
   101  	// 2) u = NOT(v) and u lacks the CPU feature
   102  	// 3) u = OP(v, w) and u lacks the CPU feature
   103  	// 4) u = OP(v, w) and u has more than 3 variable inputs.	var rewrite func(v *Value) [3]*Value
   104  	var rewrite func(v *Value) [3]*Value
   105  
   106  	// computeTT returns the truth table for a boolean expression
   107  	// over the variables in vars, where vars[0] varies slowest in
   108  	// the truth table and vars[2] varies fastest.
   109  	// e.g. computeTT( "and(x, or(y, not(z)))", {x,y,z} ) returns
   110  	// (bit 0 first) 0 0 0 0 1 0 1 1 = (reversed) 1101_0000 = 0xD0
   111  	//            x: 0 0 0 0 1 1 1 1
   112  	//            y: 0 0 1 1 0 0 1 1
   113  	//            z: 0 1 0 1 0 1 0 1
   114  	var computeTT func(v *Value, vars [3]*Value) uint8
   115  
   116  	// combine two sets of variables into one, returning ok/not
   117  	// if the two sets contained 3 or fewer elements.  Combine
   118  	// ensures that the sets of Values never contain duplicates.
   119  	// (Duplicates would create less-efficient code, not incorrect code.)
   120  	combine := func(a, b [3]*Value) ([3]*Value, bool) {
   121  		var c [3]*Value
   122  		i := 0
   123  		for _, v := range a {
   124  			if v == nil {
   125  				break
   126  			}
   127  			c[i] = v
   128  			i++
   129  		}
   130  	bloop:
   131  		for _, v := range b {
   132  			if v == nil {
   133  				break
   134  			}
   135  			for _, u := range a {
   136  				if v == u {
   137  					continue bloop
   138  				}
   139  			}
   140  			if i == 3 {
   141  				return [3]*Value{}, false
   142  			}
   143  			c[i] = v
   144  			i++
   145  		}
   146  		return c, true
   147  	}
   148  
   149  	computeTT = func(v *Value, vars [3]*Value) uint8 {
   150  		i := 0
   151  		for ; i < len(vars); i++ {
   152  			if vars[i] == v {
   153  				return truthTableValues[i]
   154  			}
   155  		}
   156  		slo := boolExprTrees[v] &^ sloInterior
   157  		a := computeTT(v.Args[0], vars)
   158  		switch slo {
   159  		case sloNot:
   160  			return ^a
   161  		case sloAnd:
   162  			return a & computeTT(v.Args[1], vars)
   163  		case sloXor:
   164  			return a ^ computeTT(v.Args[1], vars)
   165  		case sloOr:
   166  			return a | computeTT(v.Args[1], vars)
   167  		case sloAndNot:
   168  			return a & ^computeTT(v.Args[1], vars)
   169  		}
   170  		panic("switch should have covered all cases, or unknown var in logical expression")
   171  	}
   172  
   173  	replace := func(a0 *Value, vars0 [3]*Value) {
   174  		imm := computeTT(a0, vars0)
   175  		op := ternOpForLogical(a0.Op)
   176  		if op == a0.Op {
   177  			if f.pass.debug > 0 {
   178  				f.Warnl(a0.Pos, "Skipping rewrite for %s, op=%v", a0.LongString(), op)
   179  			}
   180  			return
   181  		}
   182  		if f.pass.debug > 0 {
   183  			f.Warnl(a0.Pos, "Rewriting %s into %v of 0b%b %v %v %v", a0.LongString(), op, imm,
   184  				vars0[0], vars0[1], vars0[2])
   185  		}
   186  		a0.reset(op)
   187  		a0.SetArgs3(vars0[0], vars0[1], vars0[2])
   188  		a0.AuxInt = int64(int8(imm))
   189  	}
   190  
   191  	// addOne ensures the no-duplicates addition of a single value
   192  	// to a set that is not full.  It seems possible that a shared
   193  	// subexpression in tricky combination with blocks lacking the
   194  	// AVX512 feature might permit this.
   195  	addOne := func(vars [3]*Value, v *Value) [3]*Value {
   196  		if vars[2] != nil {
   197  			panic("rewriteTern.addOne, vars[2] should be nil")
   198  		}
   199  		if v == vars[0] || v == vars[1] {
   200  			return vars
   201  		}
   202  		if vars[1] == nil {
   203  			vars[1] = v
   204  		} else {
   205  			vars[2] = v
   206  		}
   207  		return vars
   208  	}
   209  
   210  	rewrite = func(v *Value) [3]*Value {
   211  		slo := boolExprTrees[v]
   212  		if slo == sloInterior { // leaf node, i.e., a "variable"
   213  			return [3]*Value{v, nil, nil}
   214  		}
   215  		var vars [3]*Value
   216  		hasFeature := v.Block.CPUfeatures.hasFeature(CPUavx512)
   217  		if slo&sloNot == sloNot {
   218  			vars = rewrite(v.Args[0])
   219  			if !hasFeature {
   220  				if vars[2] != nil {
   221  					replace(v.Args[0], vars)
   222  					return [3]*Value{v, nil, nil}
   223  				}
   224  				return vars
   225  			}
   226  		} else {
   227  			var ok bool
   228  			a0, a1 := v.Args[0], v.Args[1]
   229  			vars0 := rewrite(a0)
   230  			vars1 := rewrite(a1)
   231  			vars, ok = combine(vars0, vars1)
   232  
   233  			if f.pass.debug > 1 {
   234  				f.Warnl(a0.Pos, "combine(%v, %v) -> %v, %v", vars0, vars1, vars, ok)
   235  			}
   236  
   237  			if !(ok && v.Block.CPUfeatures.hasFeature(CPUavx512)) {
   238  				// too many variables, or cannot rewrite current values.
   239  				// rewrite one or both subtrees if possible
   240  				if vars0[2] != nil && a0.Block.CPUfeatures.hasFeature(CPUavx512) {
   241  					replace(a0, vars0)
   242  				}
   243  				if vars1[2] != nil && a1.Block.CPUfeatures.hasFeature(CPUavx512) {
   244  					replace(a1, vars1)
   245  				}
   246  
   247  				// 3-element var arrays are either rewritten, or unable to be rewritten
   248  				// because of the features in effect in their block.  Either way, they
   249  				// are treated as a "new var" if 3 elements are present.
   250  
   251  				if vars0[2] == nil {
   252  					if vars1[2] == nil {
   253  						// both subtrees are 2-element and were not rewritten.
   254  						//
   255  						// TODO a clever person would look at subtrees of inputs,
   256  						// e.g. rewrite
   257  						//        ((a AND b) XOR b) XOR (d  XOR (c AND d))
   258  						// to    (((a AND b) XOR b) XOR  d) XOR (c AND d)
   259  						// to v = TERNLOG(truthtable, a, b, d) XOR (c AND d)
   260  						// and return the variable set {v, c, d}
   261  						//
   262  						// But for now, just restart with a0 and a1.
   263  						return [3]*Value{a0, a1, nil}
   264  					} else {
   265  						// a1 (maybe) rewrote, a0 has room for another var
   266  						vars = addOne(vars0, a1)
   267  					}
   268  				} else if vars1[2] == nil {
   269  					// a0 (maybe) rewrote, a1 has room for another var
   270  					vars = addOne(vars1, a0)
   271  				} else if !ok {
   272  					// both (maybe) rewrote
   273  					// a0 and a1 are different because otherwise their variable
   274  					// sets would have combined "ok".
   275  					return [3]*Value{a0, a1, nil}
   276  				}
   277  				// continue with either the vars from "ok" or the updated set of vars.
   278  			}
   279  		}
   280  		// if root and 3 vars and hasFeature, rewrite.
   281  		if slo&sloInterior == 0 && vars[2] != nil && hasFeature {
   282  			replace(v, vars)
   283  			return [3]*Value{v, nil, nil}
   284  		}
   285  		return vars
   286  	}
   287  
   288  	for _, v := range roots {
   289  		if f.pass.debug > 1 {
   290  			f.Warnl(v.Pos, "SLO root %s", v.LongString())
   291  		}
   292  		rewrite(v)
   293  	}
   294  }
   295  

View as plain text