Source file src/cmd/compile/internal/ssagen/intrinsics.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssagen
     6  
     7  import (
     8  	"fmt"
     9  	"internal/abi"
    10  	"internal/buildcfg"
    11  
    12  	"cmd/compile/internal/base"
    13  	"cmd/compile/internal/ir"
    14  	"cmd/compile/internal/ssa"
    15  	"cmd/compile/internal/typecheck"
    16  	"cmd/compile/internal/types"
    17  	"cmd/internal/sys"
    18  )
    19  
    20  var intrinsics intrinsicBuilders
    21  
    22  // An intrinsicBuilder converts a call node n into an ssa value that
    23  // implements that call as an intrinsic. args is a list of arguments to the func.
    24  type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
    25  
    26  type intrinsicKey struct {
    27  	arch *sys.Arch
    28  	pkg  string
    29  	fn   string
    30  }
    31  
    32  // intrinsicBuildConfig specifies the config to use for intrinsic building.
    33  type intrinsicBuildConfig struct {
    34  	instrumenting bool
    35  
    36  	go386     string
    37  	goamd64   int
    38  	goarm     buildcfg.GoarmFeatures
    39  	goarm64   buildcfg.Goarm64Features
    40  	gomips    string
    41  	gomips64  string
    42  	goppc64   int
    43  	goriscv64 int
    44  }
    45  
    46  type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
    47  
    48  // add adds the intrinsic builder b for pkg.fn for the given architecture.
    49  func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
    50  	if _, found := ib[intrinsicKey{arch, pkg, fn}]; found {
    51  		panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name))
    52  	}
    53  	ib[intrinsicKey{arch, pkg, fn}] = b
    54  }
    55  
    56  // addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
    57  func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
    58  	for _, arch := range archs {
    59  		ib.add(arch, pkg, fn, b)
    60  	}
    61  }
    62  
    63  // addForFamilies does the same as addForArchs but operates on architecture families.
    64  func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
    65  	for _, arch := range sys.Archs {
    66  		if arch.InFamily(archFamilies...) {
    67  			intrinsics.add(arch, pkg, fn, b)
    68  		}
    69  	}
    70  }
    71  
    72  // alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
    73  // for which targetPkg.targetFn already exists.
    74  func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
    75  	// TODO(jsing): Consider making this work even if the alias is added
    76  	// before the intrinsic.
    77  	aliased := false
    78  	for _, arch := range archs {
    79  		if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
    80  			intrinsics.add(arch, pkg, fn, b)
    81  			aliased = true
    82  		}
    83  	}
    84  	if !aliased {
    85  		panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
    86  	}
    87  }
    88  
    89  // lookup looks up the intrinsic for a pkg.fn on the specified architecture.
    90  func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
    91  	return intrinsics[intrinsicKey{arch, pkg, fn}]
    92  }
    93  
    94  func initIntrinsics(cfg *intrinsicBuildConfig) {
    95  	if cfg == nil {
    96  		cfg = &intrinsicBuildConfig{
    97  			instrumenting: base.Flag.Cfg.Instrumenting,
    98  			go386:         buildcfg.GO386,
    99  			goamd64:       buildcfg.GOAMD64,
   100  			goarm:         buildcfg.GOARM,
   101  			goarm64:       buildcfg.GOARM64,
   102  			gomips:        buildcfg.GOMIPS,
   103  			gomips64:      buildcfg.GOMIPS64,
   104  			goppc64:       buildcfg.GOPPC64,
   105  			goriscv64:     buildcfg.GORISCV64,
   106  		}
   107  	}
   108  	intrinsics = intrinsicBuilders{}
   109  
   110  	var p4 []*sys.Arch
   111  	var p8 []*sys.Arch
   112  	var lwatomics []*sys.Arch
   113  	for _, a := range sys.Archs {
   114  		if a.PtrSize == 4 {
   115  			p4 = append(p4, a)
   116  		} else {
   117  			p8 = append(p8, a)
   118  		}
   119  		if a.Family != sys.PPC64 {
   120  			lwatomics = append(lwatomics, a)
   121  		}
   122  	}
   123  	all := sys.Archs[:]
   124  
   125  	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
   126  		intrinsics.addForArchs(pkg, fn, b, archs...)
   127  	}
   128  	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
   129  		intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
   130  	}
   131  	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
   132  		intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
   133  	}
   134  
   135  	/******** runtime ********/
   136  	if !cfg.instrumenting {
   137  		add("runtime", "slicebytetostringtmp",
   138  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   139  				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
   140  				// for the backend instead of slicebytetostringtmp calls
   141  				// when not instrumenting.
   142  				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
   143  			},
   144  			all...)
   145  	}
   146  	addF("internal/runtime/math", "MulUintptr",
   147  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   148  			if s.config.PtrSize == 4 {
   149  				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   150  			}
   151  			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   152  		},
   153  		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64)
   154  	add("runtime", "KeepAlive",
   155  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   156  			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
   157  			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
   158  			return nil
   159  		},
   160  		all...)
   161  
   162  	addF("runtime", "publicationBarrier",
   163  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   164  			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
   165  			return nil
   166  		},
   167  		sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64)
   168  
   169  	/******** internal/runtime/sys ********/
   170  	add("internal/runtime/sys", "GetCallerPC",
   171  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   172  			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
   173  		},
   174  		all...)
   175  
   176  	add("internal/runtime/sys", "GetCallerSP",
   177  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   178  			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
   179  		},
   180  		all...)
   181  
   182  	add("internal/runtime/sys", "GetClosurePtr",
   183  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   184  			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
   185  		},
   186  		all...)
   187  
   188  	addF("internal/runtime/sys", "Bswap32",
   189  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   190  			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   191  		},
   192  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   193  	addF("internal/runtime/sys", "Bswap64",
   194  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   195  			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   196  		},
   197  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   198  
   199  	addF("runtime", "memequal",
   200  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   201  			return s.newValue4(ssa.OpMemEq, s.f.Config.Types.Bool, args[0], args[1], args[2], s.mem())
   202  		},
   203  		sys.ARM64)
   204  
   205  	if cfg.goppc64 >= 10 {
   206  		// Use only on Power10 as the new byte reverse instructions that Power10 provide
   207  		// make it worthwhile as an intrinsic
   208  		addF("internal/runtime/sys", "Bswap32",
   209  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   210  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   211  			},
   212  			sys.PPC64)
   213  		addF("internal/runtime/sys", "Bswap64",
   214  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   215  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   216  			},
   217  			sys.PPC64)
   218  	}
   219  
   220  	if cfg.goriscv64 >= 22 {
   221  		addF("internal/runtime/sys", "Bswap32",
   222  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   223  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   224  			},
   225  			sys.RISCV64)
   226  		addF("internal/runtime/sys", "Bswap64",
   227  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   228  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   229  			},
   230  			sys.RISCV64)
   231  	}
   232  
   233  	/****** Prefetch ******/
   234  	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   235  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   236  			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
   237  			return nil
   238  		}
   239  	}
   240  
   241  	// Make Prefetch intrinsics for supported platforms
   242  	// On the unsupported platforms stub function will be eliminated
   243  	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
   244  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
   245  	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
   246  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
   247  
   248  	/******** internal/runtime/atomic ********/
   249  	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
   250  
   251  	addF("internal/runtime/atomic", "Load",
   252  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   253  			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   254  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   255  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   256  		},
   257  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   258  	addF("internal/runtime/atomic", "Load8",
   259  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   260  			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
   261  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   262  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   263  		},
   264  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   265  	addF("internal/runtime/atomic", "Load64",
   266  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   267  			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   268  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   269  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   270  		},
   271  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   272  	addF("internal/runtime/atomic", "LoadAcq",
   273  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   274  			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   275  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   276  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   277  		},
   278  		sys.PPC64)
   279  	addF("internal/runtime/atomic", "LoadAcq64",
   280  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   281  			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   282  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   283  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   284  		},
   285  		sys.PPC64)
   286  	addF("internal/runtime/atomic", "Loadp",
   287  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   288  			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
   289  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   290  			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
   291  		},
   292  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   293  
   294  	addF("internal/runtime/atomic", "Store",
   295  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   296  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
   297  			return nil
   298  		},
   299  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   300  	addF("internal/runtime/atomic", "Store8",
   301  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   302  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
   303  			return nil
   304  		},
   305  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   306  	addF("internal/runtime/atomic", "Store64",
   307  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   308  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
   309  			return nil
   310  		},
   311  		sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   312  	addF("internal/runtime/atomic", "StorepNoWB",
   313  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   314  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
   315  			return nil
   316  		},
   317  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
   318  	addF("internal/runtime/atomic", "StoreRel",
   319  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   320  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
   321  			return nil
   322  		},
   323  		sys.PPC64)
   324  	addF("internal/runtime/atomic", "StoreRel64",
   325  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   326  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
   327  			return nil
   328  		},
   329  		sys.PPC64)
   330  
   331  	makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   332  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   333  			// Target Atomic feature is identified by dynamic detection
   334  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   335  			v := s.load(types.Types[types.TBOOL], addr)
   336  			b := s.endBlock()
   337  			b.Kind = ssa.BlockIf
   338  			b.SetControl(v)
   339  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   340  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   341  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   342  			b.AddEdgeTo(bTrue)
   343  			b.AddEdgeTo(bFalse)
   344  			b.Likely = ssa.BranchLikely
   345  
   346  			// We have atomic instructions - use it directly.
   347  			s.startBlock(bTrue)
   348  			emit(s, n, args, op1, typ, false)
   349  			s.endBlock().AddEdgeTo(bEnd)
   350  
   351  			// Use original instruction sequence.
   352  			s.startBlock(bFalse)
   353  			emit(s, n, args, op0, typ, false)
   354  			s.endBlock().AddEdgeTo(bEnd)
   355  
   356  			// Merge results.
   357  			s.startBlock(bEnd)
   358  
   359  			return nil
   360  		}
   361  	}
   362  
   363  	atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   364  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   365  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   366  		if needReturn {
   367  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   368  		}
   369  	}
   370  
   371  	addF("internal/runtime/atomic", "Store8",
   372  		makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
   373  		sys.Loong64)
   374  	addF("internal/runtime/atomic", "Store",
   375  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   376  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
   377  			return nil
   378  		},
   379  		sys.Loong64)
   380  	addF("internal/runtime/atomic", "Store64",
   381  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   382  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
   383  			return nil
   384  		},
   385  		sys.Loong64)
   386  
   387  	addF("internal/runtime/atomic", "Xchg8",
   388  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   389  			v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   390  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   391  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   392  		},
   393  		sys.AMD64, sys.PPC64)
   394  	addF("internal/runtime/atomic", "Xchg",
   395  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   396  			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   397  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   398  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   399  		},
   400  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   401  	addF("internal/runtime/atomic", "Xchg64",
   402  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   403  			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   404  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   405  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   406  		},
   407  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   408  
   409  	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
   410  
   411  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   412  			if cfg.goarm64.LSE {
   413  				emit(s, n, args, op1, typ, needReturn)
   414  			} else {
   415  				// Target Atomic feature is identified by dynamic detection
   416  				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
   417  				v := s.load(types.Types[types.TBOOL], addr)
   418  				b := s.endBlock()
   419  				b.Kind = ssa.BlockIf
   420  				b.SetControl(v)
   421  				bTrue := s.f.NewBlock(ssa.BlockPlain)
   422  				bFalse := s.f.NewBlock(ssa.BlockPlain)
   423  				bEnd := s.f.NewBlock(ssa.BlockPlain)
   424  				b.AddEdgeTo(bTrue)
   425  				b.AddEdgeTo(bFalse)
   426  				b.Likely = ssa.BranchLikely
   427  
   428  				// We have atomic instructions - use it directly.
   429  				s.startBlock(bTrue)
   430  				emit(s, n, args, op1, typ, needReturn)
   431  				s.endBlock().AddEdgeTo(bEnd)
   432  
   433  				// Use original instruction sequence.
   434  				s.startBlock(bFalse)
   435  				emit(s, n, args, op0, typ, needReturn)
   436  				s.endBlock().AddEdgeTo(bEnd)
   437  
   438  				// Merge results.
   439  				s.startBlock(bEnd)
   440  			}
   441  			if needReturn {
   442  				return s.variable(n, types.Types[typ])
   443  			} else {
   444  				return nil
   445  			}
   446  		}
   447  	}
   448  	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   449  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
   450  	}
   451  	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   452  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
   453  	}
   454  
   455  	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   456  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   457  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   458  		if needReturn {
   459  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   460  		}
   461  	}
   462  	addF("internal/runtime/atomic", "Xchg8",
   463  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64),
   464  		sys.ARM64)
   465  	addF("internal/runtime/atomic", "Xchg",
   466  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
   467  		sys.ARM64)
   468  	addF("internal/runtime/atomic", "Xchg64",
   469  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
   470  		sys.ARM64)
   471  
   472  	makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   473  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   474  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   475  			v := s.load(types.Types[types.TBOOL], addr)
   476  			b := s.endBlock()
   477  			b.Kind = ssa.BlockIf
   478  			b.SetControl(v)
   479  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   480  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   481  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   482  			b.AddEdgeTo(bTrue)
   483  			b.AddEdgeTo(bFalse)
   484  			b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b
   485  
   486  			// We have the intrinsic - use it directly.
   487  			s.startBlock(bTrue)
   488  			s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   489  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n])
   490  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n])
   491  			s.endBlock().AddEdgeTo(bEnd)
   492  
   493  			// Call the pure Go version.
   494  			s.startBlock(bFalse)
   495  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8]
   496  			s.endBlock().AddEdgeTo(bEnd)
   497  
   498  			// Merge results.
   499  			s.startBlock(bEnd)
   500  			return s.variable(n, types.Types[types.TUINT8])
   501  		}
   502  	}
   503  	addF("internal/runtime/atomic", "Xchg8",
   504  		makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant),
   505  		sys.Loong64)
   506  
   507  	addF("internal/runtime/atomic", "Xadd",
   508  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   509  			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   510  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   511  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   512  		},
   513  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   514  	addF("internal/runtime/atomic", "Xadd64",
   515  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   516  			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   517  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   518  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   519  		},
   520  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   521  
   522  	addF("internal/runtime/atomic", "Xadd",
   523  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
   524  		sys.ARM64)
   525  	addF("internal/runtime/atomic", "Xadd64",
   526  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
   527  		sys.ARM64)
   528  
   529  	addF("internal/runtime/atomic", "Cas",
   530  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   531  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   532  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   533  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   534  		},
   535  		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   536  	addF("internal/runtime/atomic", "Cas64",
   537  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   538  			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   539  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   540  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   541  		},
   542  		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   543  	addF("internal/runtime/atomic", "CasRel",
   544  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   545  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   546  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   547  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   548  		},
   549  		sys.PPC64)
   550  
   551  	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   552  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   553  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   554  		if needReturn {
   555  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   556  		}
   557  	}
   558  
   559  	addF("internal/runtime/atomic", "Cas",
   560  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
   561  		sys.ARM64)
   562  	addF("internal/runtime/atomic", "Cas64",
   563  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
   564  		sys.ARM64)
   565  
   566  	atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   567  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   568  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   569  		if needReturn {
   570  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   571  		}
   572  	}
   573  
   574  	makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder {
   575  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   576  			// Target Atomic feature is identified by dynamic detection
   577  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb)
   578  			v := s.load(types.Types[types.TBOOL], addr)
   579  			b := s.endBlock()
   580  			b.Kind = ssa.BlockIf
   581  			b.SetControl(v)
   582  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   583  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   584  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   585  			b.AddEdgeTo(bTrue)
   586  			b.AddEdgeTo(bFalse)
   587  			b.Likely = ssa.BranchLikely
   588  
   589  			// We have atomic instructions - use it directly.
   590  			s.startBlock(bTrue)
   591  			emit(s, n, args, op1, types.TBOOL, true)
   592  			s.endBlock().AddEdgeTo(bEnd)
   593  
   594  			// Use original instruction sequence.
   595  			s.startBlock(bFalse)
   596  			emit(s, n, args, op0, types.TBOOL, true)
   597  			s.endBlock().AddEdgeTo(bEnd)
   598  
   599  			// Merge results.
   600  			s.startBlock(bEnd)
   601  
   602  			return s.variable(n, types.Types[types.TBOOL])
   603  		}
   604  	}
   605  
   606  	addF("internal/runtime/atomic", "Cas",
   607  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64),
   608  		sys.Loong64)
   609  	addF("internal/runtime/atomic", "Cas64",
   610  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64),
   611  		sys.Loong64)
   612  
   613  	// Old-style atomic logical operation API (all supported archs except arm64).
   614  	addF("internal/runtime/atomic", "And8",
   615  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   616  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
   617  			return nil
   618  		},
   619  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   620  	addF("internal/runtime/atomic", "And",
   621  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   622  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
   623  			return nil
   624  		},
   625  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   626  	addF("internal/runtime/atomic", "Or8",
   627  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   628  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
   629  			return nil
   630  		},
   631  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   632  	addF("internal/runtime/atomic", "Or",
   633  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   634  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
   635  			return nil
   636  		},
   637  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   638  
   639  	// arm64 always uses the new-style atomic logical operations, for both the
   640  	// old and new style API.
   641  	addF("internal/runtime/atomic", "And8",
   642  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
   643  		sys.ARM64)
   644  	addF("internal/runtime/atomic", "Or8",
   645  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
   646  		sys.ARM64)
   647  	addF("internal/runtime/atomic", "And64",
   648  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
   649  		sys.ARM64)
   650  	addF("internal/runtime/atomic", "And32",
   651  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   652  		sys.ARM64)
   653  	addF("internal/runtime/atomic", "And",
   654  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   655  		sys.ARM64)
   656  	addF("internal/runtime/atomic", "Or64",
   657  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
   658  		sys.ARM64)
   659  	addF("internal/runtime/atomic", "Or32",
   660  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   661  		sys.ARM64)
   662  	addF("internal/runtime/atomic", "Or",
   663  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   664  		sys.ARM64)
   665  
   666  	// New-style atomic logical operations, which return the old memory value.
   667  	addF("internal/runtime/atomic", "And64",
   668  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   669  			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   670  			p0, p1 := s.split(v)
   671  			s.vars[memVar] = p1
   672  			return p0
   673  		},
   674  		sys.AMD64, sys.Loong64)
   675  	addF("internal/runtime/atomic", "And32",
   676  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   677  			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   678  			p0, p1 := s.split(v)
   679  			s.vars[memVar] = p1
   680  			return p0
   681  		},
   682  		sys.AMD64, sys.Loong64)
   683  	addF("internal/runtime/atomic", "Or64",
   684  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   685  			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   686  			p0, p1 := s.split(v)
   687  			s.vars[memVar] = p1
   688  			return p0
   689  		},
   690  		sys.AMD64, sys.Loong64)
   691  	addF("internal/runtime/atomic", "Or32",
   692  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   693  			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   694  			p0, p1 := s.split(v)
   695  			s.vars[memVar] = p1
   696  			return p0
   697  		},
   698  		sys.AMD64, sys.Loong64)
   699  
   700  	// Aliases for atomic load operations
   701  	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
   702  	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
   703  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
   704  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
   705  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
   706  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
   707  	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
   708  	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
   709  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
   710  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
   711  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
   712  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
   713  
   714  	// Aliases for atomic store operations
   715  	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
   716  	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
   717  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
   718  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
   719  	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
   720  	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
   721  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
   722  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
   723  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
   724  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
   725  
   726  	// Aliases for atomic swap operations
   727  	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
   728  	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
   729  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
   730  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
   731  
   732  	// Aliases for atomic add operations
   733  	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
   734  	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
   735  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
   736  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
   737  
   738  	// Aliases for atomic CAS operations
   739  	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
   740  	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
   741  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
   742  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
   743  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
   744  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
   745  	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
   746  
   747  	// Aliases for atomic And/Or operations
   748  	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64)
   749  	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64)
   750  
   751  	/******** math ********/
   752  	addF("math", "sqrt",
   753  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   754  			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
   755  		},
   756  		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
   757  	addF("math", "Trunc",
   758  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   759  			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
   760  		},
   761  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   762  	addF("math", "Ceil",
   763  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   764  			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
   765  		},
   766  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   767  	addF("math", "Floor",
   768  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   769  			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
   770  		},
   771  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   772  	addF("math", "Round",
   773  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   774  			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
   775  		},
   776  		sys.ARM64, sys.PPC64, sys.S390X)
   777  	addF("math", "RoundToEven",
   778  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   779  			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
   780  		},
   781  		sys.ARM64, sys.S390X, sys.Wasm)
   782  	addF("math", "Abs",
   783  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   784  			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
   785  		},
   786  		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
   787  	addF("math", "Copysign",
   788  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   789  			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
   790  		},
   791  		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
   792  	addF("math", "FMA",
   793  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   794  			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   795  		},
   796  		sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
   797  	addF("math", "FMA",
   798  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   799  			if cfg.goamd64 >= 3 {
   800  				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   801  			}
   802  
   803  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
   804  			b := s.endBlock()
   805  			b.Kind = ssa.BlockIf
   806  			b.SetControl(v)
   807  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   808  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   809  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   810  			b.AddEdgeTo(bTrue)
   811  			b.AddEdgeTo(bFalse)
   812  			b.Likely = ssa.BranchLikely // >= haswell cpus are common
   813  
   814  			// We have the intrinsic - use it directly.
   815  			s.startBlock(bTrue)
   816  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   817  			s.endBlock().AddEdgeTo(bEnd)
   818  
   819  			// Call the pure Go version.
   820  			s.startBlock(bFalse)
   821  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   822  			s.endBlock().AddEdgeTo(bEnd)
   823  
   824  			// Merge results.
   825  			s.startBlock(bEnd)
   826  			return s.variable(n, types.Types[types.TFLOAT64])
   827  		},
   828  		sys.AMD64)
   829  	addF("math", "FMA",
   830  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   831  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
   832  			v := s.load(types.Types[types.TBOOL], addr)
   833  			b := s.endBlock()
   834  			b.Kind = ssa.BlockIf
   835  			b.SetControl(v)
   836  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   837  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   838  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   839  			b.AddEdgeTo(bTrue)
   840  			b.AddEdgeTo(bFalse)
   841  			b.Likely = ssa.BranchLikely
   842  
   843  			// We have the intrinsic - use it directly.
   844  			s.startBlock(bTrue)
   845  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   846  			s.endBlock().AddEdgeTo(bEnd)
   847  
   848  			// Call the pure Go version.
   849  			s.startBlock(bFalse)
   850  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   851  			s.endBlock().AddEdgeTo(bEnd)
   852  
   853  			// Merge results.
   854  			s.startBlock(bEnd)
   855  			return s.variable(n, types.Types[types.TFLOAT64])
   856  		},
   857  		sys.ARM)
   858  
   859  	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   860  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   861  			if cfg.goamd64 >= 2 {
   862  				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   863  			}
   864  
   865  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
   866  			b := s.endBlock()
   867  			b.Kind = ssa.BlockIf
   868  			b.SetControl(v)
   869  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   870  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   871  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   872  			b.AddEdgeTo(bTrue)
   873  			b.AddEdgeTo(bFalse)
   874  			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
   875  
   876  			// We have the intrinsic - use it directly.
   877  			s.startBlock(bTrue)
   878  			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   879  			s.endBlock().AddEdgeTo(bEnd)
   880  
   881  			// Call the pure Go version.
   882  			s.startBlock(bFalse)
   883  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   884  			s.endBlock().AddEdgeTo(bEnd)
   885  
   886  			// Merge results.
   887  			s.startBlock(bEnd)
   888  			return s.variable(n, types.Types[types.TFLOAT64])
   889  		}
   890  	}
   891  	addF("math", "RoundToEven",
   892  		makeRoundAMD64(ssa.OpRoundToEven),
   893  		sys.AMD64)
   894  	addF("math", "Floor",
   895  		makeRoundAMD64(ssa.OpFloor),
   896  		sys.AMD64)
   897  	addF("math", "Ceil",
   898  		makeRoundAMD64(ssa.OpCeil),
   899  		sys.AMD64)
   900  	addF("math", "Trunc",
   901  		makeRoundAMD64(ssa.OpTrunc),
   902  		sys.AMD64)
   903  
   904  	/******** math/bits ********/
   905  	addF("math/bits", "TrailingZeros64",
   906  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   907  			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   908  		},
   909  		sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   910  	addF("math/bits", "TrailingZeros64",
   911  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   912  			lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
   913  			hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
   914  			return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
   915  		},
   916  		sys.I386)
   917  	addF("math/bits", "TrailingZeros32",
   918  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   919  			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   920  		},
   921  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   922  	addF("math/bits", "TrailingZeros16",
   923  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   924  			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   925  		},
   926  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   927  	addF("math/bits", "TrailingZeros8",
   928  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   929  			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   930  		},
   931  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   932  
   933  	if cfg.goriscv64 >= 22 {
   934  		addF("math/bits", "TrailingZeros64",
   935  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   936  				return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   937  			},
   938  			sys.RISCV64)
   939  		addF("math/bits", "TrailingZeros32",
   940  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   941  				return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   942  			},
   943  			sys.RISCV64)
   944  		addF("math/bits", "TrailingZeros16",
   945  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   946  				return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   947  			},
   948  			sys.RISCV64)
   949  		addF("math/bits", "TrailingZeros8",
   950  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   951  				return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   952  			},
   953  			sys.RISCV64)
   954  	}
   955  
   956  	// ReverseBytes inlines correctly, no need to intrinsify it.
   957  	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
   958  	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
   959  	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
   960  	addF("math/bits", "ReverseBytes16",
   961  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   962  			return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   963  		},
   964  		sys.Loong64)
   965  	if cfg.goppc64 >= 10 {
   966  		// On Power10, 16-bit rotate is not available so use BRH instruction
   967  		addF("math/bits", "ReverseBytes16",
   968  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   969  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
   970  			},
   971  			sys.PPC64)
   972  	}
   973  	if cfg.goriscv64 >= 22 {
   974  		addF("math/bits", "ReverseBytes16",
   975  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   976  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   977  			},
   978  			sys.RISCV64)
   979  	}
   980  
   981  	addF("math/bits", "Len64",
   982  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   983  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
   984  		},
   985  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   986  	addF("math/bits", "Len32",
   987  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   988  			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
   989  		},
   990  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   991  	addF("math/bits", "Len16",
   992  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   993  			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
   994  		},
   995  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   996  	addF("math/bits", "Len8",
   997  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   998  			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
   999  		},
  1000  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
  1001  
  1002  	if cfg.goriscv64 >= 22 {
  1003  		addF("math/bits", "Len64",
  1004  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1005  				return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
  1006  			},
  1007  			sys.RISCV64)
  1008  		addF("math/bits", "Len32",
  1009  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1010  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
  1011  			},
  1012  			sys.RISCV64)
  1013  		addF("math/bits", "Len16",
  1014  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1015  				return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
  1016  			},
  1017  			sys.RISCV64)
  1018  		addF("math/bits", "Len8",
  1019  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1020  				return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
  1021  			},
  1022  			sys.RISCV64)
  1023  	}
  1024  
  1025  	alias("math/bits", "Len", "math/bits", "Len64", p8...)
  1026  	alias("math/bits", "Len", "math/bits", "Len32", p4...)
  1027  
  1028  	// LeadingZeros is handled because it trivially calls Len.
  1029  	addF("math/bits", "Reverse64",
  1030  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1031  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1032  		},
  1033  		sys.ARM64, sys.Loong64)
  1034  	addF("math/bits", "Reverse32",
  1035  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1036  			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
  1037  		},
  1038  		sys.ARM64, sys.Loong64)
  1039  	addF("math/bits", "Reverse16",
  1040  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1041  			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
  1042  		},
  1043  		sys.ARM64, sys.Loong64)
  1044  	addF("math/bits", "Reverse8",
  1045  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1046  			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
  1047  		},
  1048  		sys.ARM64, sys.Loong64)
  1049  	addF("math/bits", "Reverse",
  1050  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1051  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1052  		},
  1053  		sys.ARM64, sys.Loong64)
  1054  	addF("math/bits", "RotateLeft8",
  1055  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1056  			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
  1057  		},
  1058  		sys.AMD64, sys.RISCV64)
  1059  	addF("math/bits", "RotateLeft16",
  1060  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1061  			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
  1062  		},
  1063  		sys.AMD64, sys.RISCV64)
  1064  	addF("math/bits", "RotateLeft32",
  1065  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1066  			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
  1067  		},
  1068  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1069  	addF("math/bits", "RotateLeft64",
  1070  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1071  			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
  1072  		},
  1073  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1074  	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
  1075  
  1076  	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1077  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1078  			if cfg.goamd64 >= 2 {
  1079  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1080  			}
  1081  
  1082  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
  1083  			b := s.endBlock()
  1084  			b.Kind = ssa.BlockIf
  1085  			b.SetControl(v)
  1086  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1087  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1088  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1089  			b.AddEdgeTo(bTrue)
  1090  			b.AddEdgeTo(bFalse)
  1091  			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
  1092  
  1093  			// We have the intrinsic - use it directly.
  1094  			s.startBlock(bTrue)
  1095  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1096  			s.endBlock().AddEdgeTo(bEnd)
  1097  
  1098  			// Call the pure Go version.
  1099  			s.startBlock(bFalse)
  1100  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1101  			s.endBlock().AddEdgeTo(bEnd)
  1102  
  1103  			// Merge results.
  1104  			s.startBlock(bEnd)
  1105  			return s.variable(n, types.Types[types.TINT])
  1106  		}
  1107  	}
  1108  
  1109  	makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1110  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1111  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb)
  1112  			v := s.load(types.Types[types.TBOOL], addr)
  1113  			b := s.endBlock()
  1114  			b.Kind = ssa.BlockIf
  1115  			b.SetControl(v)
  1116  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1117  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1118  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1119  			b.AddEdgeTo(bTrue)
  1120  			b.AddEdgeTo(bFalse)
  1121  			b.Likely = ssa.BranchLikely // most loong64 machines support the LSX
  1122  
  1123  			// We have the intrinsic - use it directly.
  1124  			s.startBlock(bTrue)
  1125  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1126  			s.endBlock().AddEdgeTo(bEnd)
  1127  
  1128  			// Call the pure Go version.
  1129  			s.startBlock(bFalse)
  1130  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1131  			s.endBlock().AddEdgeTo(bEnd)
  1132  
  1133  			// Merge results.
  1134  			s.startBlock(bEnd)
  1135  			return s.variable(n, types.Types[types.TINT])
  1136  		}
  1137  	}
  1138  
  1139  	makeOnesCountRISCV64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1140  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1141  			if cfg.goriscv64 >= 22 {
  1142  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1143  			}
  1144  
  1145  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.RISCV64HasZbb, s.sb)
  1146  			v := s.load(types.Types[types.TBOOL], addr)
  1147  			b := s.endBlock()
  1148  			b.Kind = ssa.BlockIf
  1149  			b.SetControl(v)
  1150  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1151  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1152  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1153  			b.AddEdgeTo(bTrue)
  1154  			b.AddEdgeTo(bFalse)
  1155  			b.Likely = ssa.BranchLikely // Majority of RISC-V support Zbb.
  1156  
  1157  			// We have the intrinsic - use it directly.
  1158  			s.startBlock(bTrue)
  1159  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1160  			s.endBlock().AddEdgeTo(bEnd)
  1161  
  1162  			// Call the pure Go version.
  1163  			s.startBlock(bFalse)
  1164  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1165  			s.endBlock().AddEdgeTo(bEnd)
  1166  
  1167  			// Merge results.
  1168  			s.startBlock(bEnd)
  1169  			return s.variable(n, types.Types[types.TINT])
  1170  		}
  1171  	}
  1172  
  1173  	addF("math/bits", "OnesCount64",
  1174  		makeOnesCountAMD64(ssa.OpPopCount64),
  1175  		sys.AMD64)
  1176  	addF("math/bits", "OnesCount64",
  1177  		makeOnesCountLoong64(ssa.OpPopCount64),
  1178  		sys.Loong64)
  1179  	addF("math/bits", "OnesCount64",
  1180  		makeOnesCountRISCV64(ssa.OpPopCount64),
  1181  		sys.RISCV64)
  1182  	addF("math/bits", "OnesCount64",
  1183  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1184  			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
  1185  		},
  1186  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1187  	addF("math/bits", "OnesCount32",
  1188  		makeOnesCountAMD64(ssa.OpPopCount32),
  1189  		sys.AMD64)
  1190  	addF("math/bits", "OnesCount32",
  1191  		makeOnesCountLoong64(ssa.OpPopCount32),
  1192  		sys.Loong64)
  1193  	addF("math/bits", "OnesCount32",
  1194  		makeOnesCountRISCV64(ssa.OpPopCount32),
  1195  		sys.RISCV64)
  1196  	addF("math/bits", "OnesCount32",
  1197  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1198  			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
  1199  		},
  1200  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1201  	addF("math/bits", "OnesCount16",
  1202  		makeOnesCountAMD64(ssa.OpPopCount16),
  1203  		sys.AMD64)
  1204  	addF("math/bits", "OnesCount16",
  1205  		makeOnesCountLoong64(ssa.OpPopCount16),
  1206  		sys.Loong64)
  1207  	addF("math/bits", "OnesCount16",
  1208  		makeOnesCountRISCV64(ssa.OpPopCount16),
  1209  		sys.RISCV64)
  1210  	addF("math/bits", "OnesCount16",
  1211  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1212  			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
  1213  		},
  1214  		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
  1215  	addF("math/bits", "OnesCount8",
  1216  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1217  			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
  1218  		},
  1219  		sys.S390X, sys.PPC64, sys.Wasm)
  1220  
  1221  	if cfg.goriscv64 >= 22 {
  1222  		addF("math/bits", "OnesCount8",
  1223  			makeOnesCountRISCV64(ssa.OpPopCount8),
  1224  			sys.RISCV64)
  1225  	}
  1226  
  1227  	alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...)
  1228  
  1229  	add("math/bits", "Mul64",
  1230  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1231  			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
  1232  		},
  1233  		all...)
  1234  	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
  1235  	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
  1236  	addF("math/bits", "Add64",
  1237  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1238  			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1239  		},
  1240  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1241  	alias("math/bits", "Add", "math/bits", "Add64", p8...)
  1242  	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
  1243  	addF("math/bits", "Sub64",
  1244  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1245  			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1246  		},
  1247  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1248  	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
  1249  	addF("math/bits", "Div64",
  1250  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1251  			// check for divide-by-zero/overflow and panic with appropriate message
  1252  			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
  1253  			s.check(cmpZero, ir.Syms.Panicdivide)
  1254  			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
  1255  			s.check(cmpOverflow, ir.Syms.Panicoverflow)
  1256  			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1257  		},
  1258  		sys.AMD64)
  1259  	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
  1260  
  1261  	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
  1262  	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
  1263  	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
  1264  	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
  1265  	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
  1266  	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
  1267  
  1268  	/******** sync/atomic ********/
  1269  
  1270  	// Note: these are disabled by flag_race in findIntrinsic below.
  1271  	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
  1272  	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
  1273  	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
  1274  	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
  1275  	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
  1276  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
  1277  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
  1278  
  1279  	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
  1280  	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
  1281  	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
  1282  	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
  1283  	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
  1284  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
  1285  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
  1286  
  1287  	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
  1288  	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
  1289  	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
  1290  	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
  1291  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
  1292  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
  1293  
  1294  	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
  1295  	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
  1296  	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
  1297  	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
  1298  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
  1299  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
  1300  
  1301  	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
  1302  	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
  1303  	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
  1304  	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
  1305  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
  1306  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
  1307  
  1308  	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1309  	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1310  	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1311  	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1312  	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1313  	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1314  	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1315  	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1316  	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1317  	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1318  
  1319  	/******** math/big ********/
  1320  	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
  1321  
  1322  	/******** internal/runtime/maps ********/
  1323  
  1324  	// Important: The intrinsic implementations below return a packed
  1325  	// bitset, while the portable Go implementation uses an unpacked
  1326  	// representation (one bit set in each byte).
  1327  	//
  1328  	// Thus we must replace most bitset methods with implementations that
  1329  	// work with the packed representation.
  1330  	//
  1331  	// TODO(prattmic): The bitset implementations don't use SIMD, so they
  1332  	// could be handled with build tags (though that would break
  1333  	// -d=ssa/intrinsics/off=1).
  1334  
  1335  	// With a packed representation we no longer need to shift the result
  1336  	// of TrailingZeros64.
  1337  	alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64)
  1338  
  1339  	addF("internal/runtime/maps", "bitsetRemoveBelow",
  1340  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1341  			b := args[0]
  1342  			i := args[1]
  1343  
  1344  			// Clear the lower i bits in b.
  1345  			//
  1346  			// out = b &^ ((1 << i) - 1)
  1347  
  1348  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1349  
  1350  			mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i)
  1351  			mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one)
  1352  			mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask)
  1353  
  1354  			return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask)
  1355  		},
  1356  		sys.AMD64)
  1357  
  1358  	addF("internal/runtime/maps", "bitsetLowestSet",
  1359  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1360  			b := args[0]
  1361  
  1362  			// Test the lowest bit in b.
  1363  			//
  1364  			// out = (b & 1) == 1
  1365  
  1366  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1367  			and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one)
  1368  			return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one)
  1369  		},
  1370  		sys.AMD64)
  1371  
  1372  	addF("internal/runtime/maps", "bitsetShiftOutLowest",
  1373  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1374  			b := args[0]
  1375  
  1376  			// Right shift out the lowest bit in b.
  1377  			//
  1378  			// out = b >> 1
  1379  
  1380  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1381  			return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one)
  1382  		},
  1383  		sys.AMD64)
  1384  
  1385  	addF("internal/runtime/maps", "ctrlGroupMatchH2",
  1386  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1387  			g := args[0]
  1388  			h := args[1]
  1389  
  1390  			// Explicit copies to fp registers. See
  1391  			// https://go.dev/issue/70451.
  1392  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1393  			hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h)
  1394  
  1395  			// Broadcast h2 into each byte of a word.
  1396  			var broadcast *ssa.Value
  1397  			if buildcfg.GOAMD64 >= 4 {
  1398  				// VPBROADCASTB saves 1 instruction vs PSHUFB
  1399  				// because the input can come from a GP
  1400  				// register, while PSHUFB requires moving into
  1401  				// an FP register first.
  1402  				//
  1403  				// Nominally PSHUFB would require a second
  1404  				// additional instruction to load the control
  1405  				// mask into a FP register. But broadcast uses
  1406  				// a control mask of 0, and the register ABI
  1407  				// already defines X15 as a zero register.
  1408  				broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h
  1409  			} else if buildcfg.GOAMD64 >= 2 {
  1410  				// PSHUFB performs a byte broadcast when given
  1411  				// a control input of 0.
  1412  				broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp)
  1413  			} else {
  1414  				// No direct byte broadcast. First we must
  1415  				// duplicate the lower byte and then do a
  1416  				// 16-bit broadcast.
  1417  
  1418  				// "Unpack" h2 with itself. This duplicates the
  1419  				// input, resulting in h2 in the lower two
  1420  				// bytes.
  1421  				unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp)
  1422  
  1423  				// Copy the lower 16-bits of unpack into every
  1424  				// 16-bit slot in the lower 64-bits of the
  1425  				// output register. Note that immediate 0
  1426  				// selects the low word as the source for every
  1427  				// destination slot.
  1428  				broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack)
  1429  
  1430  				// No need to broadcast into the upper 64-bits,
  1431  				// as we don't use those.
  1432  			}
  1433  
  1434  			// Compare each byte of the control word with h2. Each
  1435  			// matching byte has every bit set.
  1436  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp)
  1437  
  1438  			// Construct a "byte mask": each output bit is equal to
  1439  			// the sign bit each input byte.
  1440  			//
  1441  			// This results in a packed output (bit N set means
  1442  			// byte N matched).
  1443  			//
  1444  			// NOTE: See comment above on bitsetFirst.
  1445  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1446  
  1447  			// g is only 64-bits so the upper 64-bits of the
  1448  			// 128-bit register will be zero. If h2 is also zero,
  1449  			// then we'll get matches on those bytes. Truncate the
  1450  			// upper bits to ignore such matches.
  1451  			ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1452  
  1453  			return ret
  1454  		},
  1455  		sys.AMD64)
  1456  
  1457  	addF("internal/runtime/maps", "ctrlGroupMatchEmpty",
  1458  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1459  			// An empty slot is   1000 0000
  1460  			// A deleted slot is  1111 1110
  1461  			// A full slot is     0??? ????
  1462  
  1463  			g := args[0]
  1464  
  1465  			// Explicit copy to fp register. See
  1466  			// https://go.dev/issue/70451.
  1467  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1468  
  1469  			if buildcfg.GOAMD64 >= 2 {
  1470  				// "PSIGNB negates each data element of the
  1471  				// destination operand (the first operand) if
  1472  				// the signed integer value of the
  1473  				// corresponding data element in the source
  1474  				// operand (the second operand) is less than
  1475  				// zero. If the signed integer value of a data
  1476  				// element in the source operand is positive,
  1477  				// the corresponding data element in the
  1478  				// destination operand is unchanged. If a data
  1479  				// element in the source operand is zero, the
  1480  				// corresponding data element in the
  1481  				// destination operand is set to zero" - Intel SDM
  1482  				//
  1483  				// If we pass the group control word as both
  1484  				// arguments:
  1485  				// - Full slots are unchanged.
  1486  				// - Deleted slots are negated, becoming
  1487  				//   0000 0010.
  1488  				// - Empty slots are negated, becoming
  1489  				//   1000 0000 (unchanged!).
  1490  				//
  1491  				// The result is that only empty slots have the
  1492  				// sign bit set. We then use PMOVMSKB to
  1493  				// extract the sign bits.
  1494  				sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp)
  1495  
  1496  				// Construct a "byte mask": each output bit is
  1497  				// equal to the sign bit each input byte. The
  1498  				// sign bit is only set for empty or deleted
  1499  				// slots.
  1500  				//
  1501  				// This results in a packed output (bit N set
  1502  				// means byte N matched).
  1503  				//
  1504  				// NOTE: See comment above on bitsetFirst.
  1505  				ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], sign)
  1506  
  1507  				// g is only 64-bits so the upper 64-bits of
  1508  				// the 128-bit register will be zero. PSIGNB
  1509  				// will keep all of these bytes zero, so no
  1510  				// need to truncate.
  1511  
  1512  				return ret
  1513  			}
  1514  
  1515  			// No PSIGNB, simply do byte equality with ctrlEmpty.
  1516  
  1517  			// Load ctrlEmpty into each byte of a control word.
  1518  			var ctrlsEmpty uint64 = abi.MapCtrlEmpty
  1519  			e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty))
  1520  			// Explicit copy to fp register. See
  1521  			// https://go.dev/issue/70451.
  1522  			efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e)
  1523  
  1524  			// Compare each byte of the control word with ctrlEmpty. Each
  1525  			// matching byte has every bit set.
  1526  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp)
  1527  
  1528  			// Construct a "byte mask": each output bit is equal to
  1529  			// the sign bit each input byte.
  1530  			//
  1531  			// This results in a packed output (bit N set means
  1532  			// byte N matched).
  1533  			//
  1534  			// NOTE: See comment above on bitsetFirst.
  1535  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1536  
  1537  			// g is only 64-bits so the upper 64-bits of the
  1538  			// 128-bit register will be zero. The upper 64-bits of
  1539  			// efp are also zero, so we'll get matches on those
  1540  			// bytes. Truncate the upper bits to ignore such
  1541  			// matches.
  1542  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1543  		},
  1544  		sys.AMD64)
  1545  
  1546  	addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted",
  1547  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1548  			// An empty slot is   1000 0000
  1549  			// A deleted slot is  1111 1110
  1550  			// A full slot is     0??? ????
  1551  			//
  1552  			// A slot is empty or deleted iff bit 7 (sign bit) is
  1553  			// set.
  1554  
  1555  			g := args[0]
  1556  
  1557  			// Explicit copy to fp register. See
  1558  			// https://go.dev/issue/70451.
  1559  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1560  
  1561  			// Construct a "byte mask": each output bit is equal to
  1562  			// the sign bit each input byte. The sign bit is only
  1563  			// set for empty or deleted slots.
  1564  			//
  1565  			// This results in a packed output (bit N set means
  1566  			// byte N matched).
  1567  			//
  1568  			// NOTE: See comment above on bitsetFirst.
  1569  			ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1570  
  1571  			// g is only 64-bits so the upper 64-bits of the
  1572  			// 128-bit register will be zero. Zero will never match
  1573  			// ctrlEmpty or ctrlDeleted, so no need to truncate.
  1574  
  1575  			return ret
  1576  		},
  1577  		sys.AMD64)
  1578  
  1579  	addF("internal/runtime/maps", "ctrlGroupMatchFull",
  1580  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1581  			// An empty slot is   1000 0000
  1582  			// A deleted slot is  1111 1110
  1583  			// A full slot is     0??? ????
  1584  			//
  1585  			// A slot is full iff bit 7 (sign bit) is unset.
  1586  
  1587  			g := args[0]
  1588  
  1589  			// Explicit copy to fp register. See
  1590  			// https://go.dev/issue/70451.
  1591  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1592  
  1593  			// Construct a "byte mask": each output bit is equal to
  1594  			// the sign bit each input byte. The sign bit is only
  1595  			// set for empty or deleted slots.
  1596  			//
  1597  			// This results in a packed output (bit N set means
  1598  			// byte N matched).
  1599  			//
  1600  			// NOTE: See comment above on bitsetFirst.
  1601  			mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1602  
  1603  			// Invert the mask to set the bits for the full slots.
  1604  			out := s.newValue1(ssa.OpCom16, types.Types[types.TUINT16], mask)
  1605  
  1606  			// g is only 64-bits so the upper 64-bits of the
  1607  			// 128-bit register will be zero, with bit 7 unset.
  1608  			// Truncate the upper bits to ignore these.
  1609  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1610  		},
  1611  		sys.AMD64)
  1612  
  1613  	/******** crypto/internal/constanttime ********/
  1614  	// We implement a superset of the Select promise:
  1615  	// Select returns x if v != 0 and y if v == 0.
  1616  	add("crypto/internal/constanttime", "Select",
  1617  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1618  			v, x, y := args[0], args[1], args[2]
  1619  
  1620  			var checkOp ssa.Op
  1621  			var zero *ssa.Value
  1622  			switch s.config.PtrSize {
  1623  			case 8:
  1624  				checkOp = ssa.OpNeq64
  1625  				zero = s.constInt64(types.Types[types.TINT], 0)
  1626  			case 4:
  1627  				checkOp = ssa.OpNeq32
  1628  				zero = s.constInt32(types.Types[types.TINT], 0)
  1629  			default:
  1630  				panic("unreachable")
  1631  			}
  1632  			check := s.newValue2(checkOp, types.Types[types.TBOOL], zero, v)
  1633  
  1634  			return s.newValue3(ssa.OpCondSelect, types.Types[types.TINT], x, y, check)
  1635  		},
  1636  		sys.ArchAMD64, sys.ArchARM64, sys.ArchLoong64, sys.ArchPPC64, sys.ArchPPC64LE, sys.ArchWasm) // all with CMOV support.
  1637  	add("crypto/internal/constanttime", "boolToUint8",
  1638  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1639  			return s.newValue1(ssa.OpCvtBoolToUint8, types.Types[types.TUINT8], args[0])
  1640  		},
  1641  		all...)
  1642  
  1643  	if buildcfg.Experiment.SIMD {
  1644  		// Only enable intrinsics, if SIMD experiment.
  1645  		simdIntrinsics(addF)
  1646  
  1647  		addF(simdPackage, "ClearAVXUpperBits",
  1648  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1649  				s.vars[memVar] = s.newValue1(ssa.OpAMD64VZEROUPPER, types.TypeMem, s.mem())
  1650  				return nil
  1651  			},
  1652  			sys.AMD64)
  1653  
  1654  		addF(simdPackage, "Int8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1655  		addF(simdPackage, "Int16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1656  		addF(simdPackage, "Int32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1657  		addF(simdPackage, "Int64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1658  		addF(simdPackage, "Uint8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1659  		addF(simdPackage, "Uint16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1660  		addF(simdPackage, "Uint32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1661  		addF(simdPackage, "Uint64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1662  		addF(simdPackage, "Int8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1663  		addF(simdPackage, "Int16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1664  		addF(simdPackage, "Int32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1665  		addF(simdPackage, "Int64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1666  		addF(simdPackage, "Uint8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1667  		addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1668  		addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1669  		addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1670  
  1671  		// sfp4 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
  1672  		sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) {
  1673  			addF(simdPackage, method,
  1674  				func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1675  					x, a, b, c, d, y := args[0], args[1], args[2], args[3], args[4], args[5]
  1676  					if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 && c.Op == ssa.OpConst8 && d.Op == ssa.OpConst8 {
  1677  						z := select4FromPair(x, a, b, c, d, y, s, hwop, vectype)
  1678  						if z != nil {
  1679  							return z
  1680  						}
  1681  					}
  1682  					return s.callResult(n, callNormal)
  1683  				},
  1684  				sys.AMD64)
  1685  		}
  1686  
  1687  		sfp4("Int32x4.SelectFromPair", ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128)
  1688  		sfp4("Uint32x4.SelectFromPair", ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128)
  1689  		sfp4("Float32x4.SelectFromPair", ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128)
  1690  
  1691  		sfp4("Int32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256)
  1692  		sfp4("Uint32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256)
  1693  		sfp4("Float32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256)
  1694  
  1695  		sfp4("Int32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512)
  1696  		sfp4("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512)
  1697  		sfp4("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512)
  1698  
  1699  		// sfp2 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
  1700  		sfp2 := func(method string, hwop ssa.Op, vectype *types.Type, cscimm func(i, j uint8) int64) {
  1701  			addF(simdPackage, method,
  1702  				func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1703  					x, a, b, y := args[0], args[1], args[2], args[3]
  1704  					if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 {
  1705  						z := select2FromPair(x, a, b, y, s, hwop, vectype, cscimm)
  1706  						if z != nil {
  1707  							return z
  1708  						}
  1709  					}
  1710  					return s.callResult(n, callNormal)
  1711  				},
  1712  				sys.AMD64)
  1713  		}
  1714  
  1715  		sfp2("Uint64x2.SelectFromPair", ssa.OpconcatSelectedConstantUint64x2, types.TypeVec128, cscimm2)
  1716  		sfp2("Int64x2.SelectFromPair", ssa.OpconcatSelectedConstantInt64x2, types.TypeVec128, cscimm2)
  1717  		sfp2("Float64x2.SelectFromPair", ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, cscimm2)
  1718  
  1719  		sfp2("Uint64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, cscimm2g2)
  1720  		sfp2("Int64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x4, types.TypeVec256, cscimm2g2)
  1721  		sfp2("Float64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x4, types.TypeVec256, cscimm2g2)
  1722  
  1723  		sfp2("Uint64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, cscimm2g4)
  1724  		sfp2("Int64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x8, types.TypeVec512, cscimm2g4)
  1725  		sfp2("Float64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x8, types.TypeVec512, cscimm2g4)
  1726  
  1727  	}
  1728  }
  1729  
  1730  func cscimm4(a, b, c, d uint8) int64 {
  1731  	return se(a + b<<2 + c<<4 + d<<6)
  1732  }
  1733  
  1734  func cscimm2(a, b uint8) int64 {
  1735  	return se(a + b<<1)
  1736  }
  1737  
  1738  func cscimm2g2(a, b uint8) int64 {
  1739  	g := cscimm2(a, b)
  1740  	return int64(int8(g + g<<2))
  1741  }
  1742  
  1743  func cscimm2g4(a, b uint8) int64 {
  1744  	g := cscimm2g2(a, b)
  1745  	return int64(int8(g + g<<4))
  1746  }
  1747  
  1748  const (
  1749  	_LLLL = iota
  1750  	_HLLL
  1751  	_LHLL
  1752  	_HHLL
  1753  	_LLHL
  1754  	_HLHL
  1755  	_LHHL
  1756  	_HHHL
  1757  	_LLLH
  1758  	_HLLH
  1759  	_LHLH
  1760  	_HHLH
  1761  	_LLHH
  1762  	_HLHH
  1763  	_LHHH
  1764  	_HHHH
  1765  )
  1766  
  1767  const (
  1768  	_LL = iota
  1769  	_HL
  1770  	_LH
  1771  	_HH
  1772  )
  1773  
  1774  func select2FromPair(x, _a, _b, y *ssa.Value, s *state, op ssa.Op, t *types.Type, csc func(a, b uint8) int64) *ssa.Value {
  1775  	a, b := uint8(_a.AuxInt8()), uint8(_b.AuxInt8())
  1776  	if a > 3 || b > 3 {
  1777  		return nil
  1778  	}
  1779  	pattern := (a&2)>>1 + (b & 2)
  1780  	a, b = a&1, b&1
  1781  
  1782  	switch pattern {
  1783  	case _LL:
  1784  		return s.newValue2I(op, t, csc(a, b), x, x)
  1785  	case _HH:
  1786  		return s.newValue2I(op, t, csc(a, b), y, y)
  1787  	case _LH:
  1788  		return s.newValue2I(op, t, csc(a, b), x, y)
  1789  	case _HL:
  1790  		return s.newValue2I(op, t, csc(a, b), y, x)
  1791  	}
  1792  	panic("The preceding switch should have been exhaustive")
  1793  }
  1794  
  1795  func select4FromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value {
  1796  	a, b, c, d := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()), uint8(_c.AuxInt8()), uint8(_d.AuxInt8())
  1797  	if a > 7 || b > 7 || c > 7 || d > 7 {
  1798  		return nil
  1799  	}
  1800  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
  1801  
  1802  	a, b, c, d = a&3, b&3, c&3, d&3
  1803  
  1804  	switch pattern {
  1805  	case _LLLL:
  1806  		// TODO DETECT 0,1,2,3, 0,0,0,0
  1807  		return s.newValue2I(op, t, cscimm4(a, b, c, d), x, x)
  1808  	case _HHHH:
  1809  		// TODO DETECT 0,1,2,3, 0,0,0,0
  1810  		return s.newValue2I(op, t, cscimm4(a, b, c, d), y, y)
  1811  	case _LLHH:
  1812  		return s.newValue2I(op, t, cscimm4(a, b, c, d), x, y)
  1813  	case _HHLL:
  1814  		return s.newValue2I(op, t, cscimm4(a, b, c, d), y, x)
  1815  
  1816  	case _HLLL:
  1817  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
  1818  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
  1819  	case _LHLL:
  1820  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
  1821  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
  1822  	case _HLHH:
  1823  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
  1824  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
  1825  	case _LHHH:
  1826  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
  1827  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
  1828  
  1829  	case _LLLH:
  1830  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
  1831  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
  1832  	case _LLHL:
  1833  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
  1834  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
  1835  
  1836  	case _HHLH:
  1837  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
  1838  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
  1839  
  1840  	case _HHHL:
  1841  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
  1842  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
  1843  
  1844  	case _LHLH:
  1845  		z := s.newValue2I(op, t, cscimm4(a, c, b, d), x, y)
  1846  		return s.newValue2I(op, t, se(0b11_01_10_00), z, z)
  1847  	case _HLHL:
  1848  		z := s.newValue2I(op, t, cscimm4(b, d, a, c), x, y)
  1849  		return s.newValue2I(op, t, se(0b01_11_00_10), z, z)
  1850  	case _HLLH:
  1851  		z := s.newValue2I(op, t, cscimm4(b, c, a, d), x, y)
  1852  		return s.newValue2I(op, t, se(0b11_01_00_10), z, z)
  1853  	case _LHHL:
  1854  		z := s.newValue2I(op, t, cscimm4(a, d, b, c), x, y)
  1855  		return s.newValue2I(op, t, se(0b01_11_10_00), z, z)
  1856  	}
  1857  	panic("The preceding switch should have been exhaustive")
  1858  }
  1859  
  1860  // se smears the not-really-a-sign bit of a uint8 to conform to the conventions
  1861  // for representing AuxInt in ssa.
  1862  func se(x uint8) int64 {
  1863  	return int64(int8(x))
  1864  }
  1865  
  1866  func opLen1(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1867  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1868  		return s.newValue1(op, t, args[0])
  1869  	}
  1870  }
  1871  
  1872  func opLen2(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1873  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1874  		return s.newValue2(op, t, args[0], args[1])
  1875  	}
  1876  }
  1877  
  1878  func opLen2_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1879  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1880  		return s.newValue2(op, t, args[1], args[0])
  1881  	}
  1882  }
  1883  
  1884  func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1885  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1886  		return s.newValue3(op, t, args[0], args[1], args[2])
  1887  	}
  1888  }
  1889  
  1890  var ssaVecBySize = map[int64]*types.Type{
  1891  	16: types.TypeVec128,
  1892  	32: types.TypeVec256,
  1893  	64: types.TypeVec512,
  1894  }
  1895  
  1896  func opLen3_31Zero3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1897  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1898  		if t, ok := ssaVecBySize[args[1].Type.Size()]; !ok {
  1899  			panic("unknown simd vector size")
  1900  		} else {
  1901  			return s.newValue3(op, t, s.newValue0(ssa.OpZeroSIMD, t), args[1], args[0])
  1902  		}
  1903  	}
  1904  }
  1905  
  1906  func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1907  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1908  		return s.newValue3(op, t, args[1], args[0], args[2])
  1909  	}
  1910  }
  1911  
  1912  func opLen3_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1913  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1914  		return s.newValue3(op, t, args[2], args[0], args[1])
  1915  	}
  1916  }
  1917  
  1918  func opLen4(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1919  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1920  		return s.newValue4(op, t, args[0], args[1], args[2], args[3])
  1921  	}
  1922  }
  1923  
  1924  func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1925  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1926  		return s.newValue4(op, t, args[2], args[0], args[1], args[3])
  1927  	}
  1928  }
  1929  
  1930  func opLen4_31(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1931  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1932  		return s.newValue4(op, t, args[2], args[1], args[0], args[3])
  1933  	}
  1934  }
  1935  
  1936  func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp func(*state, int)) *ssa.Value {
  1937  	// Make blocks we'll need.
  1938  	bEnd := s.f.NewBlock(ssa.BlockPlain)
  1939  
  1940  	if !idx.Type.IsKind(types.TUINT8) {
  1941  		panic("immJumpTable expects uint8 value")
  1942  	}
  1943  
  1944  	// We will exhaust 0-255, so no need to check the bounds.
  1945  	t := types.Types[types.TUINTPTR]
  1946  	idx = s.conv(nil, idx, idx.Type, t)
  1947  
  1948  	b := s.curBlock
  1949  	b.Kind = ssa.BlockJumpTable
  1950  	b.Pos = intrinsicCall.Pos()
  1951  	if base.Flag.Cfg.SpectreIndex {
  1952  		// Potential Spectre vulnerability hardening?
  1953  		idx = s.newValue2(ssa.OpSpectreSliceIndex, t, idx, s.uintptrConstant(255))
  1954  	}
  1955  	b.SetControl(idx)
  1956  	targets := [256]*ssa.Block{}
  1957  	for i := range 256 {
  1958  		t := s.f.NewBlock(ssa.BlockPlain)
  1959  		targets[i] = t
  1960  		b.AddEdgeTo(t)
  1961  	}
  1962  	s.endBlock()
  1963  
  1964  	for i, t := range targets {
  1965  		s.startBlock(t)
  1966  		genOp(s, i)
  1967  		if t.Kind != ssa.BlockExit {
  1968  			t.AddEdgeTo(bEnd)
  1969  		}
  1970  		s.endBlock()
  1971  	}
  1972  
  1973  	s.startBlock(bEnd)
  1974  	ret := s.variable(intrinsicCall, intrinsicCall.Type())
  1975  	return ret
  1976  }
  1977  
  1978  func opLen1Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1979  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1980  		if args[1].Op == ssa.OpConst8 {
  1981  			return s.newValue1I(op, t, args[1].AuxInt<<int64(offset), args[0])
  1982  		}
  1983  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  1984  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  1985  			s.vars[n] = sNew.newValue1I(op, t, int64(int8(idx<<offset)), args[0])
  1986  		})
  1987  	}
  1988  }
  1989  
  1990  func opLen2Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1991  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1992  		if args[1].Op == ssa.OpConst8 {
  1993  			return s.newValue2I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2])
  1994  		}
  1995  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  1996  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  1997  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[2])
  1998  		})
  1999  	}
  2000  }
  2001  
  2002  func opLen3Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2003  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2004  		if args[1].Op == ssa.OpConst8 {
  2005  			return s.newValue3I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3])
  2006  		}
  2007  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2008  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2009  			s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3])
  2010  		})
  2011  	}
  2012  }
  2013  
  2014  func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2015  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2016  		if args[2].Op == ssa.OpConst8 {
  2017  			return s.newValue2I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1])
  2018  		}
  2019  		return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
  2020  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2021  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[1])
  2022  		})
  2023  	}
  2024  }
  2025  
  2026  // Two immediates instead of just 1.  Offset is ignored, so it is a _ parameter instead.
  2027  func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2028  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2029  		if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 {
  2030  			i1, i2 := args[1].AuxInt, args[2].AuxInt
  2031  			return s.newValue2I(op, t, int64(int8(i1+i2<<4)), args[0], args[3])
  2032  		}
  2033  		four := s.constInt64(types.Types[types.TUINT8], 4)
  2034  		shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four)
  2035  		combined := s.newValue2(ssa.OpAdd8, types.Types[types.TUINT8], args[1], shifted)
  2036  		return immJumpTable(s, combined, n, func(sNew *state, idx int) {
  2037  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2038  			// TODO for "zeroing" values, panic instead.
  2039  			if idx & ^(3+3<<4) == 0 {
  2040  				s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx)), args[0], args[3])
  2041  			} else {
  2042  				sNew.rtcall(ir.Syms.PanicSimdImm, false, nil)
  2043  			}
  2044  		})
  2045  	}
  2046  }
  2047  
  2048  // The assembler requires the imm value of a SHA1RNDS4 instruction to be one of 0,1,2,3...
  2049  func opLen2Imm8_SHA1RNDS4(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2050  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2051  		if args[1].Op == ssa.OpConst8 {
  2052  			return s.newValue2I(op, t, (args[1].AuxInt<<int64(offset))&0b11, args[0], args[2])
  2053  		}
  2054  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2055  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2056  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset))&0b11, args[0], args[2])
  2057  		})
  2058  	}
  2059  }
  2060  
  2061  func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2062  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2063  		if args[2].Op == ssa.OpConst8 {
  2064  			return s.newValue3I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1], args[3])
  2065  		}
  2066  		return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
  2067  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2068  			s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[1], args[3])
  2069  		})
  2070  	}
  2071  }
  2072  
  2073  func opLen4Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2074  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2075  		if args[1].Op == ssa.OpConst8 {
  2076  			return s.newValue4I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3], args[4])
  2077  		}
  2078  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2079  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2080  			s.vars[n] = sNew.newValue4I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3], args[4])
  2081  		})
  2082  	}
  2083  }
  2084  
  2085  func simdLoad() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2086  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2087  		return s.newValue2(ssa.OpLoad, n.Type(), args[0], s.mem())
  2088  	}
  2089  }
  2090  
  2091  func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2092  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2093  		s.store(args[0].Type, args[1], args[0])
  2094  		return nil
  2095  	}
  2096  }
  2097  
  2098  var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{
  2099  	8:  {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64},
  2100  	16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32},
  2101  	32: {4: ssa.OpCvt8toMask32x4, 8: ssa.OpCvt8toMask32x8, 16: ssa.OpCvt16toMask32x16},
  2102  	64: {2: ssa.OpCvt8toMask64x2, 4: ssa.OpCvt8toMask64x4, 8: ssa.OpCvt8toMask64x8},
  2103  }
  2104  
  2105  var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{
  2106  	8:  {16: ssa.OpCvtMask8x16to16, 32: ssa.OpCvtMask8x32to32, 64: ssa.OpCvtMask8x64to64},
  2107  	16: {8: ssa.OpCvtMask16x8to8, 16: ssa.OpCvtMask16x16to16, 32: ssa.OpCvtMask16x32to32},
  2108  	32: {4: ssa.OpCvtMask32x4to8, 8: ssa.OpCvtMask32x8to8, 16: ssa.OpCvtMask32x16to16},
  2109  	64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8},
  2110  }
  2111  
  2112  func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2113  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2114  		op := cvtVToMaskOpcodes[elemBits][lanes]
  2115  		if op == 0 {
  2116  			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
  2117  		}
  2118  		return s.newValue1(op, types.TypeMask, args[0])
  2119  	}
  2120  }
  2121  
  2122  func simdCvtMaskToV(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2123  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2124  		op := cvtMaskToVOpcodes[elemBits][lanes]
  2125  		if op == 0 {
  2126  			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
  2127  		}
  2128  		return s.newValue1(op, n.Type(), args[0])
  2129  	}
  2130  }
  2131  
  2132  func simdMaskedLoad(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2133  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2134  		return s.newValue3(op, n.Type(), args[0], args[1], s.mem())
  2135  	}
  2136  }
  2137  
  2138  func simdMaskedStore(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2139  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2140  		s.vars[memVar] = s.newValue4A(op, types.TypeMem, args[0].Type, args[1], args[2], args[0], s.mem())
  2141  		return nil
  2142  	}
  2143  }
  2144  
  2145  // findIntrinsic returns a function which builds the SSA equivalent of the
  2146  // function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
  2147  func findIntrinsic(sym *types.Sym) intrinsicBuilder {
  2148  	if sym == nil || sym.Pkg == nil {
  2149  		return nil
  2150  	}
  2151  	pkg := sym.Pkg.Path
  2152  	if sym.Pkg == ir.Pkgs.Runtime {
  2153  		pkg = "runtime"
  2154  	}
  2155  	if base.Flag.Race && pkg == "sync/atomic" {
  2156  		// The race detector needs to be able to intercept these calls.
  2157  		// We can't intrinsify them.
  2158  		return nil
  2159  	}
  2160  	// Skip intrinsifying math functions (which may contain hard-float
  2161  	// instructions) when soft-float
  2162  	if Arch.SoftFloat && pkg == "math" {
  2163  		return nil
  2164  	}
  2165  
  2166  	fn := sym.Name
  2167  	if ssa.IntrinsicsDisable {
  2168  		if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") ||
  2169  			pkg == simdPackage {
  2170  			// These runtime functions don't have definitions, must be intrinsics.
  2171  		} else {
  2172  			return nil
  2173  		}
  2174  	}
  2175  	return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
  2176  }
  2177  
  2178  func IsIntrinsicCall(n *ir.CallExpr) bool {
  2179  	if n == nil {
  2180  		return false
  2181  	}
  2182  	name, ok := n.Fun.(*ir.Name)
  2183  	if !ok {
  2184  		if n.Fun.Op() == ir.OMETHEXPR {
  2185  			if meth := ir.MethodExprName(n.Fun); meth != nil {
  2186  				if fn := meth.Func; fn != nil {
  2187  					return IsIntrinsicSym(fn.Sym())
  2188  				}
  2189  			}
  2190  		}
  2191  		return false
  2192  	}
  2193  	return IsIntrinsicSym(name.Sym())
  2194  }
  2195  
  2196  func IsIntrinsicSym(sym *types.Sym) bool {
  2197  	return findIntrinsic(sym) != nil
  2198  }
  2199  
  2200  // GenIntrinsicBody generates the function body for a bodyless intrinsic.
  2201  // This is used when the intrinsic is used in a non-call context, e.g.
  2202  // as a function pointer, or (for a method) being referenced from the type
  2203  // descriptor.
  2204  //
  2205  // The compiler already recognizes a call to fn as an intrinsic and can
  2206  // directly generate code for it. So we just fill in the body with a call
  2207  // to fn.
  2208  func GenIntrinsicBody(fn *ir.Func) {
  2209  	if ir.CurFunc != nil {
  2210  		base.FatalfAt(fn.Pos(), "enqueueFunc %v inside %v", fn, ir.CurFunc)
  2211  	}
  2212  
  2213  	if base.Flag.LowerR != 0 {
  2214  		fmt.Println("generate intrinsic for", ir.FuncName(fn))
  2215  	}
  2216  
  2217  	pos := fn.Pos()
  2218  	ft := fn.Type()
  2219  	var ret ir.Node
  2220  
  2221  	// For a method, it usually starts with an ODOTMETH (pre-typecheck) or
  2222  	// OMETHEXPR (post-typecheck) referencing the method symbol without the
  2223  	// receiver type, and Walk rewrites it to a call directly to the
  2224  	// type-qualified method symbol, moving the receiver to an argument.
  2225  	// Here fn has already the type-qualified method symbol, and it is hard
  2226  	// to get the unqualified symbol. So we just generate the post-Walk form
  2227  	// and mark it typechecked and Walked.
  2228  	call := ir.NewCallExpr(pos, ir.OCALLFUNC, fn.Nname, nil)
  2229  	call.Args = ir.RecvParamNames(ft)
  2230  	call.IsDDD = ft.IsVariadic()
  2231  	typecheck.Exprs(call.Args)
  2232  	call.SetTypecheck(1)
  2233  	call.SetWalked(true)
  2234  	ret = call
  2235  	if ft.NumResults() > 0 {
  2236  		if ft.NumResults() == 1 {
  2237  			call.SetType(ft.Result(0).Type)
  2238  		} else {
  2239  			call.SetType(ft.ResultsTuple())
  2240  		}
  2241  		n := ir.NewReturnStmt(base.Pos, nil)
  2242  		n.Results = []ir.Node{call}
  2243  		ret = n
  2244  	}
  2245  	fn.Body.Append(ret)
  2246  
  2247  	if base.Flag.LowerR != 0 {
  2248  		ir.DumpList("generate intrinsic body", fn.Body)
  2249  	}
  2250  
  2251  	ir.CurFunc = fn
  2252  	typecheck.Stmts(fn.Body)
  2253  	ir.CurFunc = nil // we know CurFunc is nil at entry
  2254  }
  2255  

View as plain text