Source file src/cmd/compile/internal/ssagen/intrinsics.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssagen
     6  
     7  import (
     8  	"fmt"
     9  	"internal/abi"
    10  	"internal/buildcfg"
    11  
    12  	"cmd/compile/internal/base"
    13  	"cmd/compile/internal/ir"
    14  	"cmd/compile/internal/ssa"
    15  	"cmd/compile/internal/typecheck"
    16  	"cmd/compile/internal/types"
    17  	"cmd/internal/sys"
    18  )
    19  
    20  var intrinsics intrinsicBuilders
    21  
    22  // An intrinsicBuilder converts a call node n into an ssa value that
    23  // implements that call as an intrinsic. args is a list of arguments to the func.
    24  type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
    25  
    26  type intrinsicKey struct {
    27  	arch *sys.Arch
    28  	pkg  string
    29  	fn   string
    30  }
    31  
    32  // intrinsicBuildConfig specifies the config to use for intrinsic building.
    33  type intrinsicBuildConfig struct {
    34  	instrumenting bool
    35  
    36  	go386     string
    37  	goamd64   int
    38  	goarm     buildcfg.GoarmFeatures
    39  	goarm64   buildcfg.Goarm64Features
    40  	gomips    string
    41  	gomips64  string
    42  	goppc64   int
    43  	goriscv64 int
    44  }
    45  
    46  type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
    47  
    48  // add adds the intrinsic builder b for pkg.fn for the given architecture.
    49  func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
    50  	if _, found := ib[intrinsicKey{arch, pkg, fn}]; found {
    51  		panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name))
    52  	}
    53  	ib[intrinsicKey{arch, pkg, fn}] = b
    54  }
    55  
    56  // addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
    57  func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
    58  	for _, arch := range archs {
    59  		ib.add(arch, pkg, fn, b)
    60  	}
    61  }
    62  
    63  // addForFamilies does the same as addForArchs but operates on architecture families.
    64  func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
    65  	for _, arch := range sys.Archs {
    66  		if arch.InFamily(archFamilies...) {
    67  			intrinsics.add(arch, pkg, fn, b)
    68  		}
    69  	}
    70  }
    71  
    72  // alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
    73  // for which targetPkg.targetFn already exists.
    74  func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
    75  	// TODO(jsing): Consider making this work even if the alias is added
    76  	// before the intrinsic.
    77  	aliased := false
    78  	for _, arch := range archs {
    79  		if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
    80  			intrinsics.add(arch, pkg, fn, b)
    81  			aliased = true
    82  		}
    83  	}
    84  	if !aliased {
    85  		panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
    86  	}
    87  }
    88  
    89  // lookup looks up the intrinsic for a pkg.fn on the specified architecture.
    90  func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
    91  	return intrinsics[intrinsicKey{arch, pkg, fn}]
    92  }
    93  
    94  func initIntrinsics(cfg *intrinsicBuildConfig) {
    95  	if cfg == nil {
    96  		cfg = &intrinsicBuildConfig{
    97  			instrumenting: base.Flag.Cfg.Instrumenting,
    98  			go386:         buildcfg.GO386,
    99  			goamd64:       buildcfg.GOAMD64,
   100  			goarm:         buildcfg.GOARM,
   101  			goarm64:       buildcfg.GOARM64,
   102  			gomips:        buildcfg.GOMIPS,
   103  			gomips64:      buildcfg.GOMIPS64,
   104  			goppc64:       buildcfg.GOPPC64,
   105  			goriscv64:     buildcfg.GORISCV64,
   106  		}
   107  	}
   108  	intrinsics = intrinsicBuilders{}
   109  
   110  	var p4 []*sys.Arch
   111  	var p8 []*sys.Arch
   112  	var lwatomics []*sys.Arch
   113  	for _, a := range sys.Archs {
   114  		if a.PtrSize == 4 {
   115  			p4 = append(p4, a)
   116  		} else {
   117  			p8 = append(p8, a)
   118  		}
   119  		if a.Family != sys.PPC64 {
   120  			lwatomics = append(lwatomics, a)
   121  		}
   122  	}
   123  	all := sys.Archs[:]
   124  
   125  	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
   126  		intrinsics.addForArchs(pkg, fn, b, archs...)
   127  	}
   128  	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
   129  		intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
   130  	}
   131  	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
   132  		intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
   133  	}
   134  
   135  	/******** runtime ********/
   136  	if !cfg.instrumenting {
   137  		add("runtime", "slicebytetostringtmp",
   138  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   139  				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
   140  				// for the backend instead of slicebytetostringtmp calls
   141  				// when not instrumenting.
   142  				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
   143  			},
   144  			all...)
   145  	}
   146  	addF("internal/runtime/math", "MulUintptr",
   147  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   148  			if s.config.PtrSize == 4 {
   149  				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   150  			}
   151  			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   152  		},
   153  		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64)
   154  	add("runtime", "KeepAlive",
   155  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   156  			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
   157  			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
   158  			return nil
   159  		},
   160  		all...)
   161  
   162  	addF("runtime", "publicationBarrier",
   163  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   164  			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
   165  			return nil
   166  		},
   167  		sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64)
   168  
   169  	/******** internal/runtime/sys ********/
   170  	add("internal/runtime/sys", "GetCallerPC",
   171  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   172  			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
   173  		},
   174  		all...)
   175  
   176  	add("internal/runtime/sys", "GetCallerSP",
   177  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   178  			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
   179  		},
   180  		all...)
   181  
   182  	add("internal/runtime/sys", "GetClosurePtr",
   183  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   184  			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
   185  		},
   186  		all...)
   187  
   188  	addF("internal/runtime/sys", "Bswap32",
   189  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   190  			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   191  		},
   192  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   193  	addF("internal/runtime/sys", "Bswap64",
   194  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   195  			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   196  		},
   197  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
   198  
   199  	addF("runtime", "memequal",
   200  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   201  			return s.newValue4(ssa.OpMemEq, s.f.Config.Types.Bool, args[0], args[1], args[2], s.mem())
   202  		},
   203  		sys.ARM64)
   204  
   205  	if cfg.goppc64 >= 10 {
   206  		// Use only on Power10 as the new byte reverse instructions that Power10 provide
   207  		// make it worthwhile as an intrinsic
   208  		addF("internal/runtime/sys", "Bswap32",
   209  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   210  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   211  			},
   212  			sys.PPC64)
   213  		addF("internal/runtime/sys", "Bswap64",
   214  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   215  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   216  			},
   217  			sys.PPC64)
   218  	}
   219  
   220  	if cfg.goriscv64 >= 22 {
   221  		addF("internal/runtime/sys", "Bswap32",
   222  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   223  				return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   224  			},
   225  			sys.RISCV64)
   226  		addF("internal/runtime/sys", "Bswap64",
   227  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   228  				return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   229  			},
   230  			sys.RISCV64)
   231  	}
   232  
   233  	/****** Prefetch ******/
   234  	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   235  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   236  			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
   237  			return nil
   238  		}
   239  	}
   240  
   241  	// Make Prefetch intrinsics for supported platforms
   242  	// On the unsupported platforms stub function will be eliminated
   243  	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
   244  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
   245  	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
   246  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
   247  
   248  	/******** internal/runtime/atomic ********/
   249  	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
   250  
   251  	addF("internal/runtime/atomic", "Load",
   252  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   253  			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   254  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   255  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   256  		},
   257  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   258  	addF("internal/runtime/atomic", "Load8",
   259  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   260  			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
   261  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   262  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   263  		},
   264  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   265  	addF("internal/runtime/atomic", "Load64",
   266  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   267  			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   268  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   269  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   270  		},
   271  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   272  	addF("internal/runtime/atomic", "LoadAcq",
   273  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   274  			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   275  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   276  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   277  		},
   278  		sys.PPC64)
   279  	addF("internal/runtime/atomic", "LoadAcq64",
   280  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   281  			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   282  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   283  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   284  		},
   285  		sys.PPC64)
   286  	addF("internal/runtime/atomic", "Loadp",
   287  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   288  			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
   289  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   290  			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
   291  		},
   292  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   293  
   294  	addF("internal/runtime/atomic", "Store",
   295  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   296  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
   297  			return nil
   298  		},
   299  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   300  	addF("internal/runtime/atomic", "Store8",
   301  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   302  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
   303  			return nil
   304  		},
   305  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   306  	addF("internal/runtime/atomic", "Store64",
   307  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   308  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
   309  			return nil
   310  		},
   311  		sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   312  	addF("internal/runtime/atomic", "StorepNoWB",
   313  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   314  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
   315  			return nil
   316  		},
   317  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
   318  	addF("internal/runtime/atomic", "StoreRel",
   319  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   320  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
   321  			return nil
   322  		},
   323  		sys.PPC64)
   324  	addF("internal/runtime/atomic", "StoreRel64",
   325  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   326  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
   327  			return nil
   328  		},
   329  		sys.PPC64)
   330  
   331  	makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   332  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   333  			// Target Atomic feature is identified by dynamic detection
   334  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   335  			v := s.load(types.Types[types.TBOOL], addr)
   336  			b := s.endBlock()
   337  			b.Kind = ssa.BlockIf
   338  			b.SetControl(v)
   339  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   340  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   341  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   342  			b.AddEdgeTo(bTrue)
   343  			b.AddEdgeTo(bFalse)
   344  			b.Likely = ssa.BranchLikely
   345  
   346  			// We have atomic instructions - use it directly.
   347  			s.startBlock(bTrue)
   348  			emit(s, n, args, op1, typ, false)
   349  			s.endBlock().AddEdgeTo(bEnd)
   350  
   351  			// Use original instruction sequence.
   352  			s.startBlock(bFalse)
   353  			emit(s, n, args, op0, typ, false)
   354  			s.endBlock().AddEdgeTo(bEnd)
   355  
   356  			// Merge results.
   357  			s.startBlock(bEnd)
   358  
   359  			return nil
   360  		}
   361  	}
   362  
   363  	atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   364  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   365  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   366  		if needReturn {
   367  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   368  		}
   369  	}
   370  
   371  	addF("internal/runtime/atomic", "Store8",
   372  		makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
   373  		sys.Loong64)
   374  	addF("internal/runtime/atomic", "Store",
   375  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   376  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
   377  			return nil
   378  		},
   379  		sys.Loong64)
   380  	addF("internal/runtime/atomic", "Store64",
   381  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   382  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
   383  			return nil
   384  		},
   385  		sys.Loong64)
   386  
   387  	addF("internal/runtime/atomic", "Xchg8",
   388  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   389  			v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   390  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   391  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   392  		},
   393  		sys.AMD64, sys.PPC64)
   394  	addF("internal/runtime/atomic", "Xchg",
   395  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   396  			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   397  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   398  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   399  		},
   400  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   401  	addF("internal/runtime/atomic", "Xchg64",
   402  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   403  			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   404  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   405  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   406  		},
   407  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   408  
   409  	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
   410  
   411  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   412  			if cfg.goarm64.LSE {
   413  				emit(s, n, args, op1, typ, needReturn)
   414  			} else {
   415  				// Target Atomic feature is identified by dynamic detection
   416  				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
   417  				v := s.load(types.Types[types.TBOOL], addr)
   418  				b := s.endBlock()
   419  				b.Kind = ssa.BlockIf
   420  				b.SetControl(v)
   421  				bTrue := s.f.NewBlock(ssa.BlockPlain)
   422  				bFalse := s.f.NewBlock(ssa.BlockPlain)
   423  				bEnd := s.f.NewBlock(ssa.BlockPlain)
   424  				b.AddEdgeTo(bTrue)
   425  				b.AddEdgeTo(bFalse)
   426  				b.Likely = ssa.BranchLikely
   427  
   428  				// We have atomic instructions - use it directly.
   429  				s.startBlock(bTrue)
   430  				emit(s, n, args, op1, typ, needReturn)
   431  				s.endBlock().AddEdgeTo(bEnd)
   432  
   433  				// Use original instruction sequence.
   434  				s.startBlock(bFalse)
   435  				emit(s, n, args, op0, typ, needReturn)
   436  				s.endBlock().AddEdgeTo(bEnd)
   437  
   438  				// Merge results.
   439  				s.startBlock(bEnd)
   440  			}
   441  			if needReturn {
   442  				return s.variable(n, types.Types[typ])
   443  			} else {
   444  				return nil
   445  			}
   446  		}
   447  	}
   448  	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   449  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
   450  	}
   451  	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   452  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
   453  	}
   454  
   455  	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   456  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   457  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   458  		if needReturn {
   459  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   460  		}
   461  	}
   462  	addF("internal/runtime/atomic", "Xchg8",
   463  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64),
   464  		sys.ARM64)
   465  	addF("internal/runtime/atomic", "Xchg",
   466  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
   467  		sys.ARM64)
   468  	addF("internal/runtime/atomic", "Xchg64",
   469  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
   470  		sys.ARM64)
   471  
   472  	makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   473  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   474  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   475  			v := s.load(types.Types[types.TBOOL], addr)
   476  			b := s.endBlock()
   477  			b.Kind = ssa.BlockIf
   478  			b.SetControl(v)
   479  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   480  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   481  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   482  			b.AddEdgeTo(bTrue)
   483  			b.AddEdgeTo(bFalse)
   484  			b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b
   485  
   486  			// We have the intrinsic - use it directly.
   487  			s.startBlock(bTrue)
   488  			s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   489  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n])
   490  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n])
   491  			s.endBlock().AddEdgeTo(bEnd)
   492  
   493  			// Call the pure Go version.
   494  			s.startBlock(bFalse)
   495  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8]
   496  			s.endBlock().AddEdgeTo(bEnd)
   497  
   498  			// Merge results.
   499  			s.startBlock(bEnd)
   500  			return s.variable(n, types.Types[types.TUINT8])
   501  		}
   502  	}
   503  	addF("internal/runtime/atomic", "Xchg8",
   504  		makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant),
   505  		sys.Loong64)
   506  
   507  	addF("internal/runtime/atomic", "Xadd",
   508  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   509  			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   510  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   511  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   512  		},
   513  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   514  	addF("internal/runtime/atomic", "Xadd64",
   515  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   516  			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   517  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   518  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   519  		},
   520  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   521  
   522  	addF("internal/runtime/atomic", "Xadd",
   523  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
   524  		sys.ARM64)
   525  	addF("internal/runtime/atomic", "Xadd64",
   526  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
   527  		sys.ARM64)
   528  
   529  	addF("internal/runtime/atomic", "Cas",
   530  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   531  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   532  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   533  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   534  		},
   535  		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   536  	addF("internal/runtime/atomic", "Cas64",
   537  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   538  			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   539  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   540  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   541  		},
   542  		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   543  	addF("internal/runtime/atomic", "CasRel",
   544  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   545  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   546  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   547  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   548  		},
   549  		sys.PPC64)
   550  
   551  	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   552  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   553  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   554  		if needReturn {
   555  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   556  		}
   557  	}
   558  
   559  	addF("internal/runtime/atomic", "Cas",
   560  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
   561  		sys.ARM64)
   562  	addF("internal/runtime/atomic", "Cas64",
   563  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
   564  		sys.ARM64)
   565  
   566  	atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   567  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   568  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   569  		if needReturn {
   570  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   571  		}
   572  	}
   573  
   574  	makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder {
   575  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   576  			// Target Atomic feature is identified by dynamic detection
   577  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb)
   578  			v := s.load(types.Types[types.TBOOL], addr)
   579  			b := s.endBlock()
   580  			b.Kind = ssa.BlockIf
   581  			b.SetControl(v)
   582  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   583  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   584  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   585  			b.AddEdgeTo(bTrue)
   586  			b.AddEdgeTo(bFalse)
   587  			b.Likely = ssa.BranchLikely
   588  
   589  			// We have atomic instructions - use it directly.
   590  			s.startBlock(bTrue)
   591  			emit(s, n, args, op1, types.TBOOL, true)
   592  			s.endBlock().AddEdgeTo(bEnd)
   593  
   594  			// Use original instruction sequence.
   595  			s.startBlock(bFalse)
   596  			emit(s, n, args, op0, types.TBOOL, true)
   597  			s.endBlock().AddEdgeTo(bEnd)
   598  
   599  			// Merge results.
   600  			s.startBlock(bEnd)
   601  
   602  			return s.variable(n, types.Types[types.TBOOL])
   603  		}
   604  	}
   605  
   606  	addF("internal/runtime/atomic", "Cas",
   607  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64),
   608  		sys.Loong64)
   609  	addF("internal/runtime/atomic", "Cas64",
   610  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64),
   611  		sys.Loong64)
   612  
   613  	// Old-style atomic logical operation API (all supported archs except arm64).
   614  	addF("internal/runtime/atomic", "And8",
   615  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   616  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
   617  			return nil
   618  		},
   619  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   620  	addF("internal/runtime/atomic", "And",
   621  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   622  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
   623  			return nil
   624  		},
   625  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   626  	addF("internal/runtime/atomic", "Or8",
   627  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   628  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
   629  			return nil
   630  		},
   631  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   632  	addF("internal/runtime/atomic", "Or",
   633  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   634  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
   635  			return nil
   636  		},
   637  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   638  
   639  	// arm64 always uses the new-style atomic logical operations, for both the
   640  	// old and new style API.
   641  	addF("internal/runtime/atomic", "And8",
   642  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
   643  		sys.ARM64)
   644  	addF("internal/runtime/atomic", "Or8",
   645  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
   646  		sys.ARM64)
   647  	addF("internal/runtime/atomic", "And64",
   648  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
   649  		sys.ARM64)
   650  	addF("internal/runtime/atomic", "And32",
   651  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   652  		sys.ARM64)
   653  	addF("internal/runtime/atomic", "And",
   654  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   655  		sys.ARM64)
   656  	addF("internal/runtime/atomic", "Or64",
   657  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
   658  		sys.ARM64)
   659  	addF("internal/runtime/atomic", "Or32",
   660  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   661  		sys.ARM64)
   662  	addF("internal/runtime/atomic", "Or",
   663  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   664  		sys.ARM64)
   665  
   666  	// New-style atomic logical operations, which return the old memory value.
   667  	addF("internal/runtime/atomic", "And64",
   668  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   669  			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   670  			p0, p1 := s.split(v)
   671  			s.vars[memVar] = p1
   672  			return p0
   673  		},
   674  		sys.AMD64, sys.Loong64)
   675  	addF("internal/runtime/atomic", "And32",
   676  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   677  			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   678  			p0, p1 := s.split(v)
   679  			s.vars[memVar] = p1
   680  			return p0
   681  		},
   682  		sys.AMD64, sys.Loong64)
   683  	addF("internal/runtime/atomic", "Or64",
   684  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   685  			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   686  			p0, p1 := s.split(v)
   687  			s.vars[memVar] = p1
   688  			return p0
   689  		},
   690  		sys.AMD64, sys.Loong64)
   691  	addF("internal/runtime/atomic", "Or32",
   692  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   693  			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   694  			p0, p1 := s.split(v)
   695  			s.vars[memVar] = p1
   696  			return p0
   697  		},
   698  		sys.AMD64, sys.Loong64)
   699  
   700  	// Aliases for atomic load operations
   701  	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
   702  	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
   703  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
   704  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
   705  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
   706  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
   707  	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
   708  	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
   709  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
   710  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
   711  
   712  	// Aliases for atomic store operations
   713  	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
   714  	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
   715  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
   716  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
   717  	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
   718  	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
   719  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
   720  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
   721  
   722  	// Aliases for atomic swap operations
   723  	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
   724  	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
   725  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
   726  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
   727  
   728  	// Aliases for atomic add operations
   729  	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
   730  	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
   731  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
   732  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
   733  
   734  	// Aliases for atomic CAS operations
   735  	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
   736  	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
   737  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
   738  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
   739  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
   740  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
   741  	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
   742  
   743  	// Aliases for atomic And/Or operations
   744  	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64)
   745  	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64)
   746  
   747  	/******** math ********/
   748  	addF("math", "sqrt",
   749  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   750  			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
   751  		},
   752  		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
   753  	addF("math", "Trunc",
   754  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   755  			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
   756  		},
   757  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   758  	addF("math", "Ceil",
   759  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   760  			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
   761  		},
   762  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   763  	addF("math", "Floor",
   764  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   765  			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
   766  		},
   767  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   768  	addF("math", "Round",
   769  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   770  			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
   771  		},
   772  		sys.ARM64, sys.PPC64, sys.S390X)
   773  	addF("math", "RoundToEven",
   774  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   775  			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
   776  		},
   777  		sys.ARM64, sys.S390X, sys.Wasm)
   778  	addF("math", "Abs",
   779  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   780  			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
   781  		},
   782  		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
   783  	addF("math", "Copysign",
   784  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   785  			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
   786  		},
   787  		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
   788  	addF("math", "FMA",
   789  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   790  			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   791  		},
   792  		sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
   793  	addF("math", "FMA",
   794  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   795  			if cfg.goamd64 >= 3 {
   796  				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   797  			}
   798  
   799  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
   800  			b := s.endBlock()
   801  			b.Kind = ssa.BlockIf
   802  			b.SetControl(v)
   803  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   804  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   805  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   806  			b.AddEdgeTo(bTrue)
   807  			b.AddEdgeTo(bFalse)
   808  			b.Likely = ssa.BranchLikely // >= haswell cpus are common
   809  
   810  			// We have the intrinsic - use it directly.
   811  			s.startBlock(bTrue)
   812  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   813  			s.endBlock().AddEdgeTo(bEnd)
   814  
   815  			// Call the pure Go version.
   816  			s.startBlock(bFalse)
   817  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   818  			s.endBlock().AddEdgeTo(bEnd)
   819  
   820  			// Merge results.
   821  			s.startBlock(bEnd)
   822  			return s.variable(n, types.Types[types.TFLOAT64])
   823  		},
   824  		sys.AMD64)
   825  	addF("math", "FMA",
   826  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   827  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
   828  			v := s.load(types.Types[types.TBOOL], addr)
   829  			b := s.endBlock()
   830  			b.Kind = ssa.BlockIf
   831  			b.SetControl(v)
   832  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   833  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   834  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   835  			b.AddEdgeTo(bTrue)
   836  			b.AddEdgeTo(bFalse)
   837  			b.Likely = ssa.BranchLikely
   838  
   839  			// We have the intrinsic - use it directly.
   840  			s.startBlock(bTrue)
   841  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   842  			s.endBlock().AddEdgeTo(bEnd)
   843  
   844  			// Call the pure Go version.
   845  			s.startBlock(bFalse)
   846  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   847  			s.endBlock().AddEdgeTo(bEnd)
   848  
   849  			// Merge results.
   850  			s.startBlock(bEnd)
   851  			return s.variable(n, types.Types[types.TFLOAT64])
   852  		},
   853  		sys.ARM)
   854  
   855  	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   856  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   857  			if cfg.goamd64 >= 2 {
   858  				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   859  			}
   860  
   861  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
   862  			b := s.endBlock()
   863  			b.Kind = ssa.BlockIf
   864  			b.SetControl(v)
   865  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   866  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   867  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   868  			b.AddEdgeTo(bTrue)
   869  			b.AddEdgeTo(bFalse)
   870  			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
   871  
   872  			// We have the intrinsic - use it directly.
   873  			s.startBlock(bTrue)
   874  			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   875  			s.endBlock().AddEdgeTo(bEnd)
   876  
   877  			// Call the pure Go version.
   878  			s.startBlock(bFalse)
   879  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   880  			s.endBlock().AddEdgeTo(bEnd)
   881  
   882  			// Merge results.
   883  			s.startBlock(bEnd)
   884  			return s.variable(n, types.Types[types.TFLOAT64])
   885  		}
   886  	}
   887  	addF("math", "RoundToEven",
   888  		makeRoundAMD64(ssa.OpRoundToEven),
   889  		sys.AMD64)
   890  	addF("math", "Floor",
   891  		makeRoundAMD64(ssa.OpFloor),
   892  		sys.AMD64)
   893  	addF("math", "Ceil",
   894  		makeRoundAMD64(ssa.OpCeil),
   895  		sys.AMD64)
   896  	addF("math", "Trunc",
   897  		makeRoundAMD64(ssa.OpTrunc),
   898  		sys.AMD64)
   899  
   900  	/******** math/bits ********/
   901  	addF("math/bits", "TrailingZeros64",
   902  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   903  			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   904  		},
   905  		sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   906  	addF("math/bits", "TrailingZeros64",
   907  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   908  			lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
   909  			hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
   910  			return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
   911  		},
   912  		sys.I386)
   913  	addF("math/bits", "TrailingZeros32",
   914  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   915  			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   916  		},
   917  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   918  	addF("math/bits", "TrailingZeros16",
   919  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   920  			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   921  		},
   922  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   923  	addF("math/bits", "TrailingZeros8",
   924  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   925  			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   926  		},
   927  		sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
   928  
   929  	if cfg.goriscv64 >= 22 {
   930  		addF("math/bits", "TrailingZeros64",
   931  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   932  				return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   933  			},
   934  			sys.RISCV64)
   935  		addF("math/bits", "TrailingZeros32",
   936  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   937  				return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   938  			},
   939  			sys.RISCV64)
   940  		addF("math/bits", "TrailingZeros16",
   941  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   942  				return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   943  			},
   944  			sys.RISCV64)
   945  		addF("math/bits", "TrailingZeros8",
   946  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   947  				return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   948  			},
   949  			sys.RISCV64)
   950  	}
   951  
   952  	// ReverseBytes inlines correctly, no need to intrinsify it.
   953  	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
   954  	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
   955  	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
   956  	addF("math/bits", "ReverseBytes16",
   957  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   958  			return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   959  		},
   960  		sys.Loong64)
   961  	if cfg.goppc64 >= 10 {
   962  		// On Power10, 16-bit rotate is not available so use BRH instruction
   963  		addF("math/bits", "ReverseBytes16",
   964  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   965  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
   966  			},
   967  			sys.PPC64)
   968  	}
   969  	if cfg.goriscv64 >= 22 {
   970  		addF("math/bits", "ReverseBytes16",
   971  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   972  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   973  			},
   974  			sys.RISCV64)
   975  	}
   976  
   977  	addF("math/bits", "Len64",
   978  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   979  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
   980  		},
   981  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   982  	addF("math/bits", "Len32",
   983  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   984  			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
   985  		},
   986  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   987  	addF("math/bits", "Len16",
   988  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   989  			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
   990  		},
   991  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   992  	addF("math/bits", "Len8",
   993  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   994  			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
   995  		},
   996  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
   997  
   998  	if cfg.goriscv64 >= 22 {
   999  		addF("math/bits", "Len64",
  1000  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1001  				return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
  1002  			},
  1003  			sys.RISCV64)
  1004  		addF("math/bits", "Len32",
  1005  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1006  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
  1007  			},
  1008  			sys.RISCV64)
  1009  		addF("math/bits", "Len16",
  1010  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1011  				return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
  1012  			},
  1013  			sys.RISCV64)
  1014  		addF("math/bits", "Len8",
  1015  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1016  				return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
  1017  			},
  1018  			sys.RISCV64)
  1019  	}
  1020  
  1021  	alias("math/bits", "Len", "math/bits", "Len64", p8...)
  1022  	alias("math/bits", "Len", "math/bits", "Len32", p4...)
  1023  
  1024  	// LeadingZeros is handled because it trivially calls Len.
  1025  	addF("math/bits", "Reverse64",
  1026  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1027  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TUINT64], args[0])
  1028  		},
  1029  		sys.ARM64, sys.Loong64)
  1030  	addF("math/bits", "Reverse32",
  1031  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1032  			return s.newValue1(ssa.OpBitRev32, types.Types[types.TUINT32], args[0])
  1033  		},
  1034  		sys.ARM64, sys.Loong64)
  1035  	addF("math/bits", "Reverse16",
  1036  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1037  			return s.newValue1(ssa.OpBitRev16, types.Types[types.TUINT16], args[0])
  1038  		},
  1039  		sys.ARM64, sys.Loong64)
  1040  	addF("math/bits", "Reverse8",
  1041  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1042  			return s.newValue1(ssa.OpBitRev8, types.Types[types.TUINT8], args[0])
  1043  		},
  1044  		sys.ARM64, sys.Loong64)
  1045  	addF("math/bits", "Reverse",
  1046  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1047  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TUINT], args[0])
  1048  		},
  1049  		sys.ARM64, sys.Loong64)
  1050  	addF("math/bits", "RotateLeft8",
  1051  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1052  			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
  1053  		},
  1054  		sys.AMD64, sys.RISCV64)
  1055  	addF("math/bits", "RotateLeft16",
  1056  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1057  			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
  1058  		},
  1059  		sys.AMD64, sys.RISCV64)
  1060  	addF("math/bits", "RotateLeft32",
  1061  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1062  			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
  1063  		},
  1064  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1065  	addF("math/bits", "RotateLeft64",
  1066  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1067  			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
  1068  		},
  1069  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1070  	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
  1071  
  1072  	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1073  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1074  			if cfg.goamd64 >= 2 {
  1075  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1076  			}
  1077  
  1078  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
  1079  			b := s.endBlock()
  1080  			b.Kind = ssa.BlockIf
  1081  			b.SetControl(v)
  1082  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1083  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1084  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1085  			b.AddEdgeTo(bTrue)
  1086  			b.AddEdgeTo(bFalse)
  1087  			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
  1088  
  1089  			// We have the intrinsic - use it directly.
  1090  			s.startBlock(bTrue)
  1091  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1092  			s.endBlock().AddEdgeTo(bEnd)
  1093  
  1094  			// Call the pure Go version.
  1095  			s.startBlock(bFalse)
  1096  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1097  			s.endBlock().AddEdgeTo(bEnd)
  1098  
  1099  			// Merge results.
  1100  			s.startBlock(bEnd)
  1101  			return s.variable(n, types.Types[types.TINT])
  1102  		}
  1103  	}
  1104  
  1105  	makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1106  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1107  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb)
  1108  			v := s.load(types.Types[types.TBOOL], addr)
  1109  			b := s.endBlock()
  1110  			b.Kind = ssa.BlockIf
  1111  			b.SetControl(v)
  1112  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1113  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1114  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1115  			b.AddEdgeTo(bTrue)
  1116  			b.AddEdgeTo(bFalse)
  1117  			b.Likely = ssa.BranchLikely // most loong64 machines support the LSX
  1118  
  1119  			// We have the intrinsic - use it directly.
  1120  			s.startBlock(bTrue)
  1121  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1122  			s.endBlock().AddEdgeTo(bEnd)
  1123  
  1124  			// Call the pure Go version.
  1125  			s.startBlock(bFalse)
  1126  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1127  			s.endBlock().AddEdgeTo(bEnd)
  1128  
  1129  			// Merge results.
  1130  			s.startBlock(bEnd)
  1131  			return s.variable(n, types.Types[types.TINT])
  1132  		}
  1133  	}
  1134  
  1135  	makeOnesCountRISCV64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1136  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1137  			if cfg.goriscv64 >= 22 {
  1138  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1139  			}
  1140  
  1141  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.RISCV64HasZbb, s.sb)
  1142  			v := s.load(types.Types[types.TBOOL], addr)
  1143  			b := s.endBlock()
  1144  			b.Kind = ssa.BlockIf
  1145  			b.SetControl(v)
  1146  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1147  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1148  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1149  			b.AddEdgeTo(bTrue)
  1150  			b.AddEdgeTo(bFalse)
  1151  			b.Likely = ssa.BranchLikely // Majority of RISC-V support Zbb.
  1152  
  1153  			// We have the intrinsic - use it directly.
  1154  			s.startBlock(bTrue)
  1155  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1156  			s.endBlock().AddEdgeTo(bEnd)
  1157  
  1158  			// Call the pure Go version.
  1159  			s.startBlock(bFalse)
  1160  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1161  			s.endBlock().AddEdgeTo(bEnd)
  1162  
  1163  			// Merge results.
  1164  			s.startBlock(bEnd)
  1165  			return s.variable(n, types.Types[types.TINT])
  1166  		}
  1167  	}
  1168  
  1169  	addF("math/bits", "OnesCount64",
  1170  		makeOnesCountAMD64(ssa.OpPopCount64),
  1171  		sys.AMD64)
  1172  	addF("math/bits", "OnesCount64",
  1173  		makeOnesCountLoong64(ssa.OpPopCount64),
  1174  		sys.Loong64)
  1175  	addF("math/bits", "OnesCount64",
  1176  		makeOnesCountRISCV64(ssa.OpPopCount64),
  1177  		sys.RISCV64)
  1178  	addF("math/bits", "OnesCount64",
  1179  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1180  			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
  1181  		},
  1182  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1183  	addF("math/bits", "OnesCount32",
  1184  		makeOnesCountAMD64(ssa.OpPopCount32),
  1185  		sys.AMD64)
  1186  	addF("math/bits", "OnesCount32",
  1187  		makeOnesCountLoong64(ssa.OpPopCount32),
  1188  		sys.Loong64)
  1189  	addF("math/bits", "OnesCount32",
  1190  		makeOnesCountRISCV64(ssa.OpPopCount32),
  1191  		sys.RISCV64)
  1192  	addF("math/bits", "OnesCount32",
  1193  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1194  			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
  1195  		},
  1196  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1197  	addF("math/bits", "OnesCount16",
  1198  		makeOnesCountAMD64(ssa.OpPopCount16),
  1199  		sys.AMD64)
  1200  	addF("math/bits", "OnesCount16",
  1201  		makeOnesCountLoong64(ssa.OpPopCount16),
  1202  		sys.Loong64)
  1203  	addF("math/bits", "OnesCount16",
  1204  		makeOnesCountRISCV64(ssa.OpPopCount16),
  1205  		sys.RISCV64)
  1206  	addF("math/bits", "OnesCount16",
  1207  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1208  			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
  1209  		},
  1210  		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
  1211  	addF("math/bits", "OnesCount8",
  1212  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1213  			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
  1214  		},
  1215  		sys.S390X, sys.PPC64, sys.Wasm)
  1216  
  1217  	if cfg.goriscv64 >= 22 {
  1218  		addF("math/bits", "OnesCount8",
  1219  			makeOnesCountRISCV64(ssa.OpPopCount8),
  1220  			sys.RISCV64)
  1221  	}
  1222  
  1223  	alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...)
  1224  
  1225  	add("math/bits", "Mul64",
  1226  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1227  			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
  1228  		},
  1229  		all...)
  1230  	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
  1231  	addF("math/bits", "Add64",
  1232  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1233  			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1234  		},
  1235  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1236  	alias("math/bits", "Add", "math/bits", "Add64", p8...)
  1237  	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
  1238  	addF("math/bits", "Sub64",
  1239  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1240  			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1241  		},
  1242  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1243  	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
  1244  	addF("math/bits", "Div64",
  1245  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1246  			// check for divide-by-zero/overflow and panic with appropriate message
  1247  			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
  1248  			s.check(cmpZero, ir.Syms.Panicdivide)
  1249  			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
  1250  			s.check(cmpOverflow, ir.Syms.Panicoverflow)
  1251  			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1252  		},
  1253  		sys.AMD64)
  1254  	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
  1255  
  1256  	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
  1257  	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
  1258  	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
  1259  	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
  1260  	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
  1261  	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
  1262  
  1263  	/******** sync/atomic ********/
  1264  
  1265  	// Note: these are disabled by flag_race in findIntrinsic below.
  1266  	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
  1267  	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
  1268  	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
  1269  	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
  1270  	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
  1271  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
  1272  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
  1273  
  1274  	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
  1275  	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
  1276  	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
  1277  	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
  1278  	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
  1279  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
  1280  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
  1281  
  1282  	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
  1283  	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
  1284  	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
  1285  	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
  1286  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
  1287  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
  1288  
  1289  	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
  1290  	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
  1291  	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
  1292  	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
  1293  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
  1294  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
  1295  
  1296  	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
  1297  	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
  1298  	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
  1299  	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
  1300  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
  1301  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
  1302  
  1303  	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1304  	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1305  	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1306  	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1307  	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1308  	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1309  	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1310  	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1311  	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1312  	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1313  
  1314  	/******** math/big ********/
  1315  	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
  1316  
  1317  	/******** internal/runtime/maps ********/
  1318  
  1319  	// Important: The intrinsic implementations below return a packed
  1320  	// bitset, while the portable Go implementation uses an unpacked
  1321  	// representation (one bit set in each byte).
  1322  	//
  1323  	// Thus we must replace most bitset methods with implementations that
  1324  	// work with the packed representation.
  1325  	//
  1326  	// TODO(prattmic): The bitset implementations don't use SIMD, so they
  1327  	// could be handled with build tags (though that would break
  1328  	// -d=ssa/intrinsics/off=1).
  1329  
  1330  	// With a packed representation we no longer need to shift the result
  1331  	// of TrailingZeros64.
  1332  	alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64)
  1333  
  1334  	addF("internal/runtime/maps", "bitsetRemoveBelow",
  1335  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1336  			b := args[0]
  1337  			i := args[1]
  1338  
  1339  			// Clear the lower i bits in b.
  1340  			//
  1341  			// out = b &^ ((1 << i) - 1)
  1342  
  1343  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1344  
  1345  			mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i)
  1346  			mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one)
  1347  			mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask)
  1348  
  1349  			return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask)
  1350  		},
  1351  		sys.AMD64)
  1352  
  1353  	addF("internal/runtime/maps", "bitsetLowestSet",
  1354  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1355  			b := args[0]
  1356  
  1357  			// Test the lowest bit in b.
  1358  			//
  1359  			// out = (b & 1) == 1
  1360  
  1361  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1362  			and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one)
  1363  			return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one)
  1364  		},
  1365  		sys.AMD64)
  1366  
  1367  	addF("internal/runtime/maps", "bitsetShiftOutLowest",
  1368  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1369  			b := args[0]
  1370  
  1371  			// Right shift out the lowest bit in b.
  1372  			//
  1373  			// out = b >> 1
  1374  
  1375  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1376  			return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one)
  1377  		},
  1378  		sys.AMD64)
  1379  
  1380  	addF("internal/runtime/maps", "ctrlGroupMatchH2",
  1381  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1382  			g := args[0]
  1383  			h := args[1]
  1384  
  1385  			// Explicit copies to fp registers. See
  1386  			// https://go.dev/issue/70451.
  1387  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1388  			hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h)
  1389  
  1390  			// Broadcast h2 into each byte of a word.
  1391  			var broadcast *ssa.Value
  1392  			if buildcfg.GOAMD64 >= 4 {
  1393  				// VPBROADCASTB saves 1 instruction vs PSHUFB
  1394  				// because the input can come from a GP
  1395  				// register, while PSHUFB requires moving into
  1396  				// an FP register first.
  1397  				//
  1398  				// Nominally PSHUFB would require a second
  1399  				// additional instruction to load the control
  1400  				// mask into a FP register. But broadcast uses
  1401  				// a control mask of 0, and the register ABI
  1402  				// already defines X15 as a zero register.
  1403  				broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h
  1404  			} else if buildcfg.GOAMD64 >= 2 {
  1405  				// PSHUFB performs a byte broadcast when given
  1406  				// a control input of 0.
  1407  				broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp)
  1408  			} else {
  1409  				// No direct byte broadcast. First we must
  1410  				// duplicate the lower byte and then do a
  1411  				// 16-bit broadcast.
  1412  
  1413  				// "Unpack" h2 with itself. This duplicates the
  1414  				// input, resulting in h2 in the lower two
  1415  				// bytes.
  1416  				unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp)
  1417  
  1418  				// Copy the lower 16-bits of unpack into every
  1419  				// 16-bit slot in the lower 64-bits of the
  1420  				// output register. Note that immediate 0
  1421  				// selects the low word as the source for every
  1422  				// destination slot.
  1423  				broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack)
  1424  
  1425  				// No need to broadcast into the upper 64-bits,
  1426  				// as we don't use those.
  1427  			}
  1428  
  1429  			// Compare each byte of the control word with h2. Each
  1430  			// matching byte has every bit set.
  1431  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp)
  1432  
  1433  			// Construct a "byte mask": each output bit is equal to
  1434  			// the sign bit each input byte.
  1435  			//
  1436  			// This results in a packed output (bit N set means
  1437  			// byte N matched).
  1438  			//
  1439  			// NOTE: See comment above on bitsetFirst.
  1440  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT8], eq)
  1441  
  1442  			// g is only 64-bits so the upper 64-bits of the
  1443  			// 128-bit register will be zero. If h2 is also zero,
  1444  			// then we'll get matches on those bytes. Truncate the
  1445  			// upper bits to ignore such matches.
  1446  			ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1447  
  1448  			return ret
  1449  		},
  1450  		sys.AMD64)
  1451  
  1452  	addF("internal/runtime/maps", "ctrlGroupMatchEmpty",
  1453  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1454  			// An empty slot is   1000 0000
  1455  			// A deleted slot is  1111 1110
  1456  			// A full slot is     0??? ????
  1457  
  1458  			g := args[0]
  1459  
  1460  			// Explicit copy to fp register. See
  1461  			// https://go.dev/issue/70451.
  1462  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1463  
  1464  			if buildcfg.GOAMD64 >= 2 {
  1465  				// "PSIGNB negates each data element of the
  1466  				// destination operand (the first operand) if
  1467  				// the signed integer value of the
  1468  				// corresponding data element in the source
  1469  				// operand (the second operand) is less than
  1470  				// zero. If the signed integer value of a data
  1471  				// element in the source operand is positive,
  1472  				// the corresponding data element in the
  1473  				// destination operand is unchanged. If a data
  1474  				// element in the source operand is zero, the
  1475  				// corresponding data element in the
  1476  				// destination operand is set to zero" - Intel SDM
  1477  				//
  1478  				// If we pass the group control word as both
  1479  				// arguments:
  1480  				// - Full slots are unchanged.
  1481  				// - Deleted slots are negated, becoming
  1482  				//   0000 0010.
  1483  				// - Empty slots are negated, becoming
  1484  				//   1000 0000 (unchanged!).
  1485  				//
  1486  				// The result is that only empty slots have the
  1487  				// sign bit set. We then use PMOVMSKB to
  1488  				// extract the sign bits.
  1489  				sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp)
  1490  
  1491  				// Construct a "byte mask": each output bit is
  1492  				// equal to the sign bit each input byte. The
  1493  				// sign bit is only set for empty or deleted
  1494  				// slots.
  1495  				//
  1496  				// This results in a packed output (bit N set
  1497  				// means byte N matched).
  1498  				//
  1499  				// NOTE: See comment above on bitsetFirst.
  1500  				ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT64], sign)
  1501  
  1502  				// g is only 64-bits so the upper 64-bits of
  1503  				// the 128-bit register will be zero. PSIGNB
  1504  				// will keep all of these bytes zero, so no
  1505  				// need to truncate.
  1506  
  1507  				return ret
  1508  			}
  1509  
  1510  			// No PSIGNB, simply do byte equality with ctrlEmpty.
  1511  
  1512  			// Load ctrlEmpty into each byte of a control word.
  1513  			var ctrlsEmpty uint64 = abi.MapCtrlEmpty
  1514  			e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty))
  1515  			// Explicit copy to fp register. See
  1516  			// https://go.dev/issue/70451.
  1517  			efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e)
  1518  
  1519  			// Compare each byte of the control word with ctrlEmpty. Each
  1520  			// matching byte has every bit set.
  1521  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp)
  1522  
  1523  			// Construct a "byte mask": each output bit is equal to
  1524  			// the sign bit each input byte.
  1525  			//
  1526  			// This results in a packed output (bit N set means
  1527  			// byte N matched).
  1528  			//
  1529  			// NOTE: See comment above on bitsetFirst.
  1530  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT8], eq)
  1531  
  1532  			// g is only 64-bits so the upper 64-bits of the
  1533  			// 128-bit register will be zero. The upper 64-bits of
  1534  			// efp are also zero, so we'll get matches on those
  1535  			// bytes. Truncate the upper bits to ignore such
  1536  			// matches.
  1537  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1538  		},
  1539  		sys.AMD64)
  1540  
  1541  	addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted",
  1542  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1543  			// An empty slot is   1000 0000
  1544  			// A deleted slot is  1111 1110
  1545  			// A full slot is     0??? ????
  1546  			//
  1547  			// A slot is empty or deleted iff bit 7 (sign bit) is
  1548  			// set.
  1549  
  1550  			g := args[0]
  1551  
  1552  			// Explicit copy to fp register. See
  1553  			// https://go.dev/issue/70451.
  1554  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1555  
  1556  			// Construct a "byte mask": each output bit is equal to
  1557  			// the sign bit each input byte. The sign bit is only
  1558  			// set for empty or deleted slots.
  1559  			//
  1560  			// This results in a packed output (bit N set means
  1561  			// byte N matched).
  1562  			//
  1563  			// NOTE: See comment above on bitsetFirst.
  1564  			ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT64], gfp)
  1565  
  1566  			// g is only 64-bits so the upper 64-bits of the
  1567  			// 128-bit register will be zero. Zero will never match
  1568  			// ctrlEmpty or ctrlDeleted, so no need to truncate.
  1569  
  1570  			return ret
  1571  		},
  1572  		sys.AMD64)
  1573  
  1574  	addF("internal/runtime/maps", "ctrlGroupMatchFull",
  1575  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1576  			// An empty slot is   1000 0000
  1577  			// A deleted slot is  1111 1110
  1578  			// A full slot is     0??? ????
  1579  			//
  1580  			// A slot is full iff bit 7 (sign bit) is unset.
  1581  
  1582  			g := args[0]
  1583  
  1584  			// Explicit copy to fp register. See
  1585  			// https://go.dev/issue/70451.
  1586  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1587  
  1588  			// Construct a "byte mask": each output bit is equal to
  1589  			// the sign bit each input byte. The sign bit is only
  1590  			// set for empty or deleted slots.
  1591  			//
  1592  			// This results in a packed output (bit N set means
  1593  			// byte N matched).
  1594  			//
  1595  			// NOTE: See comment above on bitsetFirst.
  1596  			mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT8], gfp)
  1597  
  1598  			// Invert the mask to set the bits for the full slots.
  1599  			out := s.newValue1(ssa.OpCom8, types.Types[types.TUINT8], mask)
  1600  
  1601  			// g is only 64-bits so the upper 64-bits of the
  1602  			// 128-bit register will be zero, with bit 7 unset.
  1603  			// Truncate the upper bits to ignore these.
  1604  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1605  		},
  1606  		sys.AMD64)
  1607  
  1608  	/******** crypto/internal/constanttime ********/
  1609  	// We implement a superset of the Select promise:
  1610  	// Select returns x if v != 0 and y if v == 0.
  1611  	hasCMOV := []*sys.Arch{sys.ArchAMD64, sys.ArchARM64, sys.ArchLoong64, sys.ArchPPC64, sys.ArchPPC64LE, sys.ArchWasm}
  1612  	if cfg.goriscv64 >= 23 {
  1613  		hasCMOV = append(hasCMOV, sys.ArchRISCV64)
  1614  	}
  1615  	add("crypto/internal/constanttime", "Select",
  1616  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1617  			v, x, y := args[0], args[1], args[2]
  1618  
  1619  			var checkOp ssa.Op
  1620  			var zero *ssa.Value
  1621  			switch s.config.PtrSize {
  1622  			case 8:
  1623  				checkOp = ssa.OpNeq64
  1624  				zero = s.constInt64(types.Types[types.TINT], 0)
  1625  			case 4:
  1626  				checkOp = ssa.OpNeq32
  1627  				zero = s.constInt32(types.Types[types.TINT], 0)
  1628  			default:
  1629  				panic("unreachable")
  1630  			}
  1631  			check := s.newValue2(checkOp, types.Types[types.TBOOL], zero, v)
  1632  
  1633  			return s.newValue3(ssa.OpCondSelect, types.Types[types.TINT], x, y, check)
  1634  		}, hasCMOV...) // all with CMOV support.
  1635  	add("crypto/internal/constanttime", "boolToUint8",
  1636  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1637  			return s.newValue1(ssa.OpCvtBoolToUint8, types.Types[types.TUINT8], args[0])
  1638  		},
  1639  		all...)
  1640  
  1641  	if buildcfg.Experiment.SIMD {
  1642  		// Only enable intrinsics, if SIMD experiment.
  1643  		simdIntrinsics(addF)
  1644  
  1645  		addF(simdPackage, "ClearAVXUpperBits",
  1646  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1647  				s.vars[memVar] = s.newValue1(ssa.OpAMD64VZEROUPPER, types.TypeMem, s.mem())
  1648  				return nil
  1649  			},
  1650  			sys.AMD64)
  1651  
  1652  		addF(simdPackage, "Int8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1653  		addF(simdPackage, "Int16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1654  		addF(simdPackage, "Int32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1655  		addF(simdPackage, "Int64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1656  		addF(simdPackage, "Uint8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1657  		addF(simdPackage, "Uint16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1658  		addF(simdPackage, "Uint32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1659  		addF(simdPackage, "Uint64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1660  		addF(simdPackage, "Int8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1661  		addF(simdPackage, "Int16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1662  		addF(simdPackage, "Int32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1663  		addF(simdPackage, "Int64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1664  		addF(simdPackage, "Uint8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1665  		addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1666  		addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1667  		addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
  1668  		addF(simdPackage, "Float32x4.IsNaN", opLen1(ssa.OpIsNaNFloat32x4, types.TypeVec128), sys.AMD64)
  1669  		addF(simdPackage, "Float32x8.IsNaN", opLen1(ssa.OpIsNaNFloat32x8, types.TypeVec256), sys.AMD64)
  1670  		addF(simdPackage, "Float32x16.IsNaN", opLen1(ssa.OpIsNaNFloat32x16, types.TypeVec512), sys.AMD64)
  1671  		addF(simdPackage, "Float64x2.IsNaN", opLen1(ssa.OpIsNaNFloat64x2, types.TypeVec128), sys.AMD64)
  1672  		addF(simdPackage, "Float64x4.IsNaN", opLen1(ssa.OpIsNaNFloat64x4, types.TypeVec256), sys.AMD64)
  1673  		addF(simdPackage, "Float64x8.IsNaN", opLen1(ssa.OpIsNaNFloat64x8, types.TypeVec512), sys.AMD64)
  1674  
  1675  		// sfp4 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
  1676  		sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) {
  1677  			addF(simdPackage, method,
  1678  				func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1679  					x, a, b, c, d, y := args[0], args[1], args[2], args[3], args[4], args[5]
  1680  					if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 && c.Op == ssa.OpConst8 && d.Op == ssa.OpConst8 {
  1681  						z := select4FromPair(x, a, b, c, d, y, s, hwop, vectype)
  1682  						if z != nil {
  1683  							return z
  1684  						}
  1685  					}
  1686  					return s.callResult(n, callNormal)
  1687  				},
  1688  				sys.AMD64)
  1689  		}
  1690  
  1691  		sfp4("Int32x4.SelectFromPair", ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128)
  1692  		sfp4("Uint32x4.SelectFromPair", ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128)
  1693  		sfp4("Float32x4.SelectFromPair", ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128)
  1694  
  1695  		sfp4("Int32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256)
  1696  		sfp4("Uint32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256)
  1697  		sfp4("Float32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256)
  1698  
  1699  		sfp4("Int32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512)
  1700  		sfp4("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512)
  1701  		sfp4("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512)
  1702  
  1703  		// sfp2 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
  1704  		sfp2 := func(method string, hwop ssa.Op, vectype *types.Type, cscimm func(i, j uint8) int64) {
  1705  			addF(simdPackage, method,
  1706  				func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1707  					x, a, b, y := args[0], args[1], args[2], args[3]
  1708  					if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 {
  1709  						z := select2FromPair(x, a, b, y, s, hwop, vectype, cscimm)
  1710  						if z != nil {
  1711  							return z
  1712  						}
  1713  					}
  1714  					return s.callResult(n, callNormal)
  1715  				},
  1716  				sys.AMD64)
  1717  		}
  1718  
  1719  		sfp2("Uint64x2.SelectFromPair", ssa.OpconcatSelectedConstantUint64x2, types.TypeVec128, cscimm2)
  1720  		sfp2("Int64x2.SelectFromPair", ssa.OpconcatSelectedConstantInt64x2, types.TypeVec128, cscimm2)
  1721  		sfp2("Float64x2.SelectFromPair", ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, cscimm2)
  1722  
  1723  		sfp2("Uint64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, cscimm2g2)
  1724  		sfp2("Int64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x4, types.TypeVec256, cscimm2g2)
  1725  		sfp2("Float64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x4, types.TypeVec256, cscimm2g2)
  1726  
  1727  		sfp2("Uint64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, cscimm2g4)
  1728  		sfp2("Int64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x8, types.TypeVec512, cscimm2g4)
  1729  		sfp2("Float64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x8, types.TypeVec512, cscimm2g4)
  1730  
  1731  	}
  1732  }
  1733  
  1734  func cscimm4(a, b, c, d uint8) int64 {
  1735  	return se(a + b<<2 + c<<4 + d<<6)
  1736  }
  1737  
  1738  func cscimm2(a, b uint8) int64 {
  1739  	return se(a + b<<1)
  1740  }
  1741  
  1742  func cscimm2g2(a, b uint8) int64 {
  1743  	g := cscimm2(a, b)
  1744  	return int64(int8(g + g<<2))
  1745  }
  1746  
  1747  func cscimm2g4(a, b uint8) int64 {
  1748  	g := cscimm2g2(a, b)
  1749  	return int64(int8(g + g<<4))
  1750  }
  1751  
  1752  const (
  1753  	_LLLL = iota
  1754  	_HLLL
  1755  	_LHLL
  1756  	_HHLL
  1757  	_LLHL
  1758  	_HLHL
  1759  	_LHHL
  1760  	_HHHL
  1761  	_LLLH
  1762  	_HLLH
  1763  	_LHLH
  1764  	_HHLH
  1765  	_LLHH
  1766  	_HLHH
  1767  	_LHHH
  1768  	_HHHH
  1769  )
  1770  
  1771  const (
  1772  	_LL = iota
  1773  	_HL
  1774  	_LH
  1775  	_HH
  1776  )
  1777  
  1778  func select2FromPair(x, _a, _b, y *ssa.Value, s *state, op ssa.Op, t *types.Type, csc func(a, b uint8) int64) *ssa.Value {
  1779  	a, b := uint8(_a.AuxInt8()), uint8(_b.AuxInt8())
  1780  	if a > 3 || b > 3 {
  1781  		return nil
  1782  	}
  1783  	pattern := (a&2)>>1 + (b & 2)
  1784  	a, b = a&1, b&1
  1785  
  1786  	switch pattern {
  1787  	case _LL:
  1788  		return s.newValue2I(op, t, csc(a, b), x, x)
  1789  	case _HH:
  1790  		return s.newValue2I(op, t, csc(a, b), y, y)
  1791  	case _LH:
  1792  		return s.newValue2I(op, t, csc(a, b), x, y)
  1793  	case _HL:
  1794  		return s.newValue2I(op, t, csc(a, b), y, x)
  1795  	}
  1796  	panic("The preceding switch should have been exhaustive")
  1797  }
  1798  
  1799  func select4FromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value {
  1800  	a, b, c, d := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()), uint8(_c.AuxInt8()), uint8(_d.AuxInt8())
  1801  	if a > 7 || b > 7 || c > 7 || d > 7 {
  1802  		return nil
  1803  	}
  1804  	pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
  1805  
  1806  	a, b, c, d = a&3, b&3, c&3, d&3
  1807  
  1808  	switch pattern {
  1809  	case _LLLL:
  1810  		// TODO DETECT 0,1,2,3, 0,0,0,0
  1811  		return s.newValue2I(op, t, cscimm4(a, b, c, d), x, x)
  1812  	case _HHHH:
  1813  		// TODO DETECT 0,1,2,3, 0,0,0,0
  1814  		return s.newValue2I(op, t, cscimm4(a, b, c, d), y, y)
  1815  	case _LLHH:
  1816  		return s.newValue2I(op, t, cscimm4(a, b, c, d), x, y)
  1817  	case _HHLL:
  1818  		return s.newValue2I(op, t, cscimm4(a, b, c, d), y, x)
  1819  
  1820  	case _HLLL:
  1821  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
  1822  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
  1823  	case _LHLL:
  1824  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
  1825  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
  1826  	case _HLHH:
  1827  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
  1828  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
  1829  	case _LHHH:
  1830  		z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
  1831  		return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
  1832  
  1833  	case _LLLH:
  1834  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
  1835  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
  1836  	case _LLHL:
  1837  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
  1838  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
  1839  
  1840  	case _HHLH:
  1841  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
  1842  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
  1843  
  1844  	case _HHHL:
  1845  		z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
  1846  		return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
  1847  
  1848  	case _LHLH:
  1849  		z := s.newValue2I(op, t, cscimm4(a, c, b, d), x, y)
  1850  		return s.newValue2I(op, t, se(0b11_01_10_00), z, z)
  1851  	case _HLHL:
  1852  		z := s.newValue2I(op, t, cscimm4(b, d, a, c), x, y)
  1853  		return s.newValue2I(op, t, se(0b01_11_00_10), z, z)
  1854  	case _HLLH:
  1855  		z := s.newValue2I(op, t, cscimm4(b, c, a, d), x, y)
  1856  		return s.newValue2I(op, t, se(0b11_01_00_10), z, z)
  1857  	case _LHHL:
  1858  		z := s.newValue2I(op, t, cscimm4(a, d, b, c), x, y)
  1859  		return s.newValue2I(op, t, se(0b01_11_10_00), z, z)
  1860  	}
  1861  	panic("The preceding switch should have been exhaustive")
  1862  }
  1863  
  1864  // se smears the not-really-a-sign bit of a uint8 to conform to the conventions
  1865  // for representing AuxInt in ssa.
  1866  func se(x uint8) int64 {
  1867  	return int64(int8(x))
  1868  }
  1869  
  1870  func opLen1(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1871  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1872  		return s.newValue1(op, t, args[0])
  1873  	}
  1874  }
  1875  
  1876  func opLen2(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1877  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1878  		return s.newValue2(op, t, args[0], args[1])
  1879  	}
  1880  }
  1881  
  1882  func opLen2_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1883  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1884  		return s.newValue2(op, t, args[1], args[0])
  1885  	}
  1886  }
  1887  
  1888  func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1889  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1890  		return s.newValue3(op, t, args[0], args[1], args[2])
  1891  	}
  1892  }
  1893  
  1894  var ssaVecBySize = map[int64]*types.Type{
  1895  	16: types.TypeVec128,
  1896  	32: types.TypeVec256,
  1897  	64: types.TypeVec512,
  1898  }
  1899  
  1900  func opLen3_31Zero3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1901  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1902  		if t, ok := ssaVecBySize[args[1].Type.Size()]; !ok {
  1903  			panic("unknown simd vector size")
  1904  		} else {
  1905  			return s.newValue3(op, t, s.newValue0(ssa.OpZeroSIMD, t), args[1], args[0])
  1906  		}
  1907  	}
  1908  }
  1909  
  1910  func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1911  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1912  		return s.newValue3(op, t, args[1], args[0], args[2])
  1913  	}
  1914  }
  1915  
  1916  func opLen3_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1917  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1918  		return s.newValue3(op, t, args[2], args[0], args[1])
  1919  	}
  1920  }
  1921  
  1922  func opLen4(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1923  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1924  		return s.newValue4(op, t, args[0], args[1], args[2], args[3])
  1925  	}
  1926  }
  1927  
  1928  func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1929  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1930  		return s.newValue4(op, t, args[2], args[0], args[1], args[3])
  1931  	}
  1932  }
  1933  
  1934  func opLen4_31(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1935  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1936  		return s.newValue4(op, t, args[2], args[1], args[0], args[3])
  1937  	}
  1938  }
  1939  
  1940  func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp func(*state, int)) *ssa.Value {
  1941  	// Make blocks we'll need.
  1942  	bEnd := s.f.NewBlock(ssa.BlockPlain)
  1943  
  1944  	if !idx.Type.IsKind(types.TUINT8) {
  1945  		panic("immJumpTable expects uint8 value")
  1946  	}
  1947  
  1948  	// We will exhaust 0-255, so no need to check the bounds.
  1949  	t := types.Types[types.TUINTPTR]
  1950  	idx = s.conv(nil, idx, idx.Type, t)
  1951  
  1952  	b := s.curBlock
  1953  	b.Kind = ssa.BlockJumpTable
  1954  	b.Pos = intrinsicCall.Pos()
  1955  	if base.Flag.Cfg.SpectreIndex {
  1956  		// Potential Spectre vulnerability hardening?
  1957  		idx = s.newValue2(ssa.OpSpectreSliceIndex, t, idx, s.uintptrConstant(255))
  1958  	}
  1959  	b.SetControl(idx)
  1960  	targets := [256]*ssa.Block{}
  1961  	for i := range 256 {
  1962  		t := s.f.NewBlock(ssa.BlockPlain)
  1963  		targets[i] = t
  1964  		b.AddEdgeTo(t)
  1965  	}
  1966  	s.endBlock()
  1967  
  1968  	for i, t := range targets {
  1969  		s.startBlock(t)
  1970  		genOp(s, i)
  1971  		if t.Kind != ssa.BlockExit {
  1972  			t.AddEdgeTo(bEnd)
  1973  		}
  1974  		s.endBlock()
  1975  	}
  1976  
  1977  	s.startBlock(bEnd)
  1978  	ret := s.variable(intrinsicCall, intrinsicCall.Type())
  1979  	return ret
  1980  }
  1981  
  1982  func opLen1Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1983  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1984  		if args[1].Op == ssa.OpConst8 {
  1985  			return s.newValue1I(op, t, args[1].AuxInt<<int64(offset), args[0])
  1986  		}
  1987  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  1988  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  1989  			s.vars[n] = sNew.newValue1I(op, t, int64(int8(idx<<offset)), args[0])
  1990  		})
  1991  	}
  1992  }
  1993  
  1994  func opLen2Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1995  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1996  		if args[1].Op == ssa.OpConst8 {
  1997  			return s.newValue2I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2])
  1998  		}
  1999  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2000  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2001  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[2])
  2002  		})
  2003  	}
  2004  }
  2005  
  2006  func opLen3Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2007  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2008  		if args[1].Op == ssa.OpConst8 {
  2009  			return s.newValue3I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3])
  2010  		}
  2011  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2012  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2013  			s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3])
  2014  		})
  2015  	}
  2016  }
  2017  
  2018  func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2019  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2020  		if args[2].Op == ssa.OpConst8 {
  2021  			return s.newValue2I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1])
  2022  		}
  2023  		return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
  2024  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2025  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[1])
  2026  		})
  2027  	}
  2028  }
  2029  
  2030  // Two immediates instead of just 1.  Offset is ignored, so it is a _ parameter instead.
  2031  func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2032  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2033  		if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 {
  2034  			i1, i2 := args[1].AuxInt, args[2].AuxInt
  2035  			return s.newValue2I(op, t, int64(int8(i1+i2<<4)), args[0], args[3])
  2036  		}
  2037  		four := s.constInt64(types.Types[types.TUINT8], 4)
  2038  		shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four)
  2039  		combined := s.newValue2(ssa.OpAdd8, types.Types[types.TUINT8], args[1], shifted)
  2040  		return immJumpTable(s, combined, n, func(sNew *state, idx int) {
  2041  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2042  			// TODO for "zeroing" values, panic instead.
  2043  			if idx & ^(3+3<<4) == 0 {
  2044  				s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx)), args[0], args[3])
  2045  			} else {
  2046  				sNew.rtcall(ir.Syms.PanicSimdImm, false, nil)
  2047  			}
  2048  		})
  2049  	}
  2050  }
  2051  
  2052  // The assembler requires the imm value of a SHA1RNDS4 instruction to be one of 0,1,2,3...
  2053  func opLen2Imm8_SHA1RNDS4(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2054  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2055  		if args[1].Op == ssa.OpConst8 {
  2056  			return s.newValue2I(op, t, (args[1].AuxInt<<int64(offset))&0b11, args[0], args[2])
  2057  		}
  2058  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2059  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2060  			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset))&0b11, args[0], args[2])
  2061  		})
  2062  	}
  2063  }
  2064  
  2065  func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2066  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2067  		if args[2].Op == ssa.OpConst8 {
  2068  			return s.newValue3I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1], args[3])
  2069  		}
  2070  		return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
  2071  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2072  			s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[1], args[3])
  2073  		})
  2074  	}
  2075  }
  2076  
  2077  func opLen4Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2078  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2079  		if args[1].Op == ssa.OpConst8 {
  2080  			return s.newValue4I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3], args[4])
  2081  		}
  2082  		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
  2083  			// Encode as int8 due to requirement of AuxInt, check its comment for details.
  2084  			s.vars[n] = sNew.newValue4I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3], args[4])
  2085  		})
  2086  	}
  2087  }
  2088  
  2089  func simdLoad() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2090  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2091  		return s.newValue2(ssa.OpLoad, n.Type(), args[0], s.mem())
  2092  	}
  2093  }
  2094  
  2095  func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2096  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2097  		s.store(args[0].Type, args[1], args[0])
  2098  		return nil
  2099  	}
  2100  }
  2101  
  2102  var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{
  2103  	8:  {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64},
  2104  	16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32},
  2105  	32: {4: ssa.OpCvt8toMask32x4, 8: ssa.OpCvt8toMask32x8, 16: ssa.OpCvt16toMask32x16},
  2106  	64: {2: ssa.OpCvt8toMask64x2, 4: ssa.OpCvt8toMask64x4, 8: ssa.OpCvt8toMask64x8},
  2107  }
  2108  
  2109  var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{
  2110  	8:  {16: ssa.OpCvtMask8x16to16, 32: ssa.OpCvtMask8x32to32, 64: ssa.OpCvtMask8x64to64},
  2111  	16: {8: ssa.OpCvtMask16x8to8, 16: ssa.OpCvtMask16x16to16, 32: ssa.OpCvtMask16x32to32},
  2112  	32: {4: ssa.OpCvtMask32x4to8, 8: ssa.OpCvtMask32x8to8, 16: ssa.OpCvtMask32x16to16},
  2113  	64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8},
  2114  }
  2115  
  2116  func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2117  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2118  		op := cvtVToMaskOpcodes[elemBits][lanes]
  2119  		if op == 0 {
  2120  			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
  2121  		}
  2122  		return s.newValue1(op, types.TypeMask, args[0])
  2123  	}
  2124  }
  2125  
  2126  func simdCvtMaskToV(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2127  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2128  		op := cvtMaskToVOpcodes[elemBits][lanes]
  2129  		if op == 0 {
  2130  			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
  2131  		}
  2132  		return s.newValue1(op, n.Type(), args[0])
  2133  	}
  2134  }
  2135  
  2136  func simdMaskedLoad(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2137  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2138  		return s.newValue3(op, n.Type(), args[0], args[1], s.mem())
  2139  	}
  2140  }
  2141  
  2142  func simdMaskedStore(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2143  	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  2144  		s.vars[memVar] = s.newValue4A(op, types.TypeMem, args[0].Type, args[1], args[2], args[0], s.mem())
  2145  		return nil
  2146  	}
  2147  }
  2148  
  2149  // findIntrinsic returns a function which builds the SSA equivalent of the
  2150  // function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
  2151  func findIntrinsic(sym *types.Sym) intrinsicBuilder {
  2152  	if sym == nil || sym.Pkg == nil {
  2153  		return nil
  2154  	}
  2155  	pkg := sym.Pkg.Path
  2156  	if sym.Pkg == ir.Pkgs.Runtime {
  2157  		pkg = "runtime"
  2158  	}
  2159  	if base.Flag.Race && pkg == "sync/atomic" {
  2160  		// The race detector needs to be able to intercept these calls.
  2161  		// We can't intrinsify them.
  2162  		return nil
  2163  	}
  2164  	// Skip intrinsifying math functions (which may contain hard-float
  2165  	// instructions) when soft-float
  2166  	if Arch.SoftFloat && pkg == "math" {
  2167  		return nil
  2168  	}
  2169  
  2170  	fn := sym.Name
  2171  	if ssa.IntrinsicsDisable {
  2172  		if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GetCallerSP" || fn == "GetClosurePtr") ||
  2173  			pkg == simdPackage {
  2174  			// These runtime functions don't have definitions, must be intrinsics.
  2175  		} else {
  2176  			return nil
  2177  		}
  2178  	}
  2179  	return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
  2180  }
  2181  
  2182  func IsIntrinsicCall(n *ir.CallExpr) bool {
  2183  	if n == nil {
  2184  		return false
  2185  	}
  2186  	name, ok := n.Fun.(*ir.Name)
  2187  	if !ok {
  2188  		if n.Fun.Op() == ir.OMETHEXPR {
  2189  			if meth := ir.MethodExprName(n.Fun); meth != nil {
  2190  				if fn := meth.Func; fn != nil {
  2191  					return IsIntrinsicSym(fn.Sym())
  2192  				}
  2193  			}
  2194  		}
  2195  		return false
  2196  	}
  2197  	return IsIntrinsicSym(name.Sym())
  2198  }
  2199  
  2200  func IsIntrinsicSym(sym *types.Sym) bool {
  2201  	return findIntrinsic(sym) != nil
  2202  }
  2203  
  2204  // GenIntrinsicBody generates the function body for a bodyless intrinsic.
  2205  // This is used when the intrinsic is used in a non-call context, e.g.
  2206  // as a function pointer, or (for a method) being referenced from the type
  2207  // descriptor.
  2208  //
  2209  // The compiler already recognizes a call to fn as an intrinsic and can
  2210  // directly generate code for it. So we just fill in the body with a call
  2211  // to fn.
  2212  func GenIntrinsicBody(fn *ir.Func) {
  2213  	if ir.CurFunc != nil {
  2214  		base.FatalfAt(fn.Pos(), "enqueueFunc %v inside %v", fn, ir.CurFunc)
  2215  	}
  2216  
  2217  	if base.Flag.LowerR != 0 {
  2218  		fmt.Println("generate intrinsic for", ir.FuncName(fn))
  2219  	}
  2220  
  2221  	pos := fn.Pos()
  2222  	ft := fn.Type()
  2223  	var ret ir.Node
  2224  
  2225  	// For a method, it usually starts with an ODOTMETH (pre-typecheck) or
  2226  	// OMETHEXPR (post-typecheck) referencing the method symbol without the
  2227  	// receiver type, and Walk rewrites it to a call directly to the
  2228  	// type-qualified method symbol, moving the receiver to an argument.
  2229  	// Here fn has already the type-qualified method symbol, and it is hard
  2230  	// to get the unqualified symbol. So we just generate the post-Walk form
  2231  	// and mark it typechecked and Walked.
  2232  	call := ir.NewCallExpr(pos, ir.OCALLFUNC, fn.Nname, nil)
  2233  	call.Args = ir.RecvParamNames(ft)
  2234  	call.IsDDD = ft.IsVariadic()
  2235  	typecheck.Exprs(call.Args)
  2236  	call.SetTypecheck(1)
  2237  	call.SetWalked(true)
  2238  	ret = call
  2239  	if ft.NumResults() > 0 {
  2240  		if ft.NumResults() == 1 {
  2241  			call.SetType(ft.Result(0).Type)
  2242  		} else {
  2243  			call.SetType(ft.ResultsTuple())
  2244  		}
  2245  		n := ir.NewReturnStmt(base.Pos, nil)
  2246  		n.Results = []ir.Node{call}
  2247  		ret = n
  2248  	}
  2249  	fn.Body.Append(ret)
  2250  
  2251  	if base.Flag.LowerR != 0 {
  2252  		ir.DumpList("generate intrinsic body", fn.Body)
  2253  	}
  2254  
  2255  	ir.CurFunc = fn
  2256  	typecheck.Stmts(fn.Body)
  2257  	ir.CurFunc = nil // we know CurFunc is nil at entry
  2258  }
  2259  

View as plain text