Source file src/cmd/compile/internal/ssagen/intrinsics.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package ssagen
     6  
     7  import (
     8  	"fmt"
     9  	"internal/abi"
    10  	"internal/buildcfg"
    11  
    12  	"cmd/compile/internal/base"
    13  	"cmd/compile/internal/ir"
    14  	"cmd/compile/internal/ssa"
    15  	"cmd/compile/internal/types"
    16  	"cmd/internal/sys"
    17  )
    18  
    19  var intrinsics intrinsicBuilders
    20  
    21  // An intrinsicBuilder converts a call node n into an ssa value that
    22  // implements that call as an intrinsic. args is a list of arguments to the func.
    23  type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
    24  
    25  type intrinsicKey struct {
    26  	arch *sys.Arch
    27  	pkg  string
    28  	fn   string
    29  }
    30  
    31  // intrinsicBuildConfig specifies the config to use for intrinsic building.
    32  type intrinsicBuildConfig struct {
    33  	instrumenting bool
    34  
    35  	go386     string
    36  	goamd64   int
    37  	goarm     buildcfg.GoarmFeatures
    38  	goarm64   buildcfg.Goarm64Features
    39  	gomips    string
    40  	gomips64  string
    41  	goppc64   int
    42  	goriscv64 int
    43  }
    44  
    45  type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
    46  
    47  // add adds the intrinsic builder b for pkg.fn for the given architecture.
    48  func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
    49  	if _, found := ib[intrinsicKey{arch, pkg, fn}]; found {
    50  		panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name))
    51  	}
    52  	ib[intrinsicKey{arch, pkg, fn}] = b
    53  }
    54  
    55  // addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
    56  func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
    57  	for _, arch := range archs {
    58  		ib.add(arch, pkg, fn, b)
    59  	}
    60  }
    61  
    62  // addForFamilies does the same as addForArchs but operates on architecture families.
    63  func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
    64  	for _, arch := range sys.Archs {
    65  		if arch.InFamily(archFamilies...) {
    66  			intrinsics.add(arch, pkg, fn, b)
    67  		}
    68  	}
    69  }
    70  
    71  // alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
    72  // for which targetPkg.targetFn already exists.
    73  func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
    74  	// TODO(jsing): Consider making this work even if the alias is added
    75  	// before the intrinsic.
    76  	aliased := false
    77  	for _, arch := range archs {
    78  		if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
    79  			intrinsics.add(arch, pkg, fn, b)
    80  			aliased = true
    81  		}
    82  	}
    83  	if !aliased {
    84  		panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
    85  	}
    86  }
    87  
    88  // lookup looks up the intrinsic for a pkg.fn on the specified architecture.
    89  func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
    90  	return intrinsics[intrinsicKey{arch, pkg, fn}]
    91  }
    92  
    93  func initIntrinsics(cfg *intrinsicBuildConfig) {
    94  	if cfg == nil {
    95  		cfg = &intrinsicBuildConfig{
    96  			instrumenting: base.Flag.Cfg.Instrumenting,
    97  			go386:         buildcfg.GO386,
    98  			goamd64:       buildcfg.GOAMD64,
    99  			goarm:         buildcfg.GOARM,
   100  			goarm64:       buildcfg.GOARM64,
   101  			gomips:        buildcfg.GOMIPS,
   102  			gomips64:      buildcfg.GOMIPS64,
   103  			goppc64:       buildcfg.GOPPC64,
   104  			goriscv64:     buildcfg.GORISCV64,
   105  		}
   106  	}
   107  	intrinsics = intrinsicBuilders{}
   108  
   109  	var p4 []*sys.Arch
   110  	var p8 []*sys.Arch
   111  	var lwatomics []*sys.Arch
   112  	for _, a := range sys.Archs {
   113  		if a.PtrSize == 4 {
   114  			p4 = append(p4, a)
   115  		} else {
   116  			p8 = append(p8, a)
   117  		}
   118  		if a.Family != sys.PPC64 {
   119  			lwatomics = append(lwatomics, a)
   120  		}
   121  	}
   122  	all := sys.Archs[:]
   123  
   124  	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
   125  		intrinsics.addForArchs(pkg, fn, b, archs...)
   126  	}
   127  	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
   128  		intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
   129  	}
   130  	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
   131  		intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
   132  	}
   133  
   134  	/******** runtime ********/
   135  	if !cfg.instrumenting {
   136  		add("runtime", "slicebytetostringtmp",
   137  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   138  				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
   139  				// for the backend instead of slicebytetostringtmp calls
   140  				// when not instrumenting.
   141  				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
   142  			},
   143  			all...)
   144  	}
   145  	addF("internal/runtime/math", "MulUintptr",
   146  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   147  			if s.config.PtrSize == 4 {
   148  				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   149  			}
   150  			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
   151  		},
   152  		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64)
   153  	add("runtime", "KeepAlive",
   154  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   155  			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
   156  			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
   157  			return nil
   158  		},
   159  		all...)
   160  
   161  	addF("runtime", "publicationBarrier",
   162  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   163  			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
   164  			return nil
   165  		},
   166  		sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64)
   167  
   168  	/******** internal/runtime/sys ********/
   169  	add("internal/runtime/sys", "GetCallerPC",
   170  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   171  			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
   172  		},
   173  		all...)
   174  
   175  	add("internal/runtime/sys", "GetCallerSP",
   176  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   177  			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
   178  		},
   179  		all...)
   180  
   181  	add("internal/runtime/sys", "GetClosurePtr",
   182  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   183  			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
   184  		},
   185  		all...)
   186  
   187  	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X}
   188  	if cfg.goppc64 >= 10 {
   189  		// Use only on Power10 as the new byte reverse instructions that Power10 provide
   190  		// make it worthwhile as an intrinsic
   191  		brev_arch = append(brev_arch, sys.PPC64)
   192  	}
   193  	addF("internal/runtime/sys", "Bswap32",
   194  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   195  			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
   196  		},
   197  		brev_arch...)
   198  	addF("internal/runtime/sys", "Bswap64",
   199  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   200  			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
   201  		},
   202  		brev_arch...)
   203  
   204  	/****** Prefetch ******/
   205  	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   206  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   207  			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
   208  			return nil
   209  		}
   210  	}
   211  
   212  	// Make Prefetch intrinsics for supported platforms
   213  	// On the unsupported platforms stub function will be eliminated
   214  	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
   215  		sys.AMD64, sys.ARM64, sys.PPC64)
   216  	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
   217  		sys.AMD64, sys.ARM64, sys.PPC64)
   218  
   219  	/******** internal/runtime/atomic ********/
   220  	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
   221  
   222  	addF("internal/runtime/atomic", "Load",
   223  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   224  			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   225  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   226  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   227  		},
   228  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   229  	addF("internal/runtime/atomic", "Load8",
   230  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   231  			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
   232  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   233  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   234  		},
   235  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   236  	addF("internal/runtime/atomic", "Load64",
   237  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   238  			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   239  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   240  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   241  		},
   242  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   243  	addF("internal/runtime/atomic", "LoadAcq",
   244  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   245  			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
   246  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   247  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   248  		},
   249  		sys.PPC64)
   250  	addF("internal/runtime/atomic", "LoadAcq64",
   251  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   252  			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
   253  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   254  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   255  		},
   256  		sys.PPC64)
   257  	addF("internal/runtime/atomic", "Loadp",
   258  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   259  			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
   260  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   261  			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
   262  		},
   263  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   264  
   265  	addF("internal/runtime/atomic", "Store",
   266  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   267  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
   268  			return nil
   269  		},
   270  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   271  	addF("internal/runtime/atomic", "Store8",
   272  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   273  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
   274  			return nil
   275  		},
   276  		sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   277  	addF("internal/runtime/atomic", "Store64",
   278  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   279  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
   280  			return nil
   281  		},
   282  		sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   283  	addF("internal/runtime/atomic", "StorepNoWB",
   284  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   285  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
   286  			return nil
   287  		},
   288  		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
   289  	addF("internal/runtime/atomic", "StoreRel",
   290  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   291  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
   292  			return nil
   293  		},
   294  		sys.PPC64)
   295  	addF("internal/runtime/atomic", "StoreRel64",
   296  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   297  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
   298  			return nil
   299  		},
   300  		sys.PPC64)
   301  
   302  	makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   303  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   304  			// Target Atomic feature is identified by dynamic detection
   305  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   306  			v := s.load(types.Types[types.TBOOL], addr)
   307  			b := s.endBlock()
   308  			b.Kind = ssa.BlockIf
   309  			b.SetControl(v)
   310  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   311  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   312  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   313  			b.AddEdgeTo(bTrue)
   314  			b.AddEdgeTo(bFalse)
   315  			b.Likely = ssa.BranchLikely
   316  
   317  			// We have atomic instructions - use it directly.
   318  			s.startBlock(bTrue)
   319  			emit(s, n, args, op1, typ, false)
   320  			s.endBlock().AddEdgeTo(bEnd)
   321  
   322  			// Use original instruction sequence.
   323  			s.startBlock(bFalse)
   324  			emit(s, n, args, op0, typ, false)
   325  			s.endBlock().AddEdgeTo(bEnd)
   326  
   327  			// Merge results.
   328  			s.startBlock(bEnd)
   329  
   330  			return nil
   331  		}
   332  	}
   333  
   334  	atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   335  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   336  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   337  		if needReturn {
   338  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   339  		}
   340  	}
   341  
   342  	addF("internal/runtime/atomic", "Store8",
   343  		makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
   344  		sys.Loong64)
   345  	addF("internal/runtime/atomic", "Store",
   346  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   347  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
   348  			return nil
   349  		},
   350  		sys.Loong64)
   351  	addF("internal/runtime/atomic", "Store64",
   352  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   353  			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
   354  			return nil
   355  		},
   356  		sys.Loong64)
   357  
   358  	addF("internal/runtime/atomic", "Xchg8",
   359  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   360  			v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   361  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   362  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
   363  		},
   364  		sys.AMD64, sys.PPC64)
   365  	addF("internal/runtime/atomic", "Xchg",
   366  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   367  			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   368  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   369  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   370  		},
   371  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   372  	addF("internal/runtime/atomic", "Xchg64",
   373  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   374  			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   375  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   376  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   377  		},
   378  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   379  
   380  	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
   381  
   382  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   383  			if cfg.goarm64.LSE {
   384  				emit(s, n, args, op1, typ, needReturn)
   385  			} else {
   386  				// Target Atomic feature is identified by dynamic detection
   387  				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
   388  				v := s.load(types.Types[types.TBOOL], addr)
   389  				b := s.endBlock()
   390  				b.Kind = ssa.BlockIf
   391  				b.SetControl(v)
   392  				bTrue := s.f.NewBlock(ssa.BlockPlain)
   393  				bFalse := s.f.NewBlock(ssa.BlockPlain)
   394  				bEnd := s.f.NewBlock(ssa.BlockPlain)
   395  				b.AddEdgeTo(bTrue)
   396  				b.AddEdgeTo(bFalse)
   397  				b.Likely = ssa.BranchLikely
   398  
   399  				// We have atomic instructions - use it directly.
   400  				s.startBlock(bTrue)
   401  				emit(s, n, args, op1, typ, needReturn)
   402  				s.endBlock().AddEdgeTo(bEnd)
   403  
   404  				// Use original instruction sequence.
   405  				s.startBlock(bFalse)
   406  				emit(s, n, args, op0, typ, needReturn)
   407  				s.endBlock().AddEdgeTo(bEnd)
   408  
   409  				// Merge results.
   410  				s.startBlock(bEnd)
   411  			}
   412  			if needReturn {
   413  				return s.variable(n, types.Types[typ])
   414  			} else {
   415  				return nil
   416  			}
   417  		}
   418  	}
   419  	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   420  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
   421  	}
   422  	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
   423  		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
   424  	}
   425  
   426  	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   427  		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
   428  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   429  		if needReturn {
   430  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   431  		}
   432  	}
   433  	addF("internal/runtime/atomic", "Xchg8",
   434  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64),
   435  		sys.ARM64)
   436  	addF("internal/runtime/atomic", "Xchg",
   437  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
   438  		sys.ARM64)
   439  	addF("internal/runtime/atomic", "Xchg64",
   440  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
   441  		sys.ARM64)
   442  
   443  	makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   444  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   445  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
   446  			v := s.load(types.Types[types.TBOOL], addr)
   447  			b := s.endBlock()
   448  			b.Kind = ssa.BlockIf
   449  			b.SetControl(v)
   450  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   451  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   452  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   453  			b.AddEdgeTo(bTrue)
   454  			b.AddEdgeTo(bFalse)
   455  			b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b
   456  
   457  			// We have the intrinsic - use it directly.
   458  			s.startBlock(bTrue)
   459  			s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
   460  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n])
   461  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n])
   462  			s.endBlock().AddEdgeTo(bEnd)
   463  
   464  			// Call the pure Go version.
   465  			s.startBlock(bFalse)
   466  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8]
   467  			s.endBlock().AddEdgeTo(bEnd)
   468  
   469  			// Merge results.
   470  			s.startBlock(bEnd)
   471  			return s.variable(n, types.Types[types.TUINT8])
   472  		}
   473  	}
   474  	addF("internal/runtime/atomic", "Xchg8",
   475  		makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant),
   476  		sys.Loong64)
   477  
   478  	addF("internal/runtime/atomic", "Xadd",
   479  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   480  			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   481  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   482  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
   483  		},
   484  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   485  	addF("internal/runtime/atomic", "Xadd64",
   486  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   487  			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   488  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   489  			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
   490  		},
   491  		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   492  
   493  	addF("internal/runtime/atomic", "Xadd",
   494  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
   495  		sys.ARM64)
   496  	addF("internal/runtime/atomic", "Xadd64",
   497  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
   498  		sys.ARM64)
   499  
   500  	addF("internal/runtime/atomic", "Cas",
   501  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   502  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   503  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   504  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   505  		},
   506  		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   507  	addF("internal/runtime/atomic", "Cas64",
   508  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   509  			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   510  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   511  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   512  		},
   513  		sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   514  	addF("internal/runtime/atomic", "CasRel",
   515  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   516  			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   517  			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   518  			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
   519  		},
   520  		sys.PPC64)
   521  
   522  	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   523  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   524  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   525  		if needReturn {
   526  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   527  		}
   528  	}
   529  
   530  	addF("internal/runtime/atomic", "Cas",
   531  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
   532  		sys.ARM64)
   533  	addF("internal/runtime/atomic", "Cas64",
   534  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
   535  		sys.ARM64)
   536  
   537  	atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
   538  		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
   539  		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
   540  		if needReturn {
   541  			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
   542  		}
   543  	}
   544  
   545  	makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder {
   546  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   547  			// Target Atomic feature is identified by dynamic detection
   548  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb)
   549  			v := s.load(types.Types[types.TBOOL], addr)
   550  			b := s.endBlock()
   551  			b.Kind = ssa.BlockIf
   552  			b.SetControl(v)
   553  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   554  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   555  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   556  			b.AddEdgeTo(bTrue)
   557  			b.AddEdgeTo(bFalse)
   558  			b.Likely = ssa.BranchLikely
   559  
   560  			// We have atomic instructions - use it directly.
   561  			s.startBlock(bTrue)
   562  			emit(s, n, args, op1, types.TBOOL, true)
   563  			s.endBlock().AddEdgeTo(bEnd)
   564  
   565  			// Use original instruction sequence.
   566  			s.startBlock(bFalse)
   567  			emit(s, n, args, op0, types.TBOOL, true)
   568  			s.endBlock().AddEdgeTo(bEnd)
   569  
   570  			// Merge results.
   571  			s.startBlock(bEnd)
   572  
   573  			return s.variable(n, types.Types[types.TBOOL])
   574  		}
   575  	}
   576  
   577  	addF("internal/runtime/atomic", "Cas",
   578  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64),
   579  		sys.Loong64)
   580  	addF("internal/runtime/atomic", "Cas64",
   581  		makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64),
   582  		sys.Loong64)
   583  
   584  	// Old-style atomic logical operation API (all supported archs except arm64).
   585  	addF("internal/runtime/atomic", "And8",
   586  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   587  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
   588  			return nil
   589  		},
   590  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   591  	addF("internal/runtime/atomic", "And",
   592  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   593  			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
   594  			return nil
   595  		},
   596  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   597  	addF("internal/runtime/atomic", "Or8",
   598  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   599  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
   600  			return nil
   601  		},
   602  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   603  	addF("internal/runtime/atomic", "Or",
   604  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   605  			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
   606  			return nil
   607  		},
   608  		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
   609  
   610  	// arm64 always uses the new-style atomic logical operations, for both the
   611  	// old and new style API.
   612  	addF("internal/runtime/atomic", "And8",
   613  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
   614  		sys.ARM64)
   615  	addF("internal/runtime/atomic", "Or8",
   616  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
   617  		sys.ARM64)
   618  	addF("internal/runtime/atomic", "And64",
   619  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
   620  		sys.ARM64)
   621  	addF("internal/runtime/atomic", "And32",
   622  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   623  		sys.ARM64)
   624  	addF("internal/runtime/atomic", "And",
   625  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
   626  		sys.ARM64)
   627  	addF("internal/runtime/atomic", "Or64",
   628  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
   629  		sys.ARM64)
   630  	addF("internal/runtime/atomic", "Or32",
   631  		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   632  		sys.ARM64)
   633  	addF("internal/runtime/atomic", "Or",
   634  		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
   635  		sys.ARM64)
   636  
   637  	// New-style atomic logical operations, which return the old memory value.
   638  	addF("internal/runtime/atomic", "And64",
   639  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   640  			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   641  			p0, p1 := s.split(v)
   642  			s.vars[memVar] = p1
   643  			return p0
   644  		},
   645  		sys.AMD64, sys.Loong64)
   646  	addF("internal/runtime/atomic", "And32",
   647  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   648  			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   649  			p0, p1 := s.split(v)
   650  			s.vars[memVar] = p1
   651  			return p0
   652  		},
   653  		sys.AMD64, sys.Loong64)
   654  	addF("internal/runtime/atomic", "Or64",
   655  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   656  			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
   657  			p0, p1 := s.split(v)
   658  			s.vars[memVar] = p1
   659  			return p0
   660  		},
   661  		sys.AMD64, sys.Loong64)
   662  	addF("internal/runtime/atomic", "Or32",
   663  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   664  			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
   665  			p0, p1 := s.split(v)
   666  			s.vars[memVar] = p1
   667  			return p0
   668  		},
   669  		sys.AMD64, sys.Loong64)
   670  
   671  	// Aliases for atomic load operations
   672  	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
   673  	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
   674  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
   675  	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
   676  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
   677  	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
   678  	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
   679  	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
   680  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
   681  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
   682  	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
   683  	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
   684  
   685  	// Aliases for atomic store operations
   686  	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
   687  	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
   688  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
   689  	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
   690  	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
   691  	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
   692  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
   693  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
   694  	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
   695  	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
   696  
   697  	// Aliases for atomic swap operations
   698  	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
   699  	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
   700  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
   701  	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
   702  
   703  	// Aliases for atomic add operations
   704  	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
   705  	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
   706  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
   707  	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
   708  
   709  	// Aliases for atomic CAS operations
   710  	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
   711  	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
   712  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
   713  	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
   714  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
   715  	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
   716  	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
   717  
   718  	// Aliases for atomic And/Or operations
   719  	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64)
   720  	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64)
   721  
   722  	/******** math ********/
   723  	addF("math", "sqrt",
   724  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   725  			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
   726  		},
   727  		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
   728  	addF("math", "Trunc",
   729  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   730  			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
   731  		},
   732  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   733  	addF("math", "Ceil",
   734  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   735  			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
   736  		},
   737  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   738  	addF("math", "Floor",
   739  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   740  			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
   741  		},
   742  		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
   743  	addF("math", "Round",
   744  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   745  			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
   746  		},
   747  		sys.ARM64, sys.PPC64, sys.S390X)
   748  	addF("math", "RoundToEven",
   749  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   750  			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
   751  		},
   752  		sys.ARM64, sys.S390X, sys.Wasm)
   753  	addF("math", "Abs",
   754  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   755  			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
   756  		},
   757  		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
   758  	addF("math", "Copysign",
   759  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   760  			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
   761  		},
   762  		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
   763  	addF("math", "FMA",
   764  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   765  			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   766  		},
   767  		sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
   768  	addF("math", "FMA",
   769  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   770  			if !s.config.UseFMA {
   771  				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   772  				return s.variable(n, types.Types[types.TFLOAT64])
   773  			}
   774  
   775  			if cfg.goamd64 >= 3 {
   776  				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   777  			}
   778  
   779  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
   780  			b := s.endBlock()
   781  			b.Kind = ssa.BlockIf
   782  			b.SetControl(v)
   783  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   784  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   785  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   786  			b.AddEdgeTo(bTrue)
   787  			b.AddEdgeTo(bFalse)
   788  			b.Likely = ssa.BranchLikely // >= haswell cpus are common
   789  
   790  			// We have the intrinsic - use it directly.
   791  			s.startBlock(bTrue)
   792  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   793  			s.endBlock().AddEdgeTo(bEnd)
   794  
   795  			// Call the pure Go version.
   796  			s.startBlock(bFalse)
   797  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   798  			s.endBlock().AddEdgeTo(bEnd)
   799  
   800  			// Merge results.
   801  			s.startBlock(bEnd)
   802  			return s.variable(n, types.Types[types.TFLOAT64])
   803  		},
   804  		sys.AMD64)
   805  	addF("math", "FMA",
   806  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   807  			if !s.config.UseFMA {
   808  				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   809  				return s.variable(n, types.Types[types.TFLOAT64])
   810  			}
   811  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
   812  			v := s.load(types.Types[types.TBOOL], addr)
   813  			b := s.endBlock()
   814  			b.Kind = ssa.BlockIf
   815  			b.SetControl(v)
   816  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   817  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   818  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   819  			b.AddEdgeTo(bTrue)
   820  			b.AddEdgeTo(bFalse)
   821  			b.Likely = ssa.BranchLikely
   822  
   823  			// We have the intrinsic - use it directly.
   824  			s.startBlock(bTrue)
   825  			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
   826  			s.endBlock().AddEdgeTo(bEnd)
   827  
   828  			// Call the pure Go version.
   829  			s.startBlock(bFalse)
   830  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   831  			s.endBlock().AddEdgeTo(bEnd)
   832  
   833  			// Merge results.
   834  			s.startBlock(bEnd)
   835  			return s.variable(n, types.Types[types.TFLOAT64])
   836  		},
   837  		sys.ARM)
   838  
   839  	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   840  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   841  			if cfg.goamd64 >= 2 {
   842  				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   843  			}
   844  
   845  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
   846  			b := s.endBlock()
   847  			b.Kind = ssa.BlockIf
   848  			b.SetControl(v)
   849  			bTrue := s.f.NewBlock(ssa.BlockPlain)
   850  			bFalse := s.f.NewBlock(ssa.BlockPlain)
   851  			bEnd := s.f.NewBlock(ssa.BlockPlain)
   852  			b.AddEdgeTo(bTrue)
   853  			b.AddEdgeTo(bFalse)
   854  			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
   855  
   856  			// We have the intrinsic - use it directly.
   857  			s.startBlock(bTrue)
   858  			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
   859  			s.endBlock().AddEdgeTo(bEnd)
   860  
   861  			// Call the pure Go version.
   862  			s.startBlock(bFalse)
   863  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
   864  			s.endBlock().AddEdgeTo(bEnd)
   865  
   866  			// Merge results.
   867  			s.startBlock(bEnd)
   868  			return s.variable(n, types.Types[types.TFLOAT64])
   869  		}
   870  	}
   871  	addF("math", "RoundToEven",
   872  		makeRoundAMD64(ssa.OpRoundToEven),
   873  		sys.AMD64)
   874  	addF("math", "Floor",
   875  		makeRoundAMD64(ssa.OpFloor),
   876  		sys.AMD64)
   877  	addF("math", "Ceil",
   878  		makeRoundAMD64(ssa.OpCeil),
   879  		sys.AMD64)
   880  	addF("math", "Trunc",
   881  		makeRoundAMD64(ssa.OpTrunc),
   882  		sys.AMD64)
   883  
   884  	/******** math/bits ********/
   885  	addF("math/bits", "TrailingZeros64",
   886  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   887  			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
   888  		},
   889  		sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   890  	addF("math/bits", "TrailingZeros64",
   891  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   892  			lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
   893  			hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
   894  			return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
   895  		},
   896  		sys.I386)
   897  	addF("math/bits", "TrailingZeros32",
   898  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   899  			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
   900  		},
   901  		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   902  	addF("math/bits", "TrailingZeros16",
   903  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   904  			x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
   905  			c := s.constInt32(types.Types[types.TUINT32], 1<<16)
   906  			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
   907  			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
   908  		},
   909  		sys.MIPS)
   910  	addF("math/bits", "TrailingZeros16",
   911  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   912  			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
   913  		},
   914  		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
   915  	addF("math/bits", "TrailingZeros16",
   916  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   917  			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
   918  			c := s.constInt64(types.Types[types.TUINT64], 1<<16)
   919  			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
   920  			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
   921  		},
   922  		sys.Loong64, sys.S390X, sys.PPC64)
   923  	addF("math/bits", "TrailingZeros8",
   924  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   925  			x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
   926  			c := s.constInt32(types.Types[types.TUINT32], 1<<8)
   927  			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
   928  			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
   929  		},
   930  		sys.MIPS)
   931  	addF("math/bits", "TrailingZeros8",
   932  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   933  			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
   934  		},
   935  		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
   936  	addF("math/bits", "TrailingZeros8",
   937  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   938  			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
   939  			c := s.constInt64(types.Types[types.TUINT64], 1<<8)
   940  			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
   941  			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
   942  		},
   943  		sys.Loong64, sys.S390X)
   944  	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
   945  	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
   946  	addF("math/bits", "ReverseBytes16",
   947  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   948  			return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
   949  		},
   950  		sys.Loong64)
   951  	// ReverseBytes inlines correctly, no need to intrinsify it.
   952  	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
   953  	// On Power10, 16-bit rotate is not available so use BRH instruction
   954  	if cfg.goppc64 >= 10 {
   955  		addF("math/bits", "ReverseBytes16",
   956  			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   957  				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
   958  			},
   959  			sys.PPC64)
   960  	}
   961  
   962  	addF("math/bits", "Len64",
   963  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   964  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
   965  		},
   966  		sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   967  	addF("math/bits", "Len32",
   968  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   969  			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
   970  		},
   971  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
   972  	addF("math/bits", "Len32",
   973  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   974  			if s.config.PtrSize == 4 {
   975  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
   976  			}
   977  			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
   978  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
   979  		},
   980  		sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
   981  	addF("math/bits", "Len16",
   982  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   983  			if s.config.PtrSize == 4 {
   984  				x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
   985  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
   986  			}
   987  			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
   988  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
   989  		},
   990  		sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
   991  	addF("math/bits", "Len16",
   992  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   993  			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
   994  		},
   995  		sys.AMD64)
   996  	addF("math/bits", "Len8",
   997  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
   998  			if s.config.PtrSize == 4 {
   999  				x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
  1000  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
  1001  			}
  1002  			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
  1003  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
  1004  		},
  1005  		sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
  1006  	addF("math/bits", "Len8",
  1007  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1008  			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
  1009  		},
  1010  		sys.AMD64)
  1011  	addF("math/bits", "Len",
  1012  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1013  			if s.config.PtrSize == 4 {
  1014  				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
  1015  			}
  1016  			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
  1017  		},
  1018  		sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
  1019  	// LeadingZeros is handled because it trivially calls Len.
  1020  	addF("math/bits", "Reverse64",
  1021  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1022  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1023  		},
  1024  		sys.ARM64, sys.Loong64)
  1025  	addF("math/bits", "Reverse32",
  1026  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1027  			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
  1028  		},
  1029  		sys.ARM64, sys.Loong64)
  1030  	addF("math/bits", "Reverse16",
  1031  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1032  			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
  1033  		},
  1034  		sys.ARM64, sys.Loong64)
  1035  	addF("math/bits", "Reverse8",
  1036  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1037  			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
  1038  		},
  1039  		sys.ARM64, sys.Loong64)
  1040  	addF("math/bits", "Reverse",
  1041  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1042  			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
  1043  		},
  1044  		sys.ARM64, sys.Loong64)
  1045  	addF("math/bits", "RotateLeft8",
  1046  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1047  			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
  1048  		},
  1049  		sys.AMD64, sys.RISCV64)
  1050  	addF("math/bits", "RotateLeft16",
  1051  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1052  			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
  1053  		},
  1054  		sys.AMD64, sys.RISCV64)
  1055  	addF("math/bits", "RotateLeft32",
  1056  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1057  			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
  1058  		},
  1059  		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1060  	addF("math/bits", "RotateLeft64",
  1061  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1062  			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
  1063  		},
  1064  		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
  1065  	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
  1066  
  1067  	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1068  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1069  			if cfg.goamd64 >= 2 {
  1070  				return s.newValue1(op, types.Types[types.TINT], args[0])
  1071  			}
  1072  
  1073  			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
  1074  			b := s.endBlock()
  1075  			b.Kind = ssa.BlockIf
  1076  			b.SetControl(v)
  1077  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1078  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1079  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1080  			b.AddEdgeTo(bTrue)
  1081  			b.AddEdgeTo(bFalse)
  1082  			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
  1083  
  1084  			// We have the intrinsic - use it directly.
  1085  			s.startBlock(bTrue)
  1086  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1087  			s.endBlock().AddEdgeTo(bEnd)
  1088  
  1089  			// Call the pure Go version.
  1090  			s.startBlock(bFalse)
  1091  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1092  			s.endBlock().AddEdgeTo(bEnd)
  1093  
  1094  			// Merge results.
  1095  			s.startBlock(bEnd)
  1096  			return s.variable(n, types.Types[types.TINT])
  1097  		}
  1098  	}
  1099  
  1100  	makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1101  		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1102  			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb)
  1103  			v := s.load(types.Types[types.TBOOL], addr)
  1104  			b := s.endBlock()
  1105  			b.Kind = ssa.BlockIf
  1106  			b.SetControl(v)
  1107  			bTrue := s.f.NewBlock(ssa.BlockPlain)
  1108  			bFalse := s.f.NewBlock(ssa.BlockPlain)
  1109  			bEnd := s.f.NewBlock(ssa.BlockPlain)
  1110  			b.AddEdgeTo(bTrue)
  1111  			b.AddEdgeTo(bFalse)
  1112  			b.Likely = ssa.BranchLikely // most loong64 machines support the LSX
  1113  
  1114  			// We have the intrinsic - use it directly.
  1115  			s.startBlock(bTrue)
  1116  			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
  1117  			s.endBlock().AddEdgeTo(bEnd)
  1118  
  1119  			// Call the pure Go version.
  1120  			s.startBlock(bFalse)
  1121  			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
  1122  			s.endBlock().AddEdgeTo(bEnd)
  1123  
  1124  			// Merge results.
  1125  			s.startBlock(bEnd)
  1126  			return s.variable(n, types.Types[types.TINT])
  1127  		}
  1128  	}
  1129  
  1130  	addF("math/bits", "OnesCount64",
  1131  		makeOnesCountAMD64(ssa.OpPopCount64),
  1132  		sys.AMD64)
  1133  	addF("math/bits", "OnesCount64",
  1134  		makeOnesCountLoong64(ssa.OpPopCount64),
  1135  		sys.Loong64)
  1136  	addF("math/bits", "OnesCount64",
  1137  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1138  			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
  1139  		},
  1140  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1141  	addF("math/bits", "OnesCount32",
  1142  		makeOnesCountAMD64(ssa.OpPopCount32),
  1143  		sys.AMD64)
  1144  	addF("math/bits", "OnesCount32",
  1145  		makeOnesCountLoong64(ssa.OpPopCount32),
  1146  		sys.Loong64)
  1147  	addF("math/bits", "OnesCount32",
  1148  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1149  			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
  1150  		},
  1151  		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
  1152  	addF("math/bits", "OnesCount16",
  1153  		makeOnesCountAMD64(ssa.OpPopCount16),
  1154  		sys.AMD64)
  1155  	addF("math/bits", "OnesCount16",
  1156  		makeOnesCountLoong64(ssa.OpPopCount16),
  1157  		sys.Loong64)
  1158  	addF("math/bits", "OnesCount16",
  1159  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1160  			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
  1161  		},
  1162  		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
  1163  	addF("math/bits", "OnesCount8",
  1164  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1165  			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
  1166  		},
  1167  		sys.S390X, sys.PPC64, sys.Wasm)
  1168  	addF("math/bits", "OnesCount",
  1169  		makeOnesCountAMD64(ssa.OpPopCount64),
  1170  		sys.AMD64)
  1171  	addF("math/bits", "Mul64",
  1172  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1173  			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
  1174  		},
  1175  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
  1176  	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
  1177  	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
  1178  	addF("math/bits", "Add64",
  1179  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1180  			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1181  		},
  1182  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1183  	alias("math/bits", "Add", "math/bits", "Add64", p8...)
  1184  	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
  1185  	addF("math/bits", "Sub64",
  1186  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1187  			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1188  		},
  1189  		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
  1190  	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
  1191  	addF("math/bits", "Div64",
  1192  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1193  			// check for divide-by-zero/overflow and panic with appropriate message
  1194  			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
  1195  			s.check(cmpZero, ir.Syms.Panicdivide)
  1196  			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
  1197  			s.check(cmpOverflow, ir.Syms.Panicoverflow)
  1198  			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
  1199  		},
  1200  		sys.AMD64)
  1201  	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
  1202  
  1203  	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
  1204  	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
  1205  	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
  1206  	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
  1207  	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
  1208  	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
  1209  
  1210  	/******** sync/atomic ********/
  1211  
  1212  	// Note: these are disabled by flag_race in findIntrinsic below.
  1213  	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
  1214  	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
  1215  	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
  1216  	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
  1217  	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
  1218  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
  1219  	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
  1220  
  1221  	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
  1222  	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
  1223  	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
  1224  	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
  1225  	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
  1226  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
  1227  	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
  1228  
  1229  	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
  1230  	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
  1231  	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
  1232  	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
  1233  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
  1234  	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
  1235  
  1236  	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
  1237  	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
  1238  	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
  1239  	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
  1240  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
  1241  	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
  1242  
  1243  	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
  1244  	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
  1245  	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
  1246  	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
  1247  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
  1248  	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
  1249  
  1250  	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1251  	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1252  	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1253  	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1254  	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1255  	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1256  	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1257  	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1258  	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1259  	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
  1260  
  1261  	/******** math/big ********/
  1262  	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
  1263  
  1264  	/******** internal/runtime/maps ********/
  1265  
  1266  	// Important: The intrinsic implementations below return a packed
  1267  	// bitset, while the portable Go implementation uses an unpacked
  1268  	// representation (one bit set in each byte).
  1269  	//
  1270  	// Thus we must replace most bitset methods with implementations that
  1271  	// work with the packed representation.
  1272  	//
  1273  	// TODO(prattmic): The bitset implementations don't use SIMD, so they
  1274  	// could be handled with build tags (though that would break
  1275  	// -d=ssa/intrinsics/off=1).
  1276  
  1277  	// With a packed representation we no longer need to shift the result
  1278  	// of TrailingZeros64.
  1279  	alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64)
  1280  
  1281  	addF("internal/runtime/maps", "bitsetRemoveBelow",
  1282  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1283  			b := args[0]
  1284  			i := args[1]
  1285  
  1286  			// Clear the lower i bits in b.
  1287  			//
  1288  			// out = b &^ ((1 << i) - 1)
  1289  
  1290  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1291  
  1292  			mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i)
  1293  			mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one)
  1294  			mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask)
  1295  
  1296  			return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask)
  1297  		},
  1298  		sys.AMD64)
  1299  
  1300  	addF("internal/runtime/maps", "bitsetLowestSet",
  1301  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1302  			b := args[0]
  1303  
  1304  			// Test the lowest bit in b.
  1305  			//
  1306  			// out = (b & 1) == 1
  1307  
  1308  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1309  			and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one)
  1310  			return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one)
  1311  		},
  1312  		sys.AMD64)
  1313  
  1314  	addF("internal/runtime/maps", "bitsetShiftOutLowest",
  1315  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1316  			b := args[0]
  1317  
  1318  			// Right shift out the lowest bit in b.
  1319  			//
  1320  			// out = b >> 1
  1321  
  1322  			one := s.constInt64(types.Types[types.TUINT64], 1)
  1323  			return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one)
  1324  		},
  1325  		sys.AMD64)
  1326  
  1327  	addF("internal/runtime/maps", "ctrlGroupMatchH2",
  1328  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1329  			g := args[0]
  1330  			h := args[1]
  1331  
  1332  			// Explicit copies to fp registers. See
  1333  			// https://go.dev/issue/70451.
  1334  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1335  			hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h)
  1336  
  1337  			// Broadcast h2 into each byte of a word.
  1338  			var broadcast *ssa.Value
  1339  			if buildcfg.GOAMD64 >= 4 {
  1340  				// VPBROADCASTB saves 1 instruction vs PSHUFB
  1341  				// because the input can come from a GP
  1342  				// register, while PSHUFB requires moving into
  1343  				// an FP register first.
  1344  				//
  1345  				// Nominally PSHUFB would require a second
  1346  				// additional instruction to load the control
  1347  				// mask into a FP register. But broadcast uses
  1348  				// a control mask of 0, and the register ABI
  1349  				// already defines X15 as a zero register.
  1350  				broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h
  1351  			} else if buildcfg.GOAMD64 >= 2 {
  1352  				// PSHUFB performs a byte broadcast when given
  1353  				// a control input of 0.
  1354  				broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp)
  1355  			} else {
  1356  				// No direct byte broadcast. First we must
  1357  				// duplicate the lower byte and then do a
  1358  				// 16-bit broadcast.
  1359  
  1360  				// "Unpack" h2 with itself. This duplicates the
  1361  				// input, resulting in h2 in the lower two
  1362  				// bytes.
  1363  				unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp)
  1364  
  1365  				// Copy the lower 16-bits of unpack into every
  1366  				// 16-bit slot in the lower 64-bits of the
  1367  				// output register. Note that immediate 0
  1368  				// selects the low word as the source for every
  1369  				// destination slot.
  1370  				broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack)
  1371  
  1372  				// No need to broadcast into the upper 64-bits,
  1373  				// as we don't use those.
  1374  			}
  1375  
  1376  			// Compare each byte of the control word with h2. Each
  1377  			// matching byte has every bit set.
  1378  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp)
  1379  
  1380  			// Construct a "byte mask": each output bit is equal to
  1381  			// the sign bit each input byte.
  1382  			//
  1383  			// This results in a packed output (bit N set means
  1384  			// byte N matched).
  1385  			//
  1386  			// NOTE: See comment above on bitsetFirst.
  1387  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1388  
  1389  			// g is only 64-bits so the upper 64-bits of the
  1390  			// 128-bit register will be zero. If h2 is also zero,
  1391  			// then we'll get matches on those bytes. Truncate the
  1392  			// upper bits to ignore such matches.
  1393  			ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1394  
  1395  			return ret
  1396  		},
  1397  		sys.AMD64)
  1398  
  1399  	addF("internal/runtime/maps", "ctrlGroupMatchEmpty",
  1400  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1401  			// An empty slot is   1000 0000
  1402  			// A deleted slot is  1111 1110
  1403  			// A full slot is     0??? ????
  1404  
  1405  			g := args[0]
  1406  
  1407  			// Explicit copy to fp register. See
  1408  			// https://go.dev/issue/70451.
  1409  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1410  
  1411  			if buildcfg.GOAMD64 >= 2 {
  1412  				// "PSIGNB negates each data element of the
  1413  				// destination operand (the first operand) if
  1414  				// the signed integer value of the
  1415  				// corresponding data element in the source
  1416  				// operand (the second operand) is less than
  1417  				// zero. If the signed integer value of a data
  1418  				// element in the source operand is positive,
  1419  				// the corresponding data element in the
  1420  				// destination operand is unchanged. If a data
  1421  				// element in the source operand is zero, the
  1422  				// corresponding data element in the
  1423  				// destination operand is set to zero" - Intel SDM
  1424  				//
  1425  				// If we pass the group control word as both
  1426  				// arguments:
  1427  				// - Full slots are unchanged.
  1428  				// - Deleted slots are negated, becoming
  1429  				//   0000 0010.
  1430  				// - Empty slots are negated, becoming
  1431  				//   1000 0000 (unchanged!).
  1432  				//
  1433  				// The result is that only empty slots have the
  1434  				// sign bit set. We then use PMOVMSKB to
  1435  				// extract the sign bits.
  1436  				sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp)
  1437  
  1438  				// Construct a "byte mask": each output bit is
  1439  				// equal to the sign bit each input byte. The
  1440  				// sign bit is only set for empty or deleted
  1441  				// slots.
  1442  				//
  1443  				// This results in a packed output (bit N set
  1444  				// means byte N matched).
  1445  				//
  1446  				// NOTE: See comment above on bitsetFirst.
  1447  				ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], sign)
  1448  
  1449  				// g is only 64-bits so the upper 64-bits of
  1450  				// the 128-bit register will be zero. PSIGNB
  1451  				// will keep all of these bytes zero, so no
  1452  				// need to truncate.
  1453  
  1454  				return ret
  1455  			}
  1456  
  1457  			// No PSIGNB, simply do byte equality with ctrlEmpty.
  1458  
  1459  			// Load ctrlEmpty into each byte of a control word.
  1460  			var ctrlsEmpty uint64 = abi.SwissMapCtrlEmpty
  1461  			e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty))
  1462  			// Explicit copy to fp register. See
  1463  			// https://go.dev/issue/70451.
  1464  			efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e)
  1465  
  1466  			// Compare each byte of the control word with ctrlEmpty. Each
  1467  			// matching byte has every bit set.
  1468  			eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp)
  1469  
  1470  			// Construct a "byte mask": each output bit is equal to
  1471  			// the sign bit each input byte.
  1472  			//
  1473  			// This results in a packed output (bit N set means
  1474  			// byte N matched).
  1475  			//
  1476  			// NOTE: See comment above on bitsetFirst.
  1477  			out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
  1478  
  1479  			// g is only 64-bits so the upper 64-bits of the
  1480  			// 128-bit register will be zero. The upper 64-bits of
  1481  			// efp are also zero, so we'll get matches on those
  1482  			// bytes. Truncate the upper bits to ignore such
  1483  			// matches.
  1484  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1485  		},
  1486  		sys.AMD64)
  1487  
  1488  	addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted",
  1489  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1490  			// An empty slot is   1000 0000
  1491  			// A deleted slot is  1111 1110
  1492  			// A full slot is     0??? ????
  1493  			//
  1494  			// A slot is empty or deleted iff bit 7 (sign bit) is
  1495  			// set.
  1496  
  1497  			g := args[0]
  1498  
  1499  			// Explicit copy to fp register. See
  1500  			// https://go.dev/issue/70451.
  1501  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1502  
  1503  			// Construct a "byte mask": each output bit is equal to
  1504  			// the sign bit each input byte. The sign bit is only
  1505  			// set for empty or deleted slots.
  1506  			//
  1507  			// This results in a packed output (bit N set means
  1508  			// byte N matched).
  1509  			//
  1510  			// NOTE: See comment above on bitsetFirst.
  1511  			ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1512  
  1513  			// g is only 64-bits so the upper 64-bits of the
  1514  			// 128-bit register will be zero. Zero will never match
  1515  			// ctrlEmpty or ctrlDeleted, so no need to truncate.
  1516  
  1517  			return ret
  1518  		},
  1519  		sys.AMD64)
  1520  
  1521  	addF("internal/runtime/maps", "ctrlGroupMatchFull",
  1522  		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
  1523  			// An empty slot is   1000 0000
  1524  			// A deleted slot is  1111 1110
  1525  			// A full slot is     0??? ????
  1526  			//
  1527  			// A slot is full iff bit 7 (sign bit) is unset.
  1528  
  1529  			g := args[0]
  1530  
  1531  			// Explicit copy to fp register. See
  1532  			// https://go.dev/issue/70451.
  1533  			gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
  1534  
  1535  			// Construct a "byte mask": each output bit is equal to
  1536  			// the sign bit each input byte. The sign bit is only
  1537  			// set for empty or deleted slots.
  1538  			//
  1539  			// This results in a packed output (bit N set means
  1540  			// byte N matched).
  1541  			//
  1542  			// NOTE: See comment above on bitsetFirst.
  1543  			mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
  1544  
  1545  			// Invert the mask to set the bits for the full slots.
  1546  			out := s.newValue1(ssa.OpCom16, types.Types[types.TUINT16], mask)
  1547  
  1548  			// g is only 64-bits so the upper 64-bits of the
  1549  			// 128-bit register will be zero, with bit 7 unset.
  1550  			// Truncate the upper bits to ignore these.
  1551  			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
  1552  		},
  1553  		sys.AMD64)
  1554  }
  1555  
  1556  // findIntrinsic returns a function which builds the SSA equivalent of the
  1557  // function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
  1558  func findIntrinsic(sym *types.Sym) intrinsicBuilder {
  1559  	if sym == nil || sym.Pkg == nil {
  1560  		return nil
  1561  	}
  1562  	pkg := sym.Pkg.Path
  1563  	if sym.Pkg == ir.Pkgs.Runtime {
  1564  		pkg = "runtime"
  1565  	}
  1566  	if base.Flag.Race && pkg == "sync/atomic" {
  1567  		// The race detector needs to be able to intercept these calls.
  1568  		// We can't intrinsify them.
  1569  		return nil
  1570  	}
  1571  	// Skip intrinsifying math functions (which may contain hard-float
  1572  	// instructions) when soft-float
  1573  	if Arch.SoftFloat && pkg == "math" {
  1574  		return nil
  1575  	}
  1576  
  1577  	fn := sym.Name
  1578  	if ssa.IntrinsicsDisable {
  1579  		if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") {
  1580  			// These runtime functions don't have definitions, must be intrinsics.
  1581  		} else {
  1582  			return nil
  1583  		}
  1584  	}
  1585  	return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
  1586  }
  1587  
  1588  func IsIntrinsicCall(n *ir.CallExpr) bool {
  1589  	if n == nil {
  1590  		return false
  1591  	}
  1592  	name, ok := n.Fun.(*ir.Name)
  1593  	if !ok {
  1594  		return false
  1595  	}
  1596  	return findIntrinsic(name.Sym()) != nil
  1597  }
  1598  

View as plain text