// Copyright 2024 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package ssagen import ( "fmt" "internal/abi" "internal/buildcfg" "cmd/compile/internal/base" "cmd/compile/internal/ir" "cmd/compile/internal/ssa" "cmd/compile/internal/types" "cmd/internal/sys" ) var intrinsics intrinsicBuilders // An intrinsicBuilder converts a call node n into an ssa value that // implements that call as an intrinsic. args is a list of arguments to the func. type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value type intrinsicKey struct { arch *sys.Arch pkg string fn string } // intrinsicBuildConfig specifies the config to use for intrinsic building. type intrinsicBuildConfig struct { instrumenting bool go386 string goamd64 int goarm buildcfg.GoarmFeatures goarm64 buildcfg.Goarm64Features gomips string gomips64 string goppc64 int goriscv64 int } type intrinsicBuilders map[intrinsicKey]intrinsicBuilder // add adds the intrinsic builder b for pkg.fn for the given architecture. func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) { if _, found := ib[intrinsicKey{arch, pkg, fn}]; found { panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name)) } ib[intrinsicKey{arch, pkg, fn}] = b } // addForArchs adds the intrinsic builder b for pkg.fn for the given architectures. func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) { for _, arch := range archs { ib.add(arch, pkg, fn, b) } } // addForFamilies does the same as addForArchs but operates on architecture families. func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) { for _, arch := range sys.Archs { if arch.InFamily(archFamilies...) { intrinsics.add(arch, pkg, fn, b) } } } // alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs // for which targetPkg.targetFn already exists. func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) { // TODO(jsing): Consider making this work even if the alias is added // before the intrinsic. aliased := false for _, arch := range archs { if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil { intrinsics.add(arch, pkg, fn, b) aliased = true } } if !aliased { panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn)) } } // lookup looks up the intrinsic for a pkg.fn on the specified architecture. func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder { return intrinsics[intrinsicKey{arch, pkg, fn}] } func initIntrinsics(cfg *intrinsicBuildConfig) { if cfg == nil { cfg = &intrinsicBuildConfig{ instrumenting: base.Flag.Cfg.Instrumenting, go386: buildcfg.GO386, goamd64: buildcfg.GOAMD64, goarm: buildcfg.GOARM, goarm64: buildcfg.GOARM64, gomips: buildcfg.GOMIPS, gomips64: buildcfg.GOMIPS64, goppc64: buildcfg.GOPPC64, goriscv64: buildcfg.GORISCV64, } } intrinsics = intrinsicBuilders{} var p4 []*sys.Arch var p8 []*sys.Arch var lwatomics []*sys.Arch for _, a := range sys.Archs { if a.PtrSize == 4 { p4 = append(p4, a) } else { p8 = append(p8, a) } if a.Family != sys.PPC64 { lwatomics = append(lwatomics, a) } } all := sys.Archs[:] add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) { intrinsics.addForArchs(pkg, fn, b, archs...) } addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) { intrinsics.addForFamilies(pkg, fn, b, archFamilies...) } alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) { intrinsics.alias(pkg, fn, pkg2, fn2, archs...) } /******** runtime ********/ if !cfg.instrumenting { add("runtime", "slicebytetostringtmp", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // Compiler frontend optimizations emit OBYTES2STRTMP nodes // for the backend instead of slicebytetostringtmp calls // when not instrumenting. return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1]) }, all...) } addF("internal/runtime/math", "MulUintptr", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if s.config.PtrSize == 4 { return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1]) } return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1]) }, sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64) add("runtime", "KeepAlive", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0]) s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem()) return nil }, all...) addF("runtime", "publicationBarrier", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem()) return nil }, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64) /******** internal/runtime/sys ********/ add("internal/runtime/sys", "GetCallerPC", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr) }, all...) add("internal/runtime/sys", "GetCallerSP", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem()) }, all...) add("internal/runtime/sys", "GetClosurePtr", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr) }, all...) brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X} if cfg.goppc64 >= 10 { // Use only on Power10 as the new byte reverse instructions that Power10 provide // make it worthwhile as an intrinsic brev_arch = append(brev_arch, sys.PPC64) } addF("internal/runtime/sys", "Bswap32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0]) }, brev_arch...) addF("internal/runtime/sys", "Bswap64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0]) }, brev_arch...) /****** Prefetch ******/ makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem()) return nil } } // Make Prefetch intrinsics for supported platforms // On the unsupported platforms stub function will be eliminated addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache), sys.AMD64, sys.ARM64, sys.PPC64) addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed), sys.AMD64, sys.ARM64, sys.PPC64) /******** internal/runtime/atomic ********/ type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) addF("internal/runtime/atomic", "Load", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) }, sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Load8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v) }, sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Load64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) }, sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "LoadAcq", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) }, sys.PPC64) addF("internal/runtime/atomic", "LoadAcq64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) }, sys.PPC64) addF("internal/runtime/atomic", "Loadp", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v) }, sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Store", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Store8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Store64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "StorepNoWB", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "StoreRel", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.PPC64) addF("internal/runtime/atomic", "StoreRel64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.PPC64) makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // Target Atomic feature is identified by dynamic detection addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb) v := s.load(types.Types[types.TBOOL], addr) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // We have atomic instructions - use it directly. s.startBlock(bTrue) emit(s, n, args, op1, typ, false) s.endBlock().AddEdgeTo(bEnd) // Use original instruction sequence. s.startBlock(bFalse) emit(s, n, args, op0, typ, false) s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return nil } } atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) { v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) if needReturn { s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) } } addF("internal/runtime/atomic", "Store8", makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64), sys.Loong64) addF("internal/runtime/atomic", "Store", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.Loong64) addF("internal/runtime/atomic", "Store64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.Loong64) addF("internal/runtime/atomic", "Xchg8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v) }, sys.AMD64, sys.PPC64) addF("internal/runtime/atomic", "Xchg", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) }, sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Xchg64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) }, sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if cfg.goarm64.LSE { emit(s, n, args, op1, typ, needReturn) } else { // Target Atomic feature is identified by dynamic detection addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb) v := s.load(types.Types[types.TBOOL], addr) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // We have atomic instructions - use it directly. s.startBlock(bTrue) emit(s, n, args, op1, typ, needReturn) s.endBlock().AddEdgeTo(bEnd) // Use original instruction sequence. s.startBlock(bFalse) emit(s, n, args, op0, typ, needReturn) s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) } if needReturn { return s.variable(n, types.Types[typ]) } else { return nil } } } makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder { return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true) } makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder { return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false) } atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) { v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) if needReturn { s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) } } addF("internal/runtime/atomic", "Xchg8", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Xchg", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Xchg64", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64), sys.ARM64) makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb) v := s.load(types.Types[types.TBOOL], addr) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b // We have the intrinsic - use it directly. s.startBlock(bTrue) s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n]) s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n]) s.endBlock().AddEdgeTo(bEnd) // Call the pure Go version. s.startBlock(bFalse) s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8] s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return s.variable(n, types.Types[types.TUINT8]) } } addF("internal/runtime/atomic", "Xchg8", makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant), sys.Loong64) addF("internal/runtime/atomic", "Xadd", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v) }, sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Xadd64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v) }, sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Xadd", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Xadd64", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Cas", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) }, sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Cas64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) }, sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "CasRel", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v) }, sys.PPC64) atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) { v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) if needReturn { s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) } } addF("internal/runtime/atomic", "Cas", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Cas64", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64), sys.ARM64) atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) { v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem()) s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v) if needReturn { s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v) } } makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // Target Atomic feature is identified by dynamic detection addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb) v := s.load(types.Types[types.TBOOL], addr) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // We have atomic instructions - use it directly. s.startBlock(bTrue) emit(s, n, args, op1, types.TBOOL, true) s.endBlock().AddEdgeTo(bEnd) // Use original instruction sequence. s.startBlock(bFalse) emit(s, n, args, op0, types.TBOOL, true) s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return s.variable(n, types.Types[types.TBOOL]) } } addF("internal/runtime/atomic", "Cas", makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64), sys.Loong64) addF("internal/runtime/atomic", "Cas64", makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64), sys.Loong64) // Old-style atomic logical operation API (all supported archs except arm64). addF("internal/runtime/atomic", "And8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "And", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Or8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) addF("internal/runtime/atomic", "Or", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem()) return nil }, sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X) // arm64 always uses the new-style atomic logical operations, for both the // old and new style API. addF("internal/runtime/atomic", "And8", makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Or8", makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "And64", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "And32", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "And", makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Or64", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Or32", makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64), sys.ARM64) addF("internal/runtime/atomic", "Or", makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64), sys.ARM64) // New-style atomic logical operations, which return the old memory value. addF("internal/runtime/atomic", "And64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) p0, p1 := s.split(v) s.vars[memVar] = p1 return p0 }, sys.AMD64, sys.Loong64) addF("internal/runtime/atomic", "And32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) p0, p1 := s.split(v) s.vars[memVar] = p1 return p0 }, sys.AMD64, sys.Loong64) addF("internal/runtime/atomic", "Or64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem()) p0, p1 := s.split(v) s.vars[memVar] = p1 return p0 }, sys.AMD64, sys.Loong64) addF("internal/runtime/atomic", "Or32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem()) p0, p1 := s.split(v) s.vars[memVar] = p1 return p0 }, sys.AMD64, sys.Loong64) // Aliases for atomic load operations alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...) alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...) alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...) alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...) alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...) alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...) alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...) alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...) alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed // Aliases for atomic store operations alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...) alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...) alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...) alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...) alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...) alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...) alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed // Aliases for atomic swap operations alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...) alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...) alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...) alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...) // Aliases for atomic add operations alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...) alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...) alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...) alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...) // Aliases for atomic CAS operations alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...) alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...) alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...) alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...) alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...) alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...) alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...) // Aliases for atomic And/Or operations alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64) alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64) /******** math ********/ addF("math", "sqrt", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0]) }, sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) addF("math", "Trunc", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0]) }, sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) addF("math", "Ceil", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0]) }, sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) addF("math", "Floor", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0]) }, sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm) addF("math", "Round", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0]) }, sys.ARM64, sys.PPC64, sys.S390X) addF("math", "RoundToEven", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0]) }, sys.ARM64, sys.S390X, sys.Wasm) addF("math", "Abs", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0]) }, sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64) addF("math", "Copysign", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1]) }, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm) addF("math", "FMA", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) }, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X) addF("math", "FMA", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if !s.config.UseFMA { s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] return s.variable(n, types.Types[types.TFLOAT64]) } if cfg.goamd64 >= 3 { return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) } v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // >= haswell cpus are common // We have the intrinsic - use it directly. s.startBlock(bTrue) s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) s.endBlock().AddEdgeTo(bEnd) // Call the pure Go version. s.startBlock(bFalse) s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return s.variable(n, types.Types[types.TFLOAT64]) }, sys.AMD64) addF("math", "FMA", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if !s.config.UseFMA { s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] return s.variable(n, types.Types[types.TFLOAT64]) } addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb) v := s.load(types.Types[types.TBOOL], addr) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // We have the intrinsic - use it directly. s.startBlock(bTrue) s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) s.endBlock().AddEdgeTo(bEnd) // Call the pure Go version. s.startBlock(bFalse) s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return s.variable(n, types.Types[types.TFLOAT64]) }, sys.ARM) makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if cfg.goamd64 >= 2 { return s.newValue1(op, types.Types[types.TFLOAT64], args[0]) } v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays // We have the intrinsic - use it directly. s.startBlock(bTrue) s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0]) s.endBlock().AddEdgeTo(bEnd) // Call the pure Go version. s.startBlock(bFalse) s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64] s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return s.variable(n, types.Types[types.TFLOAT64]) } } addF("math", "RoundToEven", makeRoundAMD64(ssa.OpRoundToEven), sys.AMD64) addF("math", "Floor", makeRoundAMD64(ssa.OpFloor), sys.AMD64) addF("math", "Ceil", makeRoundAMD64(ssa.OpCeil), sys.AMD64) addF("math", "Trunc", makeRoundAMD64(ssa.OpTrunc), sys.AMD64) /******** math/bits ********/ addF("math/bits", "TrailingZeros64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0]) }, sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) addF("math/bits", "TrailingZeros64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0]) hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0]) return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi) }, sys.I386) addF("math/bits", "TrailingZeros32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0]) }, sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) addF("math/bits", "TrailingZeros16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) c := s.constInt32(types.Types[types.TUINT32], 1<<16) y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) }, sys.MIPS) addF("math/bits", "TrailingZeros16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0]) }, sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) addF("math/bits", "TrailingZeros16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) c := s.constInt64(types.Types[types.TUINT64], 1<<16) y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) }, sys.Loong64, sys.S390X, sys.PPC64) addF("math/bits", "TrailingZeros8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) c := s.constInt32(types.Types[types.TUINT32], 1<<8) y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c) return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y) }, sys.MIPS) addF("math/bits", "TrailingZeros8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0]) }, sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm) addF("math/bits", "TrailingZeros8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) c := s.constInt64(types.Types[types.TUINT64], 1<<8) y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c) return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y) }, sys.Loong64, sys.S390X) alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...) alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...) addF("math/bits", "ReverseBytes16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0]) }, sys.Loong64) // ReverseBytes inlines correctly, no need to intrinsify it. // Nothing special is needed for targets where ReverseBytes16 lowers to a rotate // On Power10, 16-bit rotate is not available so use BRH instruction if cfg.goppc64 >= 10 { addF("math/bits", "ReverseBytes16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0]) }, sys.PPC64) } addF("math/bits", "Len64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) }, sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) addF("math/bits", "Len32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) }, sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64) addF("math/bits", "Len32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if s.config.PtrSize == 4 { return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) } x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0]) return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) }, sys.ARM, sys.S390X, sys.MIPS, sys.Wasm) addF("math/bits", "Len16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if s.config.PtrSize == 4 { x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0]) return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) } x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0]) return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) }, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) addF("math/bits", "Len16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0]) }, sys.AMD64) addF("math/bits", "Len8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if s.config.PtrSize == 4 { x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0]) return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x) } x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0]) return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x) }, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) addF("math/bits", "Len8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0]) }, sys.AMD64) addF("math/bits", "Len", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if s.config.PtrSize == 4 { return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0]) } return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0]) }, sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm) // LeadingZeros is handled because it trivially calls Len. addF("math/bits", "Reverse64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0]) }, sys.ARM64, sys.Loong64) addF("math/bits", "Reverse32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0]) }, sys.ARM64, sys.Loong64) addF("math/bits", "Reverse16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0]) }, sys.ARM64, sys.Loong64) addF("math/bits", "Reverse8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0]) }, sys.ARM64, sys.Loong64) addF("math/bits", "Reverse", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0]) }, sys.ARM64, sys.Loong64) addF("math/bits", "RotateLeft8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1]) }, sys.AMD64, sys.RISCV64) addF("math/bits", "RotateLeft16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1]) }, sys.AMD64, sys.RISCV64) addF("math/bits", "RotateLeft32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1]) }, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) addF("math/bits", "RotateLeft64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1]) }, sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm) alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...) makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if cfg.goamd64 >= 2 { return s.newValue1(op, types.Types[types.TINT], args[0]) } v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // most machines have popcnt nowadays // We have the intrinsic - use it directly. s.startBlock(bTrue) s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0]) s.endBlock().AddEdgeTo(bEnd) // Call the pure Go version. s.startBlock(bFalse) s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT] s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return s.variable(n, types.Types[types.TINT]) } } makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb) v := s.load(types.Types[types.TBOOL], addr) b := s.endBlock() b.Kind = ssa.BlockIf b.SetControl(v) bTrue := s.f.NewBlock(ssa.BlockPlain) bFalse := s.f.NewBlock(ssa.BlockPlain) bEnd := s.f.NewBlock(ssa.BlockPlain) b.AddEdgeTo(bTrue) b.AddEdgeTo(bFalse) b.Likely = ssa.BranchLikely // most loong64 machines support the LSX // We have the intrinsic - use it directly. s.startBlock(bTrue) s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0]) s.endBlock().AddEdgeTo(bEnd) // Call the pure Go version. s.startBlock(bFalse) s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT] s.endBlock().AddEdgeTo(bEnd) // Merge results. s.startBlock(bEnd) return s.variable(n, types.Types[types.TINT]) } } addF("math/bits", "OnesCount64", makeOnesCountAMD64(ssa.OpPopCount64), sys.AMD64) addF("math/bits", "OnesCount64", makeOnesCountLoong64(ssa.OpPopCount64), sys.Loong64) addF("math/bits", "OnesCount64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0]) }, sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm) addF("math/bits", "OnesCount32", makeOnesCountAMD64(ssa.OpPopCount32), sys.AMD64) addF("math/bits", "OnesCount32", makeOnesCountLoong64(ssa.OpPopCount32), sys.Loong64) addF("math/bits", "OnesCount32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0]) }, sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm) addF("math/bits", "OnesCount16", makeOnesCountAMD64(ssa.OpPopCount16), sys.AMD64) addF("math/bits", "OnesCount16", makeOnesCountLoong64(ssa.OpPopCount16), sys.Loong64) addF("math/bits", "OnesCount16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0]) }, sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm) addF("math/bits", "OnesCount8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0]) }, sys.S390X, sys.PPC64, sys.Wasm) addF("math/bits", "OnesCount", makeOnesCountAMD64(ssa.OpPopCount64), sys.AMD64) addF("math/bits", "Mul64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1]) }, sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64) alias("math/bits", "Mul", "math/bits", "Mul64", p8...) alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...) addF("math/bits", "Add64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) }, sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64) alias("math/bits", "Add", "math/bits", "Add64", p8...) alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...) addF("math/bits", "Sub64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) }, sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64) alias("math/bits", "Sub", "math/bits", "Sub64", p8...) addF("math/bits", "Div64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // check for divide-by-zero/overflow and panic with appropriate message cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64])) s.check(cmpZero, ir.Syms.Panicdivide) cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2]) s.check(cmpOverflow, ir.Syms.Panicoverflow) return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) }, sys.AMD64) alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64) alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...) alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...) alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...) alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...) alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...) alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...) /******** sync/atomic ********/ // Note: these are disabled by flag_race in findIntrinsic below. alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...) alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...) alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...) alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...) alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...) alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...) alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...) alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...) alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...) // Note: not StorePointer, that needs a write barrier. Same below for {CompareAnd}Swap. alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...) alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...) alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...) alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...) alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...) alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...) alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...) alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...) alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...) alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...) alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...) alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...) alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...) alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...) alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...) alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...) alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...) alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...) alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...) alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...) alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...) alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...) alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64) /******** math/big ********/ alias("math/big", "mulWW", "math/bits", "Mul64", p8...) /******** internal/runtime/maps ********/ // Important: The intrinsic implementations below return a packed // bitset, while the portable Go implementation uses an unpacked // representation (one bit set in each byte). // // Thus we must replace most bitset methods with implementations that // work with the packed representation. // // TODO(prattmic): The bitset implementations don't use SIMD, so they // could be handled with build tags (though that would break // -d=ssa/intrinsics/off=1). // With a packed representation we no longer need to shift the result // of TrailingZeros64. alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64) addF("internal/runtime/maps", "bitsetRemoveBelow", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { b := args[0] i := args[1] // Clear the lower i bits in b. // // out = b &^ ((1 << i) - 1) one := s.constInt64(types.Types[types.TUINT64], 1) mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i) mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one) mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask) return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask) }, sys.AMD64) addF("internal/runtime/maps", "bitsetLowestSet", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { b := args[0] // Test the lowest bit in b. // // out = (b & 1) == 1 one := s.constInt64(types.Types[types.TUINT64], 1) and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one) return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one) }, sys.AMD64) addF("internal/runtime/maps", "bitsetShiftOutLowest", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { b := args[0] // Right shift out the lowest bit in b. // // out = b >> 1 one := s.constInt64(types.Types[types.TUINT64], 1) return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one) }, sys.AMD64) addF("internal/runtime/maps", "ctrlGroupMatchH2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { g := args[0] h := args[1] // Explicit copies to fp registers. See // https://go.dev/issue/70451. gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g) hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h) // Broadcast h2 into each byte of a word. var broadcast *ssa.Value if buildcfg.GOAMD64 >= 4 { // VPBROADCASTB saves 1 instruction vs PSHUFB // because the input can come from a GP // register, while PSHUFB requires moving into // an FP register first. // // Nominally PSHUFB would require a second // additional instruction to load the control // mask into a FP register. But broadcast uses // a control mask of 0, and the register ABI // already defines X15 as a zero register. broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h } else if buildcfg.GOAMD64 >= 2 { // PSHUFB performs a byte broadcast when given // a control input of 0. broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp) } else { // No direct byte broadcast. First we must // duplicate the lower byte and then do a // 16-bit broadcast. // "Unpack" h2 with itself. This duplicates the // input, resulting in h2 in the lower two // bytes. unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp) // Copy the lower 16-bits of unpack into every // 16-bit slot in the lower 64-bits of the // output register. Note that immediate 0 // selects the low word as the source for every // destination slot. broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack) // No need to broadcast into the upper 64-bits, // as we don't use those. } // Compare each byte of the control word with h2. Each // matching byte has every bit set. eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp) // Construct a "byte mask": each output bit is equal to // the sign bit each input byte. // // This results in a packed output (bit N set means // byte N matched). // // NOTE: See comment above on bitsetFirst. out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq) // g is only 64-bits so the upper 64-bits of the // 128-bit register will be zero. If h2 is also zero, // then we'll get matches on those bytes. Truncate the // upper bits to ignore such matches. ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out) return ret }, sys.AMD64) addF("internal/runtime/maps", "ctrlGroupMatchEmpty", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // An empty slot is 1000 0000 // A deleted slot is 1111 1110 // A full slot is 0??? ???? g := args[0] // Explicit copy to fp register. See // https://go.dev/issue/70451. gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g) if buildcfg.GOAMD64 >= 2 { // "PSIGNB negates each data element of the // destination operand (the first operand) if // the signed integer value of the // corresponding data element in the source // operand (the second operand) is less than // zero. If the signed integer value of a data // element in the source operand is positive, // the corresponding data element in the // destination operand is unchanged. If a data // element in the source operand is zero, the // corresponding data element in the // destination operand is set to zero" - Intel SDM // // If we pass the group control word as both // arguments: // - Full slots are unchanged. // - Deleted slots are negated, becoming // 0000 0010. // - Empty slots are negated, becoming // 1000 0000 (unchanged!). // // The result is that only empty slots have the // sign bit set. We then use PMOVMSKB to // extract the sign bits. sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp) // Construct a "byte mask": each output bit is // equal to the sign bit each input byte. The // sign bit is only set for empty or deleted // slots. // // This results in a packed output (bit N set // means byte N matched). // // NOTE: See comment above on bitsetFirst. ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], sign) // g is only 64-bits so the upper 64-bits of // the 128-bit register will be zero. PSIGNB // will keep all of these bytes zero, so no // need to truncate. return ret } // No PSIGNB, simply do byte equality with ctrlEmpty. // Load ctrlEmpty into each byte of a control word. var ctrlsEmpty uint64 = abi.SwissMapCtrlEmpty e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty)) // Explicit copy to fp register. See // https://go.dev/issue/70451. efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e) // Compare each byte of the control word with ctrlEmpty. Each // matching byte has every bit set. eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp) // Construct a "byte mask": each output bit is equal to // the sign bit each input byte. // // This results in a packed output (bit N set means // byte N matched). // // NOTE: See comment above on bitsetFirst. out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq) // g is only 64-bits so the upper 64-bits of the // 128-bit register will be zero. The upper 64-bits of // efp are also zero, so we'll get matches on those // bytes. Truncate the upper bits to ignore such // matches. return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out) }, sys.AMD64) addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // An empty slot is 1000 0000 // A deleted slot is 1111 1110 // A full slot is 0??? ???? // // A slot is empty or deleted iff bit 7 (sign bit) is // set. g := args[0] // Explicit copy to fp register. See // https://go.dev/issue/70451. gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g) // Construct a "byte mask": each output bit is equal to // the sign bit each input byte. The sign bit is only // set for empty or deleted slots. // // This results in a packed output (bit N set means // byte N matched). // // NOTE: See comment above on bitsetFirst. ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp) // g is only 64-bits so the upper 64-bits of the // 128-bit register will be zero. Zero will never match // ctrlEmpty or ctrlDeleted, so no need to truncate. return ret }, sys.AMD64) addF("internal/runtime/maps", "ctrlGroupMatchFull", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // An empty slot is 1000 0000 // A deleted slot is 1111 1110 // A full slot is 0??? ???? // // A slot is full iff bit 7 (sign bit) is unset. g := args[0] // Explicit copy to fp register. See // https://go.dev/issue/70451. gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g) // Construct a "byte mask": each output bit is equal to // the sign bit each input byte. The sign bit is only // set for empty or deleted slots. // // This results in a packed output (bit N set means // byte N matched). // // NOTE: See comment above on bitsetFirst. mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp) // Invert the mask to set the bits for the full slots. out := s.newValue1(ssa.OpCom16, types.Types[types.TUINT16], mask) // g is only 64-bits so the upper 64-bits of the // 128-bit register will be zero, with bit 7 unset. // Truncate the upper bits to ignore these. return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out) }, sys.AMD64) } // findIntrinsic returns a function which builds the SSA equivalent of the // function identified by the symbol sym. If sym is not an intrinsic call, returns nil. func findIntrinsic(sym *types.Sym) intrinsicBuilder { if sym == nil || sym.Pkg == nil { return nil } pkg := sym.Pkg.Path if sym.Pkg == ir.Pkgs.Runtime { pkg = "runtime" } if base.Flag.Race && pkg == "sync/atomic" { // The race detector needs to be able to intercept these calls. // We can't intrinsify them. return nil } // Skip intrinsifying math functions (which may contain hard-float // instructions) when soft-float if Arch.SoftFloat && pkg == "math" { return nil } fn := sym.Name if ssa.IntrinsicsDisable { if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") { // These runtime functions don't have definitions, must be intrinsics. } else { return nil } } return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn) } func IsIntrinsicCall(n *ir.CallExpr) bool { if n == nil { return false } name, ok := n.Fun.(*ir.Name) if !ok { return false } return findIntrinsic(name.Sym()) != nil }