// Copyright 2025 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package unify import ( "fmt" "iter" "reflect" "strings" ) // An envSet is an immutable set of environments, where each environment is a // mapping from [ident]s to [Value]s. // // To keep this compact, we use an algebraic representation similar to // relational algebra. The atoms are zero, unit, or a singular binding: // // - A singular binding {x: v} is an environment set consisting of a single // environment that binds a single ident x to a single value v. // // - Zero (0) is the empty set. // // - Unit (1) is an environment set consisting of a single, empty environment // (no bindings). // // From these, we build up more complex sets of environments using sums and // cross products: // // - A sum, E + F, is simply the union of the two environment sets: E ∪ F // // - A cross product, E ⨯ F, is the Cartesian product of the two environment // sets, followed by joining each pair of environments: {e ⊕ f | (e, f) ∊ E ⨯ F} // // The join of two environments, e ⊕ f, is an environment that contains all of // the bindings in either e or f. To detect bugs, it is an error if an // identifier is bound in both e and f (however, see below for what we could do // differently). // // Environment sets form a commutative semiring and thus obey the usual // commutative semiring rules: // // e + 0 = e // e ⨯ 0 = 0 // e ⨯ 1 = e // e + f = f + e // e ⨯ f = f ⨯ e // // Furthermore, environments sets are additively and multiplicatively idempotent // because + and ⨯ are themselves defined in terms of sets: // // e + e = e // e ⨯ e = e // // # Examples // // To represent {{x: 1, y: 1}, {x: 2, y: 2}}, we build the two environments and // sum them: // // ({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2}) // // If we add a third variable z that can be 1 or 2, independent of x and y, we // get four logical environments: // // {x: 1, y: 1, z: 1} // {x: 2, y: 2, z: 1} // {x: 1, y: 1, z: 2} // {x: 2, y: 2, z: 2} // // This could be represented as a sum of all four environments, but because z is // independent, we can use a more compact representation: // // (({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})) ⨯ ({z: 1} + {z: 2}) // // # Generalized cross product // // While cross-product is currently restricted to disjoint environments, we // could generalize the definition of joining two environments to: // // {xₖ: vₖ} ⊕ {xₖ: wₖ} = {xₖ: vₖ ∩ wₖ} (where unbound idents are bound to the [Top] value, ⟙) // // where v ∩ w is the unification of v and w. This itself could be coarsened to // // v ∩ w = v if w = ⟙ // = w if v = ⟙ // = v if v = w // = 0 otherwise // // We could use this rule to implement substitution. For example, E ⨯ {x: 1} // narrows environment set E to only environments in which x is bound to 1. But // we currently don't do this. type envSet struct { root *envExpr } type envExpr struct { // TODO: A tree-based data structure for this may not be ideal, since it // involves a lot of walking to find things and we often have to do deep // rewrites anyway for partitioning. Would some flattened array-style // representation be better, possibly combined with an index of ident uses? // We could even combine that with an immutable array abstraction (ala // Clojure) that could enable more efficient construction operations. kind envExprKind // For envBinding id *ident val *Value // For sum or product. Len must be >= 2 and none of the elements can have // the same kind as this node. operands []*envExpr } type envExprKind byte const ( envZero envExprKind = iota envUnit envProduct envSum envBinding ) var ( // topEnv is the unit value (multiplicative identity) of a [envSet]. topEnv = envSet{envExprUnit} // bottomEnv is the zero value (additive identity) of a [envSet]. bottomEnv = envSet{envExprZero} envExprZero = &envExpr{kind: envZero} envExprUnit = &envExpr{kind: envUnit} ) // bind binds id to each of vals in e. // // Its panics if id is already bound in e. // // Environments are typically initially constructed by starting with [topEnv] // and calling bind one or more times. func (e envSet) bind(id *ident, vals ...*Value) envSet { if e.isEmpty() { return bottomEnv } // TODO: If any of vals are _, should we just drop that val? We're kind of // inconsistent about whether an id missing from e means id is invalid or // means id is _. // Check that id isn't present in e. for range e.root.bindings(id) { panic("id " + id.name + " already present in environment") } // Create a sum of all the values. bindings := make([]*envExpr, 0, 1) for _, val := range vals { bindings = append(bindings, &envExpr{kind: envBinding, id: id, val: val}) } // Multiply it in. return envSet{newEnvExprProduct(e.root, newEnvExprSum(bindings...))} } func (e envSet) isEmpty() bool { return e.root.kind == envZero } // bindings yields all [envBinding] nodes in e with the given id. If id is nil, // it yields all binding nodes. func (e *envExpr) bindings(id *ident) iter.Seq[*envExpr] { // This is just a pre-order walk and it happens this is the only thing we // need a pre-order walk for. return func(yield func(*envExpr) bool) { var rec func(e *envExpr) bool rec = func(e *envExpr) bool { if e.kind == envBinding && (id == nil || e.id == id) { if !yield(e) { return false } } for _, o := range e.operands { if !rec(o) { return false } } return true } rec(e) } } // newEnvExprProduct constructs a product node from exprs, performing // simplifications. It does NOT check that bindings are disjoint. func newEnvExprProduct(exprs ...*envExpr) *envExpr { factors := make([]*envExpr, 0, 2) for _, expr := range exprs { switch expr.kind { case envZero: return envExprZero case envUnit: // No effect on product case envProduct: factors = append(factors, expr.operands...) default: factors = append(factors, expr) } } if len(factors) == 0 { return envExprUnit } else if len(factors) == 1 { return factors[0] } return &envExpr{kind: envProduct, operands: factors} } // newEnvExprSum constructs a sum node from exprs, performing simplifications. func newEnvExprSum(exprs ...*envExpr) *envExpr { // TODO: If all of envs are products (or bindings), factor any common terms. // E.g., x * y + x * z ==> x * (y + z). This is easy to do for binding // terms, but harder to do for more general terms. var have smallSet[*envExpr] terms := make([]*envExpr, 0, 2) for _, expr := range exprs { switch expr.kind { case envZero: // No effect on sum case envSum: for _, expr1 := range expr.operands { if have.Add(expr1) { terms = append(terms, expr1) } } default: if have.Add(expr) { terms = append(terms, expr) } } } if len(terms) == 0 { return envExprZero } else if len(terms) == 1 { return terms[0] } return &envExpr{kind: envSum, operands: terms} } func crossEnvs(env1, env2 envSet) envSet { // Confirm that envs have disjoint idents. var ids1 smallSet[*ident] for e := range env1.root.bindings(nil) { ids1.Add(e.id) } for e := range env2.root.bindings(nil) { if ids1.Has(e.id) { panic(fmt.Sprintf("%s bound on both sides of cross-product", e.id.name)) } } return envSet{newEnvExprProduct(env1.root, env2.root)} } func unionEnvs(envs ...envSet) envSet { exprs := make([]*envExpr, len(envs)) for i := range envs { exprs[i] = envs[i].root } return envSet{newEnvExprSum(exprs...)} } // envPartition is a subset of an env where id is bound to value in all // deterministic environments. type envPartition struct { id *ident value *Value env envSet } // partitionBy splits e by distinct bindings of id and removes id from each // partition. // // If there are environments in e where id is not bound, they will not be // reflected in any partition. // // It panics if e is bottom, since attempting to partition an empty environment // set almost certainly indicates a bug. func (e envSet) partitionBy(id *ident) []envPartition { if e.isEmpty() { // We could return zero partitions, but getting here at all almost // certainly indicates a bug. panic("cannot partition empty environment set") } // Emit a partition for each value of id. var seen smallSet[*Value] var parts []envPartition for n := range e.root.bindings(id) { if !seen.Add(n.val) { // Already emitted a partition for this value. continue } parts = append(parts, envPartition{ id: id, value: n.val, env: envSet{e.root.substitute(id, n.val)}, }) } return parts } // substitute replaces bindings of id to val with 1 and bindings of id to any // other value with 0 and simplifies the result. func (e *envExpr) substitute(id *ident, val *Value) *envExpr { switch e.kind { default: panic("bad kind") case envZero, envUnit: return e case envBinding: if e.id != id { return e } else if e.val != val { return envExprZero } else { return envExprUnit } case envProduct, envSum: // Substitute each operand. Sometimes, this won't change anything, so we // build the new operands list lazily. var nOperands []*envExpr for i, op := range e.operands { nOp := op.substitute(id, val) if nOperands == nil && op != nOp { // Operand diverged; initialize nOperands. nOperands = make([]*envExpr, 0, len(e.operands)) nOperands = append(nOperands, e.operands[:i]...) } if nOperands != nil { nOperands = append(nOperands, nOp) } } if nOperands == nil { // Nothing changed. return e } if e.kind == envProduct { return newEnvExprProduct(nOperands...) } else { return newEnvExprSum(nOperands...) } } } // A smallSet is a set optimized for stack allocation when small. type smallSet[T comparable] struct { array [32]T n int m map[T]struct{} } // Has returns whether val is in set. func (s *smallSet[T]) Has(val T) bool { arr := s.array[:s.n] for i := range arr { if arr[i] == val { return true } } _, ok := s.m[val] return ok } // Add adds val to the set and returns true if it was added (not already // present). func (s *smallSet[T]) Add(val T) bool { // Test for presence. if s.Has(val) { return false } // Add it if s.n < len(s.array) { s.array[s.n] = val s.n++ } else { if s.m == nil { s.m = make(map[T]struct{}) } s.m[val] = struct{}{} } return true } type ident struct { _ [0]func() // Not comparable (only compare *ident) name string } type Var struct { id *ident } func (d Var) Exact() bool { // These can't appear in concrete Values. panic("Exact called on non-concrete Value") } func (d Var) WhyNotExact() string { // These can't appear in concrete Values. return "WhyNotExact called on non-concrete Value" } func (d Var) decode(rv reflect.Value) error { return &inexactError{"var", rv.Type().String()} } func (d Var) unify(w *Value, e envSet, swap bool, uf *unifier) (Domain, envSet, error) { // TODO: Vars from !sums in the input can have a huge number of values. // Unifying these could be way more efficient with some indexes over any // exact values we can pull out, like Def fields that are exact Strings. // Maybe we try to produce an array of yes/no/maybe matches and then we only // have to do deeper evaluation of the maybes. We could probably cache this // on an envTerm. It may also help to special-case Var/Var unification to // pick which one to index versus enumerate. if vd, ok := w.Domain.(Var); ok && d.id == vd.id { // Unifying $x with $x results in $x. If we descend into this we'll have // problems because we strip $x out of the environment to keep ourselves // honest and then can't find it on the other side. // // TODO: I'm not positive this is the right fix. return vd, e, nil } // We need to unify w with the value of d in each possible environment. We // can save some work by grouping environments by the value of d, since // there will be a lot of redundancy here. var nEnvs []envSet envParts := e.partitionBy(d.id) for i, envPart := range envParts { exit := uf.enterVar(d.id, i) // Each branch logically gets its own copy of the initial environment // (narrowed down to just this binding of the variable), and each branch // may result in different changes to that starting environment. res, e2, err := w.unify(envPart.value, envPart.env, swap, uf) exit.exit() if err != nil { return nil, envSet{}, err } if res.Domain == nil { // This branch entirely failed to unify, so it's gone. continue } nEnv := e2.bind(d.id, res) nEnvs = append(nEnvs, nEnv) } if len(nEnvs) == 0 { // All branches failed return nil, bottomEnv, nil } // The effect of this is entirely captured in the environment. We can return // back the same Bind node. return d, unionEnvs(nEnvs...), nil } // An identPrinter maps [ident]s to unique string names. type identPrinter struct { ids map[*ident]string idGen map[string]int } func (p *identPrinter) unique(id *ident) string { if p.ids == nil { p.ids = make(map[*ident]string) p.idGen = make(map[string]int) } name, ok := p.ids[id] if !ok { gen := p.idGen[id.name] p.idGen[id.name]++ if gen == 0 { name = id.name } else { name = fmt.Sprintf("%s#%d", id.name, gen) } p.ids[id] = name } return name } func (p *identPrinter) slice(ids []*ident) string { var strs []string for _, id := range ids { strs = append(strs, p.unique(id)) } return fmt.Sprintf("[%s]", strings.Join(strs, ", ")) }