// Copyright 2026 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package maphash_test // This file demonstrates a Bloom filter. import ( "fmt" "hash/maphash" "math" "math/bits" ) // BloomFilter is a Bloom filter, a probabilistic space-efficient // representation of a set of values of type V based on hashing. // // It provides two operations: Insert inserts an element and Contains // queries whether a value is a member of the set. // // However, unlike a typical set, the size is independent of the // number of elements. The catch: Contains is permitted to report // true even for some elements that are not present. The trade-off // between space and accuracy is determined by parameters at // construction. type BloomFilter[V any] struct { hasher maphash.Hasher[V] seeds []maphash.Seed // each seed determines a hash function bytes []byte // bit vector } // NewBloomFilterComparable returns a new BloomFilter for the // specified elements using their natural comparison. func NewComparableBloomFilter[V comparable](n int, fpProb float64) *BloomFilter[V] { return NewBloomFilter(maphash.ComparableHasher[V]{}, n, fpProb) } // NewBloomFilter constructs a new BloomFilter capable of holding n // elements with the specified probability of false positive results, // assuming a well dispersed hash function. func NewBloomFilter[V any](hasher maphash.Hasher[V], n int, fpProb float64) *BloomFilter[V] { nbytes, nseeds := calibrate(n, fpProb) seeds := make([]maphash.Seed, nseeds) for i := range nseeds { seeds[i] = maphash.MakeSeed() } return &BloomFilter[V]{ hasher: hasher, bytes: make([]byte, nbytes), seeds: seeds, } } func (f *BloomFilter[V]) Contains(v V) bool { for _, seed := range f.seeds { index, bit := f.locate(seed, v) if f.bytes[index]&bit == 0 { return false } } return true } func (f *BloomFilter[V]) Insert(v V) { for _, seed := range f.seeds { index, bit := f.locate(seed, v) f.bytes[index] |= bit } } func (f *BloomFilter[V]) locate(seed maphash.Seed, v V) (uint64, byte) { // Optimization note: the dynamic call to hasher.Hash causes h // to escape. You can use a sync.Pool can help mitigate the // allocation cost. // // Alternatively, you could copy and specialize the filter logic // for a specific implementation of maphash.Hasher, allowing // the compiler's escape analysis to determine that the // hasher.Hash call does not in fact cause h to escape. var h maphash.Hash h.SetSeed(seed) f.hasher.Hash(&h, v) hash := h.Sum64() index := reduce(hash, uint64(len(f.bytes))) mask := byte(1 << (hash % 8)) return index, mask } // reduce maps hash to a value in the interval [0, n). // See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction func reduce(hash, n uint64) uint64 { if n > 1<<32-1 { hi, _ := bits.Mul64(hash, n) return hi } return hash >> 32 * n >> 32 } func calibrate(n int, fpProb float64) (bytes, seeds int) { // Following https://en.wikipedia.org/wiki/Bloom_filter: // - k is the number of hash functions, // - m is the size of the bit field; // - n is the number of set bits. if n == 0 { return 1, 1 } logFpProb := math.Log(fpProb) m := -(float64(n) * logFpProb) / (math.Ln2 * math.Ln2) // Round up to a byte. // TODO(adonovan): opt: round up to the next allocation size // class (see bytes.growSlice) and then compute the possibly // smaller number of needed seeds based on this higher number. bytes = int(m) / 8 if float64(bytes*8) < m { bytes++ } k := -logFpProb / math.Ln2 seeds = max(int(math.Round(k)), 1) return bytes, seeds } func Example_bloomFilter() { // Create a Bloom filter optimized for 2 elements with // a one-in-a-billion false-positive rate. // // (This low rate demands a lot of space: 88 bits and // 30 hash functions. More typical rates are 1-5%; // at 5%, we need only 16 bits and 4 hash functions.) f := NewComparableBloomFilter[string](2, 1e-9) // Insert two elements. f.Insert("apple") f.Insert("banana") // Check whether elements are present. // // "cherry" was not inserted, but Contains is probabilistic, so // this test will spuriously report Contains("cherry") = true // about once every billion runs. for _, fruit := range []string{"apple", "banana", "cherry"} { fmt.Printf("Contains(%q) = %v\n", fruit, f.Contains(fruit)) } // Output: // // Contains("apple") = true // Contains("banana") = true // Contains("cherry") = false }