Source file src/hash/maphash/example_bloom_test.go
1 // Copyright 2026 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package maphash_test 6 7 // This file demonstrates a Bloom filter. 8 9 import ( 10 "fmt" 11 "hash/maphash" 12 "math" 13 "math/bits" 14 ) 15 16 // BloomFilter is a Bloom filter, a probabilistic space-efficient 17 // representation of a set of values of type V based on hashing. 18 // 19 // It provides two operations: Insert inserts an element and Contains 20 // queries whether a value is a member of the set. 21 // 22 // However, unlike a typical set, the size is independent of the 23 // number of elements. The catch: Contains is permitted to report 24 // true even for some elements that are not present. The trade-off 25 // between space and accuracy is determined by parameters at 26 // construction. 27 type BloomFilter[V any] struct { 28 hasher maphash.Hasher[V] 29 seeds []maphash.Seed // each seed determines a hash function 30 bytes []byte // bit vector 31 } 32 33 // NewBloomFilterComparable returns a new BloomFilter for the 34 // specified elements using their natural comparison. 35 func NewComparableBloomFilter[V comparable](n int, fpProb float64) *BloomFilter[V] { 36 return NewBloomFilter(maphash.ComparableHasher[V]{}, n, fpProb) 37 } 38 39 // NewBloomFilter constructs a new BloomFilter capable of holding n 40 // elements with the specified probability of false positive results, 41 // assuming a well dispersed hash function. 42 func NewBloomFilter[V any](hasher maphash.Hasher[V], n int, fpProb float64) *BloomFilter[V] { 43 nbytes, nseeds := calibrate(n, fpProb) 44 45 seeds := make([]maphash.Seed, nseeds) 46 for i := range nseeds { 47 seeds[i] = maphash.MakeSeed() 48 } 49 50 return &BloomFilter[V]{ 51 hasher: hasher, 52 bytes: make([]byte, nbytes), 53 seeds: seeds, 54 } 55 } 56 57 func (f *BloomFilter[V]) Contains(v V) bool { 58 for _, seed := range f.seeds { 59 index, bit := f.locate(seed, v) 60 if f.bytes[index]&bit == 0 { 61 return false 62 } 63 } 64 return true 65 } 66 67 func (f *BloomFilter[V]) Insert(v V) { 68 for _, seed := range f.seeds { 69 index, bit := f.locate(seed, v) 70 f.bytes[index] |= bit 71 } 72 } 73 74 func (f *BloomFilter[V]) locate(seed maphash.Seed, v V) (uint64, byte) { 75 // Optimization note: the dynamic call to hasher.Hash causes h 76 // to escape. You can use a sync.Pool can help mitigate the 77 // allocation cost. 78 // 79 // Alternatively, you could copy and specialize the filter logic 80 // for a specific implementation of maphash.Hasher, allowing 81 // the compiler's escape analysis to determine that the 82 // hasher.Hash call does not in fact cause h to escape. 83 var h maphash.Hash 84 h.SetSeed(seed) 85 f.hasher.Hash(&h, v) 86 hash := h.Sum64() 87 88 index := reduce(hash, uint64(len(f.bytes))) 89 mask := byte(1 << (hash % 8)) 90 return index, mask 91 } 92 93 // reduce maps hash to a value in the interval [0, n). 94 // See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction 95 func reduce(hash, n uint64) uint64 { 96 if n > 1<<32-1 { 97 hi, _ := bits.Mul64(hash, n) 98 return hi 99 } 100 return hash >> 32 * n >> 32 101 } 102 103 func calibrate(n int, fpProb float64) (bytes, seeds int) { 104 // Following https://en.wikipedia.org/wiki/Bloom_filter: 105 // - k is the number of hash functions, 106 // - m is the size of the bit field; 107 // - n is the number of set bits. 108 109 if n == 0 { 110 return 1, 1 111 } 112 113 logFpProb := math.Log(fpProb) 114 m := -(float64(n) * logFpProb) / (math.Ln2 * math.Ln2) 115 116 // Round up to a byte. 117 // TODO(adonovan): opt: round up to the next allocation size 118 // class (see bytes.growSlice) and then compute the possibly 119 // smaller number of needed seeds based on this higher number. 120 bytes = int(m) / 8 121 if float64(bytes*8) < m { 122 bytes++ 123 } 124 125 k := -logFpProb / math.Ln2 126 seeds = max(int(math.Round(k)), 1) 127 return bytes, seeds 128 } 129 130 func Example_bloomFilter() { 131 // Create a Bloom filter optimized for 2 elements with 132 // a one-in-a-billion false-positive rate. 133 // 134 // (This low rate demands a lot of space: 88 bits and 135 // 30 hash functions. More typical rates are 1-5%; 136 // at 5%, we need only 16 bits and 4 hash functions.) 137 f := NewComparableBloomFilter[string](2, 1e-9) 138 139 // Insert two elements. 140 f.Insert("apple") 141 f.Insert("banana") 142 143 // Check whether elements are present. 144 // 145 // "cherry" was not inserted, but Contains is probabilistic, so 146 // this test will spuriously report Contains("cherry") = true 147 // about once every billion runs. 148 for _, fruit := range []string{"apple", "banana", "cherry"} { 149 fmt.Printf("Contains(%q) = %v\n", fruit, f.Contains(fruit)) 150 } 151 152 // Output: 153 // 154 // Contains("apple") = true 155 // Contains("banana") = true 156 // Contains("cherry") = false 157 } 158