example_bloom_test.go

     1  // Copyright 2026 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package maphash_test
     6  
     7  // This file demonstrates a Bloom filter.
     8  
     9  import (
    10  	"fmt"
    11  	"hash/maphash"
    12  	"math"
    13  	"math/bits"
    14  )
    15  
    16  // BloomFilter is a Bloom filter, a probabilistic space-efficient
    17  // representation of a set of values of type V based on hashing.
    18  //
    19  // It provides two operations: Insert inserts an element and Contains
    20  // queries whether a value is a member of the set.
    21  //
    22  // However, unlike a typical set, the size is independent of the
    23  // number of elements. The catch: Contains is permitted to report
    24  // true even for some elements that are not present. The trade-off
    25  // between space and accuracy is determined by parameters at
    26  // construction.
    27  type BloomFilter[V any] struct {
    28  	hasher maphash.Hasher[V]
    29  	seeds  []maphash.Seed // each seed determines a hash function
    30  	bytes  []byte         // bit vector
    31  }
    32  
    33  // NewBloomFilterComparable returns a new BloomFilter for the
    34  // specified elements using their natural comparison.
    35  func NewComparableBloomFilter[V comparable](n int, fpProb float64) *BloomFilter[V] {
    36  	return NewBloomFilter(maphash.ComparableHasher[V]{}, n, fpProb)
    37  }
    38  
    39  // NewBloomFilter constructs a new BloomFilter capable of holding n
    40  // elements with the specified probability of false positive results,
    41  // assuming a well dispersed hash function.
    42  func NewBloomFilter[V any](hasher maphash.Hasher[V], n int, fpProb float64) *BloomFilter[V] {
    43  	nbytes, nseeds := calibrate(n, fpProb)
    44  
    45  	seeds := make([]maphash.Seed, nseeds)
    46  	for i := range nseeds {
    47  		seeds[i] = maphash.MakeSeed()
    48  	}
    49  
    50  	return &BloomFilter[V]{
    51  		hasher: hasher,
    52  		bytes:  make([]byte, nbytes),
    53  		seeds:  seeds,
    54  	}
    55  }
    56  
    57  func (f *BloomFilter[V]) Contains(v V) bool {
    58  	for _, seed := range f.seeds {
    59  		index, bit := f.locate(seed, v)
    60  		if f.bytes[index]&bit == 0 {
    61  			return false
    62  		}
    63  	}
    64  	return true
    65  }
    66  
    67  func (f *BloomFilter[V]) Insert(v V) {
    68  	for _, seed := range f.seeds {
    69  		index, bit := f.locate(seed, v)
    70  		f.bytes[index] |= bit
    71  	}
    72  }
    73  
    74  func (f *BloomFilter[V]) locate(seed maphash.Seed, v V) (uint64, byte) {
    75  	// Optimization note: the dynamic call to hasher.Hash causes h
    76  	// to escape. You can use a sync.Pool can help mitigate the
    77  	// allocation cost.
    78  	//
    79  	// Alternatively, you could copy and specialize the filter logic
    80  	// for a specific implementation of maphash.Hasher, allowing
    81  	// the compiler's escape analysis to determine that the
    82  	// hasher.Hash call does not in fact cause h to escape.
    83  	var h maphash.Hash
    84  	h.SetSeed(seed)
    85  	f.hasher.Hash(&h, v)
    86  	hash := h.Sum64()
    87  
    88  	index := reduce(hash, uint64(len(f.bytes)))
    89  	mask := byte(1 << (hash % 8))
    90  	return index, mask
    91  }
    92  
    93  // reduce maps hash to a value in the interval [0, n).
    94  // See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction
    95  func reduce(hash, n uint64) uint64 {
    96  	if n > 1<<32-1 {
    97  		hi, _ := bits.Mul64(hash, n)
    98  		return hi
    99  	}
   100  	return hash >> 32 * n >> 32
   101  }
   102  
   103  func calibrate(n int, fpProb float64) (bytes, seeds int) {
   104  	// Following https://en.wikipedia.org/wiki/Bloom_filter:
   105  	// - k is the number of hash functions,
   106  	// - m is the size of the bit field;
   107  	// - n is the number of set bits.
   108  
   109  	if n == 0 {
   110  		return 1, 1
   111  	}
   112  
   113  	logFpProb := math.Log(fpProb)
   114  	m := -(float64(n) * logFpProb) / (math.Ln2 * math.Ln2)
   115  
   116  	// Round up to a byte.
   117  	// TODO(adonovan): opt: round up to the next allocation size
   118  	// class (see bytes.growSlice) and then compute the possibly
   119  	// smaller number of needed seeds based on this higher number.
   120  	bytes = int(m) / 8
   121  	if float64(bytes*8) < m {
   122  		bytes++
   123  	}
   124  
   125  	k := -logFpProb / math.Ln2
   126  	seeds = max(int(math.Round(k)), 1)
   127  	return bytes, seeds
   128  }
   129  
   130  func Example_bloomFilter() {
   131  	// Create a Bloom filter optimized for 2 elements with
   132  	// a one-in-a-billion false-positive rate.
   133  	//
   134  	// (This low rate demands a lot of space: 88 bits and
   135  	// 30 hash functions. More typical rates are 1-5%;
   136  	// at 5%, we need only 16 bits and 4 hash functions.)
   137  	f := NewComparableBloomFilter[string](2, 1e-9)
   138  
   139  	// Insert two elements.
   140  	f.Insert("apple")
   141  	f.Insert("banana")
   142  
   143  	// Check whether elements are present.
   144  	//
   145  	// "cherry" was not inserted, but Contains is probabilistic, so
   146  	// this test will spuriously report Contains("cherry") = true
   147  	// about once every billion runs.
   148  	for _, fruit := range []string{"apple", "banana", "cherry"} {
   149  		fmt.Printf("Contains(%q) = %v\n", fruit, f.Contains(fruit))
   150  	}
   151  
   152  	// Output:
   153  	//
   154  	// Contains("apple") = true
   155  	// Contains("banana") = true
   156  	// Contains("cherry") = false
   157  }
   158
View as plain text