// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package maphash_test

// This file demonstrates a Bloom filter.

import (
	"fmt"
	"hash/maphash"
	"math"
	"math/bits"
)

// BloomFilter is a Bloom filter, a probabilistic space-efficient
// representation of a set of values of type V based on hashing.
//
// It provides two operations: Insert inserts an element and Contains
// queries whether a value is a member of the set.
//
// However, unlike a typical set, the size is independent of the
// number of elements. The catch: Contains is permitted to report
// true even for some elements that are not present. The trade-off
// between space and accuracy is determined by parameters at
// construction.
type BloomFilter[V any] struct {
	hasher maphash.Hasher[V]
	seeds  []maphash.Seed // each seed determines a hash function
	bytes  []byte         // bit vector
}

// NewBloomFilterComparable returns a new BloomFilter for the
// specified elements using their natural comparison.
func NewComparableBloomFilter[V comparable](n int, fpProb float64) *BloomFilter[V] {
	return NewBloomFilter(maphash.ComparableHasher[V]{}, n, fpProb)
}

// NewBloomFilter constructs a new BloomFilter capable of holding n
// elements with the specified probability of false positive results,
// assuming a well dispersed hash function.
func NewBloomFilter[V any](hasher maphash.Hasher[V], n int, fpProb float64) *BloomFilter[V] {
	nbytes, nseeds := calibrate(n, fpProb)

	seeds := make([]maphash.Seed, nseeds)
	for i := range nseeds {
		seeds[i] = maphash.MakeSeed()
	}

	return &BloomFilter[V]{
		hasher: hasher,
		bytes:  make([]byte, nbytes),
		seeds:  seeds,
	}
}

func (f *BloomFilter[V]) Contains(v V) bool {
	for _, seed := range f.seeds {
		index, bit := f.locate(seed, v)
		if f.bytes[index]&bit == 0 {
			return false
		}
	}
	return true
}

func (f *BloomFilter[V]) Insert(v V) {
	for _, seed := range f.seeds {
		index, bit := f.locate(seed, v)
		f.bytes[index] |= bit
	}
}

func (f *BloomFilter[V]) locate(seed maphash.Seed, v V) (uint64, byte) {
	// Optimization note: the dynamic call to hasher.Hash causes h
	// to escape. You can use a sync.Pool can help mitigate the
	// allocation cost.
	//
	// Alternatively, you could copy and specialize the filter logic
	// for a specific implementation of maphash.Hasher, allowing
	// the compiler's escape analysis to determine that the
	// hasher.Hash call does not in fact cause h to escape.
	var h maphash.Hash
	h.SetSeed(seed)
	f.hasher.Hash(&h, v)
	hash := h.Sum64()

	index := reduce(hash, uint64(len(f.bytes)))
	mask := byte(1 << (hash % 8))
	return index, mask
}

// reduce maps hash to a value in the interval [0, n).
// See https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction
func reduce(hash, n uint64) uint64 {
	if n > 1<<32-1 {
		hi, _ := bits.Mul64(hash, n)
		return hi
	}
	return hash >> 32 * n >> 32
}

func calibrate(n int, fpProb float64) (bytes, seeds int) {
	// Following https://en.wikipedia.org/wiki/Bloom_filter:
	// - k is the number of hash functions,
	// - m is the size of the bit field;
	// - n is the number of set bits.

	if n == 0 {
		return 1, 1
	}

	logFpProb := math.Log(fpProb)
	m := -(float64(n) * logFpProb) / (math.Ln2 * math.Ln2)

	// Round up to a byte.
	// TODO(adonovan): opt: round up to the next allocation size
	// class (see bytes.growSlice) and then compute the possibly
	// smaller number of needed seeds based on this higher number.
	bytes = int(m) / 8
	if float64(bytes*8) < m {
		bytes++
	}

	k := -logFpProb / math.Ln2
	seeds = max(int(math.Round(k)), 1)
	return bytes, seeds
}

func Example_bloomFilter() {
	// Create a Bloom filter optimized for 2 elements with
	// a one-in-a-billion false-positive rate.
	//
	// (This low rate demands a lot of space: 88 bits and
	// 30 hash functions. More typical rates are 1-5%;
	// at 5%, we need only 16 bits and 4 hash functions.)
	f := NewComparableBloomFilter[string](2, 1e-9)

	// Insert two elements.
	f.Insert("apple")
	f.Insert("banana")

	// Check whether elements are present.
	//
	// "cherry" was not inserted, but Contains is probabilistic, so
	// this test will spuriously report Contains("cherry") = true
	// about once every billion runs.
	for _, fruit := range []string{"apple", "banana", "cherry"} {
		fmt.Printf("Contains(%q) = %v\n", fruit, f.Contains(fruit))
	}

	// Output:
	//
	// Contains("apple") = true
	// Contains("banana") = true
	// Contains("cherry") = false
}