openMSX
MinimalPerfectHash.hh
Go to the documentation of this file.
1#ifndef MINIMAL_PERFECT_HASH_HH
2#define MINIMAL_PERFECT_HASH_HH
3
4#include "cstd.hh"
5#include "Math.hh"
6#include "ranges.hh"
7#include "static_vector.hh"
8#include "stl.hh"
9#include "xrange.hh"
10#include <array>
11#include <bit>
12#include <cassert>
13#include <cstdint>
14
15// This calculates an "order preserving minimal perfect hash function" (mph).
16//
17// That is for N given input keys (typically these keys will be strings) we
18// produce a function with the following properties:
19// - The function takes a key as input and produces a number.
20// - For the i-th input key (i in [0..N)) the functions returns the value 'i'.
21// * Distinct inputs produce a distinct output (it's a 'perfect' hash).
22// * There are no gaps in the output range (the function is 'minimal').
23// * The order between input and output is preserved.
24// - For a new input that was not part of the original set, this function still
25// produces a value in the range [0..n), but it doesn't give any guarantee
26// about which specific value it returns. So typically this output will be
27// used as an index in an array to lookup the original key and then do an
28// equality comparison on that (single) key.
29// - The function runs in constant time (O(1)). (Comparing all keys one by one
30// would be O(N), using a std::map would be O(log(N))). More in detail, the
31// input key is hashed (with given non-perfect hash function) only once,
32// followed by one or sometimes two table-lookups.
33// - The function can be calculated at compile-time (constexpr).
34// - The required storage is about 2 bytes per input key.
35//
36// This implementation is based on
37// Frozen - An header-only, constexpr alternative to gperf for C++14 users
38// https://github.com/serge-sans-paille/frozen
39// Which is based on Steve Hanov's blog post:
40// http://stevehanov.ca/blog/index.php?id=119
41// Which is based on this paper:
42// Hash, displace, and compress
43// Djamal Belazzougui, Fabiano C. Botelho and Martin Dietzfelbinger
44// http://cmph.sourceforge.net/papers/esa09.pdf
45//
46// Compared to the original implementation in 'Frozen' this version is more
47// stripped down. There is a chance that the calculation of the mph fails. That
48// chance is a bit bigger in our version(*1). On the other hand our produced
49// functions runs a bit faster (only runs the non-perfect hash function once
50// instead of once or twice). And the resulting tables are ~10x smaller in
51// memory (for the specific use case in openMSX with only about ~100 input
52// keys).
53// (*1) If the calculation fails, it fails at compile time. And for the current
54// use case in openMSX (mapper type names) it works fine. If in the future we
55// add many (+20) more mapper types we may need to tweak this code a little.
56
58
59template<size_t M, typename Hash>
60struct Result
61{
62 static_assert(std::has_single_bit(M));
63 static constexpr auto M2 = M;
64
65 std::array<uint8_t, M > tab1;
66 std::array<uint8_t, M2> tab2; // Note: half this size is not enough, tried that before
67 [[no_unique_address]] Hash hash;
68
69 [[nodiscard]] constexpr uint8_t lookupIndex(const auto& key) const {
70 const uint32_t h = hash(key);
71 const uint8_t d = tab1[h % M];
72 if ((d & 0x80) == 0) {
73 return d;
74 } else {
75 return tab2[(h >> (d & 31)) % M2];
76 }
77 }
78};
79
80template<size_t N, typename Hash, typename GetKey>
81[[nodiscard]] constexpr auto create(const Hash& hash, const GetKey& getKey)
82{
83 static_assert(N < 128);
84 constexpr size_t M = std::bit_ceil(N);
85 constexpr auto bucket_max = size_t(2 * cstd::sqrt(M));
86
88 r.hash = hash;
89
90 // Step 1: Place all of the keys into buckets
91 std::array<static_vector<uint8_t, bucket_max>, r.tab1.size()> buckets;
92 for (auto i : xrange(N)) {
93 buckets[hash(getKey(i)) % r.tab1.size()].push_back(uint8_t(i));
94 }
95
96 // Step 2: Sort the buckets to process the ones with the most items first.
97 ranges::sort(buckets, [](const auto& x, const auto& y) {
98 // sort largest first
99 if (x.size() != y.size()) {
100 return x.size() > y.size();
101 }
102 // same size, sort lexicographical
103 return std::lexicographical_compare(x.begin(), x.end(),
104 y.begin(), y.end());
105 });
106
107 // Step 3: Map the items in buckets into hash tables.
108 constexpr auto UNUSED = uint8_t(-1);
109 ranges::fill(r.tab2, UNUSED);
110 for (const auto& bucket : buckets) {
111 auto const bSize = bucket.size();
112 if (bSize == 0) break; // done
113 auto hash1 = hash(getKey(bucket[0])) % r.tab1.size();
114 if (bSize == 1) {
115 // Store index to the (single) item in tab1
116 r.tab1[hash1] = bucket[0];
117 } else {
118 // Repeatedly try different shift-factors until we can
119 // place all items in the bucket into free slots.
120 uint8_t shift = 1;
122
123 while (bucket_slots.size() < bSize) {
124 auto hash2 = hash(getKey(bucket[bucket_slots.size()]));
125 assert((hash2 % r.tab1.size()) == hash1);
126 auto slot = (hash2 >> shift) % r.tab2.size();
127 if (r.tab2[slot] != UNUSED || contains(bucket_slots, slot)) {
128 ++shift; // try next
129 assert(shift < 32);
130 bucket_slots.clear();
131 continue;
132 }
133 bucket_slots.push_back(uint8_t(slot));
134 }
135
136 // Put successful shift-factor in tab1, and put indices to items in their slots
137 r.tab1[hash1] = shift | 0x80;
138 for (auto i : xrange(bSize)) {
139 r.tab2[bucket_slots[i]] = bucket[i];
140 }
141 }
142 }
143
144 // Change unused entries to zero because it must be valid indices (< N).
145 ranges::replace(r.tab2, UNUSED, uint8_t(0));
146 return r;
147}
148
149} // namespace PerfectMinimalHash
150
151#endif
constexpr size_t size() const noexcept
constexpr void clear() noexcept
constexpr void push_back(const T &a)
constexpr auto create(const Hash &hash, const GetKey &getKey)
constexpr double sqrt(double x)
Definition cstd.hh:261
constexpr void fill(ForwardRange &&range, const T &value)
Definition ranges.hh:315
constexpr void replace(ForwardRange &&range, const T &old_value, const T &new_value)
Definition ranges.hh:303
constexpr void sort(RandomAccessRange &&range)
Definition ranges.hh:51
constexpr bool contains(ITER first, ITER last, const VAL &val)
Check if a range contains a given value, using linear search.
Definition stl.hh:32
std::array< uint8_t, M > tab1
std::array< uint8_t, M2 > tab2
constexpr uint8_t lookupIndex(const auto &key) const
constexpr auto xrange(T e)
Definition xrange.hh:132