openMSX
MinimalPerfectHash.hh
Go to the documentation of this file.
1 #ifndef MINIMAL_PERFECT_HASH_HH
2 #define MINIMAL_PERFECT_HASH_HH
3 
4 #include "cstd.hh"
5 #include "Math.hh"
6 #include "static_vector.hh"
7 #include "stl.hh"
8 #include "xrange.hh"
9 #include <array>
10 #include <cassert>
11 #include <cstdint>
12 
13 // This calculates an "order preserving minimal perfect hash function" (mph).
14 //
15 // That is for N given input keys (typically these keys will be strings) we
16 // produce a function with the following properties:
17 // - The function takes a key as input and produces a number.
18 // - For the i-th input key (i in [0..N)) the functions returns the value 'i'.
19 // * Distinct inputs produce a distinct output (it's a 'perfect' hash).
20 // * There are no gaps in the output range (the function is 'minimal').
21 // * The order between input and output is preserved.
22 // - For a new input that was not part of the original set, this function still
23 // produces a value in the range [0..n), but it doesn't give any guarantee
24 // about which specific value it returns. So typically this output will be
25 // used as an index in an array to lookup the original key and then do an
26 // equality comparison on that (single) key.
27 // - The function runs in constant time (O(1)). (Comparing all keys one by one
28 // would be O(N), using a std::map would be O(log(N))). More in detail, the
29 // input key is hashed (with given non-perfect hash function) only once,
30 // followed by one or sometimes two table-lookups.
31 // - The function can be calculated at compile-time (constexpr).
32 // - The required storage is about 2 bytes per input key.
33 //
34 // This implementation is based on
35 // Frozen - An header-only, constexpr alternative to gperf for C++14 users
36 // https://github.com/serge-sans-paille/frozen
37 // Which is based on Steve Hanov's blog post:
38 // http://stevehanov.ca/blog/index.php?id=119
39 // Which is based on this paper:
40 // Hash, displace, and compress
41 // Djamal Belazzougui, Fabiano C. Botelho and Martin Dietzfelbinger
42 // http://cmph.sourceforge.net/papers/esa09.pdf
43 //
44 // Compared to the original implementation in 'Frozen' this version is more
45 // stripped down. There is a chance that the calculation of the mph fails. That
46 // chance is a bit bigger in our version(*1). On the other hand our produced
47 // functions runs a bit faster (only runs the non-perfect hash function once
48 // instead of once or twice). And the resulting tables are ~10x smaller in
49 // memory (for the specific use case in openMSX with only about ~100 input
50 // keys).
51 // (*1) If the calculation fails, it fails at compile time. And for the current
52 // use case in openMSX (mapper type names) it works fine. If in the future we
53 // add many (+20) more mapper types we may need to tweak this code a little.
54 
55 namespace PerfectMinimalHash {
56 
57 template<size_t M, typename Hash>
58 struct Result
59 {
60  static_assert(Math::ispow2(M));
61  static constexpr auto M2 = M / 2;
62 
63  std::array<uint8_t, M > tab1;
64  std::array<uint8_t, M2> tab2; // half size (space optimization)
65  Hash hash;
66 
67  template<typename Key>
68  [[nodiscard]] constexpr uint8_t lookupIndex(const Key& key) const {
69  const uint32_t h = hash(key);
70  const uint8_t d = tab1[h % M];
71  if ((d & 0x80) == 0) {
72  return d;
73  } else {
74  return tab2[(h >> (d & 31)) % M2];
75  }
76  }
77 };
78 
79 template<size_t N, typename Hash, typename GetKey>
80 [[nodiscard]] constexpr auto create(const Hash& hash, const GetKey& getKey)
81 {
82  static_assert(N < 128);
83  constexpr size_t M = Math::ceil2(N);
84  constexpr auto bucket_max = size_t(2 * cstd::sqrt(M));
85 
86  Result<M, Hash> r{};
87  r.hash = hash;
88 
89  // Step 1: Place all of the keys into buckets
90  std::array<static_vector<uint8_t, bucket_max>, r.tab1.size()> buckets;
91  for (auto i : xrange(N)) {
92  buckets[hash(getKey(i)) % r.tab1.size()].push_back(uint8_t(i));
93  }
94 
95  // Step 2: Sort the buckets to process the ones with the most items first.
96  cstd::sort(buckets, [](const auto& x, const auto& y) {
97  return x.size() > y.size(); // sort largest first
98  });
99 
100  // Step 3: Map the items in buckets into hash tables.
101  constexpr auto UNUSED = uint8_t(-1);
102  cstd::fill(r.tab2, UNUSED);
103  for (const auto& bucket : buckets) {
104  auto const bSize = bucket.size();
105  if (bSize == 0) break; // done
106  auto hash1 = hash(getKey(bucket[0])) % r.tab1.size();
107  if (bSize == 1) {
108  // Store index to the (single) item in tab1
109  r.tab1[hash1] = bucket[0];
110  } else {
111  // Repeatedly try different shift-factors until we can
112  // place all items in the bucket into free slots.
113  uint8_t shift = 1;
115 
116  while (bucket_slots.size() < bSize) {
117  auto hash2 = hash(getKey(bucket[bucket_slots.size()]));
118  assert((hash2 % r.tab1.size()) == hash1);
119  auto slot = (hash2 >> shift) % r.tab2.size();
120  if (r.tab2[slot] != UNUSED || contains(bucket_slots, slot)) {
121  ++shift; // try next
122  assert(shift < 32);
123  bucket_slots.clear();
124  continue;
125  }
126  bucket_slots.push_back(uint8_t(slot));
127  }
128 
129  // Put successful shift-factor in tab1, and put indices to items in their slots
130  r.tab1[hash1] = shift | 0x80;
131  for (auto i : xrange(bSize)) {
132  r.tab2[bucket_slots[i]] = bucket[i];
133  }
134  }
135  }
136 
137  // Change unused entries to zero because it must be valid indices (< N).
138  cstd::replace(r.tab2, UNUSED, uint8_t(0));
139  return r;
140 }
141 
142 } // namespace PerfectMinimalHash
143 
144 #endif
constexpr void clear()
constexpr size_t size() const
constexpr void push_back(const T &a)
constexpr T ceil2(T x) noexcept
Returns the smallest number that is both >=a and a power of two.
Definition: Math.hh:85
constexpr bool ispow2(T x) noexcept
Is the given number an integral power of two? That is, does it have exactly one 1-bit in binary repre...
Definition: Math.hh:57
constexpr auto create(const Hash &hash, const GetKey &getKey)
constexpr void sort(RAIt first, RAIt last, Compare cmp=Compare{})
Definition: cstd.hh:64
constexpr void fill(ForwardIt first, ForwardIt last, const T &value)
Definition: cstd.hh:111
constexpr double sqrt(double x)
Definition: cstd.hh:367
constexpr void replace(ForwardIt first, ForwardIt last, const T &old_value, const T &new_value)
Definition: cstd.hh:95
constexpr unsigned N
Definition: ResampleHQ.cc:229
constexpr byte UNUSED
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:124
constexpr bool contains(ITER first, ITER last, const VAL &val)
Check if a range contains a given value, using linear search.
Definition: stl.hh:92
std::array< uint8_t, M > tab1
std::array< uint8_t, M2 > tab2
constexpr uint8_t lookupIndex(const Key &key) const
static constexpr auto M2
constexpr auto xrange(T e)
Definition: xrange.hh:155