openMSX
MinimalPerfectHash.hh
Go to the documentation of this file.
1 #ifndef MINIMAL_PERFECT_HASH_HH
2 #define MINIMAL_PERFECT_HASH_HH
3 
4 #include "cstd.hh"
5 #include "Math.hh"
6 #include "ranges.hh"
7 #include "static_vector.hh"
8 #include "stl.hh"
9 #include "xrange.hh"
10 #include <array>
11 #include <bit>
12 #include <cassert>
13 #include <cstdint>
14 
15 // This calculates an "order preserving minimal perfect hash function" (mph).
16 //
17 // That is for N given input keys (typically these keys will be strings) we
18 // produce a function with the following properties:
19 // - The function takes a key as input and produces a number.
20 // - For the i-th input key (i in [0..N)) the functions returns the value 'i'.
21 // * Distinct inputs produce a distinct output (it's a 'perfect' hash).
22 // * There are no gaps in the output range (the function is 'minimal').
23 // * The order between input and output is preserved.
24 // - For a new input that was not part of the original set, this function still
25 // produces a value in the range [0..n), but it doesn't give any guarantee
26 // about which specific value it returns. So typically this output will be
27 // used as an index in an array to lookup the original key and then do an
28 // equality comparison on that (single) key.
29 // - The function runs in constant time (O(1)). (Comparing all keys one by one
30 // would be O(N), using a std::map would be O(log(N))). More in detail, the
31 // input key is hashed (with given non-perfect hash function) only once,
32 // followed by one or sometimes two table-lookups.
33 // - The function can be calculated at compile-time (constexpr).
34 // - The required storage is about 2 bytes per input key.
35 //
36 // This implementation is based on
37 // Frozen - An header-only, constexpr alternative to gperf for C++14 users
38 // https://github.com/serge-sans-paille/frozen
39 // Which is based on Steve Hanov's blog post:
40 // http://stevehanov.ca/blog/index.php?id=119
41 // Which is based on this paper:
42 // Hash, displace, and compress
43 // Djamal Belazzougui, Fabiano C. Botelho and Martin Dietzfelbinger
44 // http://cmph.sourceforge.net/papers/esa09.pdf
45 //
46 // Compared to the original implementation in 'Frozen' this version is more
47 // stripped down. There is a chance that the calculation of the mph fails. That
48 // chance is a bit bigger in our version(*1). On the other hand our produced
49 // functions runs a bit faster (only runs the non-perfect hash function once
50 // instead of once or twice). And the resulting tables are ~10x smaller in
51 // memory (for the specific use case in openMSX with only about ~100 input
52 // keys).
53 // (*1) If the calculation fails, it fails at compile time. And for the current
54 // use case in openMSX (mapper type names) it works fine. If in the future we
55 // add many (+20) more mapper types we may need to tweak this code a little.
56 
57 namespace PerfectMinimalHash {
58 
59 template<size_t M, typename Hash>
60 struct Result
61 {
62  static_assert(std::has_single_bit(M));
63  static constexpr auto M2 = M / 2;
64 
65  std::array<uint8_t, M > tab1;
66  std::array<uint8_t, M2> tab2; // half size (space optimization)
67  [[no_unique_address]] Hash hash;
68 
69  [[nodiscard]] constexpr uint8_t lookupIndex(const auto& key) const {
70  const uint32_t h = hash(key);
71  const uint8_t d = tab1[h % M];
72  if ((d & 0x80) == 0) {
73  return d;
74  } else {
75  return tab2[(h >> (d & 31)) % M2];
76  }
77  }
78 };
79 
80 template<size_t N, typename Hash, typename GetKey>
81 [[nodiscard]] constexpr auto create(const Hash& hash, const GetKey& getKey)
82 {
83  static_assert(N < 128);
84  constexpr size_t M = std::bit_ceil(N);
85  constexpr auto bucket_max = size_t(2 * cstd::sqrt(M));
86 
87  Result<M, Hash> r{};
88  r.hash = hash;
89 
90  // Step 1: Place all of the keys into buckets
91  std::array<static_vector<uint8_t, bucket_max>, r.tab1.size()> buckets;
92  for (auto i : xrange(N)) {
93  buckets[hash(getKey(i)) % r.tab1.size()].push_back(uint8_t(i));
94  }
95 
96  // Step 2: Sort the buckets to process the ones with the most items first.
97  ranges::sort(buckets, [](const auto& x, const auto& y) {
98  // sort largest first
99  if (x.size() != y.size()) {
100  return x.size() > y.size();
101  }
102  // same size, sort lexicographical
103  return std::lexicographical_compare(x.begin(), x.end(),
104  y.begin(), y.end());
105  });
106 
107  // Step 3: Map the items in buckets into hash tables.
108  constexpr auto UNUSED = uint8_t(-1);
109  ranges::fill(r.tab2, UNUSED);
110  for (const auto& bucket : buckets) {
111  auto const bSize = bucket.size();
112  if (bSize == 0) break; // done
113  auto hash1 = hash(getKey(bucket[0])) % r.tab1.size();
114  if (bSize == 1) {
115  // Store index to the (single) item in tab1
116  r.tab1[hash1] = bucket[0];
117  } else {
118  // Repeatedly try different shift-factors until we can
119  // place all items in the bucket into free slots.
120  uint8_t shift = 1;
122 
123  while (bucket_slots.size() < bSize) {
124  auto hash2 = hash(getKey(bucket[bucket_slots.size()]));
125  assert((hash2 % r.tab1.size()) == hash1);
126  auto slot = (hash2 >> shift) % r.tab2.size();
127  if (r.tab2[slot] != UNUSED || contains(bucket_slots, slot)) {
128  ++shift; // try next
129  assert(shift < 32);
130  bucket_slots.clear();
131  continue;
132  }
133  bucket_slots.push_back(uint8_t(slot));
134  }
135 
136  // Put successful shift-factor in tab1, and put indices to items in their slots
137  r.tab1[hash1] = shift | 0x80;
138  for (auto i : xrange(bSize)) {
139  r.tab2[bucket_slots[i]] = bucket[i];
140  }
141  }
142  }
143 
144  // Change unused entries to zero because it must be valid indices (< N).
145  ranges::replace(r.tab2, UNUSED, uint8_t(0));
146  return r;
147 }
148 
149 } // namespace PerfectMinimalHash
150 
151 #endif
constexpr void clear()
constexpr size_t size() const
constexpr void push_back(const T &a)
constexpr auto create(const Hash &hash, const GetKey &getKey)
constexpr double sqrt(double x)
Definition: cstd.hh:258
constexpr unsigned N
Definition: ResampleHQ.cc:226
constexpr byte UNUSED
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:127
constexpr void fill(ForwardRange &&range, const T &value)
Definition: ranges.hh:256
constexpr void replace(ForwardRange &&range, const T &old_value, const T &new_value)
Definition: ranges.hh:244
constexpr void sort(RandomAccessRange &&range)
Definition: ranges.hh:33
constexpr bool contains(ITER first, ITER last, const VAL &val)
Check if a range contains a given value, using linear search.
Definition: stl.hh:23
std::array< uint8_t, M > tab1
std::array< uint8_t, M2 > tab2
static constexpr auto M2
constexpr uint8_t lookupIndex(const auto &key) const
constexpr auto xrange(T e)
Definition: xrange.hh:133