openMSX
DeltaBlock.cc
Go to the documentation of this file.
1 #include "DeltaBlock.hh"
2 #include "likely.hh"
3 #include "ranges.hh"
4 #include "lz4.hh"
5 #include <cassert>
6 #include <cstring>
7 #include <tuple>
8 #include <utility>
9 #if STATISTICS
10 #include <iostream>
11 #endif
12 #ifdef __SSE2__
13 #include <emmintrin.h>
14 #endif
15 
16 namespace openmsx {
17 
18 // --- Compressed integers ---
19 
20 // See https://en.wikipedia.org/wiki/LEB128 for a description of the
21 // 'unsigned LEB' format.
22 static void storeUleb(std::vector<uint8_t>& result, size_t value)
23 {
24  do {
25  uint8_t b = value & 0x7F;
26  value >>= 7;
27  if (value) b |= 0x80;
28  result.push_back(b);
29  } while (value);
30 }
31 
32 [[nodiscard]] static constexpr size_t loadUleb(const uint8_t*& data)
33 {
34  size_t result = 0;
35  int shift = 0;
36  while (true) {
37  uint8_t b = *data++;
38  result |= size_t(b & 0x7F) << shift;
39  if ((b & 0x80) == 0) return result;
40  shift += 7;
41  }
42 }
43 
44 
45 // --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
46 
47 template<int N> bool comp(const uint8_t* p, const uint8_t* q);
48 
49 template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
50 {
51  return *reinterpret_cast<const uint32_t*>(p) ==
52  *reinterpret_cast<const uint32_t*>(q);
53 }
54 
55 template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
56 {
57  return *reinterpret_cast<const uint64_t*>(p) ==
58  *reinterpret_cast<const uint64_t*>(q);
59 }
60 
61 #ifdef __SSE2__
62 template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
63 {
64  // Tests show that (on my machine) using 1 128-bit load is faster than
65  // 2 64-bit loads. Even though the actual comparison is slightly more
66  // complicated with SSE instructions.
67  __m128i a = _mm_load_si128(reinterpret_cast<const __m128i*>(p));
68  __m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(q));
69  __m128i d = _mm_cmpeq_epi8(a, b);
70  return _mm_movemask_epi8(d) == 0xffff;
71 }
72 #endif
73 
74 
75 // --- Optimized mismatch function ---
76 
77 // This is much like the function std::mismatch(). You pass in two buffers,
78 // the corresponding elements of both buffers are compared and the first
79 // position where the elements no longer match is returned.
80 //
81 // Compared to the std::mismatch() this implementation is faster because:
82 // - We make use of sentinels. This requires to temporarily change the content
83 // of the buffer. So it won't work with read-only-memory.
84 // - We compare words-at-a-time instead of byte-at-a-time.
85 static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
86  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
87 {
88  assert((p_end - p) == (q_end - q));
89 
90  // When SSE is available, work with 16-byte words, otherwise 4 or 8
91  // bytes. This routine also benefits from AVX2 instructions. Though not
92  // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
93  // extra run-time checks and that's not worth it at this point.
94  constexpr int WORD_SIZE =
95 #ifdef __SSE2__
96  sizeof(__m128i);
97 #else
98  sizeof(void*);
99 #endif
100 
101  // Region too small or
102  // both buffers are differently aligned.
103  if (unlikely((p_end - p) < (2 * WORD_SIZE)) ||
104  unlikely((reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
105  (reinterpret_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) {
106  goto end;
107  }
108 
109  // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
110  if (unlikely(reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1))) {
111  do {
112  if (*p != *q) return {p, q};
113  p += 1; q += 1;
114  } while (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1));
115  }
116 
117  // Fast path. Compare words-at-a-time.
118  {
119  // Place a sentinel in the last full word. This ensures we'll
120  // find a mismatch within the buffer, and so we can omit the
121  // end-of-buffer checks.
122  auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
123  auto save = *sentinel;
124  *sentinel = ~q_end[-WORD_SIZE];
125 
126  while (comp<WORD_SIZE>(p, q)) {
127  p += WORD_SIZE; q += WORD_SIZE;
128  }
129 
130  // Restore sentinel.
131  *sentinel = save;
132  }
133 
134  // Slow path. This handles:
135  // - Small or differently aligned buffers.
136  // - The bytes at and after the (restored) sentinel.
137 end: return std::mismatch(p, p_end, q);
138 }
139 
140 
141 // --- Optimized scan_match function ---
142 
143 // Like scan_mismatch() above, but searches two buffers for the first
144 // corresponding equal (instead of not-equal) bytes.
145 //
146 // Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
147 // buffer cannot be read-only memory.
148 //
149 // Unlike scan_mismatch() it's less obvious how to perform this function
150 // word-at-a-time (it's possible with some bit hacks). Though luckily this
151 // function is also less performance critical.
152 [[nodiscard]] static constexpr std::pair<const uint8_t*, const uint8_t*> scan_match(
153  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
154 {
155  assert((p_end - p) == (q_end - q));
156 
157  // Code below is functionally equivalent to:
158  // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
159  // return {p, q};
160 
161  if (p == p_end) return {p, q};
162 
163  auto* p_last = const_cast<uint8_t*>(p_end - 1);
164  auto save = *p_last;
165  *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
166 
167  while (*p != *q) { ++p; ++q; }
168 
169  *p_last = save;
170  if ((p == p_last) && (*p != *q)) { ++p; ++q; }
171  return {p, q};
172 }
173 
174 
175 // --- delta (de)compression routines ---
176 
177 // Calculate a 'delta' between two binary buffers of equal size.
178 // The result is a stream of:
179 // n1 number of bytes are equal
180 // n2 number of bytes are different, and here are the bytes
181 // n3 number of bytes are equal
182 // ...
183 [[nodiscard]] static std::vector<uint8_t> calcDelta(
184  const uint8_t* oldBuf, const uint8_t* newBuf, size_t size)
185 {
186  std::vector<uint8_t> result;
187 
188  const auto* p = oldBuf;
189  const auto* q = newBuf;
190  const auto* p_end = p + size;
191  const auto* q_end = q + size;
192 
193  // scan equal bytes (possibly zero)
194  const auto* q1 = q;
195  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
196  auto n1 = q - q1;
197  storeUleb(result, n1);
198 
199  while (q != q_end) {
200  assert(*p != *q);
201 
202  const auto* q2 = q;
203  different:
204  std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
205  auto n2 = q - q2;
206 
207  const auto* q3 = q;
208  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
209  auto n3 = q - q3;
210  if ((q != q_end) && (n3 <= 2)) goto different;
211 
212  storeUleb(result, n2);
213  result.insert(result.end(), q2, q3);
214 
215  if (n3 != 0) storeUleb(result, n3);
216  }
217 
218  result.shrink_to_fit();
219  return result;
220 }
221 
222 // Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
223 static void applyDeltaInPlace(uint8_t* buf, size_t size, const uint8_t* delta)
224 {
225  auto* end = buf + size;
226 
227  while (buf != end) {
228  auto n1 = loadUleb(delta);
229  buf += n1;
230  if (buf == end) break;
231 
232  auto n2 = loadUleb(delta);
233  memcpy(buf, delta, n2);
234  buf += n2;
235  delta += n2;
236  }
237 }
238 
239 #if STATISTICS
240 
241 // class DeltaBlock
242 
244 {
245  globalAllocSize -= allocSize;
246  std::cout << "stat: ~DeltaBlock " << globalAllocSize
247  << " (-" << allocSize << ")\n";
248 }
249 
250 #endif
251 
252 // class DeltaBlockCopy
253 
254 DeltaBlockCopy::DeltaBlockCopy(const uint8_t* data, size_t size)
255  : block(size)
256  , compressedSize(0)
257 {
258 #ifdef DEBUG
259  sha1 = SHA1::calc({data, size});
260 #endif
261  memcpy(block.data(), data, size);
262  assert(!compressed());
263 #if STATISTICS
264  allocSize = size;
265  globalAllocSize += allocSize;
266  std::cout << "stat: DeltaBlockCopy " << globalAllocSize
267  << " (+" << allocSize << ")\n";
268 #endif
269 }
270 
271 void DeltaBlockCopy::apply(uint8_t* dst, size_t size) const
272 {
273  if (compressed()) {
274  LZ4::decompress(block.data(), dst, int(compressedSize), int(size));
275  } else {
276  memcpy(dst, block.data(), size);
277  }
278 #ifdef DEBUG
279  assert(SHA1::calc({dst, size}) == sha1);
280 #endif
281 }
282 
284 {
285  if (compressed()) return;
286 
287  size_t dstLen = LZ4::compressBound(int(size));
288  MemBuffer<uint8_t> buf2(dstLen);
289  dstLen = LZ4::compress(block.data(), buf2.data(), int(size));
290 
291  if (dstLen >= size) {
292  // compression isn't beneficial
293  return;
294  }
295  compressedSize = dstLen;
296  block.swap(buf2);
297  block.resize(compressedSize); // shrink to fit
298  assert(compressed());
299 #ifdef DEBUG
300  MemBuffer<uint8_t> buf3(size);
301  apply(buf3.data(), size);
302  assert(memcmp(buf3.data(), buf2.data(), size) == 0);
303 #endif
304 #if STATISTICS
305  int delta = compressedSize - allocSize;
306  allocSize = compressedSize;
307  globalAllocSize += delta;
308  std::cout << "stat: compress " << globalAllocSize
309  << " (" << delta << ")\n";
310 #endif
311 }
312 
313 const uint8_t* DeltaBlockCopy::getData()
314 {
315  assert(!compressed());
316  return block.data();
317 }
318 
319 
320 // class DeltaBlockDiff
321 
323  std::shared_ptr<DeltaBlockCopy> prev_,
324  const uint8_t* data, size_t size)
325  : prev(std::move(prev_))
326  , delta(calcDelta(prev->getData(), data, size))
327 {
328 #ifdef DEBUG
329  sha1 = SHA1::calc({data, size});
330 
332  apply(buf.data(), size);
333  assert(memcmp(buf.data(), data, size) == 0);
334 #endif
335 #if STATISTICS
336  allocSize = delta.size();
337  globalAllocSize += allocSize;
338  std::cout << "stat: DeltaBlockDiff " << globalAllocSize
339  << " (+" << allocSize << ")\n";
340 #endif
341 }
342 
343 void DeltaBlockDiff::apply(uint8_t* dst, size_t size) const
344 {
345  prev->apply(dst, size);
346  applyDeltaInPlace(dst, size, delta.data());
347 #ifdef DEBUG
348  assert(SHA1::calc({dst, size}) == sha1);
349 #endif
350 }
351 
353 {
354  return delta.size();
355 }
356 
357 
358 // class LastDeltaBlocks
359 
360 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
361  const void* id, const uint8_t* data, size_t size)
362 {
363  auto it = ranges::lower_bound(infos, std::tuple(id, size), {},
364  [](const Info& info) { return std::tuple(info.id, info.size); });
365  if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
366  // no previous info yet
367  it = infos.emplace(it, id, size);
368  }
369  assert(it->id == id);
370  assert(it->size == size);
371 
372  auto ref = it->ref.lock();
373  if (it->accSize >= size || !ref) {
374  if (ref) {
375  // We will switch to a new DeltaBlockCopy object. So
376  // now is a good time to compress the old one.
377  ref->compress(size);
378  }
379  // Heuristic: create a new block when too many small
380  // differences have accumulated.
381  auto b = std::make_shared<DeltaBlockCopy>(data, size);
382  it->ref = b;
383  it->last = b;
384  it->accSize = 0;
385  return b;
386  } else {
387  // Create diff based on earlier reference block.
388  // Reference remains unchanged.
389  auto b = std::make_shared<DeltaBlockDiff>(ref, data, size);
390  it->last = b;
391  it->accSize += b->getDeltaSize();
392  return b;
393  }
394 }
395 
396 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
397  const void* id, const uint8_t* data, size_t size)
398 {
399  auto it = ranges::lower_bound(infos, id, {}, &Info::id);
400  if ((it == end(infos)) || (it->id != id)) {
401  // no previous block yet
402  it = infos.emplace(it, id, size);
403  }
404  assert(it->size == size);
405 
406  auto last = it->last.lock();
407  if (!last) {
408  auto b = std::make_shared<DeltaBlockCopy>(data, size);
409  it->ref = b;
410  it->last = b;
411  it->accSize = 0;
412  return b;
413  } else {
414 #ifdef DEBUG
415  assert(SHA1::calc({data, size}) == last->sha1);
416 #endif
417  return last;
418  }
419 }
420 
422 {
423  for (const Info& info : infos) {
424  if (auto ref = info.ref.lock()) {
425  ref->compress(info.size);
426  }
427  }
428  infos.clear();
429 }
430 
431 } // namespace openmsx
uintptr_t id
Definition: Interpreter.cc:26
const uint8_t * getData()
Definition: DeltaBlock.cc:313
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:271
DeltaBlockCopy(const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:254
void compress(size_t size)
Definition: DeltaBlock.cc:283
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:343
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:322
size_t getDeltaSize() const
Definition: DeltaBlock.cc:352
virtual ~DeltaBlock()=default
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:396
std::shared_ptr< DeltaBlock > createNew(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:360
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:111
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition: MemBuffer.hh:136
static Sha1Sum calc(span< const uint8_t > data)
Easier to use interface, if you can pass all data in one go.
Definition: utils/sha1.cc:365
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
#define unlikely(x)
Definition: likely.hh:15
int compressBound(int iSize)
Definition: lz4.hh:51
int decompress(const uint8_t *src, uint8_t *dst, int compressedSize, int dstCapacity)
Definition: lz4.cc:540
int compress(const uint8_t *src, uint8_t *dst, int srcSize)
Definition: lz4.cc:516
void save(unsigned width, unsigned height, const void **rowPointers, const PixelFormat &format, const std::string &filename)
Definition: PNG.cc:371
This file implemented 3 utility functions:
Definition: Autofire.cc:9
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:49
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:55
AmdFlash::SectorInfo Info
Definition: RomManbow2.cc:17
bool comp(const uint8_t *p, const uint8_t *q)
auto lower_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition: ranges.hh:85
size_t size(std::string_view utf8)
constexpr auto end(const zstring_view &x)
Definition: zstring_view.hh:84