openMSX
DeltaBlock.cc
Go to the documentation of this file.
1 #include "DeltaBlock.hh"
2 #include "likely.hh"
3 #include "ranges.hh"
4 #include "snappy.hh"
5 #include <cassert>
6 #include <cstring>
7 #include <tuple>
8 #include <utility>
9 #if STATISTICS
10 #include <iostream>
11 #endif
12 #ifdef __SSE2__
13 #include <emmintrin.h>
14 #endif
15 
16 namespace openmsx {
17 
18 using std::vector;
19 
20 // --- Compressed integers ---
21 
22 // See https://en.wikipedia.org/wiki/LEB128 for a description of the
23 // 'unsigned LEB' format.
24 static void storeUleb(vector<uint8_t>& result, size_t value)
25 {
26  do {
27  uint8_t b = value & 0x7F;
28  value >>= 7;
29  if (value) b |= 0x80;
30  result.push_back(b);
31  } while (value);
32 }
33 
34 static size_t loadUleb(const uint8_t*& data)
35 {
36  size_t result = 0;
37  int shift = 0;
38  while (true) {
39  uint8_t b = *data++;
40  result |= size_t(b & 0x7F) << shift;
41  if ((b & 0x80) == 0) return result;
42  shift += 7;
43  }
44 }
45 
46 
47 // --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
48 
49 template<int N> bool comp(const uint8_t* p, const uint8_t* q);
50 
51 template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
52 {
53  return *reinterpret_cast<const uint32_t*>(p) ==
54  *reinterpret_cast<const uint32_t*>(q);
55 }
56 
57 template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
58 {
59  return *reinterpret_cast<const uint64_t*>(p) ==
60  *reinterpret_cast<const uint64_t*>(q);
61 }
62 
63 #ifdef __SSE2__
64 template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
65 {
66  // Tests show that (on my machine) using 1 128-bit load is faster than
67  // 2 64-bit loads. Even though the actual comparison is slightly more
68  // complicated with SSE instructions.
69  __m128i a = _mm_load_si128(reinterpret_cast<const __m128i*>(p));
70  __m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(q));
71  __m128i d = _mm_cmpeq_epi8(a, b);
72  return _mm_movemask_epi8(d) == 0xffff;
73 }
74 #endif
75 
76 
77 // --- Optimized mismatch function ---
78 
79 // This is much like the function std::mismatch(). You pass in two buffers,
80 // the corresponding elements of both buffers are compared and the first
81 // position where the elements no longer match is returned.
82 //
83 // Compared to the std::mismatch() this implementation is faster because:
84 // - We make use of sentinels. This requires to temporarily change the content
85 // of the buffer. So it won't work with read-only-memory.
86 // - We compare words-at-a-time instead of byte-at-a-time.
87 static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
88  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
89 {
90  assert((p_end - p) == (q_end - q));
91 
92  // When SSE is available, work with 16-byte words, otherwise 4 or 8
93  // bytes. This routine also benefits from AVX2 instructions. Though not
94  // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
95  // extra run-time checks and that's not worth it at this point.
96  static const int WORD_SIZE =
97 #ifdef __SSE2__
98  sizeof(__m128i);
99 #else
100  sizeof(void*);
101 #endif
102 
103  // Region too small or
104  // both buffers are differently aligned.
105  if (unlikely((p_end - p) < (2 * WORD_SIZE)) ||
106  unlikely((reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
107  (reinterpret_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) {
108  goto end;
109  }
110 
111  // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
112  if (unlikely(reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1))) {
113  do {
114  if (*p != *q) return {p, q};
115  p += 1; q += 1;
116  } while (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1));
117  }
118 
119  // Fast path. Compare words-at-a-time.
120  {
121  // Place a sentinel in the last full word. This ensures we'll
122  // find a mismatch within the buffer, and so we can omit the
123  // end-of-buffer checks.
124  auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
125  auto save = *sentinel;
126  *sentinel = ~q_end[-WORD_SIZE];
127 
128  while (comp<WORD_SIZE>(p, q)) {
129  p += WORD_SIZE; q += WORD_SIZE;
130  }
131 
132  // Restore sentinel.
133  *sentinel = save;
134  }
135 
136  // Slow path. This handles:
137  // - Small or differently aligned buffers.
138  // - The bytes at and after the (restored) sentinel.
139 end: return std::mismatch(p, p_end, q);
140 }
141 
142 
143 // --- Optimized scan_match function ---
144 
145 // Like scan_mismatch() above, but searches two buffers for the first
146 // corresponding equal (instead of not-equal) bytes.
147 //
148 // Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
149 // buffer cannot be read-only memory.
150 //
151 // Unlike scan_mismatch() it's less obvious how to perform this function
152 // word-at-a-time (it's possible with some bit hacks). Though luckily this
153 // function is also less performance critical.
154 static std::pair<const uint8_t*, const uint8_t*> scan_match(
155  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
156 {
157  assert((p_end - p) == (q_end - q));
158 
159  // Code below is functionally equivalent to:
160  // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
161  // return {p, q};
162 
163  if (p == p_end) return {p, q};
164 
165  auto* p_last = const_cast<uint8_t*>(p_end - 1);
166  auto save = *p_last;
167  *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
168 
169  while (*p != *q) { ++p; ++q; }
170 
171  *p_last = save;
172  if ((p == p_last) && (*p != *q)) { ++p; ++q; }
173  return {p, q};
174 }
175 
176 
177 // --- delta (de)compression routines ---
178 
179 // Calculate a 'delta' between two binary buffers of equal size.
180 // The result is a stream of:
181 // n1 number of bytes are equal
182 // n2 number of bytes are different, and here are the bytes
183 // n3 number of bytes are equal
184 // ...
185 static vector<uint8_t> calcDelta(const uint8_t* oldBuf, const uint8_t* newBuf, size_t size)
186 {
187  vector<uint8_t> result;
188 
189  auto* p = oldBuf;
190  auto* q = newBuf;
191  auto* p_end = p + size;
192  auto* q_end = q + size;
193 
194  // scan equal bytes (possibly zero)
195  auto* q1 = q;
196  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
197  auto n1 = q - q1;
198  storeUleb(result, n1);
199 
200  while (q != q_end) {
201  assert(*p != *q);
202 
203  auto* q2 = q;
204  different:
205  std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
206  auto n2 = q - q2;
207 
208  auto* q3 = q;
209  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
210  auto n3 = q - q3;
211  if ((q != q_end) && (n3 <= 2)) goto different;
212 
213  storeUleb(result, n2);
214  result.insert(result.end(), q2, q3);
215 
216  if (n3 != 0) storeUleb(result, n3);
217  }
218 
219  result.shrink_to_fit();
220  return result;
221 }
222 
223 // Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
224 static void applyDeltaInPlace(uint8_t* buf, size_t size, const uint8_t* delta)
225 {
226  auto* end = buf + size;
227 
228  while (buf != end) {
229  auto n1 = loadUleb(delta);
230  buf += n1;
231  if (buf == end) break;
232 
233  auto n2 = loadUleb(delta);
234  memcpy(buf, delta, n2);
235  buf += n2;
236  delta += n2;
237  }
238 }
239 
240 #if STATISTICS
241 
242 // class DeltaBlock
243 
244 size_t DeltaBlock::globalAllocSize = 0;
245 
247 {
248  globalAllocSize -= allocSize;
249  std::cout << "stat: ~DeltaBlock " << globalAllocSize
250  << " (-" << allocSize << ")\n";
251 }
252 
253 #endif
254 
255 // class DeltaBlockCopy
256 
257 DeltaBlockCopy::DeltaBlockCopy(const uint8_t* data, size_t size)
258  : block(size)
259  , compressedSize(0)
260 {
261 #ifdef DEBUG
262  sha1 = SHA1::calc(data, size);
263 #endif
264  memcpy(block.data(), data, size);
265  assert(!compressed());
266 #if STATISTICS
267  allocSize = size;
268  globalAllocSize += allocSize;
269  std::cout << "stat: DeltaBlockCopy " << globalAllocSize
270  << " (+" << allocSize << ")\n";
271 #endif
272 }
273 
274 void DeltaBlockCopy::apply(uint8_t* dst, size_t size) const
275 {
276  if (compressed()) {
278  reinterpret_cast<const char*>(block.data()), compressedSize,
279  reinterpret_cast<char*>(dst), size);
280  } else {
281  memcpy(dst, block.data(), size);
282  }
283 #ifdef DEBUG
284  assert(SHA1::calc(dst, size) == sha1);
285 #endif
286 }
287 
288 void DeltaBlockCopy::compress(size_t size)
289 {
290  if (compressed()) return;
291 
292  size_t dstLen = snappy::maxCompressedLength(size);
293  MemBuffer<uint8_t> buf2(dstLen);
294  snappy::compress(reinterpret_cast<const char*>(block.data()), size,
295  reinterpret_cast<char*>(buf2.data()), dstLen);
296  if (dstLen >= size) {
297  // compression isn't beneficial
298  return;
299  }
300  compressedSize = dstLen;
301  block.swap(buf2);
302  block.resize(compressedSize); // shrink to fit
303  assert(compressed());
304 #ifdef DEBUG
305  MemBuffer<uint8_t> buf3(size);
306  apply(buf3.data(), size);
307  assert(memcmp(buf3.data(), buf2.data(), size) == 0);
308 #endif
309 #if STATISTICS
310  int delta = compressedSize - allocSize;
311  allocSize = compressedSize;
312  globalAllocSize += delta;
313  std::cout << "stat: compress " << globalAllocSize
314  << " (" << delta << ")\n";
315 #endif
316 }
317 
318 const uint8_t* DeltaBlockCopy::getData()
319 {
320  assert(!compressed());
321  return block.data();
322 }
323 
324 
325 // class DeltaBlockDiff
326 
328  std::shared_ptr<DeltaBlockCopy> prev_,
329  const uint8_t* data, size_t size)
330  : prev(std::move(prev_))
331  , delta(calcDelta(prev->getData(), data, size))
332 {
333 #ifdef DEBUG
334  sha1 = SHA1::calc(data, size);
335 
336  MemBuffer<uint8_t> buf(size);
337  apply(buf.data(), size);
338  assert(memcmp(buf.data(), data, size) == 0);
339 #endif
340 #if STATISTICS
341  allocSize = delta.size();
342  globalAllocSize += allocSize;
343  std::cout << "stat: DeltaBlockDiff " << globalAllocSize
344  << " (+" << allocSize << ")\n";
345 #endif
346 }
347 
348 void DeltaBlockDiff::apply(uint8_t* dst, size_t size) const
349 {
350  prev->apply(dst, size);
351  applyDeltaInPlace(dst, size, delta.data());
352 #ifdef DEBUG
353  assert(SHA1::calc(dst, size) == sha1);
354 #endif
355 }
356 
358 {
359  return delta.size();
360 }
361 
362 
363 // class LastDeltaBlocks
364 
365 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
366  const void* id, const uint8_t* data, size_t size)
367 {
368  auto it = ranges::lower_bound(infos, std::make_tuple(id, size),
369  [](const Info& info, const std::tuple<const void*, size_t>& info2) {
370  return std::make_tuple(info.id, info.size) < info2; });
371  if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
372  // no previous info yet
373  it = infos.emplace(it, id, size);
374  }
375  assert(it->id == id);
376  assert(it->size == size);
377 
378  auto ref = it->ref.lock();
379  if (it->accSize >= size || !ref) {
380  if (ref) {
381  // We will switch to a new DeltaBlockCopy object. So
382  // now is a good time to compress the old one.
383  ref->compress(size);
384  }
385  // Heuristic: create a new block when too many small
386  // differences have accumulated.
387  auto b = std::make_shared<DeltaBlockCopy>(data, size);
388  it->ref = b;
389  it->last = b;
390  it->accSize = 0;
391  return b;
392  } else {
393  // Create diff based on earlier reference block.
394  // Reference remains unchanged.
395  auto b = std::make_shared<DeltaBlockDiff>(ref, data, size);
396  it->last = b;
397  it->accSize += b->getDeltaSize();
398  return b;
399  }
400 }
401 
402 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
403  const void* id, const uint8_t* data, size_t size)
404 {
405  auto it = ranges::lower_bound(infos, id,
406  [](const Info& info, const void* id2) {
407  return info.id < id2; });
408  if ((it == end(infos)) || (it->id != id)) {
409  // no previous block yet
410  it = infos.emplace(it, id, size);
411  }
412  assert(it->size == size);
413 
414  auto last = it->last.lock();
415  if (!last) {
416  auto b = std::make_shared<DeltaBlockCopy>(data, size);
417  it->ref = b;
418  it->last = b;
419  it->accSize = 0;
420  return b;
421  } else {
422 #ifdef DEBUG
423  assert(SHA1::calc(data, size) == last->sha1);
424 #endif
425  return last;
426  }
427 }
428 
430 {
431  for (const Info& info : infos) {
432  if (auto ref = info.ref.lock()) {
433  ref->compress(info.size);
434  }
435  }
436  infos.clear();
437 }
438 
439 } // namespace openmsx
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:348
size_t maxCompressedLength(size_t inLen)
Definition: snappy.cc:616
size_t getDeltaSize() const
Definition: DeltaBlock.cc:357
#define unlikely(x)
Definition: likely.hh:15
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:57
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
DeltaBlockCopy(const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:257
STL namespace.
static Sha1Sum calc(const uint8_t *data, size_t len)
Easier to use interface, if you can pass all data in one go.
Definition: utils/sha1.cc:361
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition: MemBuffer.hh:145
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:274
void compress(const char *input, size_t inLen, char *output, size_t &outLen)
Definition: snappy.cc:603
constexpr auto data(C &c) -> decltype(c.data())
Definition: span.hh:69
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:120
bool comp(const uint8_t *p, const uint8_t *q)
void uncompress(const char *input, size_t inLen, char *output, size_t outLen)
Definition: snappy.cc:166
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:90
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
void compress(size_t size)
Definition: DeltaBlock.cc:288
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:402
auto lower_bound(ForwardRange &&range, const T &value)
Definition: ranges.hh:71
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:51
constexpr auto size(const C &c) -> decltype(c.size())
Definition: span.hh:62
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:327
std::shared_ptr< DeltaBlock > createNew(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:365
auto end(const string_view &x)
Definition: string_view.hh:152
virtual ~DeltaBlock()=default
const uint8_t * getData()
Definition: DeltaBlock.cc:318