openMSX
DeltaBlock.cc
Go to the documentation of this file.
1 #include "DeltaBlock.hh"
2 #include "likely.hh"
3 #include "ranges.hh"
4 #include "lz4.hh"
5 #include <cassert>
6 #include <cstring>
7 #include <tuple>
8 #include <utility>
9 #if STATISTICS
10 #include <iostream>
11 #endif
12 #ifdef __SSE2__
13 #include <emmintrin.h>
14 #endif
15 
16 namespace openmsx {
17 
18 using std::vector;
19 
20 // --- Compressed integers ---
21 
22 // See https://en.wikipedia.org/wiki/LEB128 for a description of the
23 // 'unsigned LEB' format.
24 static void storeUleb(vector<uint8_t>& result, size_t value)
25 {
26  do {
27  uint8_t b = value & 0x7F;
28  value >>= 7;
29  if (value) b |= 0x80;
30  result.push_back(b);
31  } while (value);
32 }
33 
34 static size_t loadUleb(const uint8_t*& data)
35 {
36  size_t result = 0;
37  int shift = 0;
38  while (true) {
39  uint8_t b = *data++;
40  result |= size_t(b & 0x7F) << shift;
41  if ((b & 0x80) == 0) return result;
42  shift += 7;
43  }
44 }
45 
46 
47 // --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
48 
49 template<int N> bool comp(const uint8_t* p, const uint8_t* q);
50 
51 template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
52 {
53  return *reinterpret_cast<const uint32_t*>(p) ==
54  *reinterpret_cast<const uint32_t*>(q);
55 }
56 
57 template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
58 {
59  return *reinterpret_cast<const uint64_t*>(p) ==
60  *reinterpret_cast<const uint64_t*>(q);
61 }
62 
63 #ifdef __SSE2__
64 template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
65 {
66  // Tests show that (on my machine) using 1 128-bit load is faster than
67  // 2 64-bit loads. Even though the actual comparison is slightly more
68  // complicated with SSE instructions.
69  __m128i a = _mm_load_si128(reinterpret_cast<const __m128i*>(p));
70  __m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(q));
71  __m128i d = _mm_cmpeq_epi8(a, b);
72  return _mm_movemask_epi8(d) == 0xffff;
73 }
74 #endif
75 
76 
77 // --- Optimized mismatch function ---
78 
79 // This is much like the function std::mismatch(). You pass in two buffers,
80 // the corresponding elements of both buffers are compared and the first
81 // position where the elements no longer match is returned.
82 //
83 // Compared to the std::mismatch() this implementation is faster because:
84 // - We make use of sentinels. This requires to temporarily change the content
85 // of the buffer. So it won't work with read-only-memory.
86 // - We compare words-at-a-time instead of byte-at-a-time.
87 static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
88  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
89 {
90  assert((p_end - p) == (q_end - q));
91 
92  // When SSE is available, work with 16-byte words, otherwise 4 or 8
93  // bytes. This routine also benefits from AVX2 instructions. Though not
94  // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
95  // extra run-time checks and that's not worth it at this point.
96  constexpr int WORD_SIZE =
97 #ifdef __SSE2__
98  sizeof(__m128i);
99 #else
100  sizeof(void*);
101 #endif
102 
103  // Region too small or
104  // both buffers are differently aligned.
105  if (unlikely((p_end - p) < (2 * WORD_SIZE)) ||
106  unlikely((reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
107  (reinterpret_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) {
108  goto end;
109  }
110 
111  // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
112  if (unlikely(reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1))) {
113  do {
114  if (*p != *q) return {p, q};
115  p += 1; q += 1;
116  } while (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1));
117  }
118 
119  // Fast path. Compare words-at-a-time.
120  {
121  // Place a sentinel in the last full word. This ensures we'll
122  // find a mismatch within the buffer, and so we can omit the
123  // end-of-buffer checks.
124  auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
125  auto save = *sentinel;
126  *sentinel = ~q_end[-WORD_SIZE];
127 
128  while (comp<WORD_SIZE>(p, q)) {
129  p += WORD_SIZE; q += WORD_SIZE;
130  }
131 
132  // Restore sentinel.
133  *sentinel = save;
134  }
135 
136  // Slow path. This handles:
137  // - Small or differently aligned buffers.
138  // - The bytes at and after the (restored) sentinel.
139 end: return std::mismatch(p, p_end, q);
140 }
141 
142 
143 // --- Optimized scan_match function ---
144 
145 // Like scan_mismatch() above, but searches two buffers for the first
146 // corresponding equal (instead of not-equal) bytes.
147 //
148 // Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
149 // buffer cannot be read-only memory.
150 //
151 // Unlike scan_mismatch() it's less obvious how to perform this function
152 // word-at-a-time (it's possible with some bit hacks). Though luckily this
153 // function is also less performance critical.
154 static std::pair<const uint8_t*, const uint8_t*> scan_match(
155  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
156 {
157  assert((p_end - p) == (q_end - q));
158 
159  // Code below is functionally equivalent to:
160  // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
161  // return {p, q};
162 
163  if (p == p_end) return {p, q};
164 
165  auto* p_last = const_cast<uint8_t*>(p_end - 1);
166  auto save = *p_last;
167  *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
168 
169  while (*p != *q) { ++p; ++q; }
170 
171  *p_last = save;
172  if ((p == p_last) && (*p != *q)) { ++p; ++q; }
173  return {p, q};
174 }
175 
176 
177 // --- delta (de)compression routines ---
178 
179 // Calculate a 'delta' between two binary buffers of equal size.
180 // The result is a stream of:
181 // n1 number of bytes are equal
182 // n2 number of bytes are different, and here are the bytes
183 // n3 number of bytes are equal
184 // ...
185 static vector<uint8_t> calcDelta(const uint8_t* oldBuf, const uint8_t* newBuf, size_t size)
186 {
187  vector<uint8_t> result;
188 
189  auto* p = oldBuf;
190  auto* q = newBuf;
191  auto* p_end = p + size;
192  auto* q_end = q + size;
193 
194  // scan equal bytes (possibly zero)
195  auto* q1 = q;
196  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
197  auto n1 = q - q1;
198  storeUleb(result, n1);
199 
200  while (q != q_end) {
201  assert(*p != *q);
202 
203  auto* q2 = q;
204  different:
205  std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
206  auto n2 = q - q2;
207 
208  auto* q3 = q;
209  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
210  auto n3 = q - q3;
211  if ((q != q_end) && (n3 <= 2)) goto different;
212 
213  storeUleb(result, n2);
214  result.insert(result.end(), q2, q3);
215 
216  if (n3 != 0) storeUleb(result, n3);
217  }
218 
219  result.shrink_to_fit();
220  return result;
221 }
222 
223 // Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
224 static void applyDeltaInPlace(uint8_t* buf, size_t size, const uint8_t* delta)
225 {
226  auto* end = buf + size;
227 
228  while (buf != end) {
229  auto n1 = loadUleb(delta);
230  buf += n1;
231  if (buf == end) break;
232 
233  auto n2 = loadUleb(delta);
234  memcpy(buf, delta, n2);
235  buf += n2;
236  delta += n2;
237  }
238 }
239 
240 #if STATISTICS
241 
242 // class DeltaBlock
243 
245 {
246  globalAllocSize -= allocSize;
247  std::cout << "stat: ~DeltaBlock " << globalAllocSize
248  << " (-" << allocSize << ")\n";
249 }
250 
251 #endif
252 
253 // class DeltaBlockCopy
254 
255 DeltaBlockCopy::DeltaBlockCopy(const uint8_t* data, size_t size)
256  : block(size)
257  , compressedSize(0)
258 {
259 #ifdef DEBUG
260  sha1 = SHA1::calc(data, size);
261 #endif
262  memcpy(block.data(), data, size);
263  assert(!compressed());
264 #if STATISTICS
265  allocSize = size;
266  globalAllocSize += allocSize;
267  std::cout << "stat: DeltaBlockCopy " << globalAllocSize
268  << " (+" << allocSize << ")\n";
269 #endif
270 }
271 
272 void DeltaBlockCopy::apply(uint8_t* dst, size_t size) const
273 {
274  if (compressed()) {
275  LZ4::decompress(block.data(), dst, int(compressedSize), int(size));
276  } else {
277  memcpy(dst, block.data(), size);
278  }
279 #ifdef DEBUG
280  assert(SHA1::calc(dst, size) == sha1);
281 #endif
282 }
283 
285 {
286  if (compressed()) return;
287 
288  size_t dstLen = LZ4::compressBound(size);
289  MemBuffer<uint8_t> buf2(dstLen);
290  dstLen = LZ4::compress(block.data(), buf2.data(), int(size));
291 
292  if (dstLen >= size) {
293  // compression isn't beneficial
294  return;
295  }
296  compressedSize = dstLen;
297  block.swap(buf2);
298  block.resize(compressedSize); // shrink to fit
299  assert(compressed());
300 #ifdef DEBUG
301  MemBuffer<uint8_t> buf3(size);
302  apply(buf3.data(), size);
303  assert(memcmp(buf3.data(), buf2.data(), size) == 0);
304 #endif
305 #if STATISTICS
306  int delta = compressedSize - allocSize;
307  allocSize = compressedSize;
308  globalAllocSize += delta;
309  std::cout << "stat: compress " << globalAllocSize
310  << " (" << delta << ")\n";
311 #endif
312 }
313 
314 const uint8_t* DeltaBlockCopy::getData()
315 {
316  assert(!compressed());
317  return block.data();
318 }
319 
320 
321 // class DeltaBlockDiff
322 
324  std::shared_ptr<DeltaBlockCopy> prev_,
325  const uint8_t* data, size_t size)
326  : prev(std::move(prev_))
327  , delta(calcDelta(prev->getData(), data, size))
328 {
329 #ifdef DEBUG
330  sha1 = SHA1::calc(data, size);
331 
333  apply(buf.data(), size);
334  assert(memcmp(buf.data(), data, size) == 0);
335 #endif
336 #if STATISTICS
337  allocSize = delta.size();
338  globalAllocSize += allocSize;
339  std::cout << "stat: DeltaBlockDiff " << globalAllocSize
340  << " (+" << allocSize << ")\n";
341 #endif
342 }
343 
344 void DeltaBlockDiff::apply(uint8_t* dst, size_t size) const
345 {
346  prev->apply(dst, size);
347  applyDeltaInPlace(dst, size, delta.data());
348 #ifdef DEBUG
349  assert(SHA1::calc(dst, size) == sha1);
350 #endif
351 }
352 
354 {
355  return delta.size();
356 }
357 
358 
359 // class LastDeltaBlocks
360 
361 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
362  const void* id, const uint8_t* data, size_t size)
363 {
364  auto it = ranges::lower_bound(infos, std::tuple(id, size),
365  [](const Info& info, const std::tuple<const void*, size_t>& info2) {
366  return std::tuple(info.id, info.size) < info2; });
367  if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
368  // no previous info yet
369  it = infos.emplace(it, id, size);
370  }
371  assert(it->id == id);
372  assert(it->size == size);
373 
374  auto ref = it->ref.lock();
375  if (it->accSize >= size || !ref) {
376  if (ref) {
377  // We will switch to a new DeltaBlockCopy object. So
378  // now is a good time to compress the old one.
379  ref->compress(size);
380  }
381  // Heuristic: create a new block when too many small
382  // differences have accumulated.
383  auto b = std::make_shared<DeltaBlockCopy>(data, size);
384  it->ref = b;
385  it->last = b;
386  it->accSize = 0;
387  return b;
388  } else {
389  // Create diff based on earlier reference block.
390  // Reference remains unchanged.
391  auto b = std::make_shared<DeltaBlockDiff>(ref, data, size);
392  it->last = b;
393  it->accSize += b->getDeltaSize();
394  return b;
395  }
396 }
397 
398 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
399  const void* id, const uint8_t* data, size_t size)
400 {
401  auto it = ranges::lower_bound(infos, id,
402  [](const Info& info, const void* id2) {
403  return info.id < id2; });
404  if ((it == end(infos)) || (it->id != id)) {
405  // no previous block yet
406  it = infos.emplace(it, id, size);
407  }
408  assert(it->size == size);
409 
410  auto last = it->last.lock();
411  if (!last) {
412  auto b = std::make_shared<DeltaBlockCopy>(data, size);
413  it->ref = b;
414  it->last = b;
415  it->accSize = 0;
416  return b;
417  } else {
418 #ifdef DEBUG
419  assert(SHA1::calc(data, size) == last->sha1);
420 #endif
421  return last;
422  }
423 }
424 
426 {
427  for (const Info& info : infos) {
428  if (auto ref = info.ref.lock()) {
429  ref->compress(info.size);
430  }
431  }
432  infos.clear();
433 }
434 
435 } // namespace openmsx
openmsx::comp< 8 >
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:57
n3
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
openmsx::MemBuffer::swap
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition: MemBuffer.hh:145
openmsx::PNG::save
void save(unsigned width, unsigned height, const void **rowPointers, const PixelFormat &format, const std::string &filename)
Definition: PNG.cc:381
unlikely
#define unlikely(x)
Definition: likely.hh:15
openmsx::LastDeltaBlocks::clear
void clear()
Definition: DeltaBlock.cc:425
openmsx::DeltaBlockDiff::DeltaBlockDiff
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:323
utf8::unchecked::size
size_t size(std::string_view utf8)
Definition: utf8_unchecked.hh:227
openmsx::DeltaBlockCopy::DeltaBlockCopy
DeltaBlockCopy(const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:255
openmsx::DeltaBlockCopy::compress
void compress(size_t size)
Definition: DeltaBlock.cc:284
ranges::lower_bound
auto lower_bound(ForwardRange &&range, const T &value)
Definition: ranges.hh:71
openmsx::DeltaBlock::~DeltaBlock
virtual ~DeltaBlock()=default
openmsx::DeltaBlockDiff::getDeltaSize
size_t getDeltaSize() const
Definition: DeltaBlock.cc:353
ranges.hh
likely.hh
DeltaBlock.hh
openmsx::DeltaBlockCopy::getData
const uint8_t * getData()
Definition: DeltaBlock.cc:314
openmsx::comp
bool comp(const uint8_t *p, const uint8_t *q)
openmsx::MemBuffer::resize
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:120
openmsx::MemBuffer< uint8_t >
openmsx::DeltaBlockDiff::apply
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:344
lz4.hh
LZ4::compressBound
int compressBound(int isize)
Definition: lz4.hh:51
openmsx::SHA1::calc
static Sha1Sum calc(const uint8_t *data, size_t len)
Easier to use interface, if you can pass all data in one go.
Definition: utils/sha1.cc:361
openmsx::MemBuffer::data
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:90
openmsx::LastDeltaBlocks::createNullDiff
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:398
LZ4::decompress
int decompress(const uint8_t *src, uint8_t *dst, int compressedSize, int dstCapacity)
Definition: lz4.cc:540
openmsx::DeltaBlockCopy::apply
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:272
openmsx
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
openmsx::LastDeltaBlocks::createNew
std::shared_ptr< DeltaBlock > createNew(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:361
LZ4::compress
int compress(const uint8_t *src, uint8_t *dst, int srcSize)
Definition: lz4.cc:516
openmsx::comp< 4 >
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:51