openMSX
DeltaBlock.cc
Go to the documentation of this file.
1 #include "DeltaBlock.hh"
2 #include "ranges.hh"
3 #include "lz4.hh"
4 #include <cassert>
5 #include <cstring>
6 #include <tuple>
7 #include <utility>
8 #if STATISTICS
9 #include <iostream>
10 #endif
11 #ifdef __SSE2__
12 #include <emmintrin.h>
13 #endif
14 
15 namespace openmsx {
16 
17 // --- Compressed integers ---
18 
19 // See https://en.wikipedia.org/wiki/LEB128 for a description of the
20 // 'unsigned LEB' format.
21 static void storeUleb(std::vector<uint8_t>& result, size_t value)
22 {
23  do {
24  uint8_t b = value & 0x7F;
25  value >>= 7;
26  if (value) b |= 0x80;
27  result.push_back(b);
28  } while (value);
29 }
30 
31 [[nodiscard]] static constexpr size_t loadUleb(const uint8_t*& data)
32 {
33  size_t result = 0;
34  int shift = 0;
35  while (true) {
36  uint8_t b = *data++;
37  result |= size_t(b & 0x7F) << shift;
38  if ((b & 0x80) == 0) return result;
39  shift += 7;
40  }
41 }
42 
43 
44 // --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
45 
46 template<int N> bool comp(const uint8_t* p, const uint8_t* q);
47 
48 template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
49 {
50  return *reinterpret_cast<const uint32_t*>(p) ==
51  *reinterpret_cast<const uint32_t*>(q);
52 }
53 
54 template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
55 {
56  return *reinterpret_cast<const uint64_t*>(p) ==
57  *reinterpret_cast<const uint64_t*>(q);
58 }
59 
60 #ifdef __SSE2__
61 template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
62 {
63  // Tests show that (on my machine) using 1 128-bit load is faster than
64  // 2 64-bit loads. Even though the actual comparison is slightly more
65  // complicated with SSE instructions.
66  __m128i a = _mm_load_si128(reinterpret_cast<const __m128i*>(p));
67  __m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(q));
68  __m128i d = _mm_cmpeq_epi8(a, b);
69  return _mm_movemask_epi8(d) == 0xffff;
70 }
71 #endif
72 
73 
74 // --- Optimized mismatch function ---
75 
76 // This is much like the function std::mismatch(). You pass in two buffers,
77 // the corresponding elements of both buffers are compared and the first
78 // position where the elements no longer match is returned.
79 //
80 // Compared to the std::mismatch() this implementation is faster because:
81 // - We make use of sentinels. This requires to temporarily change the content
82 // of the buffer. So it won't work with read-only-memory.
83 // - We compare words-at-a-time instead of byte-at-a-time.
84 static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
85  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
86 {
87  assert((p_end - p) == (q_end - q));
88 
89  // When SSE is available, work with 16-byte words, otherwise 4 or 8
90  // bytes. This routine also benefits from AVX2 instructions. Though not
91  // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
92  // extra run-time checks and that's not worth it at this point.
93  constexpr int WORD_SIZE =
94 #ifdef __SSE2__
95  sizeof(__m128i);
96 #else
97  sizeof(void*);
98 #endif
99 
100  // Region too small or
101  // both buffers are differently aligned.
102  if (((p_end - p) < (2 * WORD_SIZE)) ||
103  ((reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
104  (reinterpret_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) [[unlikely]] {
105  goto end;
106  }
107 
108  // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
109  if (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) [[unlikely]] {
110  do {
111  if (*p != *q) return {p, q};
112  p += 1; q += 1;
113  } while (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1));
114  }
115 
116  // Fast path. Compare words-at-a-time.
117  {
118  // Place a sentinel in the last full word. This ensures we'll
119  // find a mismatch within the buffer, and so we can omit the
120  // end-of-buffer checks.
121  auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
122  auto save = *sentinel;
123  *sentinel = ~q_end[-WORD_SIZE];
124 
125  while (comp<WORD_SIZE>(p, q)) {
126  p += WORD_SIZE; q += WORD_SIZE;
127  }
128 
129  // Restore sentinel.
130  *sentinel = save;
131  }
132 
133  // Slow path. This handles:
134  // - Small or differently aligned buffers.
135  // - The bytes at and after the (restored) sentinel.
136 end: return std::mismatch(p, p_end, q);
137 }
138 
139 
140 // --- Optimized scan_match function ---
141 
142 // Like scan_mismatch() above, but searches two buffers for the first
143 // corresponding equal (instead of not-equal) bytes.
144 //
145 // Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
146 // buffer cannot be read-only memory.
147 //
148 // Unlike scan_mismatch() it's less obvious how to perform this function
149 // word-at-a-time (it's possible with some bit hacks). Though luckily this
150 // function is also less performance critical.
151 [[nodiscard]] static constexpr std::pair<const uint8_t*, const uint8_t*> scan_match(
152  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
153 {
154  assert((p_end - p) == (q_end - q));
155 
156  // Code below is functionally equivalent to:
157  // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
158  // return {p, q};
159 
160  if (p == p_end) return {p, q};
161 
162  auto* p_last = const_cast<uint8_t*>(p_end - 1);
163  auto save = *p_last;
164  *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
165 
166  while (*p != *q) { ++p; ++q; }
167 
168  *p_last = save;
169  if ((p == p_last) && (*p != *q)) { ++p; ++q; }
170  return {p, q};
171 }
172 
173 
174 // --- delta (de)compression routines ---
175 
176 // Calculate a 'delta' between two binary buffers of equal size.
177 // The result is a stream of:
178 // n1 number of bytes are equal
179 // n2 number of bytes are different, and here are the bytes
180 // n3 number of bytes are equal
181 // ...
182 [[nodiscard]] static std::vector<uint8_t> calcDelta(
183  const uint8_t* oldBuf, const uint8_t* newBuf, size_t size)
184 {
185  std::vector<uint8_t> result;
186 
187  const auto* p = oldBuf;
188  const auto* q = newBuf;
189  const auto* p_end = p + size;
190  const auto* q_end = q + size;
191 
192  // scan equal bytes (possibly zero)
193  const auto* q1 = q;
194  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
195  auto n1 = q - q1;
196  storeUleb(result, n1);
197 
198  while (q != q_end) {
199  assert(*p != *q);
200 
201  const auto* q2 = q;
202  different:
203  std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
204  auto n2 = q - q2;
205 
206  const auto* q3 = q;
207  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
208  auto n3 = q - q3;
209  if ((q != q_end) && (n3 <= 2)) goto different;
210 
211  storeUleb(result, n2);
212  result.insert(result.end(), q2, q3);
213 
214  if (n3 != 0) storeUleb(result, n3);
215  }
216 
217  result.shrink_to_fit();
218  return result;
219 }
220 
221 // Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
222 static void applyDeltaInPlace(uint8_t* buf, size_t size, const uint8_t* delta)
223 {
224  auto* end = buf + size;
225 
226  while (buf != end) {
227  auto n1 = loadUleb(delta);
228  buf += n1;
229  if (buf == end) break;
230 
231  auto n2 = loadUleb(delta);
232  memcpy(buf, delta, n2);
233  buf += n2;
234  delta += n2;
235  }
236 }
237 
238 #if STATISTICS
239 
240 // class DeltaBlock
241 
243 {
244  globalAllocSize -= allocSize;
245  std::cout << "stat: ~DeltaBlock " << globalAllocSize
246  << " (-" << allocSize << ")\n";
247 }
248 
249 #endif
250 
251 // class DeltaBlockCopy
252 
253 DeltaBlockCopy::DeltaBlockCopy(const uint8_t* data, size_t size)
254  : block(size)
255  , compressedSize(0)
256 {
257 #ifdef DEBUG
258  sha1 = SHA1::calc({data, size});
259 #endif
260  memcpy(block.data(), data, size);
261  assert(!compressed());
262 #if STATISTICS
263  allocSize = size;
264  globalAllocSize += allocSize;
265  std::cout << "stat: DeltaBlockCopy " << globalAllocSize
266  << " (+" << allocSize << ")\n";
267 #endif
268 }
269 
270 void DeltaBlockCopy::apply(uint8_t* dst, size_t size) const
271 {
272  if (compressed()) {
273  LZ4::decompress(block.data(), dst, int(compressedSize), int(size));
274  } else {
275  memcpy(dst, block.data(), size);
276  }
277 #ifdef DEBUG
278  assert(SHA1::calc({dst, size}) == sha1);
279 #endif
280 }
281 
283 {
284  if (compressed()) return;
285 
286  size_t dstLen = LZ4::compressBound(int(size));
287  MemBuffer<uint8_t> buf2(dstLen);
288  dstLen = LZ4::compress(block.data(), buf2.data(), int(size));
289 
290  if (dstLen >= size) {
291  // compression isn't beneficial
292  return;
293  }
294  compressedSize = dstLen;
295  block.swap(buf2);
296  block.resize(compressedSize); // shrink to fit
297  assert(compressed());
298 #ifdef DEBUG
299  MemBuffer<uint8_t> buf3(size);
300  apply(buf3.data(), size);
301  assert(memcmp(buf3.data(), buf2.data(), size) == 0);
302 #endif
303 #if STATISTICS
304  int delta = compressedSize - allocSize;
305  allocSize = compressedSize;
306  globalAllocSize += delta;
307  std::cout << "stat: compress " << globalAllocSize
308  << " (" << delta << ")\n";
309 #endif
310 }
311 
312 const uint8_t* DeltaBlockCopy::getData()
313 {
314  assert(!compressed());
315  return block.data();
316 }
317 
318 
319 // class DeltaBlockDiff
320 
322  std::shared_ptr<DeltaBlockCopy> prev_,
323  const uint8_t* data, size_t size)
324  : prev(std::move(prev_))
325  , delta(calcDelta(prev->getData(), data, size))
326 {
327 #ifdef DEBUG
328  sha1 = SHA1::calc({data, size});
329 
331  apply(buf.data(), size);
332  assert(memcmp(buf.data(), data, size) == 0);
333 #endif
334 #if STATISTICS
335  allocSize = delta.size();
336  globalAllocSize += allocSize;
337  std::cout << "stat: DeltaBlockDiff " << globalAllocSize
338  << " (+" << allocSize << ")\n";
339 #endif
340 }
341 
342 void DeltaBlockDiff::apply(uint8_t* dst, size_t size) const
343 {
344  prev->apply(dst, size);
345  applyDeltaInPlace(dst, size, delta.data());
346 #ifdef DEBUG
347  assert(SHA1::calc({dst, size}) == sha1);
348 #endif
349 }
350 
352 {
353  return delta.size();
354 }
355 
356 
357 // class LastDeltaBlocks
358 
359 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
360  const void* id, const uint8_t* data, size_t size)
361 {
362  auto it = ranges::lower_bound(infos, std::tuple(id, size), {},
363  [](const Info& info) { return std::tuple(info.id, info.size); });
364  if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
365  // no previous info yet
366  it = infos.emplace(it, id, size);
367  }
368  assert(it->id == id);
369  assert(it->size == size);
370 
371  auto ref = it->ref.lock();
372  if (it->accSize >= size || !ref) {
373  if (ref) {
374  // We will switch to a new DeltaBlockCopy object. So
375  // now is a good time to compress the old one.
376  ref->compress(size);
377  }
378  // Heuristic: create a new block when too many small
379  // differences have accumulated.
380  auto b = std::make_shared<DeltaBlockCopy>(data, size);
381  it->ref = b;
382  it->last = b;
383  it->accSize = 0;
384  return b;
385  } else {
386  // Create diff based on earlier reference block.
387  // Reference remains unchanged.
388  auto b = std::make_shared<DeltaBlockDiff>(ref, data, size);
389  it->last = b;
390  it->accSize += b->getDeltaSize();
391  return b;
392  }
393 }
394 
395 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
396  const void* id, const uint8_t* data, size_t size)
397 {
398  auto it = ranges::lower_bound(infos, id, {}, &Info::id);
399  if ((it == end(infos)) || (it->id != id)) {
400  // no previous block yet
401  it = infos.emplace(it, id, size);
402  }
403  assert(it->size == size);
404 
405  auto last = it->last.lock();
406  if (!last) {
407  auto b = std::make_shared<DeltaBlockCopy>(data, size);
408  it->ref = b;
409  it->last = b;
410  it->accSize = 0;
411  return b;
412  } else {
413 #ifdef DEBUG
414  assert(SHA1::calc({data, size}) == last->sha1);
415 #endif
416  return last;
417  }
418 }
419 
421 {
422  for (const Info& info : infos) {
423  if (auto ref = info.ref.lock()) {
424  ref->compress(info.size);
425  }
426  }
427  infos.clear();
428 }
429 
430 } // namespace openmsx
uintptr_t id
Definition: Interpreter.cc:26
const uint8_t * getData()
Definition: DeltaBlock.cc:312
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:270
DeltaBlockCopy(const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:253
void compress(size_t size)
Definition: DeltaBlock.cc:282
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:342
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:321
size_t getDeltaSize() const
Definition: DeltaBlock.cc:351
virtual ~DeltaBlock()=default
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:395
std::shared_ptr< DeltaBlock > createNew(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:359
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:111
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition: MemBuffer.hh:136
static Sha1Sum calc(std::span< const uint8_t > data)
Easier to use interface, if you can pass all data in one go.
Definition: utils/sha1.cc:363
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
int compressBound(int iSize)
Definition: lz4.hh:51
int decompress(const uint8_t *src, uint8_t *dst, int compressedSize, int dstCapacity)
Definition: lz4.cc:479
int compress(const uint8_t *src, uint8_t *dst, int srcSize)
Definition: lz4.cc:455
void save(unsigned width, unsigned height, const void **rowPointers, const PixelFormat &format, const std::string &filename)
Definition: PNG.cc:371
This file implemented 3 utility functions:
Definition: Autofire.cc:9
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:48
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:54
AmdFlash::SectorInfo Info
Definition: RomManbow2.cc:17
bool comp(const uint8_t *p, const uint8_t *q)
auto lower_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition: ranges.hh:99
size_t size(std::string_view utf8)
constexpr auto end(const zstring_view &x)