openMSX
DeltaBlock.cc
Go to the documentation of this file.
1 #include "DeltaBlock.hh"
2 #include "likely.hh"
3 #include "ranges.hh"
4 #include "lz4.hh"
5 #include <cassert>
6 #include <cstring>
7 #include <tuple>
8 #include <utility>
9 #if STATISTICS
10 #include <iostream>
11 #endif
12 #ifdef __SSE2__
13 #include <emmintrin.h>
14 #endif
15 
16 namespace openmsx {
17 
18 using std::vector;
19 
20 // --- Compressed integers ---
21 
22 // See https://en.wikipedia.org/wiki/LEB128 for a description of the
23 // 'unsigned LEB' format.
24 static void storeUleb(vector<uint8_t>& result, size_t value)
25 {
26  do {
27  uint8_t b = value & 0x7F;
28  value >>= 7;
29  if (value) b |= 0x80;
30  result.push_back(b);
31  } while (value);
32 }
33 
34 [[nodiscard]] static constexpr size_t loadUleb(const uint8_t*& data)
35 {
36  size_t result = 0;
37  int shift = 0;
38  while (true) {
39  uint8_t b = *data++;
40  result |= size_t(b & 0x7F) << shift;
41  if ((b & 0x80) == 0) return result;
42  shift += 7;
43  }
44 }
45 
46 
47 // --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
48 
49 template<int N> bool comp(const uint8_t* p, const uint8_t* q);
50 
51 template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
52 {
53  return *reinterpret_cast<const uint32_t*>(p) ==
54  *reinterpret_cast<const uint32_t*>(q);
55 }
56 
57 template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
58 {
59  return *reinterpret_cast<const uint64_t*>(p) ==
60  *reinterpret_cast<const uint64_t*>(q);
61 }
62 
63 #ifdef __SSE2__
64 template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
65 {
66  // Tests show that (on my machine) using 1 128-bit load is faster than
67  // 2 64-bit loads. Even though the actual comparison is slightly more
68  // complicated with SSE instructions.
69  __m128i a = _mm_load_si128(reinterpret_cast<const __m128i*>(p));
70  __m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(q));
71  __m128i d = _mm_cmpeq_epi8(a, b);
72  return _mm_movemask_epi8(d) == 0xffff;
73 }
74 #endif
75 
76 
77 // --- Optimized mismatch function ---
78 
79 // This is much like the function std::mismatch(). You pass in two buffers,
80 // the corresponding elements of both buffers are compared and the first
81 // position where the elements no longer match is returned.
82 //
83 // Compared to the std::mismatch() this implementation is faster because:
84 // - We make use of sentinels. This requires to temporarily change the content
85 // of the buffer. So it won't work with read-only-memory.
86 // - We compare words-at-a-time instead of byte-at-a-time.
87 static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
88  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
89 {
90  assert((p_end - p) == (q_end - q));
91 
92  // When SSE is available, work with 16-byte words, otherwise 4 or 8
93  // bytes. This routine also benefits from AVX2 instructions. Though not
94  // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
95  // extra run-time checks and that's not worth it at this point.
96  constexpr int WORD_SIZE =
97 #ifdef __SSE2__
98  sizeof(__m128i);
99 #else
100  sizeof(void*);
101 #endif
102 
103  // Region too small or
104  // both buffers are differently aligned.
105  if (unlikely((p_end - p) < (2 * WORD_SIZE)) ||
106  unlikely((reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
107  (reinterpret_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) {
108  goto end;
109  }
110 
111  // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
112  if (unlikely(reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1))) {
113  do {
114  if (*p != *q) return {p, q};
115  p += 1; q += 1;
116  } while (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1));
117  }
118 
119  // Fast path. Compare words-at-a-time.
120  {
121  // Place a sentinel in the last full word. This ensures we'll
122  // find a mismatch within the buffer, and so we can omit the
123  // end-of-buffer checks.
124  auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
125  auto save = *sentinel;
126  *sentinel = ~q_end[-WORD_SIZE];
127 
128  while (comp<WORD_SIZE>(p, q)) {
129  p += WORD_SIZE; q += WORD_SIZE;
130  }
131 
132  // Restore sentinel.
133  *sentinel = save;
134  }
135 
136  // Slow path. This handles:
137  // - Small or differently aligned buffers.
138  // - The bytes at and after the (restored) sentinel.
139 end: return std::mismatch(p, p_end, q);
140 }
141 
142 
143 // --- Optimized scan_match function ---
144 
145 // Like scan_mismatch() above, but searches two buffers for the first
146 // corresponding equal (instead of not-equal) bytes.
147 //
148 // Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
149 // buffer cannot be read-only memory.
150 //
151 // Unlike scan_mismatch() it's less obvious how to perform this function
152 // word-at-a-time (it's possible with some bit hacks). Though luckily this
153 // function is also less performance critical.
154 [[nodiscard]] static constexpr std::pair<const uint8_t*, const uint8_t*> scan_match(
155  const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
156 {
157  assert((p_end - p) == (q_end - q));
158 
159  // Code below is functionally equivalent to:
160  // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
161  // return {p, q};
162 
163  if (p == p_end) return {p, q};
164 
165  auto* p_last = const_cast<uint8_t*>(p_end - 1);
166  auto save = *p_last;
167  *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
168 
169  while (*p != *q) { ++p; ++q; }
170 
171  *p_last = save;
172  if ((p == p_last) && (*p != *q)) { ++p; ++q; }
173  return {p, q};
174 }
175 
176 
177 // --- delta (de)compression routines ---
178 
179 // Calculate a 'delta' between two binary buffers of equal size.
180 // The result is a stream of:
181 // n1 number of bytes are equal
182 // n2 number of bytes are different, and here are the bytes
183 // n3 number of bytes are equal
184 // ...
185 [[nodiscard]] static vector<uint8_t> calcDelta(
186  const uint8_t* oldBuf, const uint8_t* newBuf, size_t size)
187 {
188  vector<uint8_t> result;
189 
190  const auto* p = oldBuf;
191  const auto* q = newBuf;
192  const auto* p_end = p + size;
193  const auto* q_end = q + size;
194 
195  // scan equal bytes (possibly zero)
196  const auto* q1 = q;
197  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
198  auto n1 = q - q1;
199  storeUleb(result, n1);
200 
201  while (q != q_end) {
202  assert(*p != *q);
203 
204  const auto* q2 = q;
205  different:
206  std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
207  auto n2 = q - q2;
208 
209  const auto* q3 = q;
210  std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
211  auto n3 = q - q3;
212  if ((q != q_end) && (n3 <= 2)) goto different;
213 
214  storeUleb(result, n2);
215  result.insert(result.end(), q2, q3);
216 
217  if (n3 != 0) storeUleb(result, n3);
218  }
219 
220  result.shrink_to_fit();
221  return result;
222 }
223 
224 // Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
225 static void applyDeltaInPlace(uint8_t* buf, size_t size, const uint8_t* delta)
226 {
227  auto* end = buf + size;
228 
229  while (buf != end) {
230  auto n1 = loadUleb(delta);
231  buf += n1;
232  if (buf == end) break;
233 
234  auto n2 = loadUleb(delta);
235  memcpy(buf, delta, n2);
236  buf += n2;
237  delta += n2;
238  }
239 }
240 
241 #if STATISTICS
242 
243 // class DeltaBlock
244 
246 {
247  globalAllocSize -= allocSize;
248  std::cout << "stat: ~DeltaBlock " << globalAllocSize
249  << " (-" << allocSize << ")\n";
250 }
251 
252 #endif
253 
254 // class DeltaBlockCopy
255 
256 DeltaBlockCopy::DeltaBlockCopy(const uint8_t* data, size_t size)
257  : block(size)
258  , compressedSize(0)
259 {
260 #ifdef DEBUG
261  sha1 = SHA1::calc({data, size});
262 #endif
263  memcpy(block.data(), data, size);
264  assert(!compressed());
265 #if STATISTICS
266  allocSize = size;
267  globalAllocSize += allocSize;
268  std::cout << "stat: DeltaBlockCopy " << globalAllocSize
269  << " (+" << allocSize << ")\n";
270 #endif
271 }
272 
273 void DeltaBlockCopy::apply(uint8_t* dst, size_t size) const
274 {
275  if (compressed()) {
276  LZ4::decompress(block.data(), dst, int(compressedSize), int(size));
277  } else {
278  memcpy(dst, block.data(), size);
279  }
280 #ifdef DEBUG
281  assert(SHA1::calc({dst, size}) == sha1);
282 #endif
283 }
284 
286 {
287  if (compressed()) return;
288 
289  size_t dstLen = LZ4::compressBound(int(size));
290  MemBuffer<uint8_t> buf2(dstLen);
291  dstLen = LZ4::compress(block.data(), buf2.data(), int(size));
292 
293  if (dstLen >= size) {
294  // compression isn't beneficial
295  return;
296  }
297  compressedSize = dstLen;
298  block.swap(buf2);
299  block.resize(compressedSize); // shrink to fit
300  assert(compressed());
301 #ifdef DEBUG
302  MemBuffer<uint8_t> buf3(size);
303  apply(buf3.data(), size);
304  assert(memcmp(buf3.data(), buf2.data(), size) == 0);
305 #endif
306 #if STATISTICS
307  int delta = compressedSize - allocSize;
308  allocSize = compressedSize;
309  globalAllocSize += delta;
310  std::cout << "stat: compress " << globalAllocSize
311  << " (" << delta << ")\n";
312 #endif
313 }
314 
315 const uint8_t* DeltaBlockCopy::getData()
316 {
317  assert(!compressed());
318  return block.data();
319 }
320 
321 
322 // class DeltaBlockDiff
323 
325  std::shared_ptr<DeltaBlockCopy> prev_,
326  const uint8_t* data, size_t size)
327  : prev(std::move(prev_))
328  , delta(calcDelta(prev->getData(), data, size))
329 {
330 #ifdef DEBUG
331  sha1 = SHA1::calc({data, size});
332 
334  apply(buf.data(), size);
335  assert(memcmp(buf.data(), data, size) == 0);
336 #endif
337 #if STATISTICS
338  allocSize = delta.size();
339  globalAllocSize += allocSize;
340  std::cout << "stat: DeltaBlockDiff " << globalAllocSize
341  << " (+" << allocSize << ")\n";
342 #endif
343 }
344 
345 void DeltaBlockDiff::apply(uint8_t* dst, size_t size) const
346 {
347  prev->apply(dst, size);
348  applyDeltaInPlace(dst, size, delta.data());
349 #ifdef DEBUG
350  assert(SHA1::calc({dst, size}) == sha1);
351 #endif
352 }
353 
355 {
356  return delta.size();
357 }
358 
359 
360 // class LastDeltaBlocks
361 
362 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
363  const void* id, const uint8_t* data, size_t size)
364 {
365  auto it = ranges::lower_bound(infos, std::tuple(id, size),
366  [](const Info& info, const std::tuple<const void*, size_t>& info2) {
367  return std::tuple(info.id, info.size) < info2; });
368  if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
369  // no previous info yet
370  it = infos.emplace(it, id, size);
371  }
372  assert(it->id == id);
373  assert(it->size == size);
374 
375  auto ref = it->ref.lock();
376  if (it->accSize >= size || !ref) {
377  if (ref) {
378  // We will switch to a new DeltaBlockCopy object. So
379  // now is a good time to compress the old one.
380  ref->compress(size);
381  }
382  // Heuristic: create a new block when too many small
383  // differences have accumulated.
384  auto b = std::make_shared<DeltaBlockCopy>(data, size);
385  it->ref = b;
386  it->last = b;
387  it->accSize = 0;
388  return b;
389  } else {
390  // Create diff based on earlier reference block.
391  // Reference remains unchanged.
392  auto b = std::make_shared<DeltaBlockDiff>(ref, data, size);
393  it->last = b;
394  it->accSize += b->getDeltaSize();
395  return b;
396  }
397 }
398 
399 std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
400  const void* id, const uint8_t* data, size_t size)
401 {
402  auto it = ranges::lower_bound(infos, id,
403  [](const Info& info, const void* id2) {
404  return info.id < id2; });
405  if ((it == end(infos)) || (it->id != id)) {
406  // no previous block yet
407  it = infos.emplace(it, id, size);
408  }
409  assert(it->size == size);
410 
411  auto last = it->last.lock();
412  if (!last) {
413  auto b = std::make_shared<DeltaBlockCopy>(data, size);
414  it->ref = b;
415  it->last = b;
416  it->accSize = 0;
417  return b;
418  } else {
419 #ifdef DEBUG
420  assert(SHA1::calc({data, size}) == last->sha1);
421 #endif
422  return last;
423  }
424 }
425 
427 {
428  for (const Info& info : infos) {
429  if (auto ref = info.ref.lock()) {
430  ref->compress(info.size);
431  }
432  }
433  infos.clear();
434 }
435 
436 } // namespace openmsx
const uint8_t * getData()
Definition: DeltaBlock.cc:315
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:273
DeltaBlockCopy(const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:256
void compress(size_t size)
Definition: DeltaBlock.cc:285
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:345
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:324
size_t getDeltaSize() const
Definition: DeltaBlock.cc:354
virtual ~DeltaBlock()=default
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:399
std::shared_ptr< DeltaBlock > createNew(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:362
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:111
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition: MemBuffer.hh:136
static Sha1Sum calc(span< const uint8_t > data)
Easier to use interface, if you can pass all data in one go.
Definition: utils/sha1.cc:369
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
#define unlikely(x)
Definition: likely.hh:15
int compressBound(int iSize)
Definition: lz4.hh:51
int decompress(const uint8_t *src, uint8_t *dst, int compressedSize, int dstCapacity)
Definition: lz4.cc:540
int compress(const uint8_t *src, uint8_t *dst, int srcSize)
Definition: lz4.cc:516
void save(unsigned width, unsigned height, const void **rowPointers, const PixelFormat &format, const std::string &filename)
Definition: PNG.cc:371
This file implemented 3 utility functions:
Definition: Autofire.cc:9
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:51
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:57
AmdFlash::SectorInfo Info
Definition: RomManbow2.cc:17
bool comp(const uint8_t *p, const uint8_t *q)
auto lower_bound(ForwardRange &&range, const T &value)
Definition: ranges.hh:71
size_t size(std::string_view utf8)
constexpr auto end(const zstring_view &x)
Definition: zstring_view.hh:83