openMSX
DeltaBlock.cc
Go to the documentation of this file.
1#include "DeltaBlock.hh"
2#include "ranges.hh"
3#include "lz4.hh"
4#include <cassert>
5#include <cstring>
6#include <tuple>
7#include <utility>
8#if STATISTICS
9#include <iostream>
10#endif
11#ifdef __SSE2__
12#include <emmintrin.h>
13#endif
14
15namespace openmsx {
16
17// --- Compressed integers ---
18
19// See https://en.wikipedia.org/wiki/LEB128 for a description of the
20// 'unsigned LEB' format.
21static void storeUleb(std::vector<uint8_t>& result, size_t value)
22{
23 do {
24 uint8_t b = value & 0x7F;
25 value >>= 7;
26 if (value) b |= 0x80;
27 result.push_back(b);
28 } while (value);
29}
30
31[[nodiscard]] static constexpr size_t loadUleb(const uint8_t*& data)
32{
33 size_t result = 0;
34 int shift = 0;
35 while (true) {
36 uint8_t b = *data++;
37 result |= size_t(b & 0x7F) << shift;
38 if ((b & 0x80) == 0) return result;
39 shift += 7;
40 }
41}
42
43
44// --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
45
46template<int N> bool comp(const uint8_t* p, const uint8_t* q);
47
48template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
49{
50 return *reinterpret_cast<const uint32_t*>(p) ==
51 *reinterpret_cast<const uint32_t*>(q);
52}
53
54template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
55{
56 return *reinterpret_cast<const uint64_t*>(p) ==
57 *reinterpret_cast<const uint64_t*>(q);
58}
59
60#ifdef __SSE2__
61template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
62{
63 // Tests show that (on my machine) using 1 128-bit load is faster than
64 // 2 64-bit loads. Even though the actual comparison is slightly more
65 // complicated with SSE instructions.
66 __m128i a = _mm_load_si128(reinterpret_cast<const __m128i*>(p));
67 __m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(q));
68 __m128i d = _mm_cmpeq_epi8(a, b);
69 return _mm_movemask_epi8(d) == 0xffff;
70}
71#endif
72
73
74// --- Optimized mismatch function ---
75
76// This is much like the function std::mismatch(). You pass in two buffers,
77// the corresponding elements of both buffers are compared and the first
78// position where the elements no longer match is returned.
79//
80// Compared to the std::mismatch() this implementation is faster because:
81// - We make use of sentinels. This requires to temporarily change the content
82// of the buffer. So it won't work with read-only-memory.
83// - We compare words-at-a-time instead of byte-at-a-time.
84static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
85 const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
86{
87 assert((p_end - p) == (q_end - q));
88
89 // When SSE is available, work with 16-byte words, otherwise 4 or 8
90 // bytes. This routine also benefits from AVX2 instructions. Though not
91 // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
92 // extra run-time checks and that's not worth it at this point.
93 constexpr int WORD_SIZE =
94#ifdef __SSE2__
95 sizeof(__m128i);
96#else
97 sizeof(void*);
98#endif
99
100 // Region too small or
101 // both buffers are differently aligned.
102 if (((p_end - p) < (2 * WORD_SIZE)) ||
103 ((reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
104 (reinterpret_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) [[unlikely]] {
105 goto end;
106 }
107
108 // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
109 if (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) [[unlikely]] {
110 do {
111 if (*p != *q) return {p, q};
112 p += 1; q += 1;
113 } while (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1));
114 }
115
116 // Fast path. Compare words-at-a-time.
117 {
118 // Place a sentinel in the last full word. This ensures we'll
119 // find a mismatch within the buffer, and so we can omit the
120 // end-of-buffer checks.
121 auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
122 auto save = *sentinel;
123 *sentinel = ~q_end[-WORD_SIZE];
124
125 while (comp<WORD_SIZE>(p, q)) {
126 p += WORD_SIZE; q += WORD_SIZE;
127 }
128
129 // Restore sentinel.
130 *sentinel = save;
131 }
132
133 // Slow path. This handles:
134 // - Small or differently aligned buffers.
135 // - The bytes at and after the (restored) sentinel.
136end: return std::mismatch(p, p_end, q);
137}
138
139
140// --- Optimized scan_match function ---
141
142// Like scan_mismatch() above, but searches two buffers for the first
143// corresponding equal (instead of not-equal) bytes.
144//
145// Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
146// buffer cannot be read-only memory.
147//
148// Unlike scan_mismatch() it's less obvious how to perform this function
149// word-at-a-time (it's possible with some bit hacks). Though luckily this
150// function is also less performance critical.
151[[nodiscard]] static constexpr std::pair<const uint8_t*, const uint8_t*> scan_match(
152 const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
153{
154 assert((p_end - p) == (q_end - q));
155
156 // Code below is functionally equivalent to:
157 // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
158 // return {p, q};
159
160 if (p == p_end) return {p, q};
161
162 auto* p_last = const_cast<uint8_t*>(p_end - 1);
163 auto save = *p_last;
164 *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
165
166 while (*p != *q) { ++p; ++q; }
167
168 *p_last = save;
169 if ((p == p_last) && (*p != *q)) { ++p; ++q; }
170 return {p, q};
171}
172
173
174// --- delta (de)compression routines ---
175
176// Calculate a 'delta' between two binary buffers of equal size.
177// The result is a stream of:
178// n1 number of bytes are equal
179// n2 number of bytes are different, and here are the bytes
180// n3 number of bytes are equal
181// ...
182[[nodiscard]] static std::vector<uint8_t> calcDelta(
183 const uint8_t* oldBuf, const uint8_t* newBuf, size_t size)
184{
185 std::vector<uint8_t> result;
186
187 const auto* p = oldBuf;
188 const auto* q = newBuf;
189 const auto* p_end = p + size;
190 const auto* q_end = q + size;
191
192 // scan equal bytes (possibly zero)
193 const auto* q1 = q;
194 std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
195 auto n1 = q - q1;
196 storeUleb(result, n1);
197
198 while (q != q_end) {
199 assert(*p != *q);
200
201 const auto* q2 = q;
202 different:
203 std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
204 auto n2 = q - q2;
205
206 const auto* q3 = q;
207 std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
208 auto n3 = q - q3;
209 if ((q != q_end) && (n3 <= 2)) goto different;
210
211 storeUleb(result, n2);
212 result.insert(result.end(), q2, q3);
213
214 if (n3 != 0) storeUleb(result, n3);
215 }
216
217 result.shrink_to_fit();
218 return result;
219}
220
221// Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
222static void applyDeltaInPlace(uint8_t* buf, size_t size, const uint8_t* delta)
223{
224 auto* end = buf + size;
225
226 while (buf != end) {
227 auto n1 = loadUleb(delta);
228 buf += n1;
229 if (buf == end) break;
230
231 auto n2 = loadUleb(delta);
232 memcpy(buf, delta, n2);
233 buf += n2;
234 delta += n2;
235 }
236}
237
238#if STATISTICS
239
240// class DeltaBlock
241
243{
244 globalAllocSize -= allocSize;
245 std::cout << "stat: ~DeltaBlock " << globalAllocSize
246 << " (-" << allocSize << ")\n";
247}
248
249#endif
250
251// class DeltaBlockCopy
252
253DeltaBlockCopy::DeltaBlockCopy(const uint8_t* data, size_t size)
254 : block(size)
255 , compressedSize(0)
256{
257#ifdef DEBUG
258 sha1 = SHA1::calc({data, size});
259#endif
260 memcpy(block.data(), data, size);
261 assert(!compressed());
262#if STATISTICS
263 allocSize = size;
264 globalAllocSize += allocSize;
265 std::cout << "stat: DeltaBlockCopy " << globalAllocSize
266 << " (+" << allocSize << ")\n";
267#endif
268}
269
270void DeltaBlockCopy::apply(uint8_t* dst, size_t size) const
271{
272 if (compressed()) {
273 LZ4::decompress(block.data(), dst, int(compressedSize), int(size));
274 } else {
275 memcpy(dst, block.data(), size);
276 }
277#ifdef DEBUG
278 assert(SHA1::calc({dst, size}) == sha1);
279#endif
280}
281
283{
284 if (compressed()) return;
285
286 size_t dstLen = LZ4::compressBound(int(size));
287 MemBuffer<uint8_t> buf2(dstLen);
288 dstLen = LZ4::compress(block.data(), buf2.data(), int(size));
289
290 if (dstLen >= size) {
291 // compression isn't beneficial
292 return;
293 }
294 compressedSize = dstLen;
295 block.swap(buf2);
296 block.resize(compressedSize); // shrink to fit
297 assert(compressed());
298#ifdef DEBUG
300 apply(buf3.data(), size);
301 assert(memcmp(buf3.data(), buf2.data(), size) == 0);
302#endif
303#if STATISTICS
304 int delta = compressedSize - allocSize;
305 allocSize = compressedSize;
306 globalAllocSize += delta;
307 std::cout << "stat: compress " << globalAllocSize
308 << " (" << delta << ")\n";
309#endif
310}
311
313{
314 assert(!compressed());
315 return block.data();
316}
317
318
319// class DeltaBlockDiff
320
322 std::shared_ptr<DeltaBlockCopy> prev_,
323 const uint8_t* data, size_t size)
324 : prev(std::move(prev_))
325 , delta(calcDelta(prev->getData(), data, size))
326{
327#ifdef DEBUG
328 sha1 = SHA1::calc({data, size});
329
331 apply(buf.data(), size);
332 assert(memcmp(buf.data(), data, size) == 0);
333#endif
334#if STATISTICS
335 allocSize = delta.size();
336 globalAllocSize += allocSize;
337 std::cout << "stat: DeltaBlockDiff " << globalAllocSize
338 << " (+" << allocSize << ")\n";
339#endif
340}
341
342void DeltaBlockDiff::apply(uint8_t* dst, size_t size) const
343{
344 prev->apply(dst, size);
345 applyDeltaInPlace(dst, size, delta.data());
346#ifdef DEBUG
347 assert(SHA1::calc({dst, size}) == sha1);
348#endif
349}
350
352{
353 return delta.size();
354}
355
356
357// class LastDeltaBlocks
358
359std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
360 const void* id, const uint8_t* data, size_t size)
361{
362 auto it = ranges::lower_bound(infos, std::tuple(id, size), {},
363 [](const Info& info) { return std::tuple(info.id, info.size); });
364 if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
365 // no previous info yet
366 it = infos.emplace(it, id, size);
367 }
368 assert(it->id == id);
369 assert(it->size == size);
370
371 auto ref = it->ref.lock();
372 if (it->accSize >= size || !ref) {
373 if (ref) {
374 // We will switch to a new DeltaBlockCopy object. So
375 // now is a good time to compress the old one.
376 ref->compress(size);
377 }
378 // Heuristic: create a new block when too many small
379 // differences have accumulated.
380 auto b = std::make_shared<DeltaBlockCopy>(data, size);
381 it->ref = b;
382 it->last = b;
383 it->accSize = 0;
384 return b;
385 } else {
386 // Create diff based on earlier reference block.
387 // Reference remains unchanged.
388 auto b = std::make_shared<DeltaBlockDiff>(ref, data, size);
389 it->last = b;
390 it->accSize += b->getDeltaSize();
391 return b;
392 }
393}
394
395std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
396 const void* id, const uint8_t* data, size_t size)
397{
398 auto it = ranges::lower_bound(infos, id, {}, &Info::id);
399 if ((it == end(infos)) || (it->id != id)) {
400 // no previous block yet
401 it = infos.emplace(it, id, size);
402 }
403 assert(it->size == size);
404
405 auto last = it->last.lock();
406 if (!last) {
407 auto b = std::make_shared<DeltaBlockCopy>(data, size);
408 it->ref = b;
409 it->last = b;
410 it->accSize = 0;
411 return b;
412 } else {
413#ifdef DEBUG
414 assert(SHA1::calc({data, size}) == last->sha1);
415#endif
416 return last;
417 }
418}
419
421{
422 for (const Info& info : infos) {
423 if (auto ref = info.ref.lock()) {
424 ref->compress(info.size);
425 }
426 }
427 infos.clear();
428}
429
430} // namespace openmsx
uintptr_t id
Definition: Interpreter.cc:26
const uint8_t * getData()
Definition: DeltaBlock.cc:312
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:270
DeltaBlockCopy(const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:253
void compress(size_t size)
Definition: DeltaBlock.cc:282
void apply(uint8_t *dst, size_t size) const override
Definition: DeltaBlock.cc:342
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:321
size_t getDeltaSize() const
Definition: DeltaBlock.cc:351
virtual ~DeltaBlock()=default
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:395
std::shared_ptr< DeltaBlock > createNew(const void *id, const uint8_t *data, size_t size)
Definition: DeltaBlock.cc:359
void resize(size_t size)
Grow or shrink the memory block.
Definition: MemBuffer.hh:111
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition: MemBuffer.hh:136
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
static Sha1Sum calc(std::span< const uint8_t > data)
Easier to use interface, if you can pass all data in one go.
Definition: utils/sha1.cc:363
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
int compressBound(int iSize)
Definition: lz4.hh:51
int decompress(const uint8_t *src, uint8_t *dst, int compressedSize, int dstCapacity)
Definition: lz4.cc:479
int compress(const uint8_t *src, uint8_t *dst, int srcSize)
Definition: lz4.cc:455
void save(unsigned width, unsigned height, const void **rowPointers, const PixelFormat &format, const std::string &filename)
Definition: PNG.cc:371
This file implemented 3 utility functions:
Definition: Autofire.cc:9
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:48
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition: DeltaBlock.cc:54
AmdFlash::SectorInfo Info
Definition: RomManbow2.cc:17
bool comp(const uint8_t *p, const uint8_t *q)
auto lower_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition: ranges.hh:99
STL namespace.
size_t size(std::string_view utf8)
constexpr auto end(const zstring_view &x)