openMSX
DeltaBlock.cc
Go to the documentation of this file.
1#include "DeltaBlock.hh"
2
3#include "ranges.hh"
4#include "lz4.hh"
5
6#include <bit>
7#include <cassert>
8#include <tuple>
9#include <utility>
10#if STATISTICS
11#include <iostream>
12#endif
13#ifdef __SSE2__
14#include <emmintrin.h>
15#endif
16
17namespace openmsx {
18
19// --- Compressed integers ---
20
21// See https://en.wikipedia.org/wiki/LEB128 for a description of the
22// 'unsigned LEB' format.
23static void storeUleb(std::vector<uint8_t>& result, size_t value)
24{
25 do {
26 uint8_t b = value & 0x7F;
27 value >>= 7;
28 if (value) b |= 0x80;
29 result.push_back(b);
30 } while (value);
31}
32
33[[nodiscard]] static constexpr size_t loadUleb(std::span<const uint8_t>& data)
34{
35 size_t result = 0;
36 int shift = 0;
37 while (true) {
38 assert(!data.empty());
39 uint8_t b = data.front();
40 data = data.subspan(1);
41 result |= size_t(b & 0x7F) << shift;
42 if ((b & 0x80) == 0) return result;
43 shift += 7;
44 }
45}
46
47
48// --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
49
50template<int N> bool comp(const uint8_t* p, const uint8_t* q);
51
52template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
53{
54 return *std::bit_cast<const uint32_t*>(p) ==
55 *std::bit_cast<const uint32_t*>(q);
56}
57
58template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
59{
60 return *std::bit_cast<const uint64_t*>(p) ==
61 *std::bit_cast<const uint64_t*>(q);
62}
63
64#ifdef __SSE2__
65template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
66{
67 // Tests show that (on my machine) using 1 128-bit load is faster than
68 // 2 64-bit loads. Even though the actual comparison is slightly more
69 // complicated with SSE instructions.
70 __m128i a = _mm_load_si128(std::bit_cast<const __m128i*>(p));
71 __m128i b = _mm_load_si128(std::bit_cast<const __m128i*>(q));
72 __m128i d = _mm_cmpeq_epi8(a, b);
73 return _mm_movemask_epi8(d) == 0xffff;
74}
75#endif
76
77
78// --- Optimized mismatch function ---
79
80// This is much like the function std::mismatch(). You pass in two buffers,
81// the corresponding elements of both buffers are compared and the first
82// position where the elements no longer match is returned.
83//
84// Compared to the std::mismatch() this implementation is faster because:
85// - We make use of sentinels. This requires to temporarily change the content
86// of the buffer. So it won't work with read-only-memory.
87// - We compare words-at-a-time instead of byte-at-a-time.
88static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
89 const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
90{
91 assert((p_end - p) == (q_end - q));
92
93 // When SSE is available, work with 16-byte words, otherwise 4 or 8
94 // bytes. This routine also benefits from AVX2 instructions. Though not
95 // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
96 // extra run-time checks and that's not worth it at this point.
97 constexpr int WORD_SIZE =
98#ifdef __SSE2__
99 sizeof(__m128i);
100#else
101 sizeof(void*);
102#endif
103
104 // Region too small or
105 // both buffers are differently aligned.
106 if (((p_end - p) < (2 * WORD_SIZE)) ||
107 ((std::bit_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
108 (std::bit_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) [[unlikely]] {
109 goto end;
110 }
111
112 // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
113 if (std::bit_cast<uintptr_t>(p) & (WORD_SIZE - 1)) [[unlikely]] {
114 do {
115 if (*p != *q) return {p, q};
116 p += 1; q += 1;
117 } while (std::bit_cast<uintptr_t>(p) & (WORD_SIZE - 1));
118 }
119
120 // Fast path. Compare words-at-a-time.
121 {
122 // Place a sentinel in the last full word. This ensures we'll
123 // find a mismatch within the buffer, and so we can omit the
124 // end-of-buffer checks.
125 auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
126 auto save = *sentinel;
127 *sentinel = ~q_end[-WORD_SIZE];
128
129 while (comp<WORD_SIZE>(p, q)) {
130 p += WORD_SIZE; q += WORD_SIZE;
131 }
132
133 // Restore sentinel.
134 *sentinel = save;
135 }
136
137 // Slow path. This handles:
138 // - Small or differently aligned buffers.
139 // - The bytes at and after the (restored) sentinel.
140end: return std::mismatch(p, p_end, q);
141}
142
143
144// --- Optimized scan_match function ---
145
146// Like scan_mismatch() above, but searches two buffers for the first
147// corresponding equal (instead of not-equal) bytes.
148//
149// Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
150// buffer cannot be read-only memory.
151//
152// Unlike scan_mismatch() it's less obvious how to perform this function
153// word-at-a-time (it's possible with some bit hacks). Though luckily this
154// function is also less performance critical.
155[[nodiscard]] static constexpr std::pair<const uint8_t*, const uint8_t*> scan_match(
156 const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
157{
158 assert((p_end - p) == (q_end - q));
159
160 // Code below is functionally equivalent to:
161 // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
162 // return {p, q};
163
164 if (p == p_end) return {p, q};
165
166 auto* p_last = const_cast<uint8_t*>(p_end - 1);
167 auto save = *p_last;
168 *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
169
170 while (*p != *q) { ++p; ++q; }
171
172 *p_last = save;
173 if ((p == p_last) && (*p != *q)) { ++p; ++q; }
174 return {p, q};
175}
176
177
178// --- delta (de)compression routines ---
179
180// Calculate a 'delta' between two binary buffers of equal size.
181// The result is a stream of:
182// n1 number of bytes are equal
183// n2 number of bytes are different, and here are the bytes
184// n3 number of bytes are equal
185// ...
186[[nodiscard]] static std::vector<uint8_t> calcDelta(
187 const uint8_t* oldBuf, std::span<const uint8_t> newBuf)
188{
189 std::vector<uint8_t> result;
190
191 const auto* p = oldBuf;
192 const auto* q = newBuf.data();
193 auto size = newBuf.size();
194 const auto* p_end = p + size;
195 const auto* q_end = q + size;
196
197 // scan equal bytes (possibly zero)
198 const auto* q1 = q;
199 std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
200 auto n1 = q - q1;
201 storeUleb(result, n1);
202
203 while (q != q_end) {
204 assert(*p != *q);
205
206 const auto* q2 = q;
207 different:
208 std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
209 auto n2 = q - q2;
210
211 const auto* q3 = q;
212 std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
213 auto n3 = q - q3;
214 if ((q != q_end) && (n3 <= 2)) goto different;
215
216 storeUleb(result, n2);
217 result.insert(result.end(), q2, q3);
218
219 if (n3 != 0) storeUleb(result, n3);
220 }
221
222 result.shrink_to_fit();
223 return result;
224}
225
226// Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
227static void applyDeltaInPlace(std::span<uint8_t> buf, std::span<const uint8_t> delta)
228{
229 while (!buf.empty()) {
230 auto n1 = loadUleb(delta);
231 buf = buf.subspan(n1);
232 if (buf.empty()) break;
233
234 auto n2 = loadUleb(delta);
235 ranges::copy(delta.subspan(0, n2), buf);
236 buf = buf .subspan(n2);
237 delta = delta.subspan(n2);
238 }
239}
240
241#if STATISTICS
242
243// class DeltaBlock
244
246{
247 globalAllocSize -= allocSize;
248 std::cout << "stat: ~DeltaBlock " << globalAllocSize
249 << " (-" << allocSize << ")\n";
250}
251
252#endif
253
254// class DeltaBlockCopy
255
256DeltaBlockCopy::DeltaBlockCopy(std::span<const uint8_t> data)
257 : block(data.size())
258{
259#ifdef DEBUG
260 sha1 = SHA1::calc(data);
261#endif
262 ranges::copy(data, block.data());
263 assert(!compressed());
264#if STATISTICS
265 allocSize = size;
266 globalAllocSize += allocSize;
267 std::cout << "stat: DeltaBlockCopy " << globalAllocSize
268 << " (+" << allocSize << ")\n";
269#endif
270}
271
272void DeltaBlockCopy::apply(std::span<uint8_t> dst) const
273{
274 if (compressed()) {
275 LZ4::decompress(block.data(), dst.data(), int(compressedSize), int(dst.size()));
276 } else {
277 ranges::copy(std::span{block.data(), dst.size()}, dst);
278 }
279#ifdef DEBUG
280 assert(SHA1::calc(dst) == sha1);
281#endif
282}
283
285{
286 if (compressed()) return;
287
288 size_t dstLen = LZ4::compressBound(int(size));
289 MemBuffer<uint8_t> buf2(dstLen);
290 dstLen = LZ4::compress(block.data(), buf2.data(), int(size));
291
292 if (dstLen >= size) {
293 // compression isn't beneficial
294 return;
295 }
296 compressedSize = dstLen;
297 block.swap(buf2);
298 block.resize(compressedSize); // shrink to fit
299 assert(compressed());
300#ifdef DEBUG
301 MemBuffer<uint8_t> buf3(size);
302 apply({buf3.data(), size});
303 assert(ranges::equal(std::span{buf3.data(), size}, std::span{buf2.data(), size}));
304#endif
305#if STATISTICS
306 int delta = compressedSize - allocSize;
307 allocSize = compressedSize;
308 globalAllocSize += delta;
309 std::cout << "stat: compress " << globalAllocSize
310 << " (" << delta << ")\n";
311#endif
312}
313
315{
316 assert(!compressed());
317 return block.data();
318}
319
320
321// class DeltaBlockDiff
322
324 std::shared_ptr<DeltaBlockCopy> prev_,
325 std::span<const uint8_t> data)
326 : prev(std::move(prev_))
327 , delta(calcDelta(prev->getData(), data))
328{
329#ifdef DEBUG
330 sha1 = SHA1::calc(data);
331
332 MemBuffer<uint8_t> buf(data.size());
333 apply({buf.data(), data.size()});
334 assert(ranges::equal(std::span{buf.data(), data.size()}, data));
335#endif
336#if STATISTICS
337 allocSize = delta.size();
338 globalAllocSize += allocSize;
339 std::cout << "stat: DeltaBlockDiff " << globalAllocSize
340 << " (+" << allocSize << ")\n";
341#endif
342}
343
344void DeltaBlockDiff::apply(std::span<uint8_t> dst) const
345{
346 prev->apply(dst);
347 applyDeltaInPlace(dst, delta);
348#ifdef DEBUG
349 assert(SHA1::calc(dst) == sha1);
350#endif
351}
352
354{
355 return delta.size();
356}
357
358
359// class LastDeltaBlocks
360
361std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
362 const void* id, std::span<const uint8_t> data)
363{
364 auto size = data.size();
365 auto it = ranges::lower_bound(infos, std::tuple(id, size), {},
366 [](const Info& info) { return std::tuple(info.id, info.size); });
367 if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
368 // no previous info yet
369 it = infos.emplace(it, id, size);
370 }
371 assert(it->id == id);
372 assert(it->size == size);
373
374 auto ref = it->ref.lock();
375 if (it->accSize >= size || !ref) {
376 if (ref) {
377 // We will switch to a new DeltaBlockCopy object. So
378 // now is a good time to compress the old one.
379 ref->compress(size);
380 }
381 // Heuristic: create a new block when too many small
382 // differences have accumulated.
383 auto b = std::make_shared<DeltaBlockCopy>(data);
384 it->ref = b;
385 it->last = b;
386 it->accSize = 0;
387 return b;
388 } else {
389 // Create diff based on earlier reference block.
390 // Reference remains unchanged.
391 auto b = std::make_shared<DeltaBlockDiff>(ref, data);
392 it->last = b;
393 it->accSize += b->getDeltaSize();
394 return b;
395 }
396}
397
398std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
399 const void* id, std::span<const uint8_t> data)
400{
401 auto size = data.size();
402 auto it = ranges::lower_bound(infos, id, {}, &Info::id);
403 if ((it == end(infos)) || (it->id != id)) {
404 // no previous block yet
405 it = infos.emplace(it, id, size);
406 }
407 assert(it->size == size);
408
409 auto last = it->last.lock();
410 if (!last) {
411 auto b = std::make_shared<DeltaBlockCopy>(data);
412 it->ref = b;
413 it->last = b;
414 it->accSize = 0;
415 return b;
416 } else {
417#ifdef DEBUG
418 assert(SHA1::calc(data) == last->sha1);
419#endif
420 return last;
421 }
422}
423
425{
426 for (const Info& info : infos) {
427 if (auto ref = info.ref.lock()) {
428 ref->compress(info.size);
429 }
430 }
431 infos.clear();
432}
433
434} // namespace openmsx
uintptr_t id
DeltaBlockCopy(std::span< const uint8_t > data)
const uint8_t * getData()
void apply(std::span< uint8_t > dst) const override
void compress(size_t size)
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, std::span< const uint8_t > data)
void apply(std::span< uint8_t > dst) const override
size_t getDeltaSize() const
virtual ~DeltaBlock()=default
std::shared_ptr< DeltaBlock > createNew(const void *id, std::span< const uint8_t > data)
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, std::span< const uint8_t > data)
This class manages the lifetime of a block of memory.
Definition MemBuffer.hh:29
void resize(size_t size)
Grow or shrink the memory block.
Definition MemBuffer.hh:111
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition MemBuffer.hh:136
const T * data() const
Returns pointer to the start of the memory buffer.
Definition MemBuffer.hh:81
static Sha1Sum calc(std::span< const uint8_t > data)
Easier to use interface, if you can pass all data in one go.
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
int compressBound(int iSize)
Definition lz4.hh:51
int decompress(const uint8_t *src, uint8_t *dst, int compressedSize, int dstCapacity)
Definition lz4.cc:479
int compress(const uint8_t *src, uint8_t *dst, int srcSize)
Definition lz4.cc:455
This file implemented 3 utility functions:
Definition Autofire.cc:9
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition DeltaBlock.cc:52
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition DeltaBlock.cc:58
AmdFlash::SectorInfo Info
Definition RomManbow2.cc:18
bool comp(const uint8_t *p, const uint8_t *q)
auto copy(InputRange &&range, OutputIter out)
Definition ranges.hh:250
bool equal(InputRange1 &&range1, InputRange2 &&range2, Pred pred={}, Proj1 proj1={}, Proj2 proj2={})
Definition ranges.hh:368
auto lower_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition ranges.hh:115
STL namespace.
size_t size(std::string_view utf8)
constexpr auto end(const zstring_view &x)