openMSX
DeltaBlock.cc
Go to the documentation of this file.
1#include "DeltaBlock.hh"
2#include "ranges.hh"
3#include "lz4.hh"
4#include <cassert>
5#include <tuple>
6#include <utility>
7#if STATISTICS
8#include <iostream>
9#endif
10#ifdef __SSE2__
11#include <emmintrin.h>
12#endif
13
14namespace openmsx {
15
16// --- Compressed integers ---
17
18// See https://en.wikipedia.org/wiki/LEB128 for a description of the
19// 'unsigned LEB' format.
20static void storeUleb(std::vector<uint8_t>& result, size_t value)
21{
22 do {
23 uint8_t b = value & 0x7F;
24 value >>= 7;
25 if (value) b |= 0x80;
26 result.push_back(b);
27 } while (value);
28}
29
30[[nodiscard]] static constexpr size_t loadUleb(std::span<const uint8_t>& data)
31{
32 size_t result = 0;
33 int shift = 0;
34 while (true) {
35 assert(!data.empty());
36 uint8_t b = data.front();
37 data = data.subspan(1);
38 result |= size_t(b & 0x7F) << shift;
39 if ((b & 0x80) == 0) return result;
40 shift += 7;
41 }
42}
43
44
45// --- Helper functions to compare {4,8,16} bytes at aligned memory locations ---
46
47template<int N> bool comp(const uint8_t* p, const uint8_t* q);
48
49template<> bool comp<4>(const uint8_t* p, const uint8_t* q)
50{
51 return *reinterpret_cast<const uint32_t*>(p) ==
52 *reinterpret_cast<const uint32_t*>(q);
53}
54
55template<> bool comp<8>(const uint8_t* p, const uint8_t* q)
56{
57 return *reinterpret_cast<const uint64_t*>(p) ==
58 *reinterpret_cast<const uint64_t*>(q);
59}
60
61#ifdef __SSE2__
62template<> bool comp<16>(const uint8_t* p, const uint8_t* q)
63{
64 // Tests show that (on my machine) using 1 128-bit load is faster than
65 // 2 64-bit loads. Even though the actual comparison is slightly more
66 // complicated with SSE instructions.
67 __m128i a = _mm_load_si128(reinterpret_cast<const __m128i*>(p));
68 __m128i b = _mm_load_si128(reinterpret_cast<const __m128i*>(q));
69 __m128i d = _mm_cmpeq_epi8(a, b);
70 return _mm_movemask_epi8(d) == 0xffff;
71}
72#endif
73
74
75// --- Optimized mismatch function ---
76
77// This is much like the function std::mismatch(). You pass in two buffers,
78// the corresponding elements of both buffers are compared and the first
79// position where the elements no longer match is returned.
80//
81// Compared to the std::mismatch() this implementation is faster because:
82// - We make use of sentinels. This requires to temporarily change the content
83// of the buffer. So it won't work with read-only-memory.
84// - We compare words-at-a-time instead of byte-at-a-time.
85static std::pair<const uint8_t*, const uint8_t*> scan_mismatch(
86 const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
87{
88 assert((p_end - p) == (q_end - q));
89
90 // When SSE is available, work with 16-byte words, otherwise 4 or 8
91 // bytes. This routine also benefits from AVX2 instructions. Though not
92 // all x86_64 CPUs have AVX2 (all have SSE2), so using them requires
93 // extra run-time checks and that's not worth it at this point.
94 constexpr int WORD_SIZE =
95#ifdef __SSE2__
96 sizeof(__m128i);
97#else
98 sizeof(void*);
99#endif
100
101 // Region too small or
102 // both buffers are differently aligned.
103 if (((p_end - p) < (2 * WORD_SIZE)) ||
104 ((reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) !=
105 (reinterpret_cast<uintptr_t>(q) & (WORD_SIZE - 1)))) [[unlikely]] {
106 goto end;
107 }
108
109 // Align to WORD_SIZE boundary. No need for end-of-buffer checks.
110 if (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1)) [[unlikely]] {
111 do {
112 if (*p != *q) return {p, q};
113 p += 1; q += 1;
114 } while (reinterpret_cast<uintptr_t>(p) & (WORD_SIZE - 1));
115 }
116
117 // Fast path. Compare words-at-a-time.
118 {
119 // Place a sentinel in the last full word. This ensures we'll
120 // find a mismatch within the buffer, and so we can omit the
121 // end-of-buffer checks.
122 auto* sentinel = &const_cast<uint8_t*>(p_end)[-WORD_SIZE];
123 auto save = *sentinel;
124 *sentinel = ~q_end[-WORD_SIZE];
125
126 while (comp<WORD_SIZE>(p, q)) {
127 p += WORD_SIZE; q += WORD_SIZE;
128 }
129
130 // Restore sentinel.
131 *sentinel = save;
132 }
133
134 // Slow path. This handles:
135 // - Small or differently aligned buffers.
136 // - The bytes at and after the (restored) sentinel.
137end: return std::mismatch(p, p_end, q);
138}
139
140
141// --- Optimized scan_match function ---
142
143// Like scan_mismatch() above, but searches two buffers for the first
144// corresponding equal (instead of not-equal) bytes.
145//
146// Like scan_mismatch(), this places a temporary sentinel in the buffer, so the
147// buffer cannot be read-only memory.
148//
149// Unlike scan_mismatch() it's less obvious how to perform this function
150// word-at-a-time (it's possible with some bit hacks). Though luckily this
151// function is also less performance critical.
152[[nodiscard]] static constexpr std::pair<const uint8_t*, const uint8_t*> scan_match(
153 const uint8_t* p, const uint8_t* p_end, const uint8_t* q, const uint8_t* q_end)
154{
155 assert((p_end - p) == (q_end - q));
156
157 // Code below is functionally equivalent to:
158 // while ((p != p_end) && (*p != *q)) { ++p; ++q; }
159 // return {p, q};
160
161 if (p == p_end) return {p, q};
162
163 auto* p_last = const_cast<uint8_t*>(p_end - 1);
164 auto save = *p_last;
165 *p_last = q_end[-1]; // make p_end[-1] == q_end[-1]
166
167 while (*p != *q) { ++p; ++q; }
168
169 *p_last = save;
170 if ((p == p_last) && (*p != *q)) { ++p; ++q; }
171 return {p, q};
172}
173
174
175// --- delta (de)compression routines ---
176
177// Calculate a 'delta' between two binary buffers of equal size.
178// The result is a stream of:
179// n1 number of bytes are equal
180// n2 number of bytes are different, and here are the bytes
181// n3 number of bytes are equal
182// ...
183[[nodiscard]] static std::vector<uint8_t> calcDelta(
184 const uint8_t* oldBuf, std::span<const uint8_t> newBuf)
185{
186 std::vector<uint8_t> result;
187
188 const auto* p = oldBuf;
189 const auto* q = newBuf.data();
190 auto size = newBuf.size();
191 const auto* p_end = p + size;
192 const auto* q_end = q + size;
193
194 // scan equal bytes (possibly zero)
195 const auto* q1 = q;
196 std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
197 auto n1 = q - q1;
198 storeUleb(result, n1);
199
200 while (q != q_end) {
201 assert(*p != *q);
202
203 const auto* q2 = q;
204 different:
205 std::tie(p, q) = scan_match(p + 1, p_end, q + 1, q_end);
206 auto n2 = q - q2;
207
208 const auto* q3 = q;
209 std::tie(p, q) = scan_mismatch(p, p_end, q, q_end);
210 auto n3 = q - q3;
211 if ((q != q_end) && (n3 <= 2)) goto different;
212
213 storeUleb(result, n2);
214 result.insert(result.end(), q2, q3);
215
216 if (n3 != 0) storeUleb(result, n3);
217 }
218
219 result.shrink_to_fit();
220 return result;
221}
222
223// Apply a previously calculated 'delta' to 'oldBuf' to get 'newbuf'.
224static void applyDeltaInPlace(std::span<uint8_t> buf, std::span<const uint8_t> delta)
225{
226 while (!buf.empty()) {
227 auto n1 = loadUleb(delta);
228 buf = buf.subspan(n1);
229 if (buf.empty()) break;
230
231 auto n2 = loadUleb(delta);
232 ranges::copy(delta.subspan(0, n2), buf);
233 buf = buf .subspan(n2);
234 delta = delta.subspan(n2);
235 }
236}
237
238#if STATISTICS
239
240// class DeltaBlock
241
243{
244 globalAllocSize -= allocSize;
245 std::cout << "stat: ~DeltaBlock " << globalAllocSize
246 << " (-" << allocSize << ")\n";
247}
248
249#endif
250
251// class DeltaBlockCopy
252
253DeltaBlockCopy::DeltaBlockCopy(std::span<const uint8_t> data)
254 : block(data.size())
255{
256#ifdef DEBUG
257 sha1 = SHA1::calc(data);
258#endif
259 ranges::copy(data, block.data());
260 assert(!compressed());
261#if STATISTICS
262 allocSize = size;
263 globalAllocSize += allocSize;
264 std::cout << "stat: DeltaBlockCopy " << globalAllocSize
265 << " (+" << allocSize << ")\n";
266#endif
267}
268
269void DeltaBlockCopy::apply(std::span<uint8_t> dst) const
270{
271 if (compressed()) {
272 LZ4::decompress(block.data(), dst.data(), int(compressedSize), int(dst.size()));
273 } else {
274 ranges::copy(std::span{block.data(), dst.size()}, dst);
275 }
276#ifdef DEBUG
277 assert(SHA1::calc(dst) == sha1);
278#endif
279}
280
282{
283 if (compressed()) return;
284
285 size_t dstLen = LZ4::compressBound(int(size));
286 MemBuffer<uint8_t> buf2(dstLen);
287 dstLen = LZ4::compress(block.data(), buf2.data(), int(size));
288
289 if (dstLen >= size) {
290 // compression isn't beneficial
291 return;
292 }
293 compressedSize = dstLen;
294 block.swap(buf2);
295 block.resize(compressedSize); // shrink to fit
296 assert(compressed());
297#ifdef DEBUG
298 MemBuffer<uint8_t> buf3(size);
299 apply({buf3.data(), size});
300 assert(ranges::equal(std::span{buf3.data(), size}, std::span{buf2.data(), size}));
301#endif
302#if STATISTICS
303 int delta = compressedSize - allocSize;
304 allocSize = compressedSize;
305 globalAllocSize += delta;
306 std::cout << "stat: compress " << globalAllocSize
307 << " (" << delta << ")\n";
308#endif
309}
310
312{
313 assert(!compressed());
314 return block.data();
315}
316
317
318// class DeltaBlockDiff
319
321 std::shared_ptr<DeltaBlockCopy> prev_,
322 std::span<const uint8_t> data)
323 : prev(std::move(prev_))
324 , delta(calcDelta(prev->getData(), data))
325{
326#ifdef DEBUG
327 sha1 = SHA1::calc(data);
328
329 MemBuffer<uint8_t> buf(data.size());
330 apply({buf.data(), data.size()});
331 assert(ranges::equal(std::span{buf.data(), data.size()}, data));
332#endif
333#if STATISTICS
334 allocSize = delta.size();
335 globalAllocSize += allocSize;
336 std::cout << "stat: DeltaBlockDiff " << globalAllocSize
337 << " (+" << allocSize << ")\n";
338#endif
339}
340
341void DeltaBlockDiff::apply(std::span<uint8_t> dst) const
342{
343 prev->apply(dst);
344 applyDeltaInPlace(dst, delta);
345#ifdef DEBUG
346 assert(SHA1::calc(dst) == sha1);
347#endif
348}
349
351{
352 return delta.size();
353}
354
355
356// class LastDeltaBlocks
357
358std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNew(
359 const void* id, std::span<const uint8_t> data)
360{
361 auto size = data.size();
362 auto it = ranges::lower_bound(infos, std::tuple(id, size), {},
363 [](const Info& info) { return std::tuple(info.id, info.size); });
364 if ((it == end(infos)) || (it->id != id) || (it->size != size)) {
365 // no previous info yet
366 it = infos.emplace(it, id, size);
367 }
368 assert(it->id == id);
369 assert(it->size == size);
370
371 auto ref = it->ref.lock();
372 if (it->accSize >= size || !ref) {
373 if (ref) {
374 // We will switch to a new DeltaBlockCopy object. So
375 // now is a good time to compress the old one.
376 ref->compress(size);
377 }
378 // Heuristic: create a new block when too many small
379 // differences have accumulated.
380 auto b = std::make_shared<DeltaBlockCopy>(data);
381 it->ref = b;
382 it->last = b;
383 it->accSize = 0;
384 return b;
385 } else {
386 // Create diff based on earlier reference block.
387 // Reference remains unchanged.
388 auto b = std::make_shared<DeltaBlockDiff>(ref, data);
389 it->last = b;
390 it->accSize += b->getDeltaSize();
391 return b;
392 }
393}
394
395std::shared_ptr<DeltaBlock> LastDeltaBlocks::createNullDiff(
396 const void* id, std::span<const uint8_t> data)
397{
398 auto size = data.size();
399 auto it = ranges::lower_bound(infos, id, {}, &Info::id);
400 if ((it == end(infos)) || (it->id != id)) {
401 // no previous block yet
402 it = infos.emplace(it, id, size);
403 }
404 assert(it->size == size);
405
406 auto last = it->last.lock();
407 if (!last) {
408 auto b = std::make_shared<DeltaBlockCopy>(data);
409 it->ref = b;
410 it->last = b;
411 it->accSize = 0;
412 return b;
413 } else {
414#ifdef DEBUG
415 assert(SHA1::calc(data) == last->sha1);
416#endif
417 return last;
418 }
419}
420
422{
423 for (const Info& info : infos) {
424 if (auto ref = info.ref.lock()) {
425 ref->compress(info.size);
426 }
427 }
428 infos.clear();
429}
430
431} // namespace openmsx
uintptr_t id
DeltaBlockCopy(std::span< const uint8_t > data)
const uint8_t * getData()
void apply(std::span< uint8_t > dst) const override
void compress(size_t size)
DeltaBlockDiff(std::shared_ptr< DeltaBlockCopy > prev_, std::span< const uint8_t > data)
void apply(std::span< uint8_t > dst) const override
size_t getDeltaSize() const
virtual ~DeltaBlock()=default
std::shared_ptr< DeltaBlock > createNew(const void *id, std::span< const uint8_t > data)
std::shared_ptr< DeltaBlock > createNullDiff(const void *id, std::span< const uint8_t > data)
This class manages the lifetime of a block of memory.
Definition MemBuffer.hh:29
void resize(size_t size)
Grow or shrink the memory block.
Definition MemBuffer.hh:111
void swap(MemBuffer &other) noexcept
Swap the managed memory block of two MemBuffers.
Definition MemBuffer.hh:136
const T * data() const
Returns pointer to the start of the memory buffer.
Definition MemBuffer.hh:81
static Sha1Sum calc(std::span< const uint8_t > data)
Easier to use interface, if you can pass all data in one go.
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
int compressBound(int iSize)
Definition lz4.hh:51
int decompress(const uint8_t *src, uint8_t *dst, int compressedSize, int dstCapacity)
Definition lz4.cc:480
int compress(const uint8_t *src, uint8_t *dst, int srcSize)
Definition lz4.cc:456
This file implemented 3 utility functions:
Definition Autofire.cc:9
bool comp< 4 >(const uint8_t *p, const uint8_t *q)
Definition DeltaBlock.cc:49
bool comp< 8 >(const uint8_t *p, const uint8_t *q)
Definition DeltaBlock.cc:55
AmdFlash::SectorInfo Info
Definition RomManbow2.cc:18
bool comp(const uint8_t *p, const uint8_t *q)
auto copy(InputRange &&range, OutputIter out)
Definition ranges.hh:250
bool equal(InputRange1 &&range1, InputRange2 &&range2, Pred pred={}, Proj1 proj1={}, Proj2 proj2={})
Definition ranges.hh:368
auto lower_bound(ForwardRange &&range, const T &value, Compare comp={}, Proj proj={})
Definition ranges.hh:115
STL namespace.
size_t size(std::string_view utf8)
constexpr auto end(const zstring_view &x)