openMSX
snappy.cc
Go to the documentation of this file.
1 #include "snappy.hh"
2 #include "aligned.hh"
3 #include "likely.hh"
4 #include "endian.hh"
5 #include "build-info.hh"
6 #include <algorithm>
7 #include <cassert>
8 #include <cstring>
9 #include <cstdint>
10 
11 using namespace openmsx;
12 
13 namespace snappy {
14 
15 enum {
16  LITERAL = 0,
17  COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
20 };
21 
22 static inline void unalignedCopy64(const char* src, char* dst)
23 {
24  if (sizeof(void*) == 8) {
25  unalignedStore64(dst + 0, unalignedLoad64(src + 0));
26  } else {
27  // This can be more efficient than unalignedLoad64 +
28  // unalignedStore64 on some platforms, in particular ARM.
29  unalignedStore32(dst + 0, unalignedLoad32(src + 0));
30  unalignedStore32(dst + 4, unalignedLoad32(src + 4));
31  }
32 }
33 static inline void unalignedCopy128(const char* src, char* dst)
34 {
35  unalignedCopy64(src + 0, dst + 0);
36  unalignedCopy64(src + 8, dst + 8);
37 }
38 
40 
41 // Copy "len" bytes from "src" to "op", one byte at a time. Used for
42 // handling COPY operations where the input and output regions may
43 // overlap. For example, suppose:
44 // src == "ab"
45 // op == src + 2
46 // len == 20
47 // After incrementalCopy(src, op, len), the result will have
48 // eleven copies of "ab"
49 // ababababababababababab
50 // Note that this does not match the semantics of either memcpy()
51 // or memmove().
52 static inline void incrementalCopy(const char* src, char* op, ptrdiff_t len)
53 {
54  assert(len > 0);
55  do {
56  *op++ = *src++;
57  } while (--len);
58 }
59 
60 // Equivalent to incrementalCopy() except that it can write up to ten extra
61 // bytes after the end of the copy, and that it is faster.
62 //
63 // The main part of this loop is a simple copy of eight bytes at a time until
64 // we've copied (at least) the requested amount of bytes. However, if op and
65 // src are less than eight bytes apart (indicating a repeating pattern of
66 // length < 8), we first need to expand the pattern in order to get the correct
67 // results. For instance, if the buffer looks like this, with the eight-byte
68 // <src> and <op> patterns marked as intervals:
69 //
70 // abxxxxxxxxxxxx
71 // [------] src
72 // [------] op
73 //
74 // a single eight-byte copy from <src> to <op> will repeat the pattern once,
75 // after which we can move <op> two bytes without moving <src>:
76 //
77 // ababxxxxxxxxxx
78 // [------] src
79 // [------] op
80 //
81 // and repeat the exercise until the two no longer overlap.
82 //
83 // This allows us to do very well in the special case of one single byte
84 // repeated many times, without taking a big hit for more general cases.
85 //
86 // The worst case of extra writing past the end of the match occurs when
87 // op - src == 1 and len == 1; the last copy will read from byte positions
88 // [0..7] and write to [4..11], whereas it was only supposed to write to
89 // position 1. Thus, ten excess bytes.
90 
91 static const int MAX_INCR_COPY_OVERFLOW = 10;
92 
93 static inline void incrementalCopyFast(const char* src, char* op, ptrdiff_t len)
94 {
95  while (op - src < 8) {
96  unalignedCopy64(src, op);
97  len -= op - src;
98  op += op - src;
99  }
100  while (len > 0) {
101  unalignedCopy64(src, op);
102  src += 8;
103  op += 8;
104  len -= 8;
105  }
106 }
107 
108 static inline uint32_t loadNBytes(const void* p, unsigned n)
109 {
110  // Mapping from i in range [0,4] to a mask to extract the bottom i bytes
111  static const uint32_t wordmask[] = {
112  0, 0xff, 0xffff, 0xffffff, 0xffffffff
113  };
114  return Endian::read_UA_L32(p) & wordmask[n];
115 }
116 
117 // Data stored per entry in lookup table:
118 // Range Bits-used Description
119 // ------------------------------------
120 // 1..64 0..7 Literal/copy length encoded in opcode byte
121 // 0..7 8..10 Copy offset encoded in opcode byte / 256
122 // 0..4 11..13 Extra bytes after opcode
123 //
124 // We use eight bits for the length even though 7 would have sufficed
125 // because of efficiency reasons:
126 // (1) Extracting a byte is faster than a bit-field
127 // (2) It properly aligns copy offset so we do not need a <<8
128 // See original code for a generator for this table.
129 static const uint16_t charTable[256] = {
130  0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
131  0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
132  0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
133  0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
134  0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
135  0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
136  0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
137  0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
138  0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
139  0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
140  0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
141  0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
142  0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
143  0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
144  0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
145  0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
146  0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
147  0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
148  0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
149  0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
150  0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
151  0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
152  0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
153  0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
154  0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
155  0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
156  0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
157  0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
158  0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
159  0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
160  0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
161  0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
162 };
163 
164 static const size_t SCRATCH_SIZE = 16;
165 
166 void uncompress(const char* input, size_t inLen,
167  char* output, size_t outLen)
168 {
169  const char* ip = input;
170  const char* ipLimit = input + inLen - SCRATCH_SIZE;
171  char* op = output;;
172  char* opLimit = output + outLen;
173 
174  while (ip != ipLimit) {
175  unsigned char c = *ip++;
176  if ((c & 0x3) == LITERAL) {
177  size_t literalLen = (c >> 2) + 1;
178  size_t outLeft = opLimit - op;
179  if (literalLen <= 16 && outLeft >= 16) {
180  // Fast path, used for the majority (about 95%)
181  // of invocations.
182  unalignedCopy128(ip, op);
183  op += literalLen;
184  ip += literalLen;
185  continue;
186  }
187  if (unlikely(literalLen >= 61)) {
188  // Long literal.
189  size_t literalLenLen = literalLen - 60;
190  literalLen = loadNBytes(ip, unsigned(literalLenLen)) + 1;
191  ip += literalLenLen;
192  }
193  memcpy(op, ip, literalLen);
194  op += literalLen;
195  ip += literalLen;
196  } else {
197  uint32_t entry = charTable[c];
198  uint32_t trailer = loadNBytes(ip, entry >> 11);
199  uint32_t length = entry & 0xff;
200  ip += entry >> 11;
201 
202  // offset/256 is encoded in bits 8..10. By just
203  // fetching those bits, we get copyOffset (since the
204  // bit-field starts at bit 8).
205  size_t offset = (entry & 0x700) + trailer;
206  size_t outLeft = opLimit - op;
207  const char* src = op - offset;
208  if (length <= 16 && offset >= 8 && outLeft >= 16) {
209  // Fast path, used for the majority (70-80%) of
210  // dynamic invocations.
211  unalignedCopy128(src, op);
212  } else {
213  if (outLeft >= length + MAX_INCR_COPY_OVERFLOW) {
214  incrementalCopyFast(src, op, length);
215  } else {
216  incrementalCopy(src, op, length);
217  }
218  }
219  op += length;
220  }
221  }
222 }
223 
224 
226 
227 // Any hash function will produce a valid compressed bitstream, but a good hash
228 // function reduces the number of collisions and thus yields better compression
229 // for compressible input, and more speed for incompressible input. Of course,
230 // it doesn't hurt if the hash function is reasonably fast either, as it gets
231 // called a lot.
232 template<int SHIFT> static inline uint32_t hashBytes(uint32_t bytes)
233 {
234  return (bytes * 0x1e35a7bd) >> SHIFT;
235 }
236 template<int SHIFT> static inline uint32_t hash(const char* p)
237 {
238  return hashBytes<SHIFT>(unalignedLoad32(p));
239 }
240 
241 // For 0 <= offset <= 4, getUint32AtOffset(getEightBytesAt(p), offset) will
242 // equal unalignedLoad32(p + offset). Motivation: On x86-64 hardware we have
243 // empirically found that overlapping loads such as
244 // unalignedLoad32(p) ... unalignedLoad32(p+1) ... unalignedLoad32(p+2)
245 // are slower than unalignedLoad64(p) followed by shifts and casts to uint32_t.
246 //
247 // We have different versions for 64- and 32-bit; ideally we would avoid the
248 // two functions and just inline the unalignedLoad64 call into
249 // getUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
250 // enough to avoid loading the value multiple times then. For 64-bit, the load
251 // is done when getEightBytesAt() is called, whereas for 32-bit, the load is
252 // done at getUint32AtOffset() time.
253 
254 template<unsigned> class EightBytesReference;
255 
256 template<> class EightBytesReference<8>
257 {
258 public:
259  void setAddress(const char* ptr)
260  {
261  data = unalignedLoad64(ptr);
262  }
263  template<int OFFSET> uint32_t getUint32AtOffset() const
264  {
265  static_assert((OFFSET >= 0) && (OFFSET <= 4), "must be in [0..4]");
266  int shift = OPENMSX_BIGENDIAN ? (32 - 8 * OFFSET)
267  : ( 8 * OFFSET);
268  return data >> shift;
269  }
270 private:
271  uint64_t data;
272 };
273 
274 template<> class EightBytesReference<4>
275 {
276 public:
277  void setAddress(const char* ptr_)
278  {
279  ptr = ptr_;
280  }
281  template<int OFFSET> uint32_t getUint32AtOffset() const
282  {
283  static_assert((OFFSET >= 0) && (OFFSET <= 4), "must be in [0..4]");
284  return unalignedLoad32(ptr + OFFSET);
285  }
286 private:
287  const char* ptr;
288 };
289 
290 
291 
292 // Count the number of trailing zero bytes (=trailing zero bits divided by 8).
293 // Returns an undefined value if n == 0.
294 template<typename T> static inline unsigned ctzDiv8(T n)
295 {
296  static const unsigned DIV = 8;
297 #if (defined(__i386__) || defined(__x86_64__)) && \
298  ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR >= 3)))
299  // Gcc-3.4 or above have the following builtins. Though we only want to
300  // use them if they are very fast. E.g. on x86 they compile to a single
301  // 'bsf' instruction. If they have to be emulated in software then the
302  // fallback code below is likely faster (because it skips part of the
303  // full ctz calculation).
304  // TODO on vc++ we could use
305  // _BitScanForward and _BitScanForward64
306  // and possibly other compilers have something similar.
307  if (sizeof(T) <= 4) {
308  return __builtin_ctz(n) / DIV;
309  } else {
310  return __builtin_ctzll(n) / DIV;
311  }
312 #else
313  // Classical ctz routine, but skip the last 3 (for DIV=8) iterations.
314  unsigned bits = 8 * sizeof(T);
315  unsigned r = (bits - 1) / DIV;
316  for (unsigned shift = bits / 2; shift >= DIV; shift /= 2) {
317  if (T x = n << shift) {
318  n = x;
319  r -= shift / DIV;
320  }
321  }
322  return r;
323 #endif
324 
325 }
326 
327 // Return the largest n such that
328 //
329 // s1[0,n-1] == s2[0,n-1]
330 // and n <= (s2Limit - s2).
331 //
332 // Does not read *s2Limit or beyond.
333 // Does not read *(s1 + (s2Limit - s2)) or beyond.
334 // Requires that s2Limit >= s2.
335 //
336 // This implementation is tuned for 64-bit, little-endian machines. But it
337 // also works fine on 32-bit and/or big-endian machines.
338 
339 // search either 4 or 8 bytes at-a-time
340 template<int> struct FindMatchUnit;
341 template<> struct FindMatchUnit<4> { using type = uint32_t; };
342 template<> struct FindMatchUnit<8> { using type = uint64_t; };
343 
344 static inline int findMatchLength(const char* s1,
345  const char* s2, const char* s2Limit)
346 {
347  assert(s2Limit >= s2);
348  int matched = 0;
349 
350  // Find out how long the match is. We loop over the data N bits at a
351  // time until we find a N-bit block that doesn't match, Then (only on
352  // little endian machines) we find the first non-matching bit and use
353  // that to calculate the total length of the match.
355  while (likely(s2 <= s2Limit - sizeof(T))) {
356  T l2 = unalignedLoad<T>(s2);
357  T l1 = unalignedLoad<T>(s1 + matched);
358  if (unlikely(l2 == l1)) {
359  s2 += sizeof(T);
360  matched += sizeof(T);
361  } else {
362  if (OPENMSX_BIGENDIAN) break;
363  // On current (mid-2008) Opteron models there is a 3%
364  // more efficient code sequence to find the first
365  // non-matching byte. However, what follows is ~10%
366  // better on Intel Core 2 and newer, and we expect
367  // AMD's bsf instruction to improve.
368  return matched + ctzDiv8(l2 ^ l1);
369  }
370  }
371  while ((s2 < s2Limit) && (s1[matched] == *s2)) {
372  ++s2;
373  ++matched;
374  }
375  return matched;
376 }
377 
378 
379 template<bool ALLOW_FAST_PATH>
380 static inline char* emitLiteral(char* op, const char* literal, int len)
381 {
382  int n = len - 1; // Zero-length literals are disallowed
383  if (n < 60) {
384  // Fits in tag byte
385  *op++ = LITERAL | (n << 2);
386 
387  // The vast majority of copies are below 16 bytes, for which a
388  // call to memcpy is overkill. This fast path can sometimes
389  // copy up to 15 bytes too much, but that is okay in the
390  // main loop, since we have a bit to go on for both sides:
391  //
392  // - The input will always have INPUT_MARGIN_BYTES = 15 extra
393  // available bytes, as long as we're in the main loop, and
394  // if not, ALLOW_FAST_PATH = false.
395  // - The output will always have 32 spare bytes (see
396  // maxCompressedLength()).
397  if (ALLOW_FAST_PATH && len <= 16) {
398  unalignedCopy128(literal, op);
399  return op + len;
400  }
401  } else {
402  // Encode in upcoming bytes
403  char* base = op;
404  op++;
405  int count = 0;
406  do {
407  *op++ = n & 0xff;
408  n >>= 8;
409  count++;
410  } while (n > 0);
411  assert(count >= 1);
412  assert(count <= 4);
413  *base = LITERAL | ((59 + count) << 2);
414  }
415  memcpy(op, literal, len);
416  return op + len;
417 }
418 
419 static inline char* emitCopyLessThan64(char* op, size_t offset, int len)
420 {
421  assert(len <= 64);
422  assert(len >= 4);
423  assert(offset < 65536);
424 
425  if ((len < 12) && (offset < 2048)) {
426  size_t lenMinus4 = len - 4;
427  assert(lenMinus4 < 8); // Must fit in 3 bits
428  *op++ = char(COPY_1_BYTE_OFFSET + ((lenMinus4) << 2) + ((offset >> 8) << 5));
429  *op++ = offset & 0xff;
430  } else {
431  *op++ = COPY_2_BYTE_OFFSET + ((len - 1) << 2);
432  Endian::write_UA_L16(op, uint16_t(offset));
433  op += 2;
434  }
435  return op;
436 }
437 
438 static inline char* emitCopy(char* op, size_t offset, int len)
439 {
440  // Emit 64 byte copies but make sure to keep at least four bytes reserved
441  while (len >= 68) {
442  op = emitCopyLessThan64(op, offset, 64);
443  len -= 64;
444  }
445 
446  // Emit an extra 60 byte copy if have too much data to fit in one copy
447  if (len > 64) {
448  op = emitCopyLessThan64(op, offset, 60);
449  len -= 60;
450  }
451 
452  // Emit remainder (at least 4 bytes)
453  op = emitCopyLessThan64(op, offset, len);
454  return op;
455 }
456 
457 // The size of a compression block. Note that many parts of the compression
458 // code assumes that kBlockSize <= 65536; in particular, the hash table
459 // can only store 16-bit offsets, and EmitCopy() also assumes the offset
460 // is 65535 bytes or less.
461 static const size_t BLOCK_SIZE = 1 << 16;
462 
463 // Compresses "input" string to the "*op" buffer.
464 //
465 // REQUIRES: "input" is at most "BLOCK_SIZE" bytes long.
466 // REQUIRES: "op" points to an array of memory that is at least
467 // "maxCompressedLength(input.size())" in size.
468 //
469 // Returns an "end" pointer into "op" buffer.
470 // "end - op" is the compressed size of "input".
471 static char* compressFragment(const char* input, size_t inputSize, char* op)
472 {
473  static const int HASH_TABLE_BITS = 14;
474  static const int SHIFT = 32 - HASH_TABLE_BITS;
475  uint16_t table[1 << HASH_TABLE_BITS]; // 32KB
476  memset(table, 0, sizeof(table));
477 
478  // "ip" is the input pointer, and "op" is the output pointer.
479  const char* ip = input;
480  const char* ipEnd = input + inputSize;
481  assert(inputSize <= BLOCK_SIZE);
482  // Bytes in [nextEmit, ip) will be emitted as literal bytes. Or
483  // [nextEmit, ipEnd) after the main loop.
484  const char* nextEmit = ip;
485 
486  static const size_t INPUT_MARGIN_BYTES = 15;
487  if (likely(inputSize >= INPUT_MARGIN_BYTES)) {
488  const char* ipLimit = ipEnd - INPUT_MARGIN_BYTES;
489  uint32_t nextHash = hash<SHIFT>(++ip);
490  while (true) {
491  assert(nextEmit < ip);
492  // The body of this loop calls emitLiteral() once and
493  // then emitCopy one or more times. (The exception is
494  // that when we're close to exhausting the input we
495  // goto emitRemainder.)
496  //
497  // In the first iteration of this loop we're just
498  // starting, so there's nothing to copy, so calling
499  // emitLiteral() once is necessary. And we only start
500  // a new iteration when the current iteration has
501  // determined that a call to emitLiteral() will precede
502  // the next call to emitCopy (if any).
503  //
504  // Step 1: Scan forward in the input looking for a
505  // 4-byte-long match. If we get close to exhausting the
506  // input then goto emitRemainder.
507  //
508  // Heuristic match skipping: If 32 bytes are scanned
509  // with no matches found, start looking only at every
510  // other byte. If 32 more bytes are scanned, look at
511  // every third byte, etc.. When a match is found,
512  // immediately go back to looking at every byte. This
513  // is a small loss (~5% performance, ~0.1% density) for
514  // compressible data due to more bookkeeping, but for
515  // non-compressible data (such as JPEG) it's a huge win
516  // since the compressor quickly "realizes" the data is
517  // incompressible and doesn't bother looking for
518  // matches everywhere.
519  //
520  // The "skip" variable keeps track of how many bytes
521  // there are since the last match; dividing it by 32
522  // (ie. right-shifting by five) gives the number of
523  // bytes to move ahead for each iteration.
524  uint32_t skip = 32;
525  const char* nextIp = ip;
526  const char* candidate;
527  do {
528  ip = nextIp;
529  uint32_t h = nextHash;
530  assert(h == hash<SHIFT>(ip));
531  uint32_t bytesBetweenHashLookups = skip++ >> 5;
532  nextIp = ip + bytesBetweenHashLookups;
533  if (unlikely(nextIp > ipLimit)) {
534  goto emitRemainder;
535  }
536  nextHash = hash<SHIFT>(nextIp);
537  candidate = input + table[h];
538  assert(candidate >= input);
539  assert(candidate < ip);
540 
541  table[h] = ip - input;
542  } while (likely(unalignedLoad32(ip) !=
543  unalignedLoad32(candidate)));
544 
545  // Step 2: A 4-byte match has been found. We'll later
546  // see if more than 4 bytes match. But, prior to the
547  // match, input bytes [nextEmit, ip) are unmatched.
548  // Emit them as "literal bytes."
549  assert(nextEmit + 16 <= ipEnd);
550  op = emitLiteral<true>(op, nextEmit, ip - nextEmit);
551 
552  // Step 3: Call emitCopy, and then see if another
553  // emitCopy could be our next move. Repeat until we
554  // find no match for the input immediately after what
555  // was consumed by the last emitCopy call.
556  //
557  // If we exit this loop normally then we need to call
558  // emitLiteral() next, though we don't yet know how big
559  // the literal will be. We handle that by proceeding
560  // to the next iteration of the main loop. We also can
561  // exit this loop via goto if we get close to
562  // exhausting the input.
564  uint32_t candidateBytes = 0;
565  do {
566  // We have a 4-byte match at ip, and no need to
567  // emit any "literal bytes" prior to ip.
568  int matched = 4 + findMatchLength(candidate + 4, ip + 4, ipEnd);
569  size_t offset = ip - candidate;
570  assert(0 == memcmp(ip, candidate, matched));
571  ip += matched;
572  op = emitCopy(op, offset, matched);
573  // We could immediately start working at ip
574  // now, but to improve compression we first
575  // update table[hash(ip - 1, ...)].
576  const char* insertTail = ip - 1;
577  nextEmit = ip;
578  if (unlikely(ip >= ipLimit)) {
579  goto emitRemainder;
580  }
581  inputBytes.setAddress(insertTail);
582  uint32_t prevHash = hashBytes<SHIFT>(inputBytes.getUint32AtOffset<0>());
583  table[prevHash] = ip - input - 1;
584  uint32_t curHash = hashBytes<SHIFT>(inputBytes.getUint32AtOffset<1>());
585  candidate = input + table[curHash];
586  candidateBytes = unalignedLoad32(candidate);
587  table[curHash] = ip - input;
588  } while (inputBytes.getUint32AtOffset<1>() == candidateBytes);
589 
590  nextHash = hashBytes<SHIFT>(inputBytes.getUint32AtOffset<2>());
591  ++ip;
592  }
593  }
594 
595 emitRemainder:
596  // Emit the remaining bytes as a literal
597  if (nextEmit < ipEnd) {
598  op = emitLiteral<false>(op, nextEmit, ipEnd - nextEmit);
599  }
600  return op;
601 }
602 
603 void compress(const char* input, size_t inLen,
604  char* output, size_t& outLen)
605 {
606  char* out = output;
607  while (inLen > 0) {
608  size_t numToRead = std::min(inLen, BLOCK_SIZE);
609  out = compressFragment(input, numToRead, out);
610  inLen -= numToRead;
611  input += numToRead;
612  }
613  outLen = out - output + SCRATCH_SIZE;
614 }
615 
616 size_t maxCompressedLength(size_t inLen)
617 {
618  return 32 + SCRATCH_SIZE + inLen + inLen / 6;
619 }
620 
621 }
Definition: snappy.cc:13
T length(const vecN< N, T > &x)
Definition: gl_vec.hh:343
size_t maxCompressedLength(size_t inLen)
Definition: snappy.cc:616
uint32_t getUint32AtOffset() const
Definition: snappy.cc:263
#define unlikely(x)
Definition: likely.hh:15
vecN< N, T > min(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:269
void compress(const char *input, size_t inLen, char *output, size_t &outLen)
Definition: snappy.cc:603
constexpr auto data(C &c) -> decltype(c.data())
Definition: span.hh:69
void uncompress(const char *input, size_t inLen, char *output, size_t outLen)
Definition: snappy.cc:166
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
void setAddress(const char *ptr)
Definition: snappy.cc:259
auto count(InputRange &&range, const T &value)
Definition: ranges.hh:209
void setAddress(const char *ptr_)
Definition: snappy.cc:277
#define likely(x)
Definition: likely.hh:14
uint32_t getUint32AtOffset() const
Definition: snappy.cc:281