5 #include "build-info.hh" 22 static inline void unalignedCopy64(
const char* src,
char* dst)
24 if (
sizeof(
void*) == 8) {
25 unalignedStore64(dst + 0, unalignedLoad64(src + 0));
29 unalignedStore32(dst + 0, unalignedLoad32(src + 0));
30 unalignedStore32(dst + 4, unalignedLoad32(src + 4));
33 static inline void unalignedCopy128(
const char* src,
char* dst)
35 unalignedCopy64(src + 0, dst + 0);
36 unalignedCopy64(src + 8, dst + 8);
52 static inline void incrementalCopy(
const char* src,
char* op, ptrdiff_t len)
91 static const int MAX_INCR_COPY_OVERFLOW = 10;
93 static inline void incrementalCopyFast(
const char* src,
char* op, ptrdiff_t len)
95 while (op - src < 8) {
96 unalignedCopy64(src, op);
101 unalignedCopy64(src, op);
108 static inline uint32_t loadNBytes(
const void* p,
unsigned n)
111 static const uint32_t wordmask[] = {
112 0, 0xff, 0xffff, 0xffffff, 0xffffffff
114 return Endian::read_UA_L32(p) & wordmask[n];
129 static const uint16_t charTable[256] = {
130 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
131 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
132 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
133 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
134 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
135 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
136 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
137 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
138 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
139 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
140 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
141 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
142 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
143 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
144 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
145 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
146 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
147 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
148 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
149 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
150 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
151 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
152 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
153 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
154 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
155 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
156 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
157 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
158 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
159 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
160 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
161 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
164 static const size_t SCRATCH_SIZE = 16;
167 char* output,
size_t outLen)
169 const char* ip = input;
170 const char* ipLimit = input + inLen - SCRATCH_SIZE;
172 char* opLimit = output + outLen;
174 while (ip != ipLimit) {
175 unsigned char c = *ip++;
177 size_t literalLen = (c >> 2) + 1;
178 size_t outLeft = opLimit - op;
179 if (literalLen <= 16 && outLeft >= 16) {
182 unalignedCopy128(ip, op);
189 size_t literalLenLen = literalLen - 60;
190 literalLen = loadNBytes(ip,
unsigned(literalLenLen)) + 1;
193 memcpy(op, ip, literalLen);
197 uint32_t entry = charTable[c];
198 uint32_t trailer = loadNBytes(ip, entry >> 11);
199 uint32_t
length = entry & 0xff;
205 size_t offset = (entry & 0x700) + trailer;
206 size_t outLeft = opLimit - op;
207 const char* src = op - offset;
208 if (length <= 16 && offset >= 8 && outLeft >= 16) {
211 unalignedCopy128(src, op);
213 if (outLeft >= length + MAX_INCR_COPY_OVERFLOW) {
214 incrementalCopyFast(src, op, length);
216 incrementalCopy(src, op, length);
232 template<
int SHIFT>
static inline uint32_t hashBytes(uint32_t bytes)
234 return (bytes * 0x1e35a7bd) >> SHIFT;
236 template<
int SHIFT>
static inline uint32_t hash(
const char* p)
238 return hashBytes<SHIFT>(unalignedLoad32(p));
261 data = unalignedLoad64(ptr);
265 static_assert((OFFSET >= 0) && (OFFSET <= 4),
"must be in [0..4]");
266 int shift = OPENMSX_BIGENDIAN ? (32 - 8 * OFFSET)
268 return data >> shift;
283 static_assert((OFFSET >= 0) && (OFFSET <= 4),
"must be in [0..4]");
284 return unalignedLoad32(ptr + OFFSET);
294 template<
typename T>
static inline unsigned ctzDiv8(T n)
296 static const unsigned DIV = 8;
297 #if (defined(__i386__) || defined(__x86_64__)) && \ 298 ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR >= 3))) 307 if (
sizeof(T) <= 4) {
308 return __builtin_ctz(n) / DIV;
310 return __builtin_ctzll(n) / DIV;
314 unsigned bits = 8 *
sizeof(T);
315 unsigned r = (bits - 1) / DIV;
316 for (
unsigned shift = bits / 2; shift >= DIV; shift /= 2) {
317 if (T x = n << shift) {
344 static inline int findMatchLength(
const char* s1,
345 const char* s2,
const char* s2Limit)
347 assert(s2Limit >= s2);
355 while (
likely(s2 <= s2Limit -
sizeof(T))) {
356 T l2 = unalignedLoad<T>(s2);
357 T l1 = unalignedLoad<T>(s1 + matched);
360 matched +=
sizeof(T);
362 if (OPENMSX_BIGENDIAN)
break;
368 return matched + ctzDiv8(l2 ^ l1);
371 while ((s2 < s2Limit) && (s1[matched] == *s2)) {
379 template<
bool ALLOW_FAST_PATH>
380 static inline char* emitLiteral(
char* op,
const char* literal,
int len)
397 if (ALLOW_FAST_PATH && len <= 16) {
398 unalignedCopy128(literal, op);
415 memcpy(op, literal, len);
419 static inline char* emitCopyLessThan64(
char* op,
size_t offset,
int len)
423 assert(offset < 65536);
425 if ((len < 12) && (offset < 2048)) {
426 size_t lenMinus4 = len - 4;
427 assert(lenMinus4 < 8);
429 *op++ = offset & 0xff;
432 Endian::write_UA_L16(op, uint16_t(offset));
438 static inline char* emitCopy(
char* op,
size_t offset,
int len)
442 op = emitCopyLessThan64(op, offset, 64);
448 op = emitCopyLessThan64(op, offset, 60);
453 op = emitCopyLessThan64(op, offset, len);
461 static const size_t BLOCK_SIZE = 1 << 16;
471 static char* compressFragment(
const char* input,
size_t inputSize,
char* op)
473 static const int HASH_TABLE_BITS = 14;
474 static const int SHIFT = 32 - HASH_TABLE_BITS;
475 uint16_t table[1 << HASH_TABLE_BITS];
476 memset(table, 0,
sizeof(table));
479 const char* ip = input;
480 const char* ipEnd = input + inputSize;
481 assert(inputSize <= BLOCK_SIZE);
484 const char* nextEmit = ip;
486 static const size_t INPUT_MARGIN_BYTES = 15;
487 if (
likely(inputSize >= INPUT_MARGIN_BYTES)) {
488 const char* ipLimit = ipEnd - INPUT_MARGIN_BYTES;
489 uint32_t nextHash = hash<SHIFT>(++ip);
491 assert(nextEmit < ip);
525 const char* nextIp = ip;
526 const char* candidate;
529 uint32_t h = nextHash;
530 assert(h == hash<SHIFT>(ip));
531 uint32_t bytesBetweenHashLookups = skip++ >> 5;
532 nextIp = ip + bytesBetweenHashLookups;
536 nextHash = hash<SHIFT>(nextIp);
537 candidate = input + table[h];
538 assert(candidate >= input);
539 assert(candidate < ip);
541 table[h] = ip - input;
542 }
while (
likely(unalignedLoad32(ip) !=
543 unalignedLoad32(candidate)));
549 assert(nextEmit + 16 <= ipEnd);
550 op = emitLiteral<true>(op, nextEmit, ip - nextEmit);
564 uint32_t candidateBytes = 0;
568 int matched = 4 + findMatchLength(candidate + 4, ip + 4, ipEnd);
569 size_t offset = ip - candidate;
570 assert(0 == memcmp(ip, candidate, matched));
572 op = emitCopy(op, offset, matched);
576 const char* insertTail = ip - 1;
581 inputBytes.setAddress(insertTail);
582 uint32_t prevHash = hashBytes<SHIFT>(inputBytes.getUint32AtOffset<0>());
583 table[prevHash] = ip - input - 1;
584 uint32_t curHash = hashBytes<SHIFT>(inputBytes.getUint32AtOffset<1>());
585 candidate = input + table[curHash];
586 candidateBytes = unalignedLoad32(candidate);
587 table[curHash] = ip - input;
588 }
while (inputBytes.getUint32AtOffset<1>() == candidateBytes);
590 nextHash = hashBytes<SHIFT>(inputBytes.getUint32AtOffset<2>());
597 if (nextEmit < ipEnd) {
598 op = emitLiteral<false>(op, nextEmit, ipEnd - nextEmit);
604 char* output,
size_t& outLen)
608 size_t numToRead =
std::min(inLen, BLOCK_SIZE);
609 out = compressFragment(input, numToRead, out);
613 outLen = out - output + SCRATCH_SIZE;
618 return 32 + SCRATCH_SIZE + inLen + inLen / 6;
T length(const vecN< N, T > &x)
size_t maxCompressedLength(size_t inLen)
uint32_t getUint32AtOffset() const
vecN< N, T > min(const vecN< N, T > &x, const vecN< N, T > &y)
void compress(const char *input, size_t inLen, char *output, size_t &outLen)
constexpr auto data(C &c) -> decltype(c.data())
void uncompress(const char *input, size_t inLen, char *output, size_t outLen)
Thanks to enen for testing this on a real cartridge:
void setAddress(const char *ptr)
auto count(InputRange &&range, const T &value)
void setAddress(const char *ptr_)
uint32_t getUint32AtOffset() const