openMSX
MemoryOps.cc
Go to the documentation of this file.
1#include "MemoryOps.hh"
2#include "build-info.hh"
3#include "systemfuncs.hh"
4#include "endian.hh"
5#include "narrow.hh"
6#include "stl.hh"
7#include "unreachable.hh"
8#include <bit>
9#include <cassert>
10#include <cstdlib>
11#include <cstdint>
12#include <new> // for std::bad_alloc
13#if ASM_X86 && defined _MSC_VER
14#include <intrin.h> // for __stosd intrinsic
15#endif
16#ifdef __SSE2__
17#include <emmintrin.h>
18#endif
19
21
22#ifdef __SSE2__
23#if ASM_X86_32 && defined _MSC_VER
24// Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
25// only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
26// alternative would be to always use this routine, but this generates worse
27// code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
28[[nodiscard]] static inline __m128i _mm_set1_epi64x(uint64_t val)
29{
30 uint32_t low = val >> 32;
31 uint32_t high = val >> 0;
32 return _mm_set_epi32(low, high, low, high);
33}
34#endif
35
36static inline void memset_64_SSE(
37 uint64_t* out, size_t num64, uint64_t val64)
38{
39 if (num64 == 0) [[unlikely]] return;
40
41 // Align at 16-byte boundary.
42 if (size_t(out) & 8) [[unlikely]] {
43 out[0] = val64;
44 ++out; --num64;
45 }
46
47 __m128i val128 = _mm_set1_epi64x(narrow_cast<int64_t>(val64));
48 uint64_t* e = out + num64 - 3;
49 for (; out < e; out += 4) {
50 _mm_store_si128(reinterpret_cast<__m128i*>(out + 0), val128);
51 _mm_store_si128(reinterpret_cast<__m128i*>(out + 2), val128);
52 }
53 if (num64 & 2) [[unlikely]] {
54 _mm_store_si128(reinterpret_cast<__m128i*>(out), val128);
55 out += 2;
56 }
57 if (num64 & 1) [[unlikely]] {
58 out[0] = val64;
59 }
60}
61#endif
62
63static inline void memset_64(
64 uint64_t* out, size_t num64, uint64_t val64)
65{
66 assert((size_t(out) % 8) == 0); // must be 8-byte aligned
67
68#ifdef __SSE2__
69 memset_64_SSE(out, num64, val64);
70 return;
71#endif
72 uint64_t* e = out + num64 - 3;
73 for (; out < e; out += 4) {
74 out[0] = val64;
75 out[1] = val64;
76 out[2] = val64;
77 out[3] = val64;
78 }
79 if (num64 & 2) [[unlikely]] {
80 out[0] = val64;
81 out[1] = val64;
82 out += 2;
83 }
84 if (num64 & 1) [[unlikely]] {
85 out[0] = val64;
86 }
87}
88
89static inline void memset_32_2(
90 uint32_t* out, size_t num32, uint32_t val0, uint32_t val1)
91{
92 assert((size_t(out) % 4) == 0); // must be 4-byte aligned
93 if (num32 == 0) [[unlikely]] return;
94
95 // Align at 8-byte boundary.
96 if (size_t(out) & 4) [[unlikely]] {
97 out[0] = val1; // start at odd pixel
98 ++out; --num32;
99 }
100
101 uint64_t val64 = Endian::BIG ? (uint64_t(val0) << 32) | val1
102 : val0 | (uint64_t(val1) << 32);
103 memset_64(reinterpret_cast<uint64_t*>(out), num32 / 2, val64);
104
105 if (num32 & 1) [[unlikely]] {
106 out[num32 - 1] = val0;
107 }
108}
109
110static inline void memset_32(uint32_t* out, size_t num32, uint32_t val32)
111{
112 assert((size_t(out) % 4) == 0); // must be 4-byte aligned
113
114#if ASM_X86
115#if defined _MSC_VER
116 // VC++'s __stosd intrinsic results in emulator benchmarks
117 // running about 7% faster than with memset_32_2, streaming or not,
118 // and about 3% faster than the C code below.
119 __stosd(reinterpret_cast<unsigned long*>(out), val32, num32);
120#else
121 memset_32_2(out, num32, val32, val32);
122#endif
123#else
124 uint32_t* e = out + num32 - 7;
125 for (; out < e; out += 8) {
126 out[0] = val32;
127 out[1] = val32;
128 out[2] = val32;
129 out[3] = val32;
130 out[4] = val32;
131 out[5] = val32;
132 out[6] = val32;
133 out[7] = val32;
134 }
135 if (num32 & 4) [[unlikely]] {
136 out[0] = val32;
137 out[1] = val32;
138 out[2] = val32;
139 out[3] = val32;
140 out += 4;
141 }
142 if (num32 & 2) [[unlikely]] {
143 out[0] = val32;
144 out[1] = val32;
145 out += 2;
146 }
147 if (num32 & 1) [[unlikely]] {
148 out[0] = val32;
149 }
150#endif
151}
152
153static inline void memset_16_2(
154 uint16_t* out, size_t num16, uint16_t val0, uint16_t val1)
155{
156 if (num16 == 0) [[unlikely]] return;
157
158 // Align at 4-byte boundary.
159 if (size_t(out) & 2) [[unlikely]] {
160 out[0] = val1; // start at odd pixel
161 ++out; --num16;
162 }
163
164 uint32_t val32 = Endian::BIG ? (uint32_t(val0) << 16) | val1
165 : val0 | (uint32_t(val1) << 16);
166 memset_32(reinterpret_cast<uint32_t*>(out), num16 / 2, val32);
167
168 if (num16 & 1) [[unlikely]] {
169 out[num16 - 1] = val0;
170 }
171}
172
173static inline void memset_16(uint16_t* out, size_t num16, uint16_t val16)
174{
175 memset_16_2(out, num16, val16, val16);
176}
177
178template<typename Pixel> void MemSet<Pixel>::operator()(
179 std::span<Pixel> out, Pixel val) const
180{
181 if constexpr (sizeof(Pixel) == 2) {
182 memset_16(reinterpret_cast<uint16_t*>(out.data()), out.size(), val);
183 } else if constexpr (sizeof(Pixel) == 4) {
184 memset_32(reinterpret_cast<uint32_t*>(out.data()), out.size(), val);
185 } else {
187 }
188}
189
190template<typename Pixel> void MemSet2<Pixel>::operator()(
191 std::span<Pixel> out, Pixel val0, Pixel val1) const
192{
193 if constexpr (sizeof(Pixel) == 2) {
194 memset_16_2(reinterpret_cast<uint16_t*>(out.data()), out.size(), val0, val1);
195 } else if constexpr (sizeof(Pixel) == 4) {
196 memset_32_2(reinterpret_cast<uint32_t*>(out.data()), out.size(), val0, val1);
197 } else {
199 }
200}
201
202// Force template instantiation
203template struct MemSet <uint16_t>;
204template struct MemSet <uint32_t>;
205template struct MemSet2<uint16_t>;
206template struct MemSet2<uint32_t>;
207
208
209
213// Helper class to keep track of aligned/unaligned pointer pairs
215{
216public:
217 AllocMap(const AllocMap&) = delete;
218 AllocMap& operator=(const AllocMap&) = delete;
219
220 static AllocMap& instance() {
221 static AllocMap oneInstance;
222 return oneInstance;
223 }
224
225 void insert(void* aligned, void* unaligned) {
226 if (!aligned) return;
227 assert(!contains(allocMap, aligned, &Entry::aligned));
228 allocMap.emplace_back(Entry{aligned, unaligned});
229 }
230
231 void* remove(void* aligned) {
232 if (!aligned) return nullptr;
233 // LIFO order is more likely than FIFO -> search backwards
234 auto it = rfind_unguarded(allocMap, aligned, &Entry::aligned);
235 // return the associated unaligned value
236 void* unaligned = it->unaligned;
237 move_pop_back(allocMap, it);
238 return unaligned;
239 }
240
241private:
242 AllocMap() = default;
243 ~AllocMap() {
244 assert(allocMap.empty());
245 }
246
247 // typically contains 5-10 items, so (unsorted) vector is fine
248 struct Entry {
249 void* aligned;
250 void* unaligned;
251 };
252 std::vector<Entry> allocMap;
253};
254
255void* mallocAligned(size_t alignment, size_t size)
256{
257 assert("must be a power of 2" && std::has_single_bit(alignment));
258 assert(alignment >= sizeof(void*));
259#if HAVE_POSIX_MEMALIGN
260 void* aligned = nullptr;
261 if (posix_memalign(&aligned, alignment, size)) {
262 throw std::bad_alloc();
263 }
264 #if defined DEBUG
265 AllocMap::instance().insert(aligned, aligned);
266 #endif
267 return aligned;
268#elif defined _MSC_VER
269 void* result = _aligned_malloc(size, alignment);
270 if (!result && size) throw std::bad_alloc();
271 return result;
272#else
273 auto t = alignment - 1;
274 void* unaligned = malloc(size + t);
275 if (!unaligned) {
276 throw std::bad_alloc();
277 }
278 auto aligned = reinterpret_cast<void*>(
279 (reinterpret_cast<size_t>(unaligned) + t) & ~t);
280 AllocMap::instance().insert(aligned, unaligned);
281 return aligned;
282#endif
283}
284
285void freeAligned(void* aligned)
286{
287#if HAVE_POSIX_MEMALIGN
288 #if defined DEBUG
289 AllocMap::instance().remove(aligned);
290 #endif
291 free(aligned);
292#elif defined _MSC_VER
293 return _aligned_free(aligned);
294#else
295 void* unaligned = AllocMap::instance().remove(aligned);
296 free(unaligned);
297#endif
298}
299
300} // namespace openmsx::MemoryOps
TclObject t
Aligned memory (de)allocation.
Definition: MemoryOps.cc:215
AllocMap(const AllocMap &)=delete
static AllocMap & instance()
Definition: MemoryOps.cc:220
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:225
AllocMap & operator=(const AllocMap &)=delete
constexpr bool BIG
Definition: endian.hh:15
constexpr double e
Definition: Math.hh:21
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:255
void freeAligned(void *)
Definition: MemoryOps.cc:285
uint32_t Pixel
size_t size(std::string_view utf8)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:125
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition: stl.hh:100
constexpr bool contains(ITER first, ITER last, const VAL &val)
Check if a range contains a given value, using linear search.
Definition: stl.hh:23
void operator()(std::span< Pixel > out, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:190
void operator()(std::span< Pixel > out, Pixel val) const
Definition: MemoryOps.cc:178
#define UNREACHABLE
Definition: unreachable.hh:38