openMSX
MemoryOps.cc
Go to the documentation of this file.
1#include "MemoryOps.hh"
2
3#include "build-info.hh"
4#include "systemfuncs.hh"
5
6#include "endian.hh"
7#include "narrow.hh"
8#include "stl.hh"
9#include "unreachable.hh"
10
11#include <bit>
12#include <cassert>
13#include <cstdlib>
14#include <cstdint>
15#include <new> // for std::bad_alloc
16#if ASM_X86 && defined _MSC_VER
17#include <intrin.h> // for __stosd intrinsic
18#endif
19#ifdef __SSE2__
20#include <emmintrin.h>
21#endif
22
24
25#ifdef __SSE2__
26#if ASM_X86_32 && defined _MSC_VER
27// Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
28// only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
29// alternative would be to always use this routine, but this generates worse
30// code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
31[[nodiscard]] static inline __m128i _mm_set1_epi64x(uint64_t val)
32{
33 uint32_t low = val >> 32;
34 uint32_t high = val >> 0;
35 return _mm_set_epi32(low, high, low, high);
36}
37#endif
38
39static inline void memset_64_SSE(
40 uint64_t* out, size_t num64, uint64_t val64)
41{
42 if (num64 == 0) [[unlikely]] return;
43
44 // Align at 16-byte boundary.
45 if (size_t(out) & 8) [[unlikely]] {
46 out[0] = val64;
47 ++out; --num64;
48 }
49
50 __m128i val128 = _mm_set1_epi64x(narrow_cast<int64_t>(val64));
51 const uint64_t* e = out + num64 - 3;
52 for (; out < e; out += 4) {
53 _mm_store_si128(std::bit_cast<__m128i*>(out + 0), val128);
54 _mm_store_si128(std::bit_cast<__m128i*>(out + 2), val128);
55 }
56 if (num64 & 2) [[unlikely]] {
57 _mm_store_si128(std::bit_cast<__m128i*>(out), val128);
58 out += 2;
59 }
60 if (num64 & 1) [[unlikely]] {
61 out[0] = val64;
62 }
63}
64#endif
65
66static inline void memset_64(
67 uint64_t* out, size_t num64, uint64_t val64)
68{
69 assert((size_t(out) % 8) == 0); // must be 8-byte aligned
70
71#ifdef __SSE2__
72 memset_64_SSE(out, num64, val64);
73 return;
74#endif
75 const uint64_t* e = out + num64 - 3;
76 for (; out < e; out += 4) {
77 out[0] = val64;
78 out[1] = val64;
79 out[2] = val64;
80 out[3] = val64;
81 }
82 if (num64 & 2) [[unlikely]] {
83 out[0] = val64;
84 out[1] = val64;
85 out += 2;
86 }
87 if (num64 & 1) [[unlikely]] {
88 out[0] = val64;
89 }
90}
91
92static inline void memset_32_2(
93 uint32_t* out, size_t num32, uint32_t val0, uint32_t val1)
94{
95 assert((size_t(out) % 4) == 0); // must be 4-byte aligned
96 if (num32 == 0) [[unlikely]] return;
97
98 // Align at 8-byte boundary.
99 if (size_t(out) & 4) [[unlikely]] {
100 out[0] = val1; // start at odd pixel
101 ++out; --num32;
102 }
103
104 uint64_t val64 = Endian::BIG ? (uint64_t(val0) << 32) | val1
105 : val0 | (uint64_t(val1) << 32);
106 memset_64(std::bit_cast<uint64_t*>(out), num32 / 2, val64);
107
108 if (num32 & 1) [[unlikely]] {
109 out[num32 - 1] = val0;
110 }
111}
112
113static inline void memset_32(uint32_t* out, size_t num32, uint32_t val32)
114{
115 assert((size_t(out) % 4) == 0); // must be 4-byte aligned
116
117#if ASM_X86
118#if defined _MSC_VER
119 // VC++'s __stosd intrinsic results in emulator benchmarks
120 // running about 7% faster than with memset_32_2, streaming or not,
121 // and about 3% faster than the C code below.
122 __stosd(std::bit_cast<unsigned long*>(out), val32, num32);
123#else
124 memset_32_2(out, num32, val32, val32);
125#endif
126#else
127 uint32_t* e = out + num32 - 7;
128 for (; out < e; out += 8) {
129 out[0] = val32;
130 out[1] = val32;
131 out[2] = val32;
132 out[3] = val32;
133 out[4] = val32;
134 out[5] = val32;
135 out[6] = val32;
136 out[7] = val32;
137 }
138 if (num32 & 4) [[unlikely]] {
139 out[0] = val32;
140 out[1] = val32;
141 out[2] = val32;
142 out[3] = val32;
143 out += 4;
144 }
145 if (num32 & 2) [[unlikely]] {
146 out[0] = val32;
147 out[1] = val32;
148 out += 2;
149 }
150 if (num32 & 1) [[unlikely]] {
151 out[0] = val32;
152 }
153#endif
154}
155
156template<typename Pixel> void MemSet<Pixel>::operator()(
157 std::span<Pixel> out, Pixel val) const
158{
159 if constexpr (sizeof(Pixel) == 4) {
160 memset_32(std::bit_cast<uint32_t*>(out.data()), out.size(), val);
161 } else {
163 }
164}
165
166template<typename Pixel> void MemSet2<Pixel>::operator()(
167 std::span<Pixel> out, Pixel val0, Pixel val1) const
168{
169 if constexpr (sizeof(Pixel) == 4) {
170 memset_32_2(std::bit_cast<uint32_t*>(out.data()), out.size(), val0, val1);
171 } else {
173 }
174}
175
176// Force template instantiation
177template struct MemSet <uint32_t>;
178template struct MemSet2<uint32_t>;
179
180
181
185// Helper class to keep track of aligned/unaligned pointer pairs
187{
188public:
189 AllocMap(const AllocMap&) = delete;
190 AllocMap(AllocMap&&) = delete;
191 AllocMap& operator=(const AllocMap&) = delete;
193
194 static AllocMap& instance() {
195 static AllocMap oneInstance;
196 return oneInstance;
197 }
198
199 void insert(void* aligned, void* unaligned) {
200 if (!aligned) return;
201 assert(!contains(allocMap, aligned, &Entry::aligned));
202 allocMap.emplace_back(Entry{aligned, unaligned});
203 }
204
205 void* remove(void* aligned) {
206 if (!aligned) return nullptr;
207 // LIFO order is more likely than FIFO -> search backwards
208 auto it = rfind_unguarded(allocMap, aligned, &Entry::aligned);
209 // return the associated unaligned value
210 void* unaligned = it->unaligned;
211 move_pop_back(allocMap, it);
212 return unaligned;
213 }
214
215private:
216 AllocMap() = default;
217 ~AllocMap() {
218 assert(allocMap.empty());
219 }
220
221 // typically contains 5-10 items, so (unsorted) vector is fine
222 struct Entry {
223 void* aligned;
224 void* unaligned;
225 };
226 std::vector<Entry> allocMap;
227};
228
229void* mallocAligned(size_t alignment, size_t size)
230{
231 assert("must be a power of 2" && std::has_single_bit(alignment));
232 assert(alignment >= sizeof(void*));
233#if HAVE_POSIX_MEMALIGN
234 void* aligned = nullptr;
235 if (posix_memalign(&aligned, alignment, size)) {
236 throw std::bad_alloc();
237 }
238 #if defined DEBUG
239 AllocMap::instance().insert(aligned, aligned);
240 #endif
241 return aligned;
242#elif defined _MSC_VER
243 void* result = _aligned_malloc(size, alignment);
244 if (!result && size) throw std::bad_alloc();
245 return result;
246#else
247 auto t = alignment - 1;
248 void* unaligned = malloc(size + t);
249 if (!unaligned) {
250 throw std::bad_alloc();
251 }
252 auto aligned = std::bit_cast<void*>(
253 (std::bit_cast<uintptr_t>(unaligned) + t) & ~t);
254 AllocMap::instance().insert(aligned, unaligned);
255 return aligned;
256#endif
257}
258
259void freeAligned(void* aligned)
260{
261#if HAVE_POSIX_MEMALIGN
262 #if defined DEBUG
263 AllocMap::instance().remove(aligned);
264 #endif
265 free(aligned);
266#elif defined _MSC_VER
267 return _aligned_free(aligned);
268#else
269 void* unaligned = AllocMap::instance().remove(aligned);
270 free(unaligned);
271#endif
272}
273
274} // namespace openmsx::MemoryOps
TclObject t
Aligned memory (de)allocation.
Definition MemoryOps.cc:187
AllocMap(const AllocMap &)=delete
static AllocMap & instance()
Definition MemoryOps.cc:194
void insert(void *, void *unaligned)
Definition MemoryOps.cc:199
AllocMap & operator=(AllocMap &&)=delete
AllocMap & operator=(const AllocMap &)=delete
AllocMap(AllocMap &&)=delete
constexpr bool BIG
Definition endian.hh:16
void * mallocAligned(size_t alignment, size_t size)
Definition MemoryOps.cc:229
void freeAligned(void *)
Definition MemoryOps.cc:259
CharacterConverter::Pixel Pixel
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition stl.hh:134
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition stl.hh:109
constexpr bool contains(ITER first, ITER last, const VAL &val)
Check if a range contains a given value, using linear search.
Definition stl.hh:32
void operator()(std::span< Pixel > out, Pixel val0, Pixel val1) const
Definition MemoryOps.cc:166
void operator()(std::span< Pixel > out, Pixel val) const
Definition MemoryOps.cc:156
#define UNREACHABLE