openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "build-info.hh"
3 #include "systemfuncs.hh"
4 #include "endian.hh"
5 #include "stl.hh"
6 #include "unreachable.hh"
7 #include <bit>
8 #include <cassert>
9 #include <cstdlib>
10 #include <cstdint>
11 #include <new> // for std::bad_alloc
12 #if ASM_X86 && defined _MSC_VER
13 #include <intrin.h> // for __stosd intrinsic
14 #endif
15 #ifdef __SSE2__
16 #include <emmintrin.h>
17 #endif
18 
19 namespace openmsx::MemoryOps {
20 
21 #ifdef __SSE2__
22 #if ASM_X86_32 && defined _MSC_VER
23 // Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
24 // only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
25 // alternative would be to always use this routine, but this generates worse
26 // code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
27 [[nodiscard]] static inline __m128i _mm_set1_epi64x(uint64_t val)
28 {
29  uint32_t low = val >> 32;
30  uint32_t high = val >> 0;
31  return _mm_set_epi32(low, high, low, high);
32 }
33 #endif
34 
35 static inline void memset_64_SSE(
36  uint64_t* out, size_t num64, uint64_t val64)
37 {
38  if (num64 == 0) [[unlikely]] return;
39 
40  // Align at 16-byte boundary.
41  if (size_t(out) & 8) [[unlikely]] {
42  out[0] = val64;
43  ++out; --num64;
44  }
45 
46  __m128i val128 = _mm_set1_epi64x(val64);
47  uint64_t* e = out + num64 - 3;
48  for (; out < e; out += 4) {
49  _mm_store_si128(reinterpret_cast<__m128i*>(out + 0), val128);
50  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2), val128);
51  }
52  if (num64 & 2) [[unlikely]] {
53  _mm_store_si128(reinterpret_cast<__m128i*>(out), val128);
54  out += 2;
55  }
56  if (num64 & 1) [[unlikely]] {
57  out[0] = val64;
58  }
59 }
60 #endif
61 
62 static inline void memset_64(
63  uint64_t* out, size_t num64, uint64_t val64)
64 {
65  assert((size_t(out) % 8) == 0); // must be 8-byte aligned
66 
67 #ifdef __SSE2__
68  memset_64_SSE(out, num64, val64);
69  return;
70 #endif
71  uint64_t* e = out + num64 - 3;
72  for (; out < e; out += 4) {
73  out[0] = val64;
74  out[1] = val64;
75  out[2] = val64;
76  out[3] = val64;
77  }
78  if (num64 & 2) [[unlikely]] {
79  out[0] = val64;
80  out[1] = val64;
81  out += 2;
82  }
83  if (num64 & 1) [[unlikely]] {
84  out[0] = val64;
85  }
86 }
87 
88 static inline void memset_32_2(
89  uint32_t* out, size_t num32, uint32_t val0, uint32_t val1)
90 {
91  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
92  if (num32 == 0) [[unlikely]] return;
93 
94  // Align at 8-byte boundary.
95  if (size_t(out) & 4) [[unlikely]] {
96  out[0] = val1; // start at odd pixel
97  ++out; --num32;
98  }
99 
100  uint64_t val64 = Endian::BIG ? (uint64_t(val0) << 32) | val1
101  : val0 | (uint64_t(val1) << 32);
102  memset_64(reinterpret_cast<uint64_t*>(out), num32 / 2, val64);
103 
104  if (num32 & 1) [[unlikely]] {
105  out[num32 - 1] = val0;
106  }
107 }
108 
109 static inline void memset_32(uint32_t* out, size_t num32, uint32_t val32)
110 {
111  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
112 
113 #if ASM_X86
114 #if defined _MSC_VER
115  // VC++'s __stosd intrinsic results in emulator benchmarks
116  // running about 7% faster than with memset_32_2, streaming or not,
117  // and about 3% faster than the C code below.
118  __stosd(reinterpret_cast<unsigned long*>(out), val32, num32);
119 #else
120  memset_32_2(out, num32, val32, val32);
121 #endif
122 #else
123  uint32_t* e = out + num32 - 7;
124  for (; out < e; out += 8) {
125  out[0] = val32;
126  out[1] = val32;
127  out[2] = val32;
128  out[3] = val32;
129  out[4] = val32;
130  out[5] = val32;
131  out[6] = val32;
132  out[7] = val32;
133  }
134  if (num32 & 4) [[unlikely]] {
135  out[0] = val32;
136  out[1] = val32;
137  out[2] = val32;
138  out[3] = val32;
139  out += 4;
140  }
141  if (num32 & 2) [[unlikely]] {
142  out[0] = val32;
143  out[1] = val32;
144  out += 2;
145  }
146  if (num32 & 1) [[unlikely]] {
147  out[0] = val32;
148  }
149 #endif
150 }
151 
152 static inline void memset_16_2(
153  uint16_t* out, size_t num16, uint16_t val0, uint16_t val1)
154 {
155  if (num16 == 0) [[unlikely]] return;
156 
157  // Align at 4-byte boundary.
158  if (size_t(out) & 2) [[unlikely]] {
159  out[0] = val1; // start at odd pixel
160  ++out; --num16;
161  }
162 
163  uint32_t val32 = Endian::BIG ? (uint32_t(val0) << 16) | val1
164  : val0 | (uint32_t(val1) << 16);
165  memset_32(reinterpret_cast<uint32_t*>(out), num16 / 2, val32);
166 
167  if (num16 & 1) [[unlikely]] {
168  out[num16 - 1] = val0;
169  }
170 }
171 
172 static inline void memset_16(uint16_t* out, size_t num16, uint16_t val16)
173 {
174  memset_16_2(out, num16, val16, val16);
175 }
176 
177 template<typename Pixel> void MemSet<Pixel>::operator()(
178  Pixel* out, size_t num, Pixel val) const
179 {
180  if constexpr (sizeof(Pixel) == 2) {
181  memset_16(reinterpret_cast<uint16_t*>(out), num, val);
182  } else if constexpr (sizeof(Pixel) == 4) {
183  memset_32(reinterpret_cast<uint32_t*>(out), num, val);
184  } else {
185  UNREACHABLE;
186  }
187 }
188 
189 template<typename Pixel> void MemSet2<Pixel>::operator()(
190  Pixel* out, size_t num, Pixel val0, Pixel val1) const
191 {
192  if constexpr (sizeof(Pixel) == 2) {
193  memset_16_2(reinterpret_cast<uint16_t*>(out), num, val0, val1);
194  } else if constexpr (sizeof(Pixel) == 4) {
195  memset_32_2(reinterpret_cast<uint32_t*>(out), num, val0, val1);
196  } else {
197  UNREACHABLE;
198  }
199 }
200 
201 // Force template instantiation
202 template struct MemSet <uint16_t>;
203 template struct MemSet <uint32_t>;
204 template struct MemSet2<uint16_t>;
205 template struct MemSet2<uint32_t>;
206 
207 
208 
212 // Helper class to keep track of aligned/unaligned pointer pairs
213 class AllocMap
214 {
215 public:
216  AllocMap(const AllocMap&) = delete;
217  AllocMap& operator=(const AllocMap&) = delete;
218 
219  static AllocMap& instance() {
220  static AllocMap oneInstance;
221  return oneInstance;
222  }
223 
224  void insert(void* aligned, void* unaligned) {
225  if (!aligned) return;
226  assert(!contains(allocMap, aligned, &Entry::aligned));
227  allocMap.emplace_back(Entry{aligned, unaligned});
228  }
229 
230  void* remove(void* aligned) {
231  if (!aligned) return nullptr;
232  // LIFO order is more likely than FIFO -> search backwards
233  auto it = rfind_unguarded(allocMap, aligned, &Entry::aligned);
234  // return the associated unaligned value
235  void* unaligned = it->unaligned;
236  move_pop_back(allocMap, it);
237  return unaligned;
238  }
239 
240 private:
241  AllocMap() = default;
242  ~AllocMap() {
243  assert(allocMap.empty());
244  }
245 
246  // typically contains 5-10 items, so (unsorted) vector is fine
247  struct Entry {
248  void* aligned;
249  void* unaligned;
250  };
251  std::vector<Entry> allocMap;
252 };
253 
254 void* mallocAligned(size_t alignment, size_t size)
255 {
256  assert("must be a power of 2" && std::has_single_bit(alignment));
257  assert(alignment >= sizeof(void*));
258 #if HAVE_POSIX_MEMALIGN
259  void* aligned = nullptr;
260  if (posix_memalign(&aligned, alignment, size)) {
261  throw std::bad_alloc();
262  }
263  #if defined DEBUG
264  AllocMap::instance().insert(aligned, aligned);
265  #endif
266  return aligned;
267 #elif defined _MSC_VER
268  void* result = _aligned_malloc(size, alignment);
269  if (!result && size) throw std::bad_alloc();
270  return result;
271 #else
272  auto t = alignment - 1;
273  void* unaligned = malloc(size + t);
274  if (!unaligned) {
275  throw std::bad_alloc();
276  }
277  auto aligned = reinterpret_cast<void*>(
278  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
279  AllocMap::instance().insert(aligned, unaligned);
280  return aligned;
281 #endif
282 }
283 
284 void freeAligned(void* aligned)
285 {
286 #if HAVE_POSIX_MEMALIGN
287  #if defined DEBUG
288  AllocMap::instance().remove(aligned);
289  #endif
290  free(aligned);
291 #elif defined _MSC_VER
292  return _aligned_free(aligned);
293 #else
294  void* unaligned = AllocMap::instance().remove(aligned);
295  free(unaligned);
296 #endif
297 }
298 
299 } // namespace openmsx::MemoryOps
TclObject t
Aligned memory (de)allocation.
Definition: MemoryOps.cc:214
static AllocMap & instance()
Definition: MemoryOps.cc:219
AllocMap(const AllocMap &)=delete
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:224
AllocMap & operator=(const AllocMap &)=delete
constexpr bool BIG
Definition: endian.hh:13
constexpr double e
Definition: Math.hh:18
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:254
void freeAligned(void *)
Definition: MemoryOps.cc:284
uint32_t Pixel
size_t size(std::string_view utf8)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:125
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition: stl.hh:100
constexpr bool contains(ITER first, ITER last, const VAL &val)
Check if a range contains a given value, using linear search.
Definition: stl.hh:23
void operator()(Pixel *out, size_t num, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:189
void operator()(Pixel *out, size_t num, Pixel val) const
Definition: MemoryOps.cc:177
#define UNREACHABLE
Definition: unreachable.hh:38