openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "likely.hh"
3 #include "build-info.hh"
4 #include "systemfuncs.hh"
5 #include "Math.hh"
6 #include "ranges.hh"
7 #include "stl.hh"
8 #include "unreachable.hh"
9 #include <utility>
10 #include <vector>
11 #include <cassert>
12 #include <cstdlib>
13 #include <cstdint>
14 #include <new> // for std::bad_alloc
15 #if ASM_X86 && defined _MSC_VER
16 #include <intrin.h> // for __stosd intrinsic
17 #endif
18 #ifdef __SSE2__
19 #include <emmintrin.h>
20 #endif
21 
22 namespace openmsx {
23 namespace MemoryOps {
24 
25 #ifdef __SSE2__
26 #if ASM_X86_32 && defined _MSC_VER
27 // Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
28 // only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
29 // alternative would be to always use this routine, but this generates worse
30 // code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
31 static inline __m128i _mm_set1_epi64x(uint64_t val)
32 {
33  uint32_t low = val >> 32;
34  uint32_t high = val >> 0;
35  return _mm_set_epi32(low, high, low, high);
36 }
37 #endif
38 
39 static inline void memset_64_SSE(
40  uint64_t* out, size_t num64, uint64_t val64)
41 {
42  if (unlikely(num64 == 0)) return;
43 
44  // Align at 16-byte boundary.
45  if (unlikely(size_t(out) & 8)) {
46  out[0] = val64;
47  ++out; --num64;
48  }
49 
50  __m128i val128 = _mm_set1_epi64x(val64);
51  uint64_t* e = out + num64 - 3;
52  for (; out < e; out += 4) {
53  _mm_store_si128(reinterpret_cast<__m128i*>(out + 0), val128);
54  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2), val128);
55  }
56  if (unlikely(num64 & 2)) {
57  _mm_store_si128(reinterpret_cast<__m128i*>(out), val128);
58  out += 2;
59  }
60  if (unlikely(num64 & 1)) {
61  out[0] = val64;
62  }
63 }
64 #endif
65 
66 static inline void memset_64(
67  uint64_t* out, size_t num64, uint64_t val64)
68 {
69  assert((size_t(out) % 8) == 0); // must be 8-byte aligned
70 
71 #ifdef __SSE2__
72  memset_64_SSE(out, num64, val64);
73  return;
74 #endif
75  uint64_t* e = out + num64 - 3;
76  for (; out < e; out += 4) {
77  out[0] = val64;
78  out[1] = val64;
79  out[2] = val64;
80  out[3] = val64;
81  }
82  if (unlikely(num64 & 2)) {
83  out[0] = val64;
84  out[1] = val64;
85  out += 2;
86  }
87  if (unlikely(num64 & 1)) {
88  out[0] = val64;
89  }
90 }
91 
92 static inline void memset_32_2(
93  uint32_t* out, size_t num32, uint32_t val0, uint32_t val1)
94 {
95  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
96  if (unlikely(num32 == 0)) return;
97 
98  // Align at 8-byte boundary.
99  if (unlikely(size_t(out) & 4)) {
100  out[0] = val1; // start at odd pixel
101  ++out; --num32;
102  }
103 
104  uint64_t val64 = OPENMSX_BIGENDIAN ? (uint64_t(val0) << 32) | val1
105  : val0 | (uint64_t(val1) << 32);
106  memset_64(reinterpret_cast<uint64_t*>(out), num32 / 2, val64);
107 
108  if (unlikely(num32 & 1)) {
109  out[num32 - 1] = val0;
110  }
111 }
112 
113 static inline void memset_32(uint32_t* out, size_t num32, uint32_t val32)
114 {
115  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
116 
117 #if ASM_X86
118 #if defined _MSC_VER
119  // VC++'s __stosd intrinsic results in emulator benchmarks
120  // running about 7% faster than with memset_32_2, streaming or not,
121  // and about 3% faster than the C code below.
122  __stosd(reinterpret_cast<unsigned long*>(out), val32, num32);
123 #else
124  memset_32_2(out, num32, val32, val32);
125 #endif
126 #else
127  uint32_t* e = out + num32 - 7;
128  for (; out < e; out += 8) {
129  out[0] = val32;
130  out[1] = val32;
131  out[2] = val32;
132  out[3] = val32;
133  out[4] = val32;
134  out[5] = val32;
135  out[6] = val32;
136  out[7] = val32;
137  }
138  if (unlikely(num32 & 4)) {
139  out[0] = val32;
140  out[1] = val32;
141  out[2] = val32;
142  out[3] = val32;
143  out += 4;
144  }
145  if (unlikely(num32 & 2)) {
146  out[0] = val32;
147  out[1] = val32;
148  out += 2;
149  }
150  if (unlikely(num32 & 1)) {
151  out[0] = val32;
152  }
153 #endif
154 }
155 
156 static inline void memset_16_2(
157  uint16_t* out, size_t num16, uint16_t val0, uint16_t val1)
158 {
159  if (unlikely(num16 == 0)) return;
160 
161  // Align at 4-byte boundary.
162  if (unlikely(size_t(out) & 2)) {
163  out[0] = val1; // start at odd pixel
164  ++out; --num16;
165  }
166 
167  uint32_t val32 = OPENMSX_BIGENDIAN ? (uint32_t(val0) << 16) | val1
168  : val0 | (uint32_t(val1) << 16);
169  memset_32(reinterpret_cast<uint32_t*>(out), num16 / 2, val32);
170 
171  if (unlikely(num16 & 1)) {
172  out[num16 - 1] = val0;
173  }
174 }
175 
176 static inline void memset_16(uint16_t* out, size_t num16, uint16_t val16)
177 {
178  memset_16_2(out, num16, val16, val16);
179 }
180 
181 template<typename Pixel> void MemSet<Pixel>::operator()(
182  Pixel* out, size_t num, Pixel val) const
183 {
184  if (sizeof(Pixel) == 2) {
185  memset_16(reinterpret_cast<uint16_t*>(out), num, val);
186  } else if (sizeof(Pixel) == 4) {
187  memset_32(reinterpret_cast<uint32_t*>(out), num, val);
188  } else {
189  UNREACHABLE;
190  }
191 }
192 
193 template<typename Pixel> void MemSet2<Pixel>::operator()(
194  Pixel* out, size_t num, Pixel val0, Pixel val1) const
195 {
196  if (sizeof(Pixel) == 2) {
197  memset_16_2(reinterpret_cast<uint16_t*>(out), num, val0, val1);
198  } else if (sizeof(Pixel) == 4) {
199  memset_32_2(reinterpret_cast<uint32_t*>(out), num, val0, val1);
200  } else {
201  UNREACHABLE;
202  }
203 }
204 
205 // Force template instantiation
206 template struct MemSet <uint16_t>;
207 template struct MemSet <uint32_t>;
208 template struct MemSet2<uint16_t>;
209 template struct MemSet2<uint32_t>;
210 
211 
212 
216 // Helper class to keep track of aligned/unaligned pointer pairs
217 class AllocMap
218 {
219 public:
220  static AllocMap& instance() {
221  static AllocMap oneInstance;
222  return oneInstance;
223  }
224 
225  void insert(void* aligned, void* unaligned) {
226  if (!aligned) return;
227  assert(ranges::none_of(allocMap, EqualTupleValue<0>(aligned)));
228  allocMap.emplace_back(aligned, unaligned);
229  }
230 
231  void* remove(void* aligned) {
232  if (!aligned) return nullptr;
233  // LIFO order is more likely than FIFO -> search backwards
234  auto it = rfind_if_unguarded(allocMap,
235  EqualTupleValue<0>(aligned));
236  // return the associated unaligned value
237  void* unaligned = it->second;
238  move_pop_back(allocMap, it);
239  return unaligned;
240  }
241 
242 private:
243  AllocMap() = default;
244  ~AllocMap() {
245  assert(allocMap.empty());
246  }
247 
248  // typically contains 5-10 items, so (unsorted) vector is fine
249  std::vector<std::pair<void*, void*>> allocMap;
250 };
251 
252 void* mallocAligned(size_t alignment, size_t size)
253 {
254  assert("must be a power of 2" && Math::isPowerOfTwo(alignment));
255  assert(alignment >= sizeof(void*));
256 #if HAVE_POSIX_MEMALIGN
257  void* aligned;
258  if (posix_memalign(&aligned, alignment, size)) {
259  throw std::bad_alloc();
260  }
261  #if defined DEBUG
262  AllocMap::instance().insert(aligned, aligned);
263  #endif
264  return aligned;
265 #elif defined _MSC_VER
266  void* result = _aligned_malloc(size, alignment);
267  if (!result && size) throw std::bad_alloc();
268  return result;
269 #else
270  auto t = alignment - 1;
271  void* unaligned = malloc(size + t);
272  if (!unaligned) {
273  throw std::bad_alloc();
274  }
275  auto aligned = reinterpret_cast<void*>(
276  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
277  AllocMap::instance().insert(aligned, unaligned);
278  return aligned;
279 #endif
280 }
281 
282 void freeAligned(void* aligned)
283 {
284 #if HAVE_POSIX_MEMALIGN
285  #if defined DEBUG
286  AllocMap::instance().remove(aligned);
287  #endif
288  free(aligned);
289 #elif defined _MSC_VER
290  return _aligned_free(aligned);
291 #else
292  void* unaligned = AllocMap::instance().remove(aligned);
293  free(unaligned);
294 #endif
295 }
296 
297 } // namespace MemoryOps
298 } // namespace openmsx
#define unlikely(x)
Definition: likely.hh:15
void freeAligned(void *)
Definition: MemoryOps.cc:282
uint32_t Pixel
bool none_of(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:131
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:191
static AllocMap & instance()
Definition: MemoryOps.cc:220
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:252
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:225
void operator()(Pixel *out, size_t num, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:193
constexpr bool isPowerOfTwo(unsigned a)
Is the given number an integer power of 2? Not correct for zero (according to this test 0 is a power ...
Definition: Math.hh:35
Aligned memory (de)allocation.
Definition: MemoryOps.cc:217
constexpr auto size(const C &c) -> decltype(c.size())
Definition: span.hh:62
auto rfind_if_unguarded(RANGE &range, PRED pred)
Definition: stl.hh:174
void operator()(Pixel *out, size_t num, Pixel val) const
Definition: MemoryOps.cc:181
#define UNREACHABLE
Definition: unreachable.hh:38