openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "likely.hh"
3 #include "build-info.hh"
4 #include "systemfuncs.hh"
5 #include "Math.hh"
6 #include "stl.hh"
7 #include "unreachable.hh"
8 #include <vector>
9 #include <cassert>
10 #include <cstdlib>
11 #include <cstdint>
12 #include <new> // for std::bad_alloc
13 #if ASM_X86 && defined _MSC_VER
14 #include <intrin.h> // for __stosd intrinsic
15 #endif
16 #ifdef __SSE2__
17 #include <emmintrin.h>
18 #endif
19 
20 namespace openmsx::MemoryOps {
21 
22 #ifdef __SSE2__
23 #if ASM_X86_32 && defined _MSC_VER
24 // Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
25 // only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
26 // alternative would be to always use this routine, but this generates worse
27 // code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
28 [[nodiscard]] static inline __m128i _mm_set1_epi64x(uint64_t val)
29 {
30  uint32_t low = val >> 32;
31  uint32_t high = val >> 0;
32  return _mm_set_epi32(low, high, low, high);
33 }
34 #endif
35 
36 static inline void memset_64_SSE(
37  uint64_t* out, size_t num64, uint64_t val64)
38 {
39  if (unlikely(num64 == 0)) return;
40 
41  // Align at 16-byte boundary.
42  if (unlikely(size_t(out) & 8)) {
43  out[0] = val64;
44  ++out; --num64;
45  }
46 
47  __m128i val128 = _mm_set1_epi64x(val64);
48  uint64_t* e = out + num64 - 3;
49  for (; out < e; out += 4) {
50  _mm_store_si128(reinterpret_cast<__m128i*>(out + 0), val128);
51  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2), val128);
52  }
53  if (unlikely(num64 & 2)) {
54  _mm_store_si128(reinterpret_cast<__m128i*>(out), val128);
55  out += 2;
56  }
57  if (unlikely(num64 & 1)) {
58  out[0] = val64;
59  }
60 }
61 #endif
62 
63 static inline void memset_64(
64  uint64_t* out, size_t num64, uint64_t val64)
65 {
66  assert((size_t(out) % 8) == 0); // must be 8-byte aligned
67 
68 #ifdef __SSE2__
69  memset_64_SSE(out, num64, val64);
70  return;
71 #endif
72  uint64_t* e = out + num64 - 3;
73  for (; out < e; out += 4) {
74  out[0] = val64;
75  out[1] = val64;
76  out[2] = val64;
77  out[3] = val64;
78  }
79  if (unlikely(num64 & 2)) {
80  out[0] = val64;
81  out[1] = val64;
82  out += 2;
83  }
84  if (unlikely(num64 & 1)) {
85  out[0] = val64;
86  }
87 }
88 
89 static inline void memset_32_2(
90  uint32_t* out, size_t num32, uint32_t val0, uint32_t val1)
91 {
92  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
93  if (unlikely(num32 == 0)) return;
94 
95  // Align at 8-byte boundary.
96  if (unlikely(size_t(out) & 4)) {
97  out[0] = val1; // start at odd pixel
98  ++out; --num32;
99  }
100 
101  uint64_t val64 = OPENMSX_BIGENDIAN ? (uint64_t(val0) << 32) | val1
102  : val0 | (uint64_t(val1) << 32);
103  memset_64(reinterpret_cast<uint64_t*>(out), num32 / 2, val64);
104 
105  if (unlikely(num32 & 1)) {
106  out[num32 - 1] = val0;
107  }
108 }
109 
110 static inline void memset_32(uint32_t* out, size_t num32, uint32_t val32)
111 {
112  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
113 
114 #if ASM_X86
115 #if defined _MSC_VER
116  // VC++'s __stosd intrinsic results in emulator benchmarks
117  // running about 7% faster than with memset_32_2, streaming or not,
118  // and about 3% faster than the C code below.
119  __stosd(reinterpret_cast<unsigned long*>(out), val32, num32);
120 #else
121  memset_32_2(out, num32, val32, val32);
122 #endif
123 #else
124  uint32_t* e = out + num32 - 7;
125  for (; out < e; out += 8) {
126  out[0] = val32;
127  out[1] = val32;
128  out[2] = val32;
129  out[3] = val32;
130  out[4] = val32;
131  out[5] = val32;
132  out[6] = val32;
133  out[7] = val32;
134  }
135  if (unlikely(num32 & 4)) {
136  out[0] = val32;
137  out[1] = val32;
138  out[2] = val32;
139  out[3] = val32;
140  out += 4;
141  }
142  if (unlikely(num32 & 2)) {
143  out[0] = val32;
144  out[1] = val32;
145  out += 2;
146  }
147  if (unlikely(num32 & 1)) {
148  out[0] = val32;
149  }
150 #endif
151 }
152 
153 static inline void memset_16_2(
154  uint16_t* out, size_t num16, uint16_t val0, uint16_t val1)
155 {
156  if (unlikely(num16 == 0)) return;
157 
158  // Align at 4-byte boundary.
159  if (unlikely(size_t(out) & 2)) {
160  out[0] = val1; // start at odd pixel
161  ++out; --num16;
162  }
163 
164  uint32_t val32 = OPENMSX_BIGENDIAN ? (uint32_t(val0) << 16) | val1
165  : val0 | (uint32_t(val1) << 16);
166  memset_32(reinterpret_cast<uint32_t*>(out), num16 / 2, val32);
167 
168  if (unlikely(num16 & 1)) {
169  out[num16 - 1] = val0;
170  }
171 }
172 
173 static inline void memset_16(uint16_t* out, size_t num16, uint16_t val16)
174 {
175  memset_16_2(out, num16, val16, val16);
176 }
177 
178 template<typename Pixel> void MemSet<Pixel>::operator()(
179  Pixel* out, size_t num, Pixel val) const
180 {
181  if constexpr (sizeof(Pixel) == 2) {
182  memset_16(reinterpret_cast<uint16_t*>(out), num, val);
183  } else if constexpr (sizeof(Pixel) == 4) {
184  memset_32(reinterpret_cast<uint32_t*>(out), num, val);
185  } else {
186  UNREACHABLE;
187  }
188 }
189 
190 template<typename Pixel> void MemSet2<Pixel>::operator()(
191  Pixel* out, size_t num, Pixel val0, Pixel val1) const
192 {
193  if constexpr (sizeof(Pixel) == 2) {
194  memset_16_2(reinterpret_cast<uint16_t*>(out), num, val0, val1);
195  } else if constexpr (sizeof(Pixel) == 4) {
196  memset_32_2(reinterpret_cast<uint32_t*>(out), num, val0, val1);
197  } else {
198  UNREACHABLE;
199  }
200 }
201 
202 // Force template instantiation
203 template struct MemSet <uint16_t>;
204 template struct MemSet <uint32_t>;
205 template struct MemSet2<uint16_t>;
206 template struct MemSet2<uint32_t>;
207 
208 
209 
213 // Helper class to keep track of aligned/unaligned pointer pairs
214 class AllocMap
215 {
216 public:
217  AllocMap(const AllocMap&) = delete;
218  AllocMap& operator=(const AllocMap&) = delete;
219 
220  static AllocMap& instance() {
221  static AllocMap oneInstance;
222  return oneInstance;
223  }
224 
225  void insert(void* aligned, void* unaligned) {
226  if (!aligned) return;
227  assert(!contains(allocMap, aligned, &Entry::aligned));
228  allocMap.emplace_back(Entry{aligned, unaligned});
229  }
230 
231  void* remove(void* aligned) {
232  if (!aligned) return nullptr;
233  // LIFO order is more likely than FIFO -> search backwards
234  auto it = rfind_unguarded(allocMap, aligned, &Entry::aligned);
235  // return the associated unaligned value
236  void* unaligned = it->unaligned;
237  move_pop_back(allocMap, it);
238  return unaligned;
239  }
240 
241 private:
242  AllocMap() = default;
243  ~AllocMap() {
244  assert(allocMap.empty());
245  }
246 
247  // typically contains 5-10 items, so (unsorted) vector is fine
248  struct Entry {
249  void* aligned;
250  void* unaligned;
251  };
252  std::vector<Entry> allocMap;
253 };
254 
255 void* mallocAligned(size_t alignment, size_t size)
256 {
257  assert("must be a power of 2" && Math::ispow2(alignment));
258  assert(alignment >= sizeof(void*));
259 #if HAVE_POSIX_MEMALIGN
260  void* aligned = nullptr;
261  if (posix_memalign(&aligned, alignment, size)) {
262  throw std::bad_alloc();
263  }
264  #if defined DEBUG
265  AllocMap::instance().insert(aligned, aligned);
266  #endif
267  return aligned;
268 #elif defined _MSC_VER
269  void* result = _aligned_malloc(size, alignment);
270  if (!result && size) throw std::bad_alloc();
271  return result;
272 #else
273  auto t = alignment - 1;
274  void* unaligned = malloc(size + t);
275  if (!unaligned) {
276  throw std::bad_alloc();
277  }
278  auto aligned = reinterpret_cast<void*>(
279  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
280  AllocMap::instance().insert(aligned, unaligned);
281  return aligned;
282 #endif
283 }
284 
285 void freeAligned(void* aligned)
286 {
287 #if HAVE_POSIX_MEMALIGN
288  #if defined DEBUG
289  AllocMap::instance().remove(aligned);
290  #endif
291  free(aligned);
292 #elif defined _MSC_VER
293  return _aligned_free(aligned);
294 #else
295  void* unaligned = AllocMap::instance().remove(aligned);
296  free(unaligned);
297 #endif
298 }
299 
300 } // namespace openmsx::MemoryOps
TclObject t
Aligned memory (de)allocation.
Definition: MemoryOps.cc:215
static AllocMap & instance()
Definition: MemoryOps.cc:220
AllocMap(const AllocMap &)=delete
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:225
AllocMap & operator=(const AllocMap &)=delete
#define unlikely(x)
Definition: likely.hh:15
constexpr bool ispow2(T x) noexcept
Is the given number an integral power of two? That is, does it have exactly one 1-bit in binary repre...
Definition: Math.hh:57
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:255
void freeAligned(void *)
Definition: MemoryOps.cc:285
uint32_t Pixel
size_t size(std::string_view utf8)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:133
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition: stl.hh:108
constexpr bool contains(ITER first, ITER last, const VAL &val)
Check if a range contains a given value, using linear search.
Definition: stl.hh:31
void operator()(Pixel *out, size_t num, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:190
void operator()(Pixel *out, size_t num, Pixel val) const
Definition: MemoryOps.cc:178
#define UNREACHABLE
Definition: unreachable.hh:38