openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "likely.hh"
3 #include "build-info.hh"
4 #include "systemfuncs.hh"
5 #include "Math.hh"
6 #include "ranges.hh"
7 #include "stl.hh"
8 #include "unreachable.hh"
9 #include <utility>
10 #include <vector>
11 #include <cassert>
12 #include <cstdlib>
13 #include <cstdint>
14 #include <new> // for std::bad_alloc
15 #if ASM_X86 && defined _MSC_VER
16 #include <intrin.h> // for __stosd intrinsic
17 #endif
18 #ifdef __SSE2__
19 #include <emmintrin.h>
20 #endif
21 
22 namespace openmsx::MemoryOps {
23 
24 #ifdef __SSE2__
25 #if ASM_X86_32 && defined _MSC_VER
26 // Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
27 // only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
28 // alternative would be to always use this routine, but this generates worse
29 // code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
30 [[nodiscard]] static inline __m128i _mm_set1_epi64x(uint64_t val)
31 {
32  uint32_t low = val >> 32;
33  uint32_t high = val >> 0;
34  return _mm_set_epi32(low, high, low, high);
35 }
36 #endif
37 
38 static inline void memset_64_SSE(
39  uint64_t* out, size_t num64, uint64_t val64)
40 {
41  if (unlikely(num64 == 0)) return;
42 
43  // Align at 16-byte boundary.
44  if (unlikely(size_t(out) & 8)) {
45  out[0] = val64;
46  ++out; --num64;
47  }
48 
49  __m128i val128 = _mm_set1_epi64x(val64);
50  uint64_t* e = out + num64 - 3;
51  for (; out < e; out += 4) {
52  _mm_store_si128(reinterpret_cast<__m128i*>(out + 0), val128);
53  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2), val128);
54  }
55  if (unlikely(num64 & 2)) {
56  _mm_store_si128(reinterpret_cast<__m128i*>(out), val128);
57  out += 2;
58  }
59  if (unlikely(num64 & 1)) {
60  out[0] = val64;
61  }
62 }
63 #endif
64 
65 static inline void memset_64(
66  uint64_t* out, size_t num64, uint64_t val64)
67 {
68  assert((size_t(out) % 8) == 0); // must be 8-byte aligned
69 
70 #ifdef __SSE2__
71  memset_64_SSE(out, num64, val64);
72  return;
73 #endif
74  uint64_t* e = out + num64 - 3;
75  for (; out < e; out += 4) {
76  out[0] = val64;
77  out[1] = val64;
78  out[2] = val64;
79  out[3] = val64;
80  }
81  if (unlikely(num64 & 2)) {
82  out[0] = val64;
83  out[1] = val64;
84  out += 2;
85  }
86  if (unlikely(num64 & 1)) {
87  out[0] = val64;
88  }
89 }
90 
91 static inline void memset_32_2(
92  uint32_t* out, size_t num32, uint32_t val0, uint32_t val1)
93 {
94  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
95  if (unlikely(num32 == 0)) return;
96 
97  // Align at 8-byte boundary.
98  if (unlikely(size_t(out) & 4)) {
99  out[0] = val1; // start at odd pixel
100  ++out; --num32;
101  }
102 
103  uint64_t val64 = OPENMSX_BIGENDIAN ? (uint64_t(val0) << 32) | val1
104  : val0 | (uint64_t(val1) << 32);
105  memset_64(reinterpret_cast<uint64_t*>(out), num32 / 2, val64);
106 
107  if (unlikely(num32 & 1)) {
108  out[num32 - 1] = val0;
109  }
110 }
111 
112 static inline void memset_32(uint32_t* out, size_t num32, uint32_t val32)
113 {
114  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
115 
116 #if ASM_X86
117 #if defined _MSC_VER
118  // VC++'s __stosd intrinsic results in emulator benchmarks
119  // running about 7% faster than with memset_32_2, streaming or not,
120  // and about 3% faster than the C code below.
121  __stosd(reinterpret_cast<unsigned long*>(out), val32, num32);
122 #else
123  memset_32_2(out, num32, val32, val32);
124 #endif
125 #else
126  uint32_t* e = out + num32 - 7;
127  for (; out < e; out += 8) {
128  out[0] = val32;
129  out[1] = val32;
130  out[2] = val32;
131  out[3] = val32;
132  out[4] = val32;
133  out[5] = val32;
134  out[6] = val32;
135  out[7] = val32;
136  }
137  if (unlikely(num32 & 4)) {
138  out[0] = val32;
139  out[1] = val32;
140  out[2] = val32;
141  out[3] = val32;
142  out += 4;
143  }
144  if (unlikely(num32 & 2)) {
145  out[0] = val32;
146  out[1] = val32;
147  out += 2;
148  }
149  if (unlikely(num32 & 1)) {
150  out[0] = val32;
151  }
152 #endif
153 }
154 
155 static inline void memset_16_2(
156  uint16_t* out, size_t num16, uint16_t val0, uint16_t val1)
157 {
158  if (unlikely(num16 == 0)) return;
159 
160  // Align at 4-byte boundary.
161  if (unlikely(size_t(out) & 2)) {
162  out[0] = val1; // start at odd pixel
163  ++out; --num16;
164  }
165 
166  uint32_t val32 = OPENMSX_BIGENDIAN ? (uint32_t(val0) << 16) | val1
167  : val0 | (uint32_t(val1) << 16);
168  memset_32(reinterpret_cast<uint32_t*>(out), num16 / 2, val32);
169 
170  if (unlikely(num16 & 1)) {
171  out[num16 - 1] = val0;
172  }
173 }
174 
175 static inline void memset_16(uint16_t* out, size_t num16, uint16_t val16)
176 {
177  memset_16_2(out, num16, val16, val16);
178 }
179 
180 template<typename Pixel> void MemSet<Pixel>::operator()(
181  Pixel* out, size_t num, Pixel val) const
182 {
183  if (sizeof(Pixel) == 2) {
184  memset_16(reinterpret_cast<uint16_t*>(out), num, val);
185  } else if (sizeof(Pixel) == 4) {
186  memset_32(reinterpret_cast<uint32_t*>(out), num, val);
187  } else {
188  UNREACHABLE;
189  }
190 }
191 
192 template<typename Pixel> void MemSet2<Pixel>::operator()(
193  Pixel* out, size_t num, Pixel val0, Pixel val1) const
194 {
195  if (sizeof(Pixel) == 2) {
196  memset_16_2(reinterpret_cast<uint16_t*>(out), num, val0, val1);
197  } else if (sizeof(Pixel) == 4) {
198  memset_32_2(reinterpret_cast<uint32_t*>(out), num, val0, val1);
199  } else {
200  UNREACHABLE;
201  }
202 }
203 
204 // Force template instantiation
205 template struct MemSet <uint16_t>;
206 template struct MemSet <uint32_t>;
207 template struct MemSet2<uint16_t>;
208 template struct MemSet2<uint32_t>;
209 
210 
211 
215 // Helper class to keep track of aligned/unaligned pointer pairs
216 class AllocMap
217 {
218 public:
219  AllocMap(const AllocMap&) = delete;
220  AllocMap& operator=(const AllocMap&) = delete;
221 
222  static AllocMap& instance() {
223  static AllocMap oneInstance;
224  return oneInstance;
225  }
226 
227  void insert(void* aligned, void* unaligned) {
228  if (!aligned) return;
229  assert(ranges::none_of(allocMap, EqualTupleValue<0>(aligned)));
230  allocMap.emplace_back(aligned, unaligned);
231  }
232 
233  void* remove(void* aligned) {
234  if (!aligned) return nullptr;
235  // LIFO order is more likely than FIFO -> search backwards
236  auto it = rfind_if_unguarded(allocMap,
237  EqualTupleValue<0>(aligned));
238  // return the associated unaligned value
239  void* unaligned = it->second;
240  move_pop_back(allocMap, it);
241  return unaligned;
242  }
243 
244 private:
245  AllocMap() = default;
246  ~AllocMap() {
247  assert(allocMap.empty());
248  }
249 
250  // typically contains 5-10 items, so (unsorted) vector is fine
251  std::vector<std::pair<void*, void*>> allocMap;
252 };
253 
254 void* mallocAligned(size_t alignment, size_t size)
255 {
256  assert("must be a power of 2" && Math::ispow2(alignment));
257  assert(alignment >= sizeof(void*));
258 #if HAVE_POSIX_MEMALIGN
259  void* aligned = nullptr;
260  if (posix_memalign(&aligned, alignment, size)) {
261  throw std::bad_alloc();
262  }
263  #if defined DEBUG
264  AllocMap::instance().insert(aligned, aligned);
265  #endif
266  return aligned;
267 #elif defined _MSC_VER
268  void* result = _aligned_malloc(size, alignment);
269  if (!result && size) throw std::bad_alloc();
270  return result;
271 #else
272  auto t = alignment - 1;
273  void* unaligned = malloc(size + t);
274  if (!unaligned) {
275  throw std::bad_alloc();
276  }
277  auto aligned = reinterpret_cast<void*>(
278  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
279  AllocMap::instance().insert(aligned, unaligned);
280  return aligned;
281 #endif
282 }
283 
284 void freeAligned(void* aligned)
285 {
286 #if HAVE_POSIX_MEMALIGN
287  #if defined DEBUG
288  AllocMap::instance().remove(aligned);
289  #endif
290  free(aligned);
291 #elif defined _MSC_VER
292  return _aligned_free(aligned);
293 #else
294  void* unaligned = AllocMap::instance().remove(aligned);
295  free(unaligned);
296 #endif
297 }
298 
299 } // namespace openmsx::MemoryOps
TclObject t
Aligned memory (de)allocation.
Definition: MemoryOps.cc:217
static AllocMap & instance()
Definition: MemoryOps.cc:222
AllocMap(const AllocMap &)=delete
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:227
AllocMap & operator=(const AllocMap &)=delete
#define unlikely(x)
Definition: likely.hh:15
constexpr bool ispow2(T x) noexcept
Is the given number an integral power of two? That is, does it have exactly one 1-bit in binary repre...
Definition: Math.hh:57
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:254
void freeAligned(void *)
Definition: MemoryOps.cc:284
uint32_t Pixel
bool none_of(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:131
size_t size(std::string_view utf8)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:182
constexpr auto rfind_if_unguarded(RANGE &range, PRED pred)
Definition: stl.hh:165
void operator()(Pixel *out, size_t num, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:192
void operator()(Pixel *out, size_t num, Pixel val) const
Definition: MemoryOps.cc:180
#define UNREACHABLE
Definition: unreachable.hh:38