openMSX
MemoryOps.cc
Go to the documentation of this file.
1 #include "MemoryOps.hh"
2 #include "likely.hh"
3 #include "build-info.hh"
4 #include "systemfuncs.hh"
5 #include "Math.hh"
6 #include "ranges.hh"
7 #include "stl.hh"
8 #include "unreachable.hh"
9 #include <utility>
10 #include <vector>
11 #include <cassert>
12 #include <cstdlib>
13 #include <cstdint>
14 #include <new> // for std::bad_alloc
15 #if ASM_X86 && defined _MSC_VER
16 #include <intrin.h> // for __stosd intrinsic
17 #endif
18 #ifdef __SSE2__
19 #include <emmintrin.h>
20 #endif
21 
22 namespace openmsx::MemoryOps {
23 
24 #ifdef __SSE2__
25 #if ASM_X86_32 && defined _MSC_VER
26 // Gcc has the _mm_set1_epi64x() function for both 32 and 64 bit. Visual studio
27 // only has it for 64 bit. So we add it ourselves for vc++/32-bit. An
28 // alternative would be to always use this routine, but this generates worse
29 // code than the real _mm_set1_epi64x() function for gcc (both 32 and 64 bit).
30 static inline __m128i _mm_set1_epi64x(uint64_t val)
31 {
32  uint32_t low = val >> 32;
33  uint32_t high = val >> 0;
34  return _mm_set_epi32(low, high, low, high);
35 }
36 #endif
37 
38 static inline void memset_64_SSE(
39  uint64_t* out, size_t num64, uint64_t val64)
40 {
41  if (unlikely(num64 == 0)) return;
42 
43  // Align at 16-byte boundary.
44  if (unlikely(size_t(out) & 8)) {
45  out[0] = val64;
46  ++out; --num64;
47  }
48 
49  __m128i val128 = _mm_set1_epi64x(val64);
50  uint64_t* e = out + num64 - 3;
51  for (; out < e; out += 4) {
52  _mm_store_si128(reinterpret_cast<__m128i*>(out + 0), val128);
53  _mm_store_si128(reinterpret_cast<__m128i*>(out + 2), val128);
54  }
55  if (unlikely(num64 & 2)) {
56  _mm_store_si128(reinterpret_cast<__m128i*>(out), val128);
57  out += 2;
58  }
59  if (unlikely(num64 & 1)) {
60  out[0] = val64;
61  }
62 }
63 #endif
64 
65 static inline void memset_64(
66  uint64_t* out, size_t num64, uint64_t val64)
67 {
68  assert((size_t(out) % 8) == 0); // must be 8-byte aligned
69 
70 #ifdef __SSE2__
71  memset_64_SSE(out, num64, val64);
72  return;
73 #endif
74  uint64_t* e = out + num64 - 3;
75  for (; out < e; out += 4) {
76  out[0] = val64;
77  out[1] = val64;
78  out[2] = val64;
79  out[3] = val64;
80  }
81  if (unlikely(num64 & 2)) {
82  out[0] = val64;
83  out[1] = val64;
84  out += 2;
85  }
86  if (unlikely(num64 & 1)) {
87  out[0] = val64;
88  }
89 }
90 
91 static inline void memset_32_2(
92  uint32_t* out, size_t num32, uint32_t val0, uint32_t val1)
93 {
94  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
95  if (unlikely(num32 == 0)) return;
96 
97  // Align at 8-byte boundary.
98  if (unlikely(size_t(out) & 4)) {
99  out[0] = val1; // start at odd pixel
100  ++out; --num32;
101  }
102 
103  uint64_t val64 = OPENMSX_BIGENDIAN ? (uint64_t(val0) << 32) | val1
104  : val0 | (uint64_t(val1) << 32);
105  memset_64(reinterpret_cast<uint64_t*>(out), num32 / 2, val64);
106 
107  if (unlikely(num32 & 1)) {
108  out[num32 - 1] = val0;
109  }
110 }
111 
112 static inline void memset_32(uint32_t* out, size_t num32, uint32_t val32)
113 {
114  assert((size_t(out) % 4) == 0); // must be 4-byte aligned
115 
116 #if ASM_X86
117 #if defined _MSC_VER
118  // VC++'s __stosd intrinsic results in emulator benchmarks
119  // running about 7% faster than with memset_32_2, streaming or not,
120  // and about 3% faster than the C code below.
121  __stosd(reinterpret_cast<unsigned long*>(out), val32, num32);
122 #else
123  memset_32_2(out, num32, val32, val32);
124 #endif
125 #else
126  uint32_t* e = out + num32 - 7;
127  for (; out < e; out += 8) {
128  out[0] = val32;
129  out[1] = val32;
130  out[2] = val32;
131  out[3] = val32;
132  out[4] = val32;
133  out[5] = val32;
134  out[6] = val32;
135  out[7] = val32;
136  }
137  if (unlikely(num32 & 4)) {
138  out[0] = val32;
139  out[1] = val32;
140  out[2] = val32;
141  out[3] = val32;
142  out += 4;
143  }
144  if (unlikely(num32 & 2)) {
145  out[0] = val32;
146  out[1] = val32;
147  out += 2;
148  }
149  if (unlikely(num32 & 1)) {
150  out[0] = val32;
151  }
152 #endif
153 }
154 
155 static inline void memset_16_2(
156  uint16_t* out, size_t num16, uint16_t val0, uint16_t val1)
157 {
158  if (unlikely(num16 == 0)) return;
159 
160  // Align at 4-byte boundary.
161  if (unlikely(size_t(out) & 2)) {
162  out[0] = val1; // start at odd pixel
163  ++out; --num16;
164  }
165 
166  uint32_t val32 = OPENMSX_BIGENDIAN ? (uint32_t(val0) << 16) | val1
167  : val0 | (uint32_t(val1) << 16);
168  memset_32(reinterpret_cast<uint32_t*>(out), num16 / 2, val32);
169 
170  if (unlikely(num16 & 1)) {
171  out[num16 - 1] = val0;
172  }
173 }
174 
175 static inline void memset_16(uint16_t* out, size_t num16, uint16_t val16)
176 {
177  memset_16_2(out, num16, val16, val16);
178 }
179 
180 template<typename Pixel> void MemSet<Pixel>::operator()(
181  Pixel* out, size_t num, Pixel val) const
182 {
183  if (sizeof(Pixel) == 2) {
184  memset_16(reinterpret_cast<uint16_t*>(out), num, val);
185  } else if (sizeof(Pixel) == 4) {
186  memset_32(reinterpret_cast<uint32_t*>(out), num, val);
187  } else {
188  UNREACHABLE;
189  }
190 }
191 
192 template<typename Pixel> void MemSet2<Pixel>::operator()(
193  Pixel* out, size_t num, Pixel val0, Pixel val1) const
194 {
195  if (sizeof(Pixel) == 2) {
196  memset_16_2(reinterpret_cast<uint16_t*>(out), num, val0, val1);
197  } else if (sizeof(Pixel) == 4) {
198  memset_32_2(reinterpret_cast<uint32_t*>(out), num, val0, val1);
199  } else {
200  UNREACHABLE;
201  }
202 }
203 
204 // Force template instantiation
205 template struct MemSet <uint16_t>;
206 template struct MemSet <uint32_t>;
207 template struct MemSet2<uint16_t>;
208 template struct MemSet2<uint32_t>;
209 
210 
211 
215 // Helper class to keep track of aligned/unaligned pointer pairs
216 class AllocMap
217 {
218 public:
219  static AllocMap& instance() {
220  static AllocMap oneInstance;
221  return oneInstance;
222  }
223 
224  void insert(void* aligned, void* unaligned) {
225  if (!aligned) return;
226  assert(ranges::none_of(allocMap, EqualTupleValue<0>(aligned)));
227  allocMap.emplace_back(aligned, unaligned);
228  }
229 
230  void* remove(void* aligned) {
231  if (!aligned) return nullptr;
232  // LIFO order is more likely than FIFO -> search backwards
233  auto it = rfind_if_unguarded(allocMap,
234  EqualTupleValue<0>(aligned));
235  // return the associated unaligned value
236  void* unaligned = it->second;
237  move_pop_back(allocMap, it);
238  return unaligned;
239  }
240 
241 private:
242  AllocMap() = default;
243  ~AllocMap() {
244  assert(allocMap.empty());
245  }
246 
247  // typically contains 5-10 items, so (unsorted) vector is fine
248  std::vector<std::pair<void*, void*>> allocMap;
249 };
250 
251 void* mallocAligned(size_t alignment, size_t size)
252 {
253  assert("must be a power of 2" && Math::ispow2(alignment));
254  assert(alignment >= sizeof(void*));
255 #if HAVE_POSIX_MEMALIGN
256  void* aligned;
257  if (posix_memalign(&aligned, alignment, size)) {
258  throw std::bad_alloc();
259  }
260  #if defined DEBUG
261  AllocMap::instance().insert(aligned, aligned);
262  #endif
263  return aligned;
264 #elif defined _MSC_VER
265  void* result = _aligned_malloc(size, alignment);
266  if (!result && size) throw std::bad_alloc();
267  return result;
268 #else
269  auto t = alignment - 1;
270  void* unaligned = malloc(size + t);
271  if (!unaligned) {
272  throw std::bad_alloc();
273  }
274  auto aligned = reinterpret_cast<void*>(
275  (reinterpret_cast<size_t>(unaligned) + t) & ~t);
276  AllocMap::instance().insert(aligned, unaligned);
277  return aligned;
278 #endif
279 }
280 
281 void freeAligned(void* aligned)
282 {
283 #if HAVE_POSIX_MEMALIGN
284  #if defined DEBUG
285  AllocMap::instance().remove(aligned);
286  #endif
287  free(aligned);
288 #elif defined _MSC_VER
289  return _aligned_free(aligned);
290 #else
291  void* unaligned = AllocMap::instance().remove(aligned);
292  free(unaligned);
293 #endif
294 }
295 
296 } // namespace openmsx::MemoryOps
MemoryOps.hh
openmsx::MemoryOps::AllocMap::insert
void insert(void *, void *unaligned)
Definition: MemoryOps.cc:224
openmsx::MemoryOps::MemSet2::operator()
void operator()(Pixel *out, size_t num, Pixel val0, Pixel val1) const
Definition: MemoryOps.cc:192
unlikely
#define unlikely(x)
Definition: likely.hh:15
openmsx::MemoryOps::freeAligned
void freeAligned(void *)
Definition: MemoryOps.cc:281
utf8::unchecked::size
size_t size(std::string_view utf8)
Definition: utf8_unchecked.hh:227
t
TclObject t
Definition: TclObject_test.cc:264
ranges.hh
Math::ispow2
constexpr bool ispow2(T x) noexcept
Is the given number an integral power of two? That is, does it have exactly one 1-bit in binary repre...
Definition: Math.hh:57
openmsx::MemoryOps::AllocMap::instance
static AllocMap & instance()
Definition: MemoryOps.cc:219
systemfuncs.hh
likely.hh
move_pop_back
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:177
openmsx::Pixel
uint32_t Pixel
Definition: GLHQLiteScaler.cc:98
UNREACHABLE
#define UNREACHABLE
Definition: unreachable.hh:38
openmsx::MemoryOps::MemSet2
Definition: MemoryOps.hh:12
openmsx::MemoryOps::MemSet
Definition: MemoryOps.hh:8
openmsx::MemoryOps::mallocAligned
void * mallocAligned(size_t alignment, size_t size)
Definition: MemoryOps.cc:251
build-info.hh
ranges::none_of
bool none_of(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:131
openmsx::MemoryOps::AllocMap::remove
void * remove(void *)
Definition: MemoryOps.cc:230
openmsx::MemoryOps
Definition: MemoryOps.cc:22
stl.hh
openmsx::MemoryOps::AllocMap
Aligned memory (de)allocation.
Definition: MemoryOps.cc:217
unreachable.hh
Math.hh
openmsx::MemoryOps::MemSet::operator()
void operator()(Pixel *out, size_t num, Pixel val) const
Definition: MemoryOps.cc:180
rfind_if_unguarded
auto rfind_if_unguarded(RANGE &range, PRED pred)
Definition: stl.hh:160