openMSX
Scanline.cc
Go to the documentation of this file.
1 #include "Scanline.hh"
2 #include "PixelOperations.hh"
3 #include "enumerate.hh"
4 #include "unreachable.hh"
5 #include "xrange.hh"
6 #include <cassert>
7 #include <cstddef>
8 #include <cstring>
9 #ifdef __SSE2__
10 #include <emmintrin.h>
11 #endif
12 
13 namespace openmsx {
14 
15 // class Multiply<uint16_t>
16 
18  : pixelOps(pixelOps_)
19 {
20  factor = 0;
21  memset(tab, 0, sizeof(tab));
22 }
23 
25 {
26  if (f == factor) return;
27  factor = f;
28 
29  for (auto [p, t] : enumerate(tab)) {
30  auto pix = uint32_t(p);
31  t = ((((pix & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask()) |
32  ((((pix & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask()) |
33  ((((pix & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask());
34  }
35 }
36 
37 inline uint16_t Multiply<uint16_t>::multiply(uint16_t p, unsigned f) const
38 {
39  unsigned r = (((p & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask();
40  unsigned g = (((p & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask();
41  unsigned b = (((p & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask();
42  return r | g | b;
43 }
44 
45 inline uint16_t Multiply<uint16_t>::multiply(uint16_t p) const
46 {
47  return tab[p];
48 }
49 
50 inline const uint16_t* Multiply<uint16_t>::getTable() const
51 {
52  return tab;
53 }
54 
55 
56 // class Multiply<uint32_t>
57 
59 {
60 }
61 
63 {
64  factor = f;
65 }
66 
67 inline uint32_t Multiply<uint32_t>::multiply(uint32_t p, unsigned f) const
68 {
70 }
71 
72 inline uint32_t Multiply<uint32_t>::multiply(uint32_t p) const
73 {
74  return multiply(p, factor);
75 }
76 
77 const uint32_t* Multiply<uint32_t>::getTable() const
78 {
79  UNREACHABLE; return nullptr;
80 }
81 
82 
83 #ifdef __SSE2__
84 
85 // 32bpp
86 static inline void drawSSE2_1(
87  const char* __restrict in1, const char* __restrict in2,
88  char* __restrict out, __m128i f)
89 {
90  __m128i zero = _mm_setzero_si128();
91  __m128i a = *reinterpret_cast<const __m128i*>(in1);
92  __m128i b = *reinterpret_cast<const __m128i*>(in2);
93  __m128i c = _mm_avg_epu8(a, b);
94  __m128i l = _mm_unpacklo_epi8(c, zero);
95  __m128i h = _mm_unpackhi_epi8(c, zero);
96  __m128i m = _mm_mulhi_epu16(l, f);
97  __m128i n = _mm_mulhi_epu16(h, f);
98  __m128i r = _mm_packus_epi16(m, n);
99  *reinterpret_cast<__m128i*>(out) = r;
100 }
101 static inline void drawSSE2(
102  const uint32_t* __restrict in1_,
103  const uint32_t* __restrict in2_,
104  uint32_t* __restrict out_,
105  unsigned factor,
106  size_t width,
107  PixelOperations<uint32_t>& /*dummy*/,
108  Multiply<uint32_t>& /*dummy*/)
109 {
110  width *= sizeof(uint32_t); // in bytes
111  assert(width >= 64);
112  assert((reinterpret_cast<uintptr_t>(in1_) % sizeof(__m128i)) == 0);
113  assert((reinterpret_cast<uintptr_t>(in2_) % sizeof(__m128i)) == 0);
114  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
115  const auto* in1 = reinterpret_cast<const char*>(in1_) + width;
116  const auto* in2 = reinterpret_cast<const char*>(in2_) + width;
117  auto* out = reinterpret_cast< char*>(out_) + width;
118 
119  __m128i f = _mm_set1_epi16(factor << 8);
120  ptrdiff_t x = -ptrdiff_t(width);
121  do {
122  drawSSE2_1(in1 + x + 0, in2 + x + 0, out + x + 0, f);
123  drawSSE2_1(in1 + x + 16, in2 + x + 16, out + x + 16, f);
124  drawSSE2_1(in1 + x + 32, in2 + x + 32, out + x + 32, f);
125  drawSSE2_1(in1 + x + 48, in2 + x + 48, out + x + 48, f);
126  x += 64;
127  } while (x < 0);
128 }
129 
130 // 16bpp
131 static inline void drawSSE2(
132  const uint16_t* __restrict in1_,
133  const uint16_t* __restrict in2_,
134  uint16_t* __restrict out_,
135  unsigned factor,
136  size_t width,
137  PixelOperations<uint16_t>& pixelOps,
138  Multiply<uint16_t>& darkener)
139 {
140  width *= sizeof(uint16_t); // in bytes
141  assert(width >= 16);
142  const auto* in1 = reinterpret_cast<const char*>(in1_) + width;
143  const auto* in2 = reinterpret_cast<const char*>(in2_) + width;
144  auto* out = reinterpret_cast< char*>(out_) + width;
145 
146  darkener.setFactor(factor);
147  const uint16_t* table = darkener.getTable();
148  __m128i mask = _mm_set1_epi16(pixelOps.getBlendMask());
149 
150  ptrdiff_t x = -ptrdiff_t(width);
151  do {
152  __m128i a = *reinterpret_cast<const __m128i*>(in1 + x);
153  __m128i b = *reinterpret_cast<const __m128i*>(in2 + x);
154  __m128i c = _mm_add_epi16(
155  _mm_and_si128(a, b),
156  _mm_srli_epi16(
157  _mm_and_si128(mask, _mm_xor_si128(a, b)),
158  1));
159  *reinterpret_cast<__m128i*>(out + x) = _mm_set_epi16(
160  table[_mm_extract_epi16(c, 7)],
161  table[_mm_extract_epi16(c, 6)],
162  table[_mm_extract_epi16(c, 5)],
163  table[_mm_extract_epi16(c, 4)],
164  table[_mm_extract_epi16(c, 3)],
165  table[_mm_extract_epi16(c, 2)],
166  table[_mm_extract_epi16(c, 1)],
167  table[_mm_extract_epi16(c, 0)]);
168  // An alternative for the above statement is this block (this
169  // is close to what we has in our old MMX routine). On gcc this
170  // generates significantly shorter (25%) but also significantly
171  // slower (30%) code. On clang both alternatives generate
172  // identical code, comparable in size to the fast gcc version
173  // (but still a bit faster).
174  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 0)], 0);
175  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 1)], 1);
176  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 2)], 2);
177  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 3)], 3);
178  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 4)], 4);
179  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 5)], 5);
180  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 6)], 6);
181  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 7)], 7);
182  //*reinterpret_cast<__m128i*>(out + x) = c;
183 
184  x += 16;
185  } while (x < 0);
186 }
187 
188 #endif
189 
190 
191 // class Scanline
192 
193 template<typename Pixel>
195  : darkener(pixelOps_)
196  , pixelOps(pixelOps_)
197 {
198 }
199 
200 template<typename Pixel>
202  const Pixel* __restrict src1, const Pixel* __restrict src2,
203  Pixel* __restrict dst, unsigned factor, size_t width)
204 {
205 #ifdef __SSE2__
206  drawSSE2(src1, src2, dst, factor, width, pixelOps, darkener);
207 #else
208  // non-SSE2 routine, both 16bpp and 32bpp
209  darkener.setFactor(factor);
210  for (auto x : xrange(width)) {
211  dst[x] = darkener.multiply(
212  pixelOps.template blend<1, 1>(src1[x], src2[x]));
213  }
214 #endif
215 }
216 
217 template<typename Pixel>
218 Pixel Scanline<Pixel>::darken(Pixel p, unsigned factor) const
219 {
220  return darkener.multiply(p, factor);
221 }
222 
223 template<typename Pixel>
224 Pixel Scanline<Pixel>::darken(Pixel p1, Pixel p2, unsigned factor) const
225 {
226  return darkener.multiply(pixelOps.template blend<1, 1>(p1, p2), factor);
227 }
228 
229 // Force template instantiation.
230 #if HAVE_16BPP
231 template class Scanline<uint16_t>;
232 #endif
233 #if HAVE_32BPP
234 template class Scanline<uint32_t>;
235 #endif
236 
237 } // namespace openmsx
int g
TclObject t
Helper class to perform 'pixel x scalar' calculations.
Definition: Scanline.hh:12
static Pixel multiply(Pixel p, unsigned x)
Perform a component wise multiplication of a pixel with an 8-bit fractional value: result = (pixel * ...
Helper class to draw scanlines.
Definition: Scanline.hh:44
Pixel darken(Pixel p, unsigned factor) const
Darken one pixel.
Definition: Scanline.cc:218
void draw(const Pixel *src1, const Pixel *src2, Pixel *dst, unsigned factor, size_t width)
Draws a scanline.
Definition: Scanline.cc:201
Scanline(const PixelOperations< Pixel > &pixelOps)
Definition: Scanline.cc:194
constexpr auto enumerate(Iterable &&iterable)
Heavily inspired by Nathan Reed's blog post: Python-Like enumerate() In C++17 http://reedbeta....
Definition: enumerate.hh:28
This file implemented 3 utility functions:
Definition: Autofire.cc:9
constexpr Table table
Definition: CPUCore.cc:260
uint32_t Pixel
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:118
constexpr nibble mask[4][13]
Definition: RP5C01.cc:34
#define UNREACHABLE
Definition: unreachable.hh:38
constexpr auto xrange(T e)
Definition: xrange.hh:155