openMSX
Scanline.cc
Go to the documentation of this file.
1 #include "Scanline.hh"
2 #include "PixelOperations.hh"
3 #include "unreachable.hh"
4 #include <cassert>
5 #include <cstddef>
6 #include <cstring>
7 #ifdef __SSE2__
8 #include <emmintrin.h>
9 #endif
10 
11 namespace openmsx {
12 
13 // class Multiply<uint16_t>
14 
16  : pixelOps(pixelOps_)
17 {
18  factor = 0;
19  memset(tab, 0, sizeof(tab));
20 }
21 
23 {
24  if (f == factor) {
25  return;
26  }
27  factor = f;
28 
29  for (unsigned p = 0; p < 0x10000; ++p) {
30  tab[p] = ((((p & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask()) |
31  ((((p & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask()) |
32  ((((p & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask());
33  }
34 }
35 
36 inline uint16_t Multiply<uint16_t>::multiply(uint16_t p, unsigned f) const
37 {
38  unsigned r = (((p & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask();
39  unsigned g = (((p & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask();
40  unsigned b = (((p & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask();
41  return r | g | b;
42 }
43 
44 inline uint16_t Multiply<uint16_t>::multiply(uint16_t p) const
45 {
46  return tab[p];
47 }
48 
49 inline const uint16_t* Multiply<uint16_t>::getTable() const
50 {
51  return tab;
52 }
53 
54 
55 // class Multiply<uint32_t>
56 
58 {
59 }
60 
62 {
63  factor = f;
64 }
65 
66 inline uint32_t Multiply<uint32_t>::multiply(uint32_t p, unsigned f) const
67 {
69 }
70 
71 inline uint32_t Multiply<uint32_t>::multiply(uint32_t p) const
72 {
73  return multiply(p, factor);
74 }
75 
76 const uint32_t* Multiply<uint32_t>::getTable() const
77 {
78  UNREACHABLE; return nullptr;
79 }
80 
81 
82 #ifdef __SSE2__
83 
84 // 32bpp
85 static inline void drawSSE2_1(
86  const char* __restrict in1, const char* __restrict in2,
87  char* __restrict out, __m128i f)
88 {
89  __m128i zero = _mm_setzero_si128();
90  __m128i a = *reinterpret_cast<const __m128i*>(in1);
91  __m128i b = *reinterpret_cast<const __m128i*>(in2);
92  __m128i c = _mm_avg_epu8(a, b);
93  __m128i l = _mm_unpacklo_epi8(c, zero);
94  __m128i h = _mm_unpackhi_epi8(c, zero);
95  __m128i m = _mm_mulhi_epu16(l, f);
96  __m128i n = _mm_mulhi_epu16(h, f);
97  __m128i r = _mm_packus_epi16(m, n);
98  *reinterpret_cast<__m128i*>(out) = r;
99 }
100 static inline void drawSSE2(
101  const uint32_t* __restrict in1_,
102  const uint32_t* __restrict in2_,
103  uint32_t* __restrict out_,
104  unsigned factor,
105  size_t width,
106  PixelOperations<uint32_t>& /*dummy*/,
107  Multiply<uint32_t>& /*dummy*/)
108 {
109  width *= sizeof(uint32_t); // in bytes
110  assert(width >= 64);
111  assert((reinterpret_cast<uintptr_t>(in1_) % sizeof(__m128i)) == 0);
112  assert((reinterpret_cast<uintptr_t>(in2_) % sizeof(__m128i)) == 0);
113  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
114  auto* in1 = reinterpret_cast<const char*>(in1_) + width;
115  auto* in2 = reinterpret_cast<const char*>(in2_) + width;
116  auto* out = reinterpret_cast< char*>(out_) + width;
117 
118  __m128i f = _mm_set1_epi16(factor << 8);
119  ptrdiff_t x = -ptrdiff_t(width);
120  do {
121  drawSSE2_1(in1 + x + 0, in2 + x + 0, out + x + 0, f);
122  drawSSE2_1(in1 + x + 16, in2 + x + 16, out + x + 16, f);
123  drawSSE2_1(in1 + x + 32, in2 + x + 32, out + x + 32, f);
124  drawSSE2_1(in1 + x + 48, in2 + x + 48, out + x + 48, f);
125  x += 64;
126  } while (x < 0);
127 }
128 
129 // 16bpp
130 static inline void drawSSE2(
131  const uint16_t* __restrict in1_,
132  const uint16_t* __restrict in2_,
133  uint16_t* __restrict out_,
134  unsigned factor,
135  size_t width,
136  PixelOperations<uint16_t>& pixelOps,
137  Multiply<uint16_t>& darkener)
138 {
139  width *= sizeof(uint16_t); // in bytes
140  assert(width >= 16);
141  auto* in1 = reinterpret_cast<const char*>(in1_) + width;
142  auto* in2 = reinterpret_cast<const char*>(in2_) + width;
143  auto* out = reinterpret_cast< char*>(out_) + width;
144 
145  darkener.setFactor(factor);
146  const uint16_t* table = darkener.getTable();
147  __m128i mask = _mm_set1_epi16(pixelOps.getBlendMask());
148 
149  ptrdiff_t x = -ptrdiff_t(width);
150  do {
151  __m128i a = *reinterpret_cast<const __m128i*>(in1 + x);
152  __m128i b = *reinterpret_cast<const __m128i*>(in2 + x);
153  __m128i c = _mm_add_epi16(
154  _mm_and_si128(a, b),
155  _mm_srli_epi16(
156  _mm_and_si128(mask, _mm_xor_si128(a, b)),
157  1));
158  *reinterpret_cast<__m128i*>(out + x) = _mm_set_epi16(
159  table[_mm_extract_epi16(c, 7)],
160  table[_mm_extract_epi16(c, 6)],
161  table[_mm_extract_epi16(c, 5)],
162  table[_mm_extract_epi16(c, 4)],
163  table[_mm_extract_epi16(c, 3)],
164  table[_mm_extract_epi16(c, 2)],
165  table[_mm_extract_epi16(c, 1)],
166  table[_mm_extract_epi16(c, 0)]);
167  // An alternative for the above statement is this block (this
168  // is close to what we has in our old MMX routine). On gcc this
169  // generates significantly shorter (25%) but also significantly
170  // slower (30%) code. On clang both alternatives generate
171  // identical code, comparable in size to the fast gcc version
172  // (but still a bit faster).
173  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 0)], 0);
174  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 1)], 1);
175  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 2)], 2);
176  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 3)], 3);
177  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 4)], 4);
178  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 5)], 5);
179  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 6)], 6);
180  //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 7)], 7);
181  //*reinterpret_cast<__m128i*>(out + x) = c;
182 
183  x += 16;
184  } while (x < 0);
185 }
186 
187 #endif
188 
189 
190 // class Scanline
191 
192 template <class Pixel>
194  : darkener(pixelOps_)
195  , pixelOps(pixelOps_)
196 {
197 }
198 
199 template <class Pixel>
201  const Pixel* __restrict src1, const Pixel* __restrict src2,
202  Pixel* __restrict dst, unsigned factor, size_t width)
203 {
204 #ifdef __SSE2__
205  drawSSE2(src1, src2, dst, factor, width, pixelOps, darkener);
206 #else
207  // non-SSE2 routine, both 16bpp and 32bpp
208  darkener.setFactor(factor);
209  for (unsigned x = 0; x < width; ++x) {
210  dst[x] = darkener.multiply(
211  pixelOps.template blend<1, 1>(src1[x], src2[x]));
212  }
213 #endif
214 }
215 
216 template <class Pixel>
218 {
219  return darkener.multiply(p, factor);
220 }
221 
222 template <class Pixel>
223 Pixel Scanline<Pixel>::darken(Pixel p1, Pixel p2, unsigned factor)
224 {
225  return darkener.multiply(pixelOps.template blend<1, 1>(p1, p2), factor);
226 }
227 
228 // Force template instantiation.
229 #if HAVE_16BPP
230 template class Scanline<uint16_t>;
231 #endif
232 #if HAVE_32BPP
233 template class Scanline<uint32_t>;
234 #endif
235 
236 } // namespace openmsx
uint16_t multiply(uint16_t p, unsigned factor) const
Definition: Scanline.cc:36
Scanline(const PixelOperations< Pixel > &pixelOps)
Definition: Scanline.cc:193
Pixel getBlendMask() const
Returns a constant that is useful to calculate the average of two pixel values.
void setFactor(unsigned f)
Definition: Scanline.cc:22
const uint16_t * getTable() const
Definition: Scanline.cc:49
void draw(const Pixel *src1, const Pixel *src2, Pixel *dst, unsigned factor, size_t width)
Draws a scanline.
Definition: Scanline.cc:200
uint32_t Pixel
Helper class to perform &#39;pixel x scalar&#39; calculations.
Definition: Scanline.hh:12
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
Helper class to draw scalines.
Definition: Scanline.hh:43
Pixel darken(Pixel p, unsigned factor)
Darken one pixel.
Definition: Scanline.cc:217
static Pixel multiply(Pixel p, unsigned x)
Perform a component wise multiplication of a pixel with an 8-bit fractional value: result = (pixel * ...
int g
#define UNREACHABLE
Definition: unreachable.hh:38