openMSX
Simple2xScaler.cc
Go to the documentation of this file.
1 #include "Simple2xScaler.hh"
3 #include "LineScalers.hh"
4 #include "RawFrame.hh"
5 #include "ScalerOutput.hh"
6 #include "RenderSettings.hh"
7 #include "vla.hh"
8 #include <cassert>
9 #include <cstddef>
10 #include <cstdint>
11 #ifdef __SSE2__
12 #include <emmintrin.h>
13 #endif
14 
15 namespace openmsx {
16 
17 // class Simple2xScaler
18 
19 template<typename Pixel>
21  const PixelOperations<Pixel>& pixelOps_,
22  RenderSettings& renderSettings)
23  : Scaler2<Pixel>(pixelOps_)
24  , settings(renderSettings)
25  , pixelOps(pixelOps_)
26  , mult1(pixelOps)
27  , mult2(pixelOps)
28  , mult3(pixelOps)
29  , scanline(pixelOps)
30 {
31 }
32 
33 template<typename Pixel>
35  FrameSource& src, unsigned srcStartY, unsigned srcEndY,
36  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
37 {
38  int scanlineFactor = settings.getScanlineFactor();
39 
40  unsigned dstHeight = dst.getHeight();
41  unsigned stopDstY = (dstEndY == dstHeight)
42  ? dstEndY : dstEndY - 2;
43  unsigned srcY = srcStartY, dstY = dstStartY;
44  for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
45  auto color0 = src.getLineColor<Pixel>(srcY);
46  dst.fillLine(dstY + 0, color0);
47  Pixel color1 = scanline.darken(color0, scanlineFactor);
48  dst.fillLine(dstY + 1, color1);
49  }
50  if (dstY != dstHeight) {
51  unsigned nextLineWidth = src.getLineWidth(srcY + 1);
52  assert(src.getLineWidth(srcY) == 1);
53  assert(nextLineWidth != 1);
54  this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
55  dst, dstY, dstEndY);
56  }
57 }
58 
59 #ifdef __SSE2__
60 
61 // Combines upper-half of 'x' with lower half of 'y'.
62 static inline __m128i shuffle(__m128i x, __m128i y)
63 {
64  // mm_shuffle_pd() actually shuffles 64-bit floating point values, we
65  // need to shuffle integers. Though floats and ints are stored in the
66  // same xmmN registers. So this instruction does the right thing.
67  // However (some?) x86 CPUs keep the float and integer interpretations
68  // of these registers in different physical locations in the chip and
69  // there is some overhead on switching between these interpretations.
70  // So the casts in the statement below don't generate any instructions,
71  // but they still can cause overhead on (some?) CPUs.
72  return _mm_castpd_si128(_mm_shuffle_pd(
73  _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
74 }
75 
76 // 32bpp
77 static void blur1on2_SSE2(
78  const uint32_t* __restrict in_, uint32_t* __restrict out_,
79  unsigned c1_, unsigned c2_, size_t width)
80 {
81  width *= sizeof(uint32_t); // in bytes
82  assert(width >= (2 * sizeof(__m128i)));
83  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
84  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
85 
86  ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
87  const auto* in = reinterpret_cast<const char*>(in_ ) - x;
88  auto* out = reinterpret_cast< char*>(out_) - 2 * x;
89 
90  // Setup first iteration
91  __m128i c1 = _mm_set1_epi16(c1_);
92  __m128i c2 = _mm_set1_epi16(c2_);
93  __m128i zero = _mm_setzero_si128();
94 
95  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
96  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
97  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
98  __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
99 
100  // Each iteration reads 4 pixels and generates 8 pixels
101  do {
102  // At the start of each iteration these variables are live:
103  // abcd, a0b0, d1a1
104  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
105  __m128i b0c0 = shuffle(a0b0, c0d0);
106  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
107  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
108  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
109  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
110  __m128i abab = _mm_packus_epi16(daab, abbc);
111  *reinterpret_cast<__m128i*>(out + 2 * x) =
112  _mm_shuffle_epi32(abab, 0xd8);
113  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
114  a0b0 = _mm_unpacklo_epi8(abcd, zero);
115  __m128i d0a0_= shuffle(c0d0, a0b0);
116  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
117  d1a1 = _mm_mullo_epi16(c1, d0a0_);
118  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
119  __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
120  __m128i cdcd = _mm_packus_epi16(bccd, cdda);
121  *reinterpret_cast<__m128i*>(out + 2 * x + 16) =
122  _mm_shuffle_epi32(cdcd, 0xd8);
123  x += 16;
124  } while (x < 0);
125 
126  // Last iteration (because this doesn't need to read new input)
127  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
128  __m128i b0c0 = shuffle(a0b0, c0d0);
129  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
130  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
131  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
132  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
133  __m128i abab = _mm_packus_epi16(daab, abbc);
134  *reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
135  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
136  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
137  __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
138  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
139  __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
140  __m128i cdcd = _mm_packus_epi16(bccd, cddd);
141  *reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
142 }
143 
144 #endif
145 
146 template<typename Pixel>
147 void Simple2xScaler<Pixel>::blur1on2(
148  const Pixel* __restrict pIn, Pixel* __restrict pOut,
149  unsigned alpha, size_t srcWidth)
150 {
151  /* This routine is functionally equivalent to the following:
152  *
153  * void blur1on2(const Pixel* pIn, Pixel* pOut, unsigned alpha)
154  * {
155  * unsigned c1 = alpha / 4;
156  * unsigned c2 = 256 - c1;
157  *
158  * Pixel prev, curr, next;
159  * prev = curr = pIn[0];
160  *
161  * unsigned x = 0;
162  * for (; x < (srcWidth - 1); ++x) {
163  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
164  * Pixel next = pIn[x + 1];
165  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
166  * prev = curr;
167  * curr = next;
168  * }
169  *
170  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
171  * next = curr;
172  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
173  * }
174  */
175 
176  if (alpha == 0) {
177  Scale_1on2<Pixel> scale;
178  scale(pIn, pOut, 2 * srcWidth);
179  return;
180  }
181 
182  assert(alpha <= 256);
183  unsigned c1 = alpha / 4;
184  unsigned c2 = 256 - c1;
185 
186 #ifdef __SSE2__
187  if constexpr (sizeof(Pixel) == 4) {
188  // SSE2, only 32bpp
189  blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
190  return;
191  }
192 #endif
193  // C++ routine, both 16bpp and 32bpp.
194  // The loop is 2x unrolled and all common subexpressions and redundant
195  // assignments have been eliminated. 1 iteration generates 4 pixels.
196  mult1.setFactor32(c1);
197  mult2.setFactor32(c2);
198 
199  Pixel p0 = pIn[0];
200  Pixel p1;
201  unsigned f0 = mult1.mul32(p0);
202  unsigned f1 = f0;
203  unsigned tmp;
204 
205  size_t x = 0;
206  for (; x < (srcWidth - 2); x += 2) {
207  tmp = mult2.mul32(p0);
208  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
209 
210  p1 = pIn[x + 1];
211  f1 = mult1.mul32(p1);
212  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
213 
214  tmp = mult2.mul32(p1);
215  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
216 
217  p0 = pIn[x + 2];
218  f0 = mult1.mul32(p0);
219  pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
220  }
221 
222  tmp = mult2.mul32(p0);
223  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
224 
225  p1 = pIn[x + 1];
226  f1 = mult1.mul32(p1);
227  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
228 
229  tmp = mult2.mul32(p1);
230  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
231 
232  pOut[2 * x + 3] = p1;
233 }
234 
235 #ifdef __SSE2__
236 
237 // 32bpp
238 static void blur1on1_SSE2(
239  const uint32_t* __restrict in_, uint32_t* __restrict out_,
240  unsigned c1_, unsigned c2_, size_t width)
241 {
242  width *= sizeof(uint32_t); // in bytes
243  assert(width >= (2 * sizeof(__m128i)));
244  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
245  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
246 
247  ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
248  const auto* in = reinterpret_cast<const char*>(in_ ) - x;
249  auto* out = reinterpret_cast< char*>(out_) - x;
250 
251  // Setup first iteration
252  __m128i c1 = _mm_set1_epi16(c1_);
253  __m128i c2 = _mm_set1_epi16(c2_);
254  __m128i zero = _mm_setzero_si128();
255 
256  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
257  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
258  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
259 
260  // Each iteration reads 4 pixels and generates 4 pixels
261  do {
262  // At the start of each iteration these variables are live:
263  // abcd, a0b0, d0a0
264  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
265  __m128i b0c0 = shuffle(a0b0, c0d0);
266  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
267  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
268  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
269  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
270  a0b0 = _mm_unpacklo_epi8(abcd, zero);
271  d0a0 = shuffle(c0d0, a0b0);
272  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
273  __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
274  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
275  *reinterpret_cast<__m128i*>(out + x) =
276  _mm_packus_epi16(aabb, ccdd);
277  x += 16;
278  } while (x < 0);
279 
280  // Last iteration (because this doesn't need to read new input)
281  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
282  __m128i b0c0 = shuffle(a0b0, c0d0);
283  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
284  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
285  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
286  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
287  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
288  __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
289  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
290  *reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
291 }
292 
293 #endif
294 template<typename Pixel>
295 void Simple2xScaler<Pixel>::blur1on1(
296  const Pixel* __restrict pIn, Pixel* __restrict pOut,
297  unsigned alpha, size_t srcWidth)
298 {
299  /* This routine is functionally equivalent to the following:
300  *
301  * void blur1on1(const Pixel* pIn, Pixel* pOut, unsigned alpha)
302  * {
303  * unsigned c1 = alpha / 4;
304  * unsigned c2 = 256 - alpha / 2;
305  *
306  * Pixel prev, curr, next;
307  * prev = curr = pIn[0];
308  *
309  * unsigned x = 0;
310  * for (; x < (srcWidth - 1); ++x) {
311  * next = pIn[x + 1];
312  * pOut[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
313  * prev = curr;
314  * curr = next;
315  * }
316  *
317  * next = curr;
318  * pOut[x] = c1 * prev + c2 * curr + c1 * next;
319  * }
320  */
321 
322  if (alpha == 0) {
323  Scale_1on1<Pixel> copy;
324  copy(pIn, pOut, srcWidth);
325  return;
326  }
327 
328  unsigned c1 = alpha / 4;
329  unsigned c2 = 256 - alpha / 2;
330 
331 #ifdef __SSE2__
332  if constexpr (sizeof(Pixel) == 4) {
333  // SSE2, only 32bpp
334  blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
335  return;
336  }
337 #endif
338  // C++ routine, both 16bpp and 32bpp.
339  // The loop is 2x unrolled and all common subexpressions and redundant
340  // assignments have been eliminated. 1 iteration generates 2 pixels.
341  mult1.setFactor32(c1);
342  mult3.setFactor32(c2);
343 
344  Pixel p0 = pIn[0];
345  Pixel p1;
346  unsigned f0 = mult1.mul32(p0);
347  unsigned f1 = f0;
348 
349  size_t x = 0;
350  for (; x < (srcWidth - 2); x += 2) {
351  p1 = pIn[x + 1];
352  unsigned t0 = mult1.mul32(p1);
353  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
354  f0 = t0;
355 
356  p0 = pIn[x + 2];
357  unsigned t1 = mult1.mul32(p0);
358  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
359  f1 = t1;
360  }
361 
362  p1 = pIn[x + 1];
363  unsigned t0 = mult1.mul32(p1);
364  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
365 
366  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
367 }
368 
369 template<typename Pixel>
370 void Simple2xScaler<Pixel>::drawScanline(
371  const Pixel* in1, const Pixel* in2, Pixel* out, int factor,
372  unsigned dstWidth)
373 {
374  if (factor != 255) {
375  scanline.draw(in1, in2, out, factor, dstWidth);
376  } else {
377  Scale_1on1<Pixel> scale;
378  scale(in1, out, dstWidth);
379  }
380 }
381 
382 template<typename Pixel>
383 void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
384  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
385  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
386 {
387  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
388  int blur = settings.getBlurFactor();
389  int scanlineFactor = settings.getScanlineFactor();
390 
391  unsigned dstY = dstStartY;
392  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
393  auto* dstLine0 = dst.acquireLine(dstY + 0);
394  blur1on2(srcLine, dstLine0, blur, srcWidth);
395 
396  for (; dstY < dstEndY - 2; dstY += 2) {
397  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
398  auto* dstLine2 = dst.acquireLine(dstY + 2);
399  blur1on2(srcLine, dstLine2, blur, srcWidth);
400 
401  auto* dstLine1 = dst.acquireLine(dstY + 1);
402  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
403  2 * srcWidth);
404 
405  dst.releaseLine(dstY + 0, dstLine0);
406  dst.releaseLine(dstY + 1, dstLine1);
407  dstLine0 = dstLine2;
408  }
409 
410  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
411  VLA_SSE_ALIGNED(Pixel, buf2, 2 * srcWidth);
412  blur1on2(srcLine, buf2, blur, srcWidth);
413 
414  auto* dstLine1 = dst.acquireLine(dstY + 1);
415  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, 2 * srcWidth);
416  dst.releaseLine(dstY + 0, dstLine0);
417  dst.releaseLine(dstY + 1, dstLine1);
418 }
419 
420 template<typename Pixel>
421 void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
422  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
423  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
424 {
425  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
426  int blur = settings.getBlurFactor();
427  int scanlineFactor = settings.getScanlineFactor();
428 
429  unsigned dstY = dstStartY;
430  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
431  auto* dstLine0 = dst.acquireLine(dstY);
432  blur1on1(srcLine, dstLine0, blur, srcWidth);
433 
434  for (; dstY < dstEndY - 2; dstY += 2) {
435  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
436  auto* dstLine2 = dst.acquireLine(dstY + 2);
437  blur1on1(srcLine, dstLine2, blur, srcWidth);
438 
439  auto* dstLine1 = dst.acquireLine(dstY + 1);
440  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
441  srcWidth);
442 
443  dst.releaseLine(dstY + 0, dstLine0);
444  dst.releaseLine(dstY + 1, dstLine1);
445  dstLine0 = dstLine2;
446  }
447 
448  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
449  VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
450  blur1on1(srcLine, buf2, blur, srcWidth);
451 
452  auto* dstLine1 = dst.acquireLine(dstY + 1);
453  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, srcWidth);
454  dst.releaseLine(dstY + 0, dstLine0);
455  dst.releaseLine(dstY + 1, dstLine1);
456 }
457 
458 template<typename Pixel>
459 void Simple2xScaler<Pixel>::scaleImage(
460  FrameSource& src, const RawFrame* superImpose,
461  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
462  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
463 {
464  if (superImpose) {
465  // Note: this implementation is different from the openGL
466  // version. Here we first alpha-blend and then scale, so the
467  // video layer will also get blurred (and possibly down-scaled
468  // to MSX resolution). The openGL version will only blur the
469  // MSX frame, then blend with the video frame and then apply
470  // scanlines. I think the openGL version is visually slightly
471  // better, but much more work to implement in software (in
472  // openGL shaders it's very easy). Maybe we can improve this
473  // later (if required at all).
474  SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
475  srcWidth = sf.getLineWidth(srcStartY);
476  this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
477  dst, dstStartY, dstEndY);
478  } else {
479  this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
480  dst, dstStartY, dstEndY);
481  }
482 }
483 
484 // Force template instantiation.
485 #if HAVE_16BPP
486 template class Simple2xScaler<uint16_t>;
487 #endif
488 #if HAVE_32BPP
489 template class Simple2xScaler<uint32_t>;
490 #endif
491 
492 } // namespace openmsx
Interface for getting lines from a video frame.
Definition: FrameSource.hh:16
Pixel getLineColor(unsigned line) const
Get the (single) color of the given line.
Definition: FrameSource.hh:75
virtual unsigned getLineWidth(unsigned line) const =0
Gets the number of display pixels on the given line.
Class containing all settings for renderers.
Base class for 2x scalers.
Definition: Scaler2.hh:12
virtual unsigned getHeight() const =0
virtual void fillLine(unsigned y, Pixel color)=0
Scaler which assigns the color of the original pixel to all pixels in the 2x2 square.
Simple2xScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:118
auto copy(InputRange &&range, OutputIter out)
Definition: ranges.hh:179
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44