openMSX
Simple2xScaler.cc
Go to the documentation of this file.
1 #include "Simple2xScaler.hh"
3 #include "LineScalers.hh"
4 #include "RawFrame.hh"
5 #include "ScalerOutput.hh"
6 #include "RenderSettings.hh"
7 #include "unreachable.hh"
8 #include "vla.hh"
9 #include <cassert>
10 #include <cstddef>
11 #include <cstdint>
12 #ifdef __SSE2__
13 #include <emmintrin.h>
14 #endif
15 
16 namespace openmsx {
17 
18 // class Simple2xScaler
19 
20 template<typename Pixel>
22  const PixelOperations<Pixel>& pixelOps_,
23  RenderSettings& renderSettings)
24  : Scaler2<Pixel>(pixelOps_)
25  , settings(renderSettings)
26  , pixelOps(pixelOps_)
27  , mult1(pixelOps)
28  , mult2(pixelOps)
29  , mult3(pixelOps)
30  , scanline(pixelOps)
31 {
32 }
33 
34 template<typename Pixel>
36  FrameSource& src, unsigned srcStartY, unsigned srcEndY,
37  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
38 {
39  int scanlineFactor = settings.getScanlineFactor();
40 
41  unsigned dstHeight = dst.getHeight();
42  unsigned stopDstY = (dstEndY == dstHeight)
43  ? dstEndY : dstEndY - 2;
44  unsigned srcY = srcStartY, dstY = dstStartY;
45  for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
46  auto color0 = src.getLineColor<Pixel>(srcY);
47  dst.fillLine(dstY + 0, color0);
48  Pixel color1 = scanline.darken(color0, scanlineFactor);
49  dst.fillLine(dstY + 1, color1);
50  }
51  if (dstY != dstHeight) {
52  unsigned nextLineWidth = src.getLineWidth(srcY + 1);
53  assert(src.getLineWidth(srcY) == 1);
54  assert(nextLineWidth != 1);
55  this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
56  dst, dstY, dstEndY);
57  }
58 }
59 
60 #ifdef __SSE2__
61 
62 // Combines upper-half of 'x' with lower half of 'y'.
63 static inline __m128i shuffle(__m128i x, __m128i y)
64 {
65  // mm_shuffle_pd() actually shuffles 64-bit floating point values, we
66  // need to shuffle integers. Though floats and ints are stored in the
67  // same xmmN registers. So this instruction does the right thing.
68  // However (some?) x86 CPUs keep the float and integer interpretations
69  // of these registers in different physical locations in the chip and
70  // there is some overhead on switching between these interpretations.
71  // So the casts in the statement below don't generate any instructions,
72  // but they still can cause overhead on (some?) CPUs.
73  return _mm_castpd_si128(_mm_shuffle_pd(
74  _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
75 }
76 
77 // 32bpp
78 static void blur1on2_SSE2(
79  const uint32_t* __restrict in_, uint32_t* __restrict out_,
80  unsigned c1_, unsigned c2_, size_t width)
81 {
82  width *= sizeof(uint32_t); // in bytes
83  assert(width >= (2 * sizeof(__m128i)));
84  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
85  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
86 
87  ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
88  const auto* in = reinterpret_cast<const char*>(in_ ) - x;
89  auto* out = reinterpret_cast< char*>(out_) - 2 * x;
90 
91  // Setup first iteration
92  __m128i c1 = _mm_set1_epi16(c1_);
93  __m128i c2 = _mm_set1_epi16(c2_);
94  __m128i zero = _mm_setzero_si128();
95 
96  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
97  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
98  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
99  __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
100 
101  // Each iteration reads 4 pixels and generates 8 pixels
102  do {
103  // At the start of each iteration these variables are live:
104  // abcd, a0b0, d1a1
105  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
106  __m128i b0c0 = shuffle(a0b0, c0d0);
107  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
108  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
109  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
110  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
111  __m128i abab = _mm_packus_epi16(daab, abbc);
112  *reinterpret_cast<__m128i*>(out + 2 * x) =
113  _mm_shuffle_epi32(abab, 0xd8);
114  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
115  a0b0 = _mm_unpacklo_epi8(abcd, zero);
116  __m128i d0a0_= shuffle(c0d0, a0b0);
117  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
118  d1a1 = _mm_mullo_epi16(c1, d0a0_);
119  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
120  __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
121  __m128i cdcd = _mm_packus_epi16(bccd, cdda);
122  *reinterpret_cast<__m128i*>(out + 2 * x + 16) =
123  _mm_shuffle_epi32(cdcd, 0xd8);
124  x += 16;
125  } while (x < 0);
126 
127  // Last iteration (because this doesn't need to read new input)
128  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
129  __m128i b0c0 = shuffle(a0b0, c0d0);
130  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
131  __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
132  __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
133  __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
134  __m128i abab = _mm_packus_epi16(daab, abbc);
135  *reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
136  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
137  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
138  __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
139  __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
140  __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
141  __m128i cdcd = _mm_packus_epi16(bccd, cddd);
142  *reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
143 }
144 
145 #endif
146 
147 template<typename Pixel>
148 void Simple2xScaler<Pixel>::blur1on2(
149  const Pixel* __restrict pIn, Pixel* __restrict pOut,
150  unsigned alpha, size_t srcWidth)
151 {
152  /* This routine is functionally equivalent to the following:
153  *
154  * void blur1on2(const Pixel* pIn, Pixel* pOut, unsigned alpha)
155  * {
156  * unsigned c1 = alpha / 4;
157  * unsigned c2 = 256 - c1;
158  *
159  * Pixel prev, curr, next;
160  * prev = curr = pIn[0];
161  *
162  * unsigned x = 0;
163  * for (; x < (srcWidth - 1); ++x) {
164  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
165  * Pixel next = pIn[x + 1];
166  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
167  * prev = curr;
168  * curr = next;
169  * }
170  *
171  * pOut[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
172  * next = curr;
173  * pOut[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
174  * }
175  */
176 
177  if (alpha == 0) {
178  Scale_1on2<Pixel> scale;
179  scale(pIn, pOut, 2 * srcWidth);
180  return;
181  }
182 
183  assert(alpha <= 256);
184  unsigned c1 = alpha / 4;
185  unsigned c2 = 256 - c1;
186 
187 #ifdef __SSE2__
188  if constexpr (sizeof(Pixel) == 4) {
189  // SSE2, only 32bpp
190  blur1on2_SSE2(pIn, pOut, c1, c2, srcWidth);
191  return;
192  }
193 #endif
194  // C++ routine, both 16bpp and 32bpp.
195  // The loop is 2x unrolled and all common subexpressions and redundant
196  // assignments have been eliminated. 1 iteration generates 4 pixels.
197  mult1.setFactor32(c1);
198  mult2.setFactor32(c2);
199 
200  Pixel p0 = pIn[0];
201  Pixel p1;
202  unsigned f0 = mult1.mul32(p0);
203  unsigned f1 = f0;
204  unsigned tmp;
205 
206  size_t x = 0;
207  for (; x < (srcWidth - 2); x += 2) {
208  tmp = mult2.mul32(p0);
209  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
210 
211  p1 = pIn[x + 1];
212  f1 = mult1.mul32(p1);
213  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
214 
215  tmp = mult2.mul32(p1);
216  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
217 
218  p0 = pIn[x + 2];
219  f0 = mult1.mul32(p0);
220  pOut[2 * x + 3] = mult1.conv32(f0 + tmp);
221  }
222 
223  tmp = mult2.mul32(p0);
224  pOut[2 * x + 0] = mult1.conv32(f1 + tmp);
225 
226  p1 = pIn[x + 1];
227  f1 = mult1.mul32(p1);
228  pOut[2 * x + 1] = mult1.conv32(f1 + tmp);
229 
230  tmp = mult2.mul32(p1);
231  pOut[2 * x + 2] = mult1.conv32(f0 + tmp);
232 
233  pOut[2 * x + 3] = p1;
234 }
235 
236 #ifdef __SSE2__
237 
238 // 32bpp
239 static void blur1on1_SSE2(
240  const uint32_t* __restrict in_, uint32_t* __restrict out_,
241  unsigned c1_, unsigned c2_, size_t width)
242 {
243  width *= sizeof(uint32_t); // in bytes
244  assert(width >= (2 * sizeof(__m128i)));
245  assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
246  assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
247 
248  ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
249  const auto* in = reinterpret_cast<const char*>(in_ ) - x;
250  auto* out = reinterpret_cast< char*>(out_) - x;
251 
252  // Setup first iteration
253  __m128i c1 = _mm_set1_epi16(c1_);
254  __m128i c2 = _mm_set1_epi16(c2_);
255  __m128i zero = _mm_setzero_si128();
256 
257  __m128i abcd = *reinterpret_cast<const __m128i*>(in);
258  __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
259  __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
260 
261  // Each iteration reads 4 pixels and generates 4 pixels
262  do {
263  // At the start of each iteration these variables are live:
264  // abcd, a0b0, d0a0
265  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
266  __m128i b0c0 = shuffle(a0b0, c0d0);
267  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
268  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
269  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
270  abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
271  a0b0 = _mm_unpacklo_epi8(abcd, zero);
272  d0a0 = shuffle(c0d0, a0b0);
273  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
274  __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
275  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
276  *reinterpret_cast<__m128i*>(out + x) =
277  _mm_packus_epi16(aabb, ccdd);
278  x += 16;
279  } while (x < 0);
280 
281  // Last iteration (because this doesn't need to read new input)
282  __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
283  __m128i b0c0 = shuffle(a0b0, c0d0);
284  __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
285  __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
286  __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
287  __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
288  __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
289  __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
290  __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
291  *reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
292 }
293 
294 #endif
295 template<typename Pixel>
296 void Simple2xScaler<Pixel>::blur1on1(
297  const Pixel* __restrict pIn, Pixel* __restrict pOut,
298  unsigned alpha, size_t srcWidth)
299 {
300  /* This routine is functionally equivalent to the following:
301  *
302  * void blur1on1(const Pixel* pIn, Pixel* pOut, unsigned alpha)
303  * {
304  * unsigned c1 = alpha / 4;
305  * unsigned c2 = 256 - alpha / 2;
306  *
307  * Pixel prev, curr, next;
308  * prev = curr = pIn[0];
309  *
310  * unsigned x = 0;
311  * for (; x < (srcWidth - 1); ++x) {
312  * next = pIn[x + 1];
313  * pOut[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
314  * prev = curr;
315  * curr = next;
316  * }
317  *
318  * next = curr;
319  * pOut[x] = c1 * prev + c2 * curr + c1 * next;
320  * }
321  */
322 
323  if (alpha == 0) {
324  Scale_1on1<Pixel> copy;
325  copy(pIn, pOut, srcWidth);
326  return;
327  }
328 
329  unsigned c1 = alpha / 4;
330  unsigned c2 = 256 - alpha / 2;
331 
332 #ifdef __SSE2__
333  if constexpr (sizeof(Pixel) == 4) {
334  // SSE2, only 32bpp
335  blur1on1_SSE2(pIn, pOut, c1, c2, srcWidth);
336  return;
337  }
338 #endif
339  // C++ routine, both 16bpp and 32bpp.
340  // The loop is 2x unrolled and all common subexpressions and redundant
341  // assignments have been eliminated. 1 iteration generates 2 pixels.
342  mult1.setFactor32(c1);
343  mult3.setFactor32(c2);
344 
345  Pixel p0 = pIn[0];
346  Pixel p1;
347  unsigned f0 = mult1.mul32(p0);
348  unsigned f1 = f0;
349 
350  size_t x = 0;
351  for (; x < (srcWidth - 2); x += 2) {
352  p1 = pIn[x + 1];
353  unsigned t0 = mult1.mul32(p1);
354  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
355  f0 = t0;
356 
357  p0 = pIn[x + 2];
358  unsigned t1 = mult1.mul32(p0);
359  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
360  f1 = t1;
361  }
362 
363  p1 = pIn[x + 1];
364  unsigned t0 = mult1.mul32(p1);
365  pOut[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
366 
367  pOut[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
368 }
369 
370 template<typename Pixel>
371 void Simple2xScaler<Pixel>::drawScanline(
372  const Pixel* in1, const Pixel* in2, Pixel* out, int factor,
373  unsigned dstWidth)
374 {
375  if (factor != 255) {
376  scanline.draw(in1, in2, out, factor, dstWidth);
377  } else {
378  Scale_1on1<Pixel> scale;
379  scale(in1, out, dstWidth);
380  }
381 }
382 
383 template<typename Pixel>
384 void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
385  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
386  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
387 {
388  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
389  int blur = settings.getBlurFactor();
390  int scanlineFactor = settings.getScanlineFactor();
391 
392  unsigned dstY = dstStartY;
393  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
394  auto* dstLine0 = dst.acquireLine(dstY + 0);
395  blur1on2(srcLine, dstLine0, blur, srcWidth);
396 
397  for (; dstY < dstEndY - 2; dstY += 2) {
398  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
399  auto* dstLine2 = dst.acquireLine(dstY + 2);
400  blur1on2(srcLine, dstLine2, blur, srcWidth);
401 
402  auto* dstLine1 = dst.acquireLine(dstY + 1);
403  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
404  2 * srcWidth);
405 
406  dst.releaseLine(dstY + 0, dstLine0);
407  dst.releaseLine(dstY + 1, dstLine1);
408  dstLine0 = dstLine2;
409  }
410 
411  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
412  VLA_SSE_ALIGNED(Pixel, buf2, 2 * srcWidth);
413  blur1on2(srcLine, buf2, blur, srcWidth);
414 
415  auto* dstLine1 = dst.acquireLine(dstY + 1);
416  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, 2 * srcWidth);
417  dst.releaseLine(dstY + 0, dstLine0);
418  dst.releaseLine(dstY + 1, dstLine1);
419 }
420 
421 template<typename Pixel>
422 void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
423  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
424  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
425 {
426  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
427  int blur = settings.getBlurFactor();
428  int scanlineFactor = settings.getScanlineFactor();
429 
430  unsigned dstY = dstStartY;
431  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
432  auto* dstLine0 = dst.acquireLine(dstY);
433  blur1on1(srcLine, dstLine0, blur, srcWidth);
434 
435  for (; dstY < dstEndY - 2; dstY += 2) {
436  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
437  auto* dstLine2 = dst.acquireLine(dstY + 2);
438  blur1on1(srcLine, dstLine2, blur, srcWidth);
439 
440  auto* dstLine1 = dst.acquireLine(dstY + 1);
441  drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor,
442  srcWidth);
443 
444  dst.releaseLine(dstY + 0, dstLine0);
445  dst.releaseLine(dstY + 1, dstLine1);
446  dstLine0 = dstLine2;
447  }
448 
449  srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
450  VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
451  blur1on1(srcLine, buf2, blur, srcWidth);
452 
453  auto* dstLine1 = dst.acquireLine(dstY + 1);
454  drawScanline(dstLine0, buf2, dstLine1, scanlineFactor, srcWidth);
455  dst.releaseLine(dstY + 0, dstLine0);
456  dst.releaseLine(dstY + 1, dstLine1);
457 }
458 
459 template<typename Pixel>
460 void Simple2xScaler<Pixel>::scaleImage(
461  FrameSource& src, const RawFrame* superImpose,
462  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
463  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
464 {
465  if (superImpose) {
466  // Note: this implementation is different from the openGL
467  // version. Here we first alpha-blend and then scale, so the
468  // video layer will also get blurred (and possibly down-scaled
469  // to MSX resolution). The openGL version will only blur the
470  // MSX frame, then blend with the video frame and then apply
471  // scanlines. I think the openGL version is visually slightly
472  // better, but much more work to implement in software (in
473  // openGL shaders it's very easy). Maybe we can improve this
474  // later (if required at all).
475  SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
476  srcWidth = sf.getLineWidth(srcStartY);
477  this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
478  dst, dstStartY, dstEndY);
479  } else {
480  this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
481  dst, dstStartY, dstEndY);
482  }
483 }
484 
485 // Force template instantiation.
486 #if HAVE_16BPP
487 template class Simple2xScaler<uint16_t>;
488 #endif
489 #if HAVE_32BPP
490 template class Simple2xScaler<uint32_t>;
491 #endif
492 
493 } // namespace openmsx
Interface for getting lines from a video frame.
Definition: FrameSource.hh:15
Pixel getLineColor(unsigned line) const
Get the (single) color of the given line.
Definition: FrameSource.hh:74
virtual unsigned getLineWidth(unsigned line) const =0
Gets the number of display pixels on the given line.
Class containing all settings for renderers.
Base class for 2x scalers.
Definition: Scaler2.hh:12
virtual unsigned getHeight() const =0
virtual void fillLine(unsigned y, Pixel color)=0
Scaler which assigns the color of the original pixel to all pixels in the 2x2 square.
Simple2xScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:5
uint32_t Pixel
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:124
auto copy(InputRange &&range, OutputIter out)
Definition: ranges.hh:149
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44