openMSX
Simple3xScaler.cc
Go to the documentation of this file.
1 #include "Simple3xScaler.hh"
3 #include "LineScalers.hh"
4 #include "RawFrame.hh"
5 #include "ScalerOutput.hh"
6 #include "RenderSettings.hh"
7 #include "vla.hh"
8 #include <cstdint>
9 #ifdef __SSE2__
10 #include <emmintrin.h>
11 #endif
12 
13 namespace openmsx {
14 
15 template<typename Pixel>
17  const PixelOperations<Pixel>& pixelOps_,
18  const RenderSettings& settings_)
19  : Scaler3<Pixel>(pixelOps_)
20  , pixelOps(pixelOps_)
21  , scanline(pixelOps_)
22  , blur_1on3(pixelOps_)
23  , settings(settings_)
24 {
25 }
26 
27 template<typename Pixel>
29 
30 template<typename Pixel>
32  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
33  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY,
35 {
36  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
37  int scanlineFactor = settings.getScanlineFactor();
38  unsigned dstWidth = dst.getWidth();
39  unsigned y = dstStartY;
40  auto* srcLine = src.getLinePtr(srcStartY++, srcWidth, buf);
41  auto* dstLine0 = dst.acquireLine(y + 0);
42  scale(srcLine, dstLine0, dstWidth);
43 
45  auto* dstLine1 = dst.acquireLine(y + 1);
46  copy(dstLine0, dstLine1, dstWidth);
47 
48  for (/* */; (y + 4) < dstEndY; y += 3, srcStartY += 1) {
49  srcLine = src.getLinePtr(srcStartY, srcWidth, buf);
50  auto* dstLine3 = dst.acquireLine(y + 3);
51  scale(srcLine, dstLine3, dstWidth);
52 
53  auto* dstLine4 = dst.acquireLine(y + 4);
54  copy(dstLine3, dstLine4, dstWidth);
55 
56  auto* dstLine2 = dst.acquireLine(y + 2);
57  scanline.draw(dstLine0, dstLine3, dstLine2,
58  scanlineFactor, dstWidth);
59 
60  dst.releaseLine(y + 0, dstLine0);
61  dst.releaseLine(y + 1, dstLine1);
62  dst.releaseLine(y + 2, dstLine2);
63  dstLine0 = dstLine3;
64  dstLine1 = dstLine4;
65  }
66  srcLine = src.getLinePtr(srcStartY, srcWidth, buf);
67  VLA_SSE_ALIGNED(Pixel, buf2, dstWidth);
68  scale(srcLine, buf2, dstWidth);
69 
70  auto* dstLine2 = dst.acquireLine(y + 2);
71  scanline.draw(dstLine0, buf2, dstLine2, scanlineFactor, dstWidth);
72  dst.releaseLine(y + 0, dstLine0);
73  dst.releaseLine(y + 1, dstLine1);
74  dst.releaseLine(y + 2, dstLine2);
75 }
76 
77 template<typename Pixel>
78 void Simple3xScaler<Pixel>::doScale2(FrameSource& src,
79  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
80  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY,
81  PolyLineScaler<Pixel>& scale)
82 {
83  VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
84  int scanlineFactor = settings.getScanlineFactor();
85  unsigned dstWidth = dst.getWidth();
86  for (unsigned srcY = srcStartY, dstY = dstStartY; dstY < dstEndY;
87  srcY += 2, dstY += 3) {
88  auto* srcLine0 = src.getLinePtr(srcY + 0, srcWidth, buf);
89  auto* dstLine0 = dst.acquireLine(dstY + 0);
90  scale(srcLine0, dstLine0, dstWidth);
91 
92  auto* srcLine1 = src.getLinePtr(srcY + 1, srcWidth, buf);
93  auto* dstLine2 = dst.acquireLine(dstY + 2);
94  scale(srcLine1, dstLine2, dstWidth);
95 
96  auto* dstLine1 = dst.acquireLine(dstY + 1);
97  scanline.draw(dstLine0, dstLine2, dstLine1,
98  scanlineFactor, dstWidth);
99 
100  dst.releaseLine(dstY + 0, dstLine0);
101  dst.releaseLine(dstY + 1, dstLine1);
102  dst.releaseLine(dstY + 2, dstLine2);
103  }
104 }
105 
106 template<typename Pixel>
107 void Simple3xScaler<Pixel>::scale2x1to9x3(FrameSource& src,
108  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
109  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
110 {
111  PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
112  doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
113 }
114 
115 template<typename Pixel>
116 void Simple3xScaler<Pixel>::scale2x2to9x3(FrameSource& src,
117  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
118  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
119 {
120  PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
121  doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
122 }
123 
124 template<typename Pixel>
125 void Simple3xScaler<Pixel>::scale1x1to3x3(FrameSource& src,
126  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
127  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
128 {
129  if (unsigned blur = settings.getBlurFactor() / 3) {
130  blur_1on3.setBlur(blur);
131  PolyScaleRef<Pixel, Blur_1on3<Pixel>> op(blur_1on3);
132  doScale1(src, srcStartY, srcEndY, srcWidth,
133  dst, dstStartY, dstEndY, op);
134  } else {
135  // No blurring: this is an optimization but it's also needed
136  // for correctness (otherwise there's an overflow in 0.16 fixed
137  // point arithmetic).
138  PolyScale<Pixel, Scale_1on3<Pixel>> op;
139  doScale1(src, srcStartY, srcEndY, srcWidth,
140  dst, dstStartY, dstEndY, op);
141  }
142 }
143 
144 template<typename Pixel>
145 void Simple3xScaler<Pixel>::scale1x2to3x3(FrameSource& src,
146  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
147  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
148 {
149  PolyScale<Pixel, Scale_1on3<Pixel>> op;
150  doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
151 }
152 
153 template<typename Pixel>
154 void Simple3xScaler<Pixel>::scale4x1to9x3(FrameSource& src,
155  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
156  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
157 {
158  PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
159  doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
160 }
161 
162 template<typename Pixel>
163 void Simple3xScaler<Pixel>::scale4x2to9x3(FrameSource& src,
164  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
165  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
166 {
167  PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
168  doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
169 }
170 
171 template<typename Pixel>
172 void Simple3xScaler<Pixel>::scale2x1to3x3(FrameSource& src,
173  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
174  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
175 {
176  PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
177  doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
178 }
179 
180 template<typename Pixel>
181 void Simple3xScaler<Pixel>::scale2x2to3x3(FrameSource& src,
182  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
183  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
184 {
185  PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
186  doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
187 }
188 
189 template<typename Pixel>
190 void Simple3xScaler<Pixel>::scale8x1to9x3(FrameSource& src,
191  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
192  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
193 {
194  PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
195  doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
196 }
197 
198 template<typename Pixel>
199 void Simple3xScaler<Pixel>::scale8x2to9x3(FrameSource& src,
200  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
201  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
202 {
203  PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
204  doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
205 }
206 
207 template<typename Pixel>
208 void Simple3xScaler<Pixel>::scale4x1to3x3(FrameSource& src,
209  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
210  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
211 {
212  PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
213  doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
214 }
215 
216 template<typename Pixel>
217 void Simple3xScaler<Pixel>::scale4x2to3x3(FrameSource& src,
218  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
219  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
220 {
221  PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
222  doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
223 }
224 
225 template<typename Pixel>
226 void Simple3xScaler<Pixel>::scaleBlank1to3(
227  FrameSource& src, unsigned srcStartY, unsigned srcEndY,
228  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
229 {
230  int scanlineFactor = settings.getScanlineFactor();
231 
232  unsigned dstHeight = dst.getHeight();
233  unsigned stopDstY = (dstEndY == dstHeight)
234  ? dstEndY : dstEndY - 3;
235  unsigned srcY = srcStartY, dstY = dstStartY;
236  for (/* */; dstY < stopDstY; srcY += 1, dstY += 3) {
237  auto color0 = src.getLineColor<Pixel>(srcY);
238  Pixel color1 = scanline.darken(color0, scanlineFactor);
239  dst.fillLine(dstY + 0, color0);
240  dst.fillLine(dstY + 1, color0);
241  dst.fillLine(dstY + 2, color1);
242  }
243  if (dstY != dstHeight) {
244  unsigned nextLineWidth = src.getLineWidth(srcY + 1);
245  assert(src.getLineWidth(srcY) == 1);
246  assert(nextLineWidth != 1);
247  this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
248  dst, dstY, dstEndY);
249  }
250 }
251 
252 template<typename Pixel>
253 void Simple3xScaler<Pixel>::scaleBlank2to3(
254  FrameSource& src, unsigned srcStartY, unsigned /*srcEndY*/,
255  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
256 {
257  int scanlineFactor = settings.getScanlineFactor();
258  for (unsigned srcY = srcStartY, dstY = dstStartY;
259  dstY < dstEndY; srcY += 2, dstY += 3) {
260  auto color0 = src.getLineColor<Pixel>(srcY + 0);
261  auto color1 = src.getLineColor<Pixel>(srcY + 1);
262  Pixel color01 = scanline.darken(color0, color1, scanlineFactor);
263  dst.fillLine(dstY + 0, color0);
264  dst.fillLine(dstY + 1, color01);
265  dst.fillLine(dstY + 2, color1);
266  }
267 }
268 
269 
270 // class Blur_1on3
271 
272 template<typename Pixel>
274  : mult0(pixelOps)
275  , mult1(pixelOps)
276  , mult2(pixelOps)
277  , mult3(pixelOps)
278 {
279 }
280 
281 #ifdef __SSE2__
282 template<typename Pixel>
283 void Blur_1on3<Pixel>::blur_SSE(const Pixel* in_, Pixel* out_, size_t srcWidth)
284 {
285  if constexpr (sizeof(Pixel) != 4) {
286  assert(false); return; // only 32-bpp
287  }
288 
289  assert((srcWidth % 4) == 0);
290  assert(srcWidth >= 8);
291  assert((size_t(in_ ) % 16) == 0);
292  assert((size_t(out_) % 16) == 0);
293 
294  unsigned alpha = blur * 256;
295  unsigned c0 = alpha / 2;
296  unsigned c1 = alpha + c0;
297  unsigned c2 = 0x10000 - c1;
298  unsigned c3 = 0x10000 - alpha;
299  __m128i C0C1 = _mm_set_epi16(c1, c1, c1, c1, c0, c0, c0, c0);
300  __m128i C1C0 = _mm_shuffle_epi32(C0C1, 0x4E);
301  __m128i C2C3 = _mm_set_epi16(c3, c3, c3, c3, c2, c2, c2, c2);
302  __m128i C3C2 = _mm_shuffle_epi32(C2C3, 0x4E);
303 
304  size_t tmp = srcWidth - 4;
305  const auto* in = reinterpret_cast<const char*>(in_ + tmp);
306  auto* out = reinterpret_cast< char*>(out_ + 3 * tmp);
307  auto x = -ptrdiff_t(tmp * sizeof(Pixel));
308 
309  __m128i ZERO = _mm_setzero_si128();
310 
311  // Prepare first iteration (duplicate left border pixel)
312  __m128i abcd = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x));
313  __m128i a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
314  __m128i a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
315  __m128i a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
316  __m128i d1d0 = _mm_shuffle_epi32(a0a1, 0x4E); // left border
317 
318  // At the start of each iteration the following vars are live:
319  // abcd, a_b_, a_a_, a0a1, d1d0
320  // Each iteration reads 4 and produces 12 pixels.
321  do {
322  // p01
323  __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
324  __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
325  __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
326  __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
327  __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
328  // p23
329  __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
330  __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
331  __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
332  __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
333  __m128i p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
334  __m128i p0123 = _mm_packus_epi16(p01, p23);
335  _mm_store_si128(reinterpret_cast<__m128i*>(out + 3 * x + 0),
336  p0123);
337 
338  // p45
339  __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
340  __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
341  __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
342  __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
343  __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
344  // p67
345  __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
346  __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
347  d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
348  __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
349  __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
350  __m128i p4567 = _mm_packus_epi16(p45, p67);
351  _mm_store_si128(reinterpret_cast<__m128i*>(out + 3 * x + 16),
352  p4567);
353 
354  // p89
355  __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
356  __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
357  __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
358  __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
359  __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
360  // pab
361  __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
362  abcd = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x + 16));
363  a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
364  a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
365  a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
366  __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
367  __m128i p89ab = _mm_packus_epi16(p89, pab);
368  _mm_store_si128(reinterpret_cast<__m128i*>(out + 3 * x + 32),
369  p89ab);
370 
371  x += 16;
372  } while (x < 0);
373 
374  // Last iteration (duplicate right border pixel)
375  // p01
376  __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
377  __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
378  __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
379  __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
380  __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
381  // p23
382  __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
383  __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
384  __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
385  __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
386  __m128i p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
387  __m128i p0123 = _mm_packus_epi16(p01, p23);
388  _mm_store_si128(reinterpret_cast<__m128i*>(out + 0),
389  p0123);
390 
391  // p45
392  __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
393  __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
394  __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
395  __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
396  __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
397  // p67
398  __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
399  __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
400  d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
401  __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
402  __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
403  __m128i p4567 = _mm_packus_epi16(p45, p67);
404  _mm_store_si128(reinterpret_cast<__m128i*>(out + 16),
405  p4567);
406 
407  // p89
408  __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
409  __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
410  __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
411  __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
412  __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
413  // pab
414  __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
415  a0a1 = _mm_shuffle_epi32(d1d0, 0x4E); // right border
416  __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
417  __m128i p89ab = _mm_packus_epi16(p89, pab);
418  _mm_store_si128(reinterpret_cast<__m128i*>(out + 32),
419  p89ab);
420 }
421 #endif
422 
423 template<typename Pixel>
425  const Pixel* __restrict in, Pixel* __restrict out,
426  size_t dstWidth)
427 {
428  /* The following code is equivalent to this loop. It is 2x unrolled
429  * and common subexpressions have been eliminated. The last iteration
430  * is also moved outside the for loop.
431  *
432  * unsigned c0 = blur / 2;
433  * unsigned c1 = c0 + blur;
434  * unsigned c2 = 256 - c1;
435  * unsigned c3 = 256 - 2 * c0;
436  * Pixel prev, curr, next;
437  * prev = curr = next = in[0];
438  * size_t srcWidth = dstWidth / 3;
439  * for (auto x : xrange(srcWidth)) {
440  * if (x != (srcWidth - 1)) next = in[x + 1];
441  * out[3 * x + 0] = mul(c1, prev) + mul(c2, curr);
442  * out[3 * x + 1] = mul(c0, prev) + mul(c3, curr) + mul(c0, next);
443  * out[3 * x + 2] = mul(c2, curr) + mul(c1, next);
444  * prev = curr;
445  * curr = next;
446  * }
447  */
448  size_t srcWidth = dstWidth / 3;
449 #ifdef __SSE2__
450  if constexpr (sizeof(Pixel) == 4) {
451  blur_SSE(in, out, srcWidth);
452  return;
453  }
454 #endif
455 
456  // C++ routine, both 16bpp and 32bpp
457  unsigned c0 = blur / 2;
458  unsigned c1 = blur + c0;
459  unsigned c2 = 256 - c1;
460  unsigned c3 = 256 - 2 * c0;
461  mult0.setFactor32(c0);
462  mult1.setFactor32(c1);
463  mult2.setFactor32(c2);
464  mult3.setFactor32(c3);
465 
466  Pixel p0 = in[0];
467  Pixel p1;
468  uint32_t f0 = mult0.mul32(p0);
469  uint32_t f1 = mult1.mul32(p0);
470  uint32_t g0 = f0;
471  uint32_t g1 = f1;
472 
473  size_t x = 0;
474  for (; x < (srcWidth - 2); x += 2) {
475  uint32_t g2 = mult2.mul32(p0);
476  out[3 * x + 0] = mult0.conv32(g2 + f1);
477  p1 = in[x + 1];
478  uint32_t t0 = mult0.mul32(p1);
479  out[3 * x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
480  f0 = t0;
481  f1 = mult1.mul32(p1);
482  out[3 * x + 2] = mult0.conv32(g2 + f1);
483 
484  uint32_t f2 = mult2.mul32(p1);
485  out[3 * x + 3] = mult0.conv32(f2 + g1);
486  p0 = in[x + 2];
487  uint32_t t1 = mult0.mul32(p0);
488  out[3 * x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + t1);
489  g0 = t1;
490  g1 = mult1.mul32(p0);
491  out[3 * x + 5] = mult0.conv32(g1 + f2);
492  }
493  uint32_t g2 = mult2.mul32(p0);
494  out[3 * x + 0] = mult0.conv32(g2 + f1);
495  p1 = in[x + 1];
496  uint32_t t0 = mult0.mul32(p1);
497  out[3 * x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
498  f0 = t0;
499  f1 = mult1.mul32(p1);
500  out[3 * x + 2] = mult0.conv32(g2 + f1);
501 
502  uint32_t f2 = mult2.mul32(p1);
503  out[3 * x + 3] = mult0.conv32(f2 + g1);
504  out[3 * x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + f0);
505  out[3 * x + 5] = p1;
506 }
507 
508 template<typename Pixel>
510  FrameSource& src, const RawFrame* superImpose,
511  unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
512  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
513 {
514  if (superImpose) {
515  SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
516  srcWidth = sf.getLineWidth(srcStartY);
517  this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
518  dst, dstStartY, dstEndY);
519  } else {
520  this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
521  dst, dstStartY, dstEndY);
522  }
523 }
524 
525 // Force template instantiation.
526 #if HAVE_16BPP
527 template class Simple3xScaler<uint16_t>;
528 #endif
529 #if HAVE_32BPP
530 template class Simple3xScaler<uint32_t>;
531 #endif
532 
533 } // namespace openmsx
void operator()(const Pixel *in, Pixel *out, size_t dstWidth)
Blur_1on3(const PixelOperations< Pixel > &pixelOps)
Interface for getting lines from a video frame.
Definition: FrameSource.hh:16
const Pixel * getLinePtr(int line, unsigned width, Pixel *buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:92
Polymorphic line scaler.
Definition: LineScalers.hh:285
A video frame as output by the VDP scanline conversion unit, before any postprocessing filters are ap...
Definition: RawFrame.hh:14
Class containing all settings for renderers.
Base class for 3x scalers.
Definition: Scaler3.hh:12
virtual Pixel * acquireLine(unsigned y)=0
virtual unsigned getWidth() const =0
virtual void releaseLine(unsigned y, Pixel *buf)=0
Simple3xScaler(const PixelOperations< Pixel > &pixelOps, const RenderSettings &settings)
This class represents a frame that is the (per-pixel) alpha-blend of a (laser-disc) video frame and a...
mat23 p23(vec2(2, 3), vec2(4, 5), vec2(0, 7))
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:118
auto copy(InputRange &&range, OutputIter out)
Definition: ranges.hh:179
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44