openMSX
Simple3xScaler.cc
Go to the documentation of this file.
1#include "Simple3xScaler.hh"
3#include "LineScalers.hh"
4#include "RawFrame.hh"
5#include "ScalerOutput.hh"
6#include "RenderSettings.hh"
7#include "narrow.hh"
8#include "vla.hh"
9#include <cstdint>
10#ifdef __SSE2__
11#include <emmintrin.h>
12#endif
13
14namespace openmsx {
15
16template<std::unsigned_integral Pixel>
18 const PixelOperations<Pixel>& pixelOps_,
19 const RenderSettings& settings_)
20 : Scaler3<Pixel>(pixelOps_)
21 , pixelOps(pixelOps_)
22 , scanline(pixelOps_)
23 , blur_1on3(pixelOps_)
24 , settings(settings_)
25{
26}
27
28template<std::unsigned_integral Pixel>
30
31template<std::unsigned_integral Pixel>
33 unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
34 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY,
36{
37 VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
38 int scanlineFactor = settings.getScanlineFactor();
39 unsigned dstWidth = dst.getWidth();
40 unsigned y = dstStartY;
41 auto srcLine = src.getLine(srcStartY++, buf);
42 auto dstLine0 = dst.acquireLine(y + 0);
43 scale(srcLine, dstLine0);
44
46 auto dstLine1 = dst.acquireLine(y + 1);
47 copy(dstLine0, dstLine1);
48
49 for (/* */; (y + 4) < dstEndY; y += 3, srcStartY += 1) {
50 srcLine = src.getLine(srcStartY, buf);
51 auto dstLine3 = dst.acquireLine(y + 3);
52 scale(srcLine, dstLine3);
53
54 auto dstLine4 = dst.acquireLine(y + 4);
55 copy(dstLine3, dstLine4);
56
57 auto dstLine2 = dst.acquireLine(y + 2);
58 scanline.draw(dstLine0, dstLine3,
59 dstLine2, scanlineFactor);
60
61 dst.releaseLine(y + 0, dstLine0);
62 dst.releaseLine(y + 1, dstLine1);
63 dst.releaseLine(y + 2, dstLine2);
64 dstLine0 = dstLine3;
65 dstLine1 = dstLine4;
66 }
67 srcLine = src.getLine(srcStartY, buf);
68 VLA_SSE_ALIGNED(Pixel, buf2, dstWidth);
69 scale(srcLine, buf2);
70
71 auto dstLine2 = dst.acquireLine(y + 2);
72 scanline.draw(dstLine0, buf2, dstLine2, scanlineFactor);
73 dst.releaseLine(y + 0, dstLine0);
74 dst.releaseLine(y + 1, dstLine1);
75 dst.releaseLine(y + 2, dstLine2);
76}
77
78template<std::unsigned_integral Pixel>
79void Simple3xScaler<Pixel>::doScale2(FrameSource& src,
80 unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
81 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY,
82 PolyLineScaler<Pixel>& scale)
83{
84 VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
85 int scanlineFactor = settings.getScanlineFactor();
86 for (unsigned srcY = srcStartY, dstY = dstStartY; dstY < dstEndY;
87 srcY += 2, dstY += 3) {
88 auto srcLine0 = src.getLine(srcY + 0, buf);
89 auto dstLine0 = dst.acquireLine(dstY + 0);
90 scale(srcLine0, dstLine0);
91
92 auto srcLine1 = src.getLine(srcY + 1, buf);
93 auto dstLine2 = dst.acquireLine(dstY + 2);
94 scale(srcLine1, dstLine2);
95
96 auto dstLine1 = dst.acquireLine(dstY + 1);
97 scanline.draw(dstLine0, dstLine2, dstLine1,
98 scanlineFactor);
99
100 dst.releaseLine(dstY + 0, dstLine0);
101 dst.releaseLine(dstY + 1, dstLine1);
102 dst.releaseLine(dstY + 2, dstLine2);
103 }
104}
105
106template<std::unsigned_integral Pixel>
107void Simple3xScaler<Pixel>::scale2x1to9x3(FrameSource& src,
108 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
109 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
110{
111 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
112 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
113}
114
115template<std::unsigned_integral Pixel>
116void Simple3xScaler<Pixel>::scale2x2to9x3(FrameSource& src,
117 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
118 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
119{
120 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
121 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
122}
123
124template<std::unsigned_integral Pixel>
125void Simple3xScaler<Pixel>::scale1x1to3x3(FrameSource& src,
126 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
127 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
128{
129 if (unsigned blur = settings.getBlurFactor() / 3) {
130 blur_1on3.setBlur(blur);
131 PolyScaleRef<Pixel, Blur_1on3<Pixel>> op(blur_1on3);
132 doScale1(src, srcStartY, srcEndY, srcWidth,
133 dst, dstStartY, dstEndY, op);
134 } else {
135 // No blurring: this is an optimization but it's also needed
136 // for correctness (otherwise there's an overflow in 0.16 fixed
137 // point arithmetic).
138 PolyScale<Pixel, Scale_1on3<Pixel>> op;
139 doScale1(src, srcStartY, srcEndY, srcWidth,
140 dst, dstStartY, dstEndY, op);
141 }
142}
143
144template<std::unsigned_integral Pixel>
145void Simple3xScaler<Pixel>::scale1x2to3x3(FrameSource& src,
146 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
147 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
148{
149 PolyScale<Pixel, Scale_1on3<Pixel>> op;
150 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
151}
152
153template<std::unsigned_integral Pixel>
154void Simple3xScaler<Pixel>::scale4x1to9x3(FrameSource& src,
155 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
156 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
157{
158 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
159 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
160}
161
162template<std::unsigned_integral Pixel>
163void Simple3xScaler<Pixel>::scale4x2to9x3(FrameSource& src,
164 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
165 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
166{
167 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
168 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
169}
170
171template<std::unsigned_integral Pixel>
172void Simple3xScaler<Pixel>::scale2x1to3x3(FrameSource& src,
173 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
174 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
175{
176 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
177 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
178}
179
180template<std::unsigned_integral Pixel>
181void Simple3xScaler<Pixel>::scale2x2to3x3(FrameSource& src,
182 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
183 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
184{
185 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
186 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
187}
188
189template<std::unsigned_integral Pixel>
190void Simple3xScaler<Pixel>::scale8x1to9x3(FrameSource& src,
191 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
192 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
193{
194 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
195 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
196}
197
198template<std::unsigned_integral Pixel>
199void Simple3xScaler<Pixel>::scale8x2to9x3(FrameSource& src,
200 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
201 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
202{
203 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
204 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
205}
206
207template<std::unsigned_integral Pixel>
208void Simple3xScaler<Pixel>::scale4x1to3x3(FrameSource& src,
209 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
210 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
211{
212 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
213 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
214}
215
216template<std::unsigned_integral Pixel>
217void Simple3xScaler<Pixel>::scale4x2to3x3(FrameSource& src,
218 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
219 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
220{
221 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
222 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
223}
224
225template<std::unsigned_integral Pixel>
226void Simple3xScaler<Pixel>::scaleBlank1to3(
227 FrameSource& src, unsigned srcStartY, unsigned srcEndY,
228 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
229{
230 int scanlineFactor = settings.getScanlineFactor();
231
232 unsigned dstHeight = dst.getHeight();
233 unsigned stopDstY = (dstEndY == dstHeight)
234 ? dstEndY : dstEndY - 3;
235 unsigned srcY = srcStartY, dstY = dstStartY;
236 for (/* */; dstY < stopDstY; srcY += 1, dstY += 3) {
237 auto color0 = src.getLineColor<Pixel>(srcY);
238 Pixel color1 = scanline.darken(color0, scanlineFactor);
239 dst.fillLine(dstY + 0, color0);
240 dst.fillLine(dstY + 1, color0);
241 dst.fillLine(dstY + 2, color1);
242 }
243 if (dstY != dstHeight) {
244 unsigned nextLineWidth = src.getLineWidth(srcY + 1);
245 assert(src.getLineWidth(srcY) == 1);
246 assert(nextLineWidth != 1);
247 this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
248 dst, dstY, dstEndY);
249 }
250}
251
252template<std::unsigned_integral Pixel>
253void Simple3xScaler<Pixel>::scaleBlank2to3(
254 FrameSource& src, unsigned srcStartY, unsigned /*srcEndY*/,
255 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
256{
257 int scanlineFactor = settings.getScanlineFactor();
258 for (unsigned srcY = srcStartY, dstY = dstStartY;
259 dstY < dstEndY; srcY += 2, dstY += 3) {
260 auto color0 = src.getLineColor<Pixel>(srcY + 0);
261 auto color1 = src.getLineColor<Pixel>(srcY + 1);
262 Pixel color01 = scanline.darken(color0, color1, scanlineFactor);
263 dst.fillLine(dstY + 0, color0);
264 dst.fillLine(dstY + 1, color01);
265 dst.fillLine(dstY + 2, color1);
266 }
267}
268
269
270// class Blur_1on3
271
272template<std::unsigned_integral Pixel>
274 : mult0(pixelOps)
275 , mult1(pixelOps)
276 , mult2(pixelOps)
277 , mult3(pixelOps)
278{
279}
280
281#ifdef __SSE2__
282template<std::unsigned_integral Pixel>
283void Blur_1on3<Pixel>::blur_SSE(const Pixel* in_, Pixel* out_, size_t srcWidth)
284{
285 if constexpr (sizeof(Pixel) != 4) {
286 assert(false); return; // only 32-bpp
287 }
288
289 assert((srcWidth % 4) == 0);
290 assert(srcWidth >= 8);
291 assert((size_t(in_ ) % 16) == 0);
292 assert((size_t(out_) % 16) == 0);
293
294 unsigned alpha = blur * 256;
295 auto c0 = narrow_cast<int16_t>(alpha / 2);
296 auto c1 = narrow_cast<int16_t>(alpha + c0);
297 auto c2 = narrow_cast<int16_t>(0x10000 - c1);
298 auto c3 = narrow_cast<int16_t>(0x10000 - alpha);
299 __m128i C0C1 = _mm_set_epi16(c1, c1, c1, c1, c0, c0, c0, c0);
300 __m128i C1C0 = _mm_shuffle_epi32(C0C1, 0x4E);
301 __m128i C2C3 = _mm_set_epi16(c3, c3, c3, c3, c2, c2, c2, c2);
302 __m128i C3C2 = _mm_shuffle_epi32(C2C3, 0x4E);
303
304 size_t tmp = srcWidth - 4;
305 const auto* in = reinterpret_cast<const char*>(in_ + tmp);
306 auto* out = reinterpret_cast< char*>(out_ + 3 * tmp);
307 auto x = -ptrdiff_t(tmp * sizeof(Pixel));
308
309 __m128i ZERO = _mm_setzero_si128();
310
311 // Prepare first iteration (duplicate left border pixel)
312 __m128i abcd = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x));
313 __m128i a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
314 __m128i a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
315 __m128i a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
316 __m128i d1d0 = _mm_shuffle_epi32(a0a1, 0x4E); // left border
317
318 // At the start of each iteration the following vars are live:
319 // abcd, a_b_, a_a_, a0a1, d1d0
320 // Each iteration reads 4 and produces 12 pixels.
321 do {
322 // p01
323 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
324 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
325 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
326 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
327 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
328 // p23
329 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
330 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
331 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
332 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
333 __m128i p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
334 __m128i p0123 = _mm_packus_epi16(p01, p23);
335 _mm_store_si128(reinterpret_cast<__m128i*>(out + 3 * x + 0),
336 p0123);
337
338 // p45
339 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
340 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
341 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
342 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
343 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
344 // p67
345 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
346 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
347 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
348 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
349 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
350 __m128i p4567 = _mm_packus_epi16(p45, p67);
351 _mm_store_si128(reinterpret_cast<__m128i*>(out + 3 * x + 16),
352 p4567);
353
354 // p89
355 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
356 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
357 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
358 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
359 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
360 // pab
361 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
362 abcd = _mm_load_si128(reinterpret_cast<const __m128i*>(in + x + 16));
363 a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
364 a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
365 a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
366 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
367 __m128i p89ab = _mm_packus_epi16(p89, pab);
368 _mm_store_si128(reinterpret_cast<__m128i*>(out + 3 * x + 32),
369 p89ab);
370
371 x += 16;
372 } while (x < 0);
373
374 // Last iteration (duplicate right border pixel)
375 // p01
376 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
377 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
378 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
379 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
380 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
381 // p23
382 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
383 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
384 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
385 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
386 __m128i p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
387 __m128i p0123 = _mm_packus_epi16(p01, p23);
388 _mm_store_si128(reinterpret_cast<__m128i*>(out + 0),
389 p0123);
390
391 // p45
392 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
393 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
394 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
395 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
396 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
397 // p67
398 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
399 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
400 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
401 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
402 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
403 __m128i p4567 = _mm_packus_epi16(p45, p67);
404 _mm_store_si128(reinterpret_cast<__m128i*>(out + 16),
405 p4567);
406
407 // p89
408 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
409 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
410 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
411 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
412 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
413 // pab
414 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
415 a0a1 = _mm_shuffle_epi32(d1d0, 0x4E); // right border
416 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
417 __m128i p89ab = _mm_packus_epi16(p89, pab);
418 _mm_store_si128(reinterpret_cast<__m128i*>(out + 32),
419 p89ab);
420}
421#endif
422
423template<std::unsigned_integral Pixel>
424void Blur_1on3<Pixel>::operator()(std::span<const Pixel> in, std::span<Pixel> out)
425{
426 /* The following code is equivalent to this loop. It is 2x unrolled
427 * and common subexpressions have been eliminated. The last iteration
428 * is also moved outside the for loop.
429 *
430 * unsigned c0 = blur / 2;
431 * unsigned c1 = c0 + blur;
432 * unsigned c2 = 256 - c1;
433 * unsigned c3 = 256 - 2 * c0;
434 * Pixel prev, curr, next;
435 * prev = curr = next = in[0];
436 * size_t srcWidth = dstWidth / 3;
437 * for (auto x : xrange(srcWidth)) {
438 * if (x != (srcWidth - 1)) next = in[x + 1];
439 * out[3 * x + 0] = mul(c1, prev) + mul(c2, curr);
440 * out[3 * x + 1] = mul(c0, prev) + mul(c3, curr) + mul(c0, next);
441 * out[3 * x + 2] = mul(c2, curr) + mul(c1, next);
442 * prev = curr;
443 * curr = next;
444 * }
445 */
446#ifdef __SSE2__
447 if constexpr (sizeof(Pixel) == 4) {
448 blur_SSE(in.data(), out.data(), in.size());
449 return;
450 }
451#endif
452
453 // C++ routine, both 16bpp and 32bpp
454 unsigned c0 = blur / 2;
455 unsigned c1 = blur + c0;
456 unsigned c2 = 256 - c1;
457 unsigned c3 = 256 - 2 * c0;
458 mult0.setFactor32(c0);
459 mult1.setFactor32(c1);
460 mult2.setFactor32(c2);
461 mult3.setFactor32(c3);
462
463 Pixel p0 = in[0];
464 Pixel p1;
465 uint32_t f0 = mult0.mul32(p0);
466 uint32_t f1 = mult1.mul32(p0);
467 uint32_t g0 = f0;
468 uint32_t g1 = f1;
469
470 size_t srcWidth = in.size();
471 size_t x = 0;
472 for (; x < (srcWidth - 2); x += 2) {
473 uint32_t g2 = mult2.mul32(p0);
474 out[3 * x + 0] = mult0.conv32(g2 + f1);
475 p1 = in[x + 1];
476 uint32_t t0 = mult0.mul32(p1);
477 out[3 * x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
478 f0 = t0;
479 f1 = mult1.mul32(p1);
480 out[3 * x + 2] = mult0.conv32(g2 + f1);
481
482 uint32_t f2 = mult2.mul32(p1);
483 out[3 * x + 3] = mult0.conv32(f2 + g1);
484 p0 = in[x + 2];
485 uint32_t t1 = mult0.mul32(p0);
486 out[3 * x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + t1);
487 g0 = t1;
488 g1 = mult1.mul32(p0);
489 out[3 * x + 5] = mult0.conv32(g1 + f2);
490 }
491 uint32_t g2 = mult2.mul32(p0);
492 out[3 * x + 0] = mult0.conv32(g2 + f1);
493 p1 = in[x + 1];
494 uint32_t t0 = mult0.mul32(p1);
495 out[3 * x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
496 f0 = t0;
497 f1 = mult1.mul32(p1);
498 out[3 * x + 2] = mult0.conv32(g2 + f1);
499
500 uint32_t f2 = mult2.mul32(p1);
501 out[3 * x + 3] = mult0.conv32(f2 + g1);
502 out[3 * x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + f0);
503 out[3 * x + 5] = p1;
504}
505
506template<std::unsigned_integral Pixel>
508 FrameSource& src, const RawFrame* superImpose,
509 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
510 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
511{
512 if (superImpose) {
513 SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
514 srcWidth = sf.getLineWidth(srcStartY);
515 this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
516 dst, dstStartY, dstEndY);
517 } else {
518 this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
519 dst, dstStartY, dstEndY);
520 }
521}
522
523// Force template instantiation.
524#if HAVE_16BPP
525template class Simple3xScaler<uint16_t>;
526#endif
527#if HAVE_32BPP
528template class Simple3xScaler<uint32_t>;
529#endif
530
531} // namespace openmsx
void operator()(std::span< const Pixel > in, std::span< Pixel > out)
Blur_1on3(const PixelOperations< Pixel > &pixelOps)
Interface for getting lines from a video frame.
Definition: FrameSource.hh:20
std::span< const Pixel > getLine(int line, std::span< Pixel > buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:96
Polymorphic line scaler.
Definition: LineScalers.hh:286
A video frame as output by the VDP scanline conversion unit, before any postprocessing filters are ap...
Definition: RawFrame.hh:15
Class containing all settings for renderers.
Base class for 3x scalers.
Definition: Scaler3.hh:12
virtual unsigned getWidth() const =0
virtual void releaseLine(unsigned y, std::span< Pixel > buf)=0
virtual std::span< Pixel > acquireLine(unsigned y)=0
Simple3xScaler(const PixelOperations< Pixel > &pixelOps, const RenderSettings &settings)
This class represents a frame that is the (per-pixel) alpha-blend of a (laser-disc) video frame and a...
mat23 p23(vec2(2, 3), vec2(4, 5), vec2(0, 7))
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
auto copy(InputRange &&range, OutputIter out)
Definition: ranges.hh:232
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:50