10 #include <emmintrin.h>
15 template<std::
unsigned_
integral Pixel>
22 , blur_1on3(pixelOps_)
27 template<std::
unsigned_
integral Pixel>
30 template<std::
unsigned_
integral Pixel>
32 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
37 int scanlineFactor = settings.getScanlineFactor();
39 unsigned y = dstStartY;
40 auto* srcLine = src.
getLinePtr(srcStartY++, srcWidth, buf);
42 scale(srcLine, dstLine0, dstWidth);
46 copy(dstLine0, dstLine1, dstWidth);
48 for (; (y + 4) < dstEndY; y += 3, srcStartY += 1) {
49 srcLine = src.
getLinePtr(srcStartY, srcWidth, buf);
51 scale(srcLine, dstLine3, dstWidth);
54 copy(dstLine3, dstLine4, dstWidth);
57 scanline.draw(dstLine0, dstLine3, dstLine2,
58 scanlineFactor, dstWidth);
66 srcLine = src.
getLinePtr(srcStartY, srcWidth, buf);
68 scale(srcLine, buf2, dstWidth);
71 scanline.draw(dstLine0, buf2, dstLine2, scanlineFactor, dstWidth);
77 template<std::
unsigned_
integral Pixel>
78 void Simple3xScaler<Pixel>::doScale2(FrameSource& src,
79 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
80 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY,
81 PolyLineScaler<Pixel>&
scale)
84 int scanlineFactor = settings.getScanlineFactor();
85 unsigned dstWidth = dst.getWidth();
86 for (
unsigned srcY = srcStartY, dstY = dstStartY; dstY < dstEndY;
87 srcY += 2, dstY += 3) {
88 auto* srcLine0 = src.getLinePtr(srcY + 0, srcWidth, buf);
89 auto* dstLine0 = dst.acquireLine(dstY + 0);
90 scale(srcLine0, dstLine0, dstWidth);
92 auto* srcLine1 = src.getLinePtr(srcY + 1, srcWidth, buf);
93 auto* dstLine2 = dst.acquireLine(dstY + 2);
94 scale(srcLine1, dstLine2, dstWidth);
96 auto* dstLine1 = dst.acquireLine(dstY + 1);
97 scanline.draw(dstLine0, dstLine2, dstLine1,
98 scanlineFactor, dstWidth);
100 dst.releaseLine(dstY + 0, dstLine0);
101 dst.releaseLine(dstY + 1, dstLine1);
102 dst.releaseLine(dstY + 2, dstLine2);
106 template<std::
unsigned_
integral Pixel>
107 void Simple3xScaler<Pixel>::scale2x1to9x3(FrameSource& src,
108 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
109 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
111 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
112 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
115 template<std::
unsigned_
integral Pixel>
116 void Simple3xScaler<Pixel>::scale2x2to9x3(FrameSource& src,
117 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
118 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
120 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
121 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
124 template<std::
unsigned_
integral Pixel>
125 void Simple3xScaler<Pixel>::scale1x1to3x3(FrameSource& src,
126 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
127 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
129 if (
unsigned blur = settings.getBlurFactor() / 3) {
130 blur_1on3.setBlur(blur);
131 PolyScaleRef<Pixel, Blur_1on3<Pixel>> op(blur_1on3);
132 doScale1(src, srcStartY, srcEndY, srcWidth,
133 dst, dstStartY, dstEndY, op);
138 PolyScale<Pixel, Scale_1on3<Pixel>> op;
139 doScale1(src, srcStartY, srcEndY, srcWidth,
140 dst, dstStartY, dstEndY, op);
144 template<std::
unsigned_
integral Pixel>
145 void Simple3xScaler<Pixel>::scale1x2to3x3(FrameSource& src,
146 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
147 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
149 PolyScale<Pixel, Scale_1on3<Pixel>> op;
150 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
153 template<std::
unsigned_
integral Pixel>
154 void Simple3xScaler<Pixel>::scale4x1to9x3(FrameSource& src,
155 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
156 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
158 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
159 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
162 template<std::
unsigned_
integral Pixel>
163 void Simple3xScaler<Pixel>::scale4x2to9x3(FrameSource& src,
164 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
165 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
167 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
168 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
171 template<std::
unsigned_
integral Pixel>
172 void Simple3xScaler<Pixel>::scale2x1to3x3(FrameSource& src,
173 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
174 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
176 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
177 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
180 template<std::
unsigned_
integral Pixel>
181 void Simple3xScaler<Pixel>::scale2x2to3x3(FrameSource& src,
182 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
183 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
185 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
186 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
189 template<std::
unsigned_
integral Pixel>
190 void Simple3xScaler<Pixel>::scale8x1to9x3(FrameSource& src,
191 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
192 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
194 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
195 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
198 template<std::
unsigned_
integral Pixel>
199 void Simple3xScaler<Pixel>::scale8x2to9x3(FrameSource& src,
200 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
201 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
203 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
204 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
207 template<std::
unsigned_
integral Pixel>
208 void Simple3xScaler<Pixel>::scale4x1to3x3(FrameSource& src,
209 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
210 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
212 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
213 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
216 template<std::
unsigned_
integral Pixel>
217 void Simple3xScaler<Pixel>::scale4x2to3x3(FrameSource& src,
218 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
219 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
221 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
222 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
225 template<std::
unsigned_
integral Pixel>
226 void Simple3xScaler<Pixel>::scaleBlank1to3(
227 FrameSource& src,
unsigned srcStartY,
unsigned srcEndY,
228 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
230 int scanlineFactor = settings.getScanlineFactor();
232 unsigned dstHeight = dst.getHeight();
233 unsigned stopDstY = (dstEndY == dstHeight)
234 ? dstEndY : dstEndY - 3;
235 unsigned srcY = srcStartY, dstY = dstStartY;
236 for (; dstY < stopDstY; srcY += 1, dstY += 3) {
237 auto color0 = src.getLineColor<
Pixel>(srcY);
238 Pixel color1 = scanline.darken(color0, scanlineFactor);
239 dst.fillLine(dstY + 0, color0);
240 dst.fillLine(dstY + 1, color0);
241 dst.fillLine(dstY + 2, color1);
243 if (dstY != dstHeight) {
244 unsigned nextLineWidth = src.getLineWidth(srcY + 1);
245 assert(src.getLineWidth(srcY) == 1);
246 assert(nextLineWidth != 1);
247 this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
252 template<std::
unsigned_
integral Pixel>
253 void Simple3xScaler<Pixel>::scaleBlank2to3(
254 FrameSource& src,
unsigned srcStartY,
unsigned ,
255 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
257 int scanlineFactor = settings.getScanlineFactor();
258 for (
unsigned srcY = srcStartY, dstY = dstStartY;
259 dstY < dstEndY; srcY += 2, dstY += 3) {
260 auto color0 = src.getLineColor<
Pixel>(srcY + 0);
261 auto color1 = src.getLineColor<
Pixel>(srcY + 1);
262 Pixel color01 = scanline.darken(color0, color1, scanlineFactor);
263 dst.fillLine(dstY + 0, color0);
264 dst.fillLine(dstY + 1, color01);
265 dst.fillLine(dstY + 2, color1);
272 template<std::
unsigned_
integral Pixel>
282 template<std::
unsigned_
integral Pixel>
285 if constexpr (
sizeof(
Pixel) != 4) {
286 assert(
false);
return;
289 assert((srcWidth % 4) == 0);
290 assert(srcWidth >= 8);
291 assert((
size_t(in_ ) % 16) == 0);
292 assert((
size_t(out_) % 16) == 0);
294 unsigned alpha = blur * 256;
295 unsigned c0 = alpha / 2;
296 unsigned c1 = alpha + c0;
297 unsigned c2 = 0x10000 - c1;
298 unsigned c3 = 0x10000 - alpha;
299 __m128i C0C1 = _mm_set_epi16(c1, c1, c1, c1, c0, c0, c0, c0);
300 __m128i C1C0 = _mm_shuffle_epi32(C0C1, 0x4E);
301 __m128i C2C3 = _mm_set_epi16(c3, c3, c3, c3, c2, c2, c2, c2);
302 __m128i C3C2 = _mm_shuffle_epi32(C2C3, 0x4E);
304 size_t tmp = srcWidth - 4;
305 const auto* in =
reinterpret_cast<const char*
>(in_ + tmp);
306 auto* out =
reinterpret_cast< char*
>(out_ + 3 * tmp);
307 auto x = -ptrdiff_t(tmp *
sizeof(
Pixel));
309 __m128i ZERO = _mm_setzero_si128();
312 __m128i abcd = _mm_load_si128(
reinterpret_cast<const __m128i*
>(in +
x));
313 __m128i a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
314 __m128i a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
315 __m128i a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
316 __m128i d1d0 = _mm_shuffle_epi32(a0a1, 0x4E);
323 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
324 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
325 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
326 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
327 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
329 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
330 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
331 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
332 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
333 __m128i
p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
334 __m128i p0123 = _mm_packus_epi16(p01,
p23);
335 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 *
x + 0),
339 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
340 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
341 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
342 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
343 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
345 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
346 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
347 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
348 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
349 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
350 __m128i p4567 = _mm_packus_epi16(p45, p67);
351 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 *
x + 16),
355 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
356 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
357 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
358 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
359 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
361 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
362 abcd = _mm_load_si128(
reinterpret_cast<const __m128i*
>(in +
x + 16));
363 a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
364 a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
365 a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
366 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
367 __m128i p89ab = _mm_packus_epi16(p89, pab);
368 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 *
x + 32),
376 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
377 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
378 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
379 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
380 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
382 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
383 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
384 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
385 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
386 __m128i
p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
387 __m128i p0123 = _mm_packus_epi16(p01,
p23);
388 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 0),
392 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
393 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
394 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
395 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
396 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
398 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
399 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
400 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
401 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
402 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
403 __m128i p4567 = _mm_packus_epi16(p45, p67);
404 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 16),
408 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
409 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
410 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
411 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
412 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
414 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
415 a0a1 = _mm_shuffle_epi32(d1d0, 0x4E);
416 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
417 __m128i p89ab = _mm_packus_epi16(p89, pab);
418 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 32),
423 template<std::
unsigned_
integral Pixel>
425 const Pixel* __restrict in,
Pixel* __restrict out,
448 size_t srcWidth = dstWidth / 3;
450 if constexpr (
sizeof(
Pixel) == 4) {
451 blur_SSE(in, out, srcWidth);
457 unsigned c0 = blur / 2;
458 unsigned c1 = blur + c0;
459 unsigned c2 = 256 - c1;
460 unsigned c3 = 256 - 2 * c0;
461 mult0.setFactor32(c0);
462 mult1.setFactor32(c1);
463 mult2.setFactor32(c2);
464 mult3.setFactor32(c3);
468 uint32_t f0 = mult0.mul32(p0);
469 uint32_t f1 = mult1.mul32(p0);
474 for (;
x < (srcWidth - 2);
x += 2) {
475 uint32_t g2 = mult2.mul32(p0);
476 out[3 *
x + 0] = mult0.conv32(g2 + f1);
478 uint32_t t0 = mult0.mul32(p1);
479 out[3 *
x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
481 f1 = mult1.mul32(p1);
482 out[3 *
x + 2] = mult0.conv32(g2 + f1);
484 uint32_t f2 = mult2.mul32(p1);
485 out[3 *
x + 3] = mult0.conv32(f2 + g1);
487 uint32_t t1 = mult0.mul32(p0);
488 out[3 *
x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + t1);
490 g1 = mult1.mul32(p0);
491 out[3 *
x + 5] = mult0.conv32(g1 + f2);
493 uint32_t g2 = mult2.mul32(p0);
494 out[3 *
x + 0] = mult0.conv32(g2 + f1);
496 uint32_t t0 = mult0.mul32(p1);
497 out[3 *
x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
499 f1 = mult1.mul32(p1);
500 out[3 *
x + 2] = mult0.conv32(g2 + f1);
502 uint32_t f2 = mult2.mul32(p1);
503 out[3 *
x + 3] = mult0.conv32(f2 + g1);
504 out[3 *
x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + f0);
508 template<std::
unsigned_
integral Pixel>
511 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
516 srcWidth = sf.getLineWidth(srcStartY);
517 this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
518 dst, dstStartY, dstEndY);
520 this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
521 dst, dstStartY, dstEndY);
527 template class Simple3xScaler<uint16_t>;
530 template class Simple3xScaler<uint32_t>;
void operator()(const Pixel *in, Pixel *out, size_t dstWidth)
Blur_1on3(const PixelOperations< Pixel > &pixelOps)
Interface for getting lines from a video frame.
const Pixel * getLinePtr(int line, unsigned width, Pixel *buf) const
Gets a pointer to the pixels of the given line number.
A video frame as output by the VDP scanline conversion unit, before any postprocessing filters are ap...
Class containing all settings for renderers.
Base class for 3x scalers.
virtual Pixel * acquireLine(unsigned y)=0
virtual unsigned getWidth() const =0
virtual void releaseLine(unsigned y, Pixel *buf)=0
Simple3xScaler(const PixelOperations< Pixel > &pixelOps, const RenderSettings &settings)
~Simple3xScaler() override
This class represents a frame that is the (per-pixel) alpha-blend of a (laser-disc) video frame and a...
mat23 p23(vec2(2, 3), vec2(4, 5), vec2(0, 7))
constexpr mat4 scale(const vec3 &xyz)
This file implemented 3 utility functions:
constexpr KeyMatrixPosition x
Keyboard bindings.
auto copy(InputRange &&range, OutputIter out)
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)