16template<std::
unsigned_
integral Pixel>
23 , blur_1on3(pixelOps_)
28template<std::
unsigned_
integral Pixel>
31template<std::
unsigned_
integral Pixel>
33 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
38 int scanlineFactor = settings.getScanlineFactor();
40 unsigned y = dstStartY;
41 auto srcLine = src.
getLine(srcStartY++, buf);
43 scale(srcLine, dstLine0);
47 copy(dstLine0, dstLine1);
49 for (; (y + 4) < dstEndY; y += 3, srcStartY += 1) {
50 srcLine = src.
getLine(srcStartY, buf);
52 scale(srcLine, dstLine3);
55 copy(dstLine3, dstLine4);
58 scanline.draw(dstLine0, dstLine3,
59 dstLine2, scanlineFactor);
67 srcLine = src.
getLine(srcStartY, buf);
72 scanline.draw(dstLine0, buf2, dstLine2, scanlineFactor);
78template<std::
unsigned_
integral Pixel>
79void Simple3xScaler<Pixel>::doScale2(FrameSource& src,
80 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
81 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY,
82 PolyLineScaler<Pixel>&
scale)
85 int scanlineFactor = settings.getScanlineFactor();
86 for (
unsigned srcY = srcStartY, dstY = dstStartY; dstY < dstEndY;
87 srcY += 2, dstY += 3) {
88 auto srcLine0 = src.getLine(srcY + 0, buf);
89 auto dstLine0 = dst.acquireLine(dstY + 0);
90 scale(srcLine0, dstLine0);
92 auto srcLine1 = src.getLine(srcY + 1, buf);
93 auto dstLine2 = dst.acquireLine(dstY + 2);
94 scale(srcLine1, dstLine2);
96 auto dstLine1 = dst.acquireLine(dstY + 1);
97 scanline.draw(dstLine0, dstLine2, dstLine1,
100 dst.releaseLine(dstY + 0, dstLine0);
101 dst.releaseLine(dstY + 1, dstLine1);
102 dst.releaseLine(dstY + 2, dstLine2);
106template<std::
unsigned_
integral Pixel>
107void Simple3xScaler<Pixel>::scale2x1to9x3(FrameSource& src,
108 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
109 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
111 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
112 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
115template<std::
unsigned_
integral Pixel>
116void Simple3xScaler<Pixel>::scale2x2to9x3(FrameSource& src,
117 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
118 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
120 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
121 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
124template<std::
unsigned_
integral Pixel>
125void Simple3xScaler<Pixel>::scale1x1to3x3(FrameSource& src,
126 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
127 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
129 if (
unsigned blur = settings.getBlurFactor() / 3) {
130 blur_1on3.setBlur(blur);
131 PolyScaleRef<Pixel, Blur_1on3<Pixel>> op(blur_1on3);
132 doScale1(src, srcStartY, srcEndY, srcWidth,
133 dst, dstStartY, dstEndY, op);
138 PolyScale<Pixel, Scale_1on3<Pixel>> op;
139 doScale1(src, srcStartY, srcEndY, srcWidth,
140 dst, dstStartY, dstEndY, op);
144template<std::
unsigned_
integral Pixel>
145void Simple3xScaler<Pixel>::scale1x2to3x3(FrameSource& src,
146 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
147 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
149 PolyScale<Pixel, Scale_1on3<Pixel>> op;
150 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
153template<std::
unsigned_
integral Pixel>
154void Simple3xScaler<Pixel>::scale4x1to9x3(FrameSource& src,
155 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
156 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
158 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
159 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
162template<std::
unsigned_
integral Pixel>
163void Simple3xScaler<Pixel>::scale4x2to9x3(FrameSource& src,
164 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
165 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
167 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
168 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
171template<std::
unsigned_
integral Pixel>
172void Simple3xScaler<Pixel>::scale2x1to3x3(FrameSource& src,
173 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
174 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
176 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
177 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
180template<std::
unsigned_
integral Pixel>
181void Simple3xScaler<Pixel>::scale2x2to3x3(FrameSource& src,
182 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
183 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
185 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
186 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
189template<std::
unsigned_
integral Pixel>
190void Simple3xScaler<Pixel>::scale8x1to9x3(FrameSource& src,
191 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
192 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
194 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
195 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
198template<std::
unsigned_
integral Pixel>
199void Simple3xScaler<Pixel>::scale8x2to9x3(FrameSource& src,
200 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
201 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
203 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
204 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
207template<std::
unsigned_
integral Pixel>
208void Simple3xScaler<Pixel>::scale4x1to3x3(FrameSource& src,
209 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
210 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
212 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
213 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
216template<std::
unsigned_
integral Pixel>
217void Simple3xScaler<Pixel>::scale4x2to3x3(FrameSource& src,
218 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
219 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
221 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
222 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
225template<std::
unsigned_
integral Pixel>
226void Simple3xScaler<Pixel>::scaleBlank1to3(
227 FrameSource& src,
unsigned srcStartY,
unsigned srcEndY,
228 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
230 int scanlineFactor = settings.getScanlineFactor();
232 unsigned dstHeight = dst.getHeight();
233 unsigned stopDstY = (dstEndY == dstHeight)
234 ? dstEndY : dstEndY - 3;
235 unsigned srcY = srcStartY, dstY = dstStartY;
236 for (; dstY < stopDstY; srcY += 1, dstY += 3) {
237 auto color0 = src.getLineColor<
Pixel>(srcY);
238 Pixel color1 = scanline.darken(color0, scanlineFactor);
239 dst.fillLine(dstY + 0, color0);
240 dst.fillLine(dstY + 1, color0);
241 dst.fillLine(dstY + 2, color1);
243 if (dstY != dstHeight) {
244 unsigned nextLineWidth = src.getLineWidth(srcY + 1);
245 assert(src.getLineWidth(srcY) == 1);
246 assert(nextLineWidth != 1);
247 this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
252template<std::
unsigned_
integral Pixel>
253void Simple3xScaler<Pixel>::scaleBlank2to3(
254 FrameSource& src,
unsigned srcStartY,
unsigned ,
255 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
257 int scanlineFactor = settings.getScanlineFactor();
258 for (
unsigned srcY = srcStartY, dstY = dstStartY;
259 dstY < dstEndY; srcY += 2, dstY += 3) {
260 auto color0 = src.getLineColor<
Pixel>(srcY + 0);
261 auto color1 = src.getLineColor<
Pixel>(srcY + 1);
262 Pixel color01 = scanline.darken(color0, color1, scanlineFactor);
263 dst.fillLine(dstY + 0, color0);
264 dst.fillLine(dstY + 1, color01);
265 dst.fillLine(dstY + 2, color1);
272template<std::
unsigned_
integral Pixel>
282template<std::
unsigned_
integral Pixel>
285 if constexpr (
sizeof(
Pixel) != 4) {
286 assert(
false);
return;
289 assert((srcWidth % 4) == 0);
290 assert(srcWidth >= 8);
291 assert((
size_t(in_ ) % 16) == 0);
292 assert((
size_t(out_) % 16) == 0);
294 unsigned alpha = blur * 256;
295 auto c0 = narrow_cast<int16_t>(alpha / 2);
296 auto c1 = narrow_cast<int16_t>(alpha + c0);
297 auto c2 = narrow_cast<int16_t>(0x10000 - c1);
298 auto c3 = narrow_cast<int16_t>(0x10000 - alpha);
299 __m128i C0C1 = _mm_set_epi16(c1, c1, c1, c1, c0, c0, c0, c0);
300 __m128i C1C0 = _mm_shuffle_epi32(C0C1, 0x4E);
301 __m128i C2C3 = _mm_set_epi16(c3, c3, c3, c3, c2, c2, c2, c2);
302 __m128i C3C2 = _mm_shuffle_epi32(C2C3, 0x4E);
304 size_t tmp = srcWidth - 4;
305 const auto* in =
reinterpret_cast<const char*
>(in_ + tmp);
306 auto* out =
reinterpret_cast< char*
>(out_ + 3 * tmp);
307 auto x = -ptrdiff_t(tmp *
sizeof(
Pixel));
309 __m128i ZERO = _mm_setzero_si128();
312 __m128i abcd = _mm_load_si128(
reinterpret_cast<const __m128i*
>(in + x));
313 __m128i a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
314 __m128i a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
315 __m128i a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
316 __m128i d1d0 = _mm_shuffle_epi32(a0a1, 0x4E);
323 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
324 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
325 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
326 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
327 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
329 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
330 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
331 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
332 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
333 __m128i
p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
334 __m128i p0123 = _mm_packus_epi16(p01,
p23);
335 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 * x + 0),
339 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
340 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
341 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
342 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
343 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
345 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
346 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
347 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
348 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
349 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
350 __m128i p4567 = _mm_packus_epi16(p45, p67);
351 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 * x + 16),
355 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
356 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
357 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
358 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
359 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
361 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
362 abcd = _mm_load_si128(
reinterpret_cast<const __m128i*
>(in + x + 16));
363 a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
364 a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
365 a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
366 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
367 __m128i p89ab = _mm_packus_epi16(p89, pab);
368 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 * x + 32),
376 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
377 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
378 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
379 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
380 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
382 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
383 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
384 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
385 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
386 __m128i
p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
387 __m128i p0123 = _mm_packus_epi16(p01,
p23);
388 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 0),
392 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
393 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
394 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
395 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
396 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
398 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
399 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
400 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
401 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
402 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
403 __m128i p4567 = _mm_packus_epi16(p45, p67);
404 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 16),
408 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
409 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
410 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
411 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
412 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
414 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
415 a0a1 = _mm_shuffle_epi32(d1d0, 0x4E);
416 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
417 __m128i p89ab = _mm_packus_epi16(p89, pab);
418 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 32),
423template<std::
unsigned_
integral Pixel>
447 if constexpr (
sizeof(
Pixel) == 4) {
448 blur_SSE(in.data(), out.data(), in.size());
454 unsigned c0 = blur / 2;
455 unsigned c1 = blur + c0;
456 unsigned c2 = 256 - c1;
457 unsigned c3 = 256 - 2 * c0;
458 mult0.setFactor32(c0);
459 mult1.setFactor32(c1);
460 mult2.setFactor32(c2);
461 mult3.setFactor32(c3);
465 uint32_t f0 = mult0.mul32(p0);
466 uint32_t f1 = mult1.mul32(p0);
470 size_t srcWidth = in.size();
472 for (; x < (srcWidth - 2); x += 2) {
473 uint32_t g2 = mult2.mul32(p0);
474 out[3 * x + 0] = mult0.conv32(g2 + f1);
476 uint32_t t0 = mult0.mul32(p1);
477 out[3 * x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
479 f1 = mult1.mul32(p1);
480 out[3 * x + 2] = mult0.conv32(g2 + f1);
482 uint32_t f2 = mult2.mul32(p1);
483 out[3 * x + 3] = mult0.conv32(f2 + g1);
485 uint32_t t1 = mult0.mul32(p0);
486 out[3 * x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + t1);
488 g1 = mult1.mul32(p0);
489 out[3 * x + 5] = mult0.conv32(g1 + f2);
491 uint32_t g2 = mult2.mul32(p0);
492 out[3 * x + 0] = mult0.conv32(g2 + f1);
494 uint32_t t0 = mult0.mul32(p1);
495 out[3 * x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
497 f1 = mult1.mul32(p1);
498 out[3 * x + 2] = mult0.conv32(g2 + f1);
500 uint32_t f2 = mult2.mul32(p1);
501 out[3 * x + 3] = mult0.conv32(f2 + g1);
502 out[3 * x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + f0);
506template<std::
unsigned_
integral Pixel>
509 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
514 srcWidth = sf.getLineWidth(srcStartY);
515 this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
516 dst, dstStartY, dstEndY);
518 this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
519 dst, dstStartY, dstEndY);
525template class Simple3xScaler<uint16_t>;
528template class Simple3xScaler<uint32_t>;
void operator()(std::span< const Pixel > in, std::span< Pixel > out)
Blur_1on3(const PixelOperations< Pixel > &pixelOps)
Interface for getting lines from a video frame.
std::span< const Pixel > getLine(int line, std::span< Pixel > buf) const
Gets a pointer to the pixels of the given line number.
A video frame as output by the VDP scanline conversion unit, before any postprocessing filters are ap...
Class containing all settings for renderers.
Base class for 3x scalers.
virtual unsigned getWidth() const =0
virtual void releaseLine(unsigned y, std::span< Pixel > buf)=0
virtual std::span< Pixel > acquireLine(unsigned y)=0
Simple3xScaler(const PixelOperations< Pixel > &pixelOps, const RenderSettings &settings)
~Simple3xScaler() override
This class represents a frame that is the (per-pixel) alpha-blend of a (laser-disc) video frame and a...
mat23 p23(vec2(2, 3), vec2(4, 5), vec2(0, 7))
constexpr mat4 scale(const vec3 &xyz)
This file implemented 3 utility functions:
auto copy(InputRange &&range, OutputIter out)
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)