20template<std::
unsigned_
integral Pixel>
25 , settings(renderSettings)
34template<std::
unsigned_
integral Pixel>
36 FrameSource& src,
unsigned srcStartY,
unsigned srcEndY,
39 int scanlineFactor = settings.getScanlineFactor();
42 unsigned stopDstY = (dstEndY == dstHeight)
43 ? dstEndY : dstEndY - 2;
44 unsigned srcY = srcStartY, dstY = dstStartY;
45 for (; dstY < stopDstY; srcY += 1, dstY += 2) {
48 Pixel color1 = scanline.darken(color0, scanlineFactor);
51 if (dstY != dstHeight) {
54 assert(nextLineWidth != 1);
55 this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
63static inline __m128i shuffle(__m128i x, __m128i y)
73 return _mm_castpd_si128(_mm_shuffle_pd(
74 _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
78static void blur1on2_SSE2(
79 const uint32_t* __restrict in_, uint32_t* __restrict out_,
80 unsigned c1_,
unsigned c2_,
size_t width)
82 width *=
sizeof(uint32_t);
83 assert(width >= (2 *
sizeof(__m128i)));
84 assert((
reinterpret_cast<uintptr_t
>(in_ ) %
sizeof(__m128i)) == 0);
85 assert((
reinterpret_cast<uintptr_t
>(out_) %
sizeof(__m128i)) == 0);
87 ptrdiff_t x = -ptrdiff_t(width -
sizeof(__m128i));
88 const auto* in =
reinterpret_cast<const char*
>(in_ ) - x;
89 auto* out =
reinterpret_cast< char*
>(out_) - 2 * x;
92 __m128i c1 = _mm_set1_epi16(narrow<int16_t>(c1_));
93 __m128i c2 = _mm_set1_epi16(narrow<int16_t>(c2_));
94 __m128i zero = _mm_setzero_si128();
96 __m128i abcd = *
reinterpret_cast<const __m128i*
>(in);
97 __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
98 __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
99 __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
105 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
106 __m128i b0c0 = shuffle(a0b0, c0d0);
107 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
108 __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
109 __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
110 __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
111 __m128i abab = _mm_packus_epi16(daab, abbc);
112 *
reinterpret_cast<__m128i*
>(out + 2 * x) =
113 _mm_shuffle_epi32(abab, 0xd8);
114 abcd = *
reinterpret_cast<const __m128i*
>(in + x + 16);
115 a0b0 = _mm_unpacklo_epi8(abcd, zero);
116 __m128i d0a0_= shuffle(c0d0, a0b0);
117 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
118 d1a1 = _mm_mullo_epi16(c1, d0a0_);
119 __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
120 __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
121 __m128i cdcd = _mm_packus_epi16(bccd, cdda);
122 *
reinterpret_cast<__m128i*
>(out + 2 * x + 16) =
123 _mm_shuffle_epi32(cdcd, 0xd8);
128 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
129 __m128i b0c0 = shuffle(a0b0, c0d0);
130 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
131 __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
132 __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
133 __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
134 __m128i abab = _mm_packus_epi16(daab, abbc);
135 *
reinterpret_cast<__m128i*
>(out) = _mm_shuffle_epi32(abab, 0xd8);
136 __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
137 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
138 __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
139 __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
140 __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
141 __m128i cdcd = _mm_packus_epi16(bccd, cddd);
142 *
reinterpret_cast<__m128i*
>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
147template<std::
unsigned_
integral Pixel>
148void Simple2xScaler<Pixel>::blur1on2(
149 std::span<const Pixel> in, std::span<Pixel> out,
unsigned alpha)
151 assert((2 * in.size()) == out.size());
178 Scale_1on2<Pixel>
scale;
183 assert(alpha <= 256);
184 unsigned c1 = alpha / 4;
185 unsigned c2 = 256 - c1;
188 if constexpr (
sizeof(
Pixel) == 4) {
190 blur1on2_SSE2(in.data(), out.data(), c1, c2, in.size());
197 mult1.setFactor32(c1);
198 mult2.setFactor32(c2);
202 unsigned f0 = mult1.mul32(p0);
205 size_t srcWidth = in.size();
207 for (; x < (srcWidth - 2); x += 2) {
208 unsigned tmp1 = mult2.mul32(p0);
209 out[2 * x + 0] = mult1.conv32(f1 + tmp1);
212 f1 = mult1.mul32(p1);
213 out[2 * x + 1] = mult1.conv32(f1 + tmp1);
215 unsigned tmp2 = mult2.mul32(p1);
216 out[2 * x + 2] = mult1.conv32(f0 + tmp2);
219 f0 = mult1.mul32(p0);
220 out[2 * x + 3] = mult1.conv32(f0 + tmp2);
223 unsigned tmp1 = mult2.mul32(p0);
224 out[2 * x + 0] = mult1.conv32(f1 + tmp1);
227 f1 = mult1.mul32(p1);
228 out[2 * x + 1] = mult1.conv32(f1 + tmp1);
230 unsigned tmp2 = mult2.mul32(p1);
231 out[2 * x + 2] = mult1.conv32(f0 + tmp2);
239static void blur1on1_SSE2(
240 const uint32_t* __restrict in_, uint32_t* __restrict out_,
241 unsigned c1_,
unsigned c2_,
size_t width)
243 width *=
sizeof(uint32_t);
244 assert(width >= (2 *
sizeof(__m128i)));
245 assert((
reinterpret_cast<uintptr_t
>(in_ ) %
sizeof(__m128i)) == 0);
246 assert((
reinterpret_cast<uintptr_t
>(out_) %
sizeof(__m128i)) == 0);
248 ptrdiff_t x = -ptrdiff_t(width -
sizeof(__m128i));
249 const auto* in =
reinterpret_cast<const char*
>(in_ ) - x;
250 auto* out =
reinterpret_cast< char*
>(out_) - x;
253 __m128i c1 = _mm_set1_epi16(narrow<int16_t>(c1_));
254 __m128i c2 = _mm_set1_epi16(narrow<int16_t>(c2_));
255 __m128i zero = _mm_setzero_si128();
257 __m128i abcd = *
reinterpret_cast<const __m128i*
>(in);
258 __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
259 __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
265 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
266 __m128i b0c0 = shuffle(a0b0, c0d0);
267 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
268 __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
269 __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
270 abcd = *
reinterpret_cast<const __m128i*
>(in + x + 16);
271 a0b0 = _mm_unpacklo_epi8(abcd, zero);
272 d0a0 = shuffle(c0d0, a0b0);
273 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
274 __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
275 __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
276 *
reinterpret_cast<__m128i*
>(out + x) =
277 _mm_packus_epi16(aabb, ccdd);
282 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
283 __m128i b0c0 = shuffle(a0b0, c0d0);
284 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
285 __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
286 __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
287 __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
288 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
289 __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
290 __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
291 *
reinterpret_cast<__m128i*
>(out) = _mm_packus_epi16(aabb, ccdd);
295template<std::
unsigned_
integral Pixel>
296void Simple2xScaler<Pixel>::blur1on1(
297 std::span<const Pixel> in, std::span<Pixel> out,
unsigned alpha)
323 Scale_1on1<Pixel>
copy;
328 unsigned c1 = alpha / 4;
329 unsigned c2 = 256 - alpha / 2;
332 if constexpr (
sizeof(
Pixel) == 4) {
334 blur1on1_SSE2(in.data(), out.data(), c1, c2, in.size());
341 mult1.setFactor32(c1);
342 mult3.setFactor32(c2);
346 unsigned f0 = mult1.mul32(p0);
349 size_t srcWidth = in.size();
351 for (; x < (srcWidth - 2); x += 2) {
353 unsigned t0 = mult1.mul32(p1);
354 out[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
358 unsigned t1 = mult1.mul32(p0);
359 out[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
364 unsigned t0 = mult1.mul32(p1);
365 out[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
367 out[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
370template<std::
unsigned_
integral Pixel>
371void Simple2xScaler<Pixel>::drawScanline(
372 std::span<const Pixel> in1, std::span<const Pixel> in2, std::span<Pixel> out,
int factor)
375 scanline.draw(in1, in2, out, factor);
377 Scale_1on1<Pixel>
scale;
382template<std::
unsigned_
integral Pixel>
383void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
384 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
385 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
388 int blur = settings.getBlurFactor();
389 int scanlineFactor = settings.getScanlineFactor();
391 unsigned dstY = dstStartY;
392 auto srcLine = src.getLine(srcStartY++, buf);
393 auto dstLine0 = dst.acquireLine(dstY + 0);
394 blur1on2(srcLine, dstLine0, blur);
396 for (; dstY < dstEndY - 2; dstY += 2) {
397 srcLine = src.getLine(srcStartY++, buf);
398 auto dstLine2 = dst.acquireLine(dstY + 2);
399 blur1on2(srcLine, dstLine2, blur);
401 auto dstLine1 = dst.acquireLine(dstY + 1);
402 drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor);
404 dst.releaseLine(dstY + 0, dstLine0);
405 dst.releaseLine(dstY + 1, dstLine1);
409 srcLine = src.getLine(srcStartY++, buf);
411 blur1on2(srcLine, buf2, blur);
413 auto dstLine1 = dst.acquireLine(dstY + 1);
414 drawScanline(dstLine0, buf2, dstLine1, scanlineFactor);
415 dst.releaseLine(dstY + 0, dstLine0);
416 dst.releaseLine(dstY + 1, dstLine1);
419template<std::
unsigned_
integral Pixel>
420void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
421 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
422 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
425 int blur = settings.getBlurFactor();
426 int scanlineFactor = settings.getScanlineFactor();
428 unsigned dstY = dstStartY;
429 auto srcLine = src.getLine(srcStartY++, buf);
430 auto dstLine0 = dst.acquireLine(dstY);
431 blur1on1(srcLine, dstLine0, blur);
433 for (; dstY < dstEndY - 2; dstY += 2) {
434 srcLine = src.getLine(srcStartY++, buf);
435 auto dstLine2 = dst.acquireLine(dstY + 2);
436 blur1on1(srcLine, dstLine2, blur);
438 auto dstLine1 = dst.acquireLine(dstY + 1);
439 drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor);
441 dst.releaseLine(dstY + 0, dstLine0);
442 dst.releaseLine(dstY + 1, dstLine1);
446 srcLine = src.getLine(srcStartY++, buf);
448 blur1on1(srcLine, buf2, blur);
450 auto dstLine1 = dst.acquireLine(dstY + 1);
451 drawScanline(dstLine0, buf2, dstLine1, scanlineFactor);
452 dst.releaseLine(dstY + 0, dstLine0);
453 dst.releaseLine(dstY + 1, dstLine1);
456template<std::
unsigned_
integral Pixel>
457void Simple2xScaler<Pixel>::scaleImage(
458 FrameSource& src,
const RawFrame* superImpose,
459 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
460 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
472 SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
473 srcWidth = sf.getLineWidth(srcStartY);
474 this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
475 dst, dstStartY, dstEndY);
477 this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
478 dst, dstStartY, dstEndY);
484template class Simple2xScaler<uint16_t>;
487template class Simple2xScaler<uint32_t>;
Interface for getting lines from a video frame.
Pixel getLineColor(unsigned line) const
Get the (single) color of the given line.
virtual unsigned getLineWidth(unsigned line) const =0
Gets the number of display pixels on the given line.
Class containing all settings for renderers.
Base class for 2x scalers.
virtual unsigned getHeight() const =0
virtual void fillLine(unsigned y, Pixel color)=0
Scaler which assigns the color of the original pixel to all pixels in the 2x2 square.
Simple2xScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
constexpr mat4 scale(const vec3 &xyz)
This file implemented 3 utility functions:
auto copy(InputRange &&range, OutputIter out)
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)