12 #include <emmintrin.h>
21 inline void setBlur(
unsigned blur_) { blur = blur_; }
30 void blur_SSE(
const Pixel* in_,
Pixel* out_,
size_t srcWidth);
35 template<
typename Pixel>
47 template<
typename Pixel>
50 template<
typename Pixel>
52 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
57 int scanlineFactor = settings.getScanlineFactor();
59 unsigned y = dstStartY;
60 auto* srcLine = src.
getLinePtr(srcStartY++, srcWidth, buf);
62 scale(srcLine, dstLine0, dstWidth);
66 copy(dstLine0, dstLine1, dstWidth);
68 for (; (y + 4) < dstEndY; y += 3, srcStartY += 1) {
69 srcLine = src.
getLinePtr(srcStartY, srcWidth, buf);
71 scale(srcLine, dstLine3, dstWidth);
74 copy(dstLine3, dstLine4, dstWidth);
77 scanline.draw(dstLine0, dstLine3, dstLine2,
78 scanlineFactor, dstWidth);
86 srcLine = src.
getLinePtr(srcStartY, srcWidth, buf);
88 scale(srcLine, buf2, dstWidth);
91 scanline.draw(dstLine0, buf2, dstLine2, scanlineFactor, dstWidth);
97 template<
typename Pixel>
98 void Simple3xScaler<Pixel>::doScale2(FrameSource& src,
99 unsigned srcStartY,
unsigned ,
unsigned srcWidth,
100 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY,
101 PolyLineScaler<Pixel>&
scale)
104 int scanlineFactor = settings.getScanlineFactor();
105 unsigned dstWidth = dst.getWidth();
106 for (
unsigned srcY = srcStartY, dstY = dstStartY; dstY < dstEndY;
107 srcY += 2, dstY += 3) {
108 auto* srcLine0 = src.getLinePtr(srcY + 0, srcWidth, buf);
109 auto* dstLine0 = dst.acquireLine(dstY + 0);
110 scale(srcLine0, dstLine0, dstWidth);
112 auto* srcLine1 = src.getLinePtr(srcY + 1, srcWidth, buf);
113 auto* dstLine2 = dst.acquireLine(dstY + 2);
114 scale(srcLine1, dstLine2, dstWidth);
116 auto* dstLine1 = dst.acquireLine(dstY + 1);
117 scanline.draw(dstLine0, dstLine2, dstLine1,
118 scanlineFactor, dstWidth);
120 dst.releaseLine(dstY + 0, dstLine0);
121 dst.releaseLine(dstY + 1, dstLine1);
122 dst.releaseLine(dstY + 2, dstLine2);
126 template<
typename Pixel>
127 void Simple3xScaler<Pixel>::scale2x1to9x3(FrameSource& src,
128 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
129 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
131 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
132 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
135 template<
typename Pixel>
136 void Simple3xScaler<Pixel>::scale2x2to9x3(FrameSource& src,
137 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
138 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
140 PolyScale<Pixel, Scale_2on9<Pixel>> op(pixelOps);
141 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
144 template<
typename Pixel>
145 void Simple3xScaler<Pixel>::scale1x1to3x3(FrameSource& src,
146 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
147 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
149 if (
unsigned blur = settings.getBlurFactor() / 3) {
150 blur_1on3->setBlur(blur);
151 PolyScaleRef<Pixel, Blur_1on3<Pixel>> op(*blur_1on3);
152 doScale1(src, srcStartY, srcEndY, srcWidth,
153 dst, dstStartY, dstEndY, op);
158 PolyScale<Pixel, Scale_1on3<Pixel>> op;
159 doScale1(src, srcStartY, srcEndY, srcWidth,
160 dst, dstStartY, dstEndY, op);
164 template<
typename Pixel>
165 void Simple3xScaler<Pixel>::scale1x2to3x3(FrameSource& src,
166 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
167 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
169 PolyScale<Pixel, Scale_1on3<Pixel>> op;
170 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
173 template<
typename Pixel>
174 void Simple3xScaler<Pixel>::scale4x1to9x3(FrameSource& src,
175 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
176 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
178 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
179 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
182 template<
typename Pixel>
183 void Simple3xScaler<Pixel>::scale4x2to9x3(FrameSource& src,
184 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
185 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
187 PolyScale<Pixel, Scale_4on9<Pixel>> op(pixelOps);
188 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
191 template<
typename Pixel>
192 void Simple3xScaler<Pixel>::scale2x1to3x3(FrameSource& src,
193 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
194 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
196 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
197 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
200 template<
typename Pixel>
201 void Simple3xScaler<Pixel>::scale2x2to3x3(FrameSource& src,
202 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
203 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
205 PolyScale<Pixel, Scale_2on3<Pixel>> op(pixelOps);
206 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
209 template<
typename Pixel>
210 void Simple3xScaler<Pixel>::scale8x1to9x3(FrameSource& src,
211 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
212 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
214 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
215 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
218 template<
typename Pixel>
219 void Simple3xScaler<Pixel>::scale8x2to9x3(FrameSource& src,
220 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
221 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
223 PolyScale<Pixel, Scale_8on9<Pixel>> op(pixelOps);
224 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
227 template<
typename Pixel>
228 void Simple3xScaler<Pixel>::scale4x1to3x3(FrameSource& src,
229 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
230 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
232 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
233 doScale1(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
236 template<
typename Pixel>
237 void Simple3xScaler<Pixel>::scale4x2to3x3(FrameSource& src,
238 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
239 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
241 PolyScale<Pixel, Scale_4on3<Pixel>> op(pixelOps);
242 doScale2(src, srcStartY, srcEndY, srcWidth, dst, dstStartY, dstEndY, op);
245 template<
typename Pixel>
246 void Simple3xScaler<Pixel>::scaleBlank1to3(
247 FrameSource& src,
unsigned srcStartY,
unsigned srcEndY,
248 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
250 int scanlineFactor = settings.getScanlineFactor();
252 unsigned dstHeight = dst.getHeight();
253 unsigned stopDstY = (dstEndY == dstHeight)
254 ? dstEndY : dstEndY - 3;
255 unsigned srcY = srcStartY, dstY = dstStartY;
256 for (; dstY < stopDstY; srcY += 1, dstY += 3) {
257 auto color0 = src.getLineColor<
Pixel>(srcY);
258 Pixel color1 = scanline.darken(color0, scanlineFactor);
259 dst.fillLine(dstY + 0, color0);
260 dst.fillLine(dstY + 1, color0);
261 dst.fillLine(dstY + 2, color1);
263 if (dstY != dstHeight) {
264 unsigned nextLineWidth = src.getLineWidth(srcY + 1);
265 assert(src.getLineWidth(srcY) == 1);
266 assert(nextLineWidth != 1);
267 this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
272 template<
typename Pixel>
273 void Simple3xScaler<Pixel>::scaleBlank2to3(
274 FrameSource& src,
unsigned srcStartY,
unsigned ,
275 ScalerOutput<Pixel>& dst,
unsigned dstStartY,
unsigned dstEndY)
277 int scanlineFactor = settings.getScanlineFactor();
278 for (
unsigned srcY = srcStartY, dstY = dstStartY;
279 dstY < dstEndY; srcY += 2, dstY += 3) {
280 auto color0 = src.getLineColor<
Pixel>(srcY + 0);
281 auto color1 = src.getLineColor<
Pixel>(srcY + 1);
282 Pixel color01 = scanline.darken(color0, color1, scanlineFactor);
283 dst.fillLine(dstY + 0, color0);
284 dst.fillLine(dstY + 1, color01);
285 dst.fillLine(dstY + 2, color1);
292 template<
typename Pixel>
302 template<
typename Pixel>
305 if (
sizeof(
Pixel) != 4) {
306 assert(
false);
return;
309 assert((srcWidth % 4) == 0);
310 assert(srcWidth >= 8);
311 assert((
size_t(in_ ) % 16) == 0);
312 assert((
size_t(out_) % 16) == 0);
314 unsigned alpha = blur * 256;
315 unsigned c0 = alpha / 2;
316 unsigned c1 = alpha + c0;
317 unsigned c2 = 0x10000 - c1;
318 unsigned c3 = 0x10000 - alpha;
319 __m128i C0C1 = _mm_set_epi16(c1, c1, c1, c1, c0, c0, c0, c0);
320 __m128i C1C0 = _mm_shuffle_epi32(C0C1, 0x4E);
321 __m128i C2C3 = _mm_set_epi16(c3, c3, c3, c3, c2, c2, c2, c2);
322 __m128i C3C2 = _mm_shuffle_epi32(C2C3, 0x4E);
324 size_t tmp = srcWidth - 4;
325 const auto* in =
reinterpret_cast<const char*
>(in_ + tmp);
326 auto* out =
reinterpret_cast< char*
>(out_ + 3 * tmp);
327 auto x = -ptrdiff_t(tmp *
sizeof(
Pixel));
329 __m128i ZERO = _mm_setzero_si128();
332 __m128i abcd = _mm_load_si128(
reinterpret_cast<const __m128i*
>(in +
x));
333 __m128i a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
334 __m128i a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
335 __m128i a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
336 __m128i d1d0 = _mm_shuffle_epi32(a0a1, 0x4E);
343 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
344 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
345 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
346 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
347 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
349 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
350 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
351 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
352 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
353 __m128i
p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
354 __m128i p0123 = _mm_packus_epi16(p01,
p23);
355 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 *
x + 0),
359 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
360 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
361 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
362 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
363 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
365 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
366 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
367 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
368 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
369 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
370 __m128i p4567 = _mm_packus_epi16(p45, p67);
371 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 *
x + 16),
375 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
376 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
377 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
378 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
379 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
381 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
382 abcd = _mm_load_si128(
reinterpret_cast<const __m128i*
>(in +
x + 16));
383 a_b_ = _mm_unpacklo_epi8(abcd, ZERO);
384 a_a_ = _mm_unpacklo_epi64(a_b_, a_b_);
385 a0a1 = _mm_mulhi_epu16(a_a_, C0C1);
386 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
387 __m128i p89ab = _mm_packus_epi16(p89, pab);
388 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 3 *
x + 32),
396 __m128i a2a3 = _mm_mulhi_epu16(a_a_, C2C3);
397 __m128i b_b_ = _mm_unpackhi_epi64(a_b_, a_b_);
398 __m128i b1b0 = _mm_mulhi_epu16(b_b_, C1C0);
399 __m128i xxb0 = _mm_unpackhi_epi64(ZERO, b1b0);
400 __m128i p01 = _mm_add_epi16(_mm_add_epi16(d1d0, a2a3), xxb0);
402 __m128i xxa1 = _mm_unpackhi_epi64(ZERO, a0a1);
403 __m128i b3b2 = _mm_mulhi_epu16(b_b_, C3C2);
404 __m128i a2b2 = shuffle<0xE4>(a2a3, b3b2);
405 __m128i b1xx = _mm_unpacklo_epi64(b1b0, ZERO);
406 __m128i
p23 = _mm_add_epi16(_mm_add_epi16(xxa1, a2b2), b1xx);
407 __m128i p0123 = _mm_packus_epi16(p01,
p23);
408 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 0),
412 __m128i a0xx = _mm_unpacklo_epi64(a0a1, ZERO);
413 __m128i c_d_ = _mm_unpackhi_epi8(abcd, ZERO);
414 __m128i c_c_ = _mm_unpacklo_epi64(c_d_, c_d_);
415 __m128i c0c1 = _mm_mulhi_epu16(c_c_, C0C1);
416 __m128i p45 = _mm_add_epi16(_mm_add_epi16(a0xx, b3b2), c0c1);
418 __m128i c2c3 = _mm_mulhi_epu16(c_c_, C2C3);
419 __m128i d_d_ = _mm_unpackhi_epi64(c_d_, c_d_);
420 d1d0 = _mm_mulhi_epu16(d_d_, C1C0);
421 __m128i xxd0 = _mm_unpackhi_epi64(ZERO, d1d0);
422 __m128i p67 = _mm_add_epi16(_mm_add_epi16(b1b0, c2c3), xxd0);
423 __m128i p4567 = _mm_packus_epi16(p45, p67);
424 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 16),
428 __m128i xxc1 = _mm_unpackhi_epi64(ZERO, c0c1);
429 __m128i d3d2 = _mm_mulhi_epu16(d_d_, C3C2);
430 __m128i c2d2 = shuffle<0xE4>(c2c3, d3d2);
431 __m128i d1xx = _mm_unpacklo_epi64(d1d0, ZERO);
432 __m128i p89 = _mm_add_epi16(_mm_add_epi16(xxc1, c2d2), d1xx);
434 __m128i c0xx = _mm_unpacklo_epi64(c0c1, ZERO);
435 a0a1 = _mm_shuffle_epi32(d1d0, 0x4E);
436 __m128i pab = _mm_add_epi16(_mm_add_epi16(c0xx, d3d2), a0a1);
437 __m128i p89ab = _mm_packus_epi16(p89, pab);
438 _mm_store_si128(
reinterpret_cast<__m128i*
>(out + 32),
443 template<
typename Pixel>
445 const Pixel* __restrict in,
Pixel* __restrict out,
468 size_t srcWidth = dstWidth / 3;
470 if (
sizeof(
Pixel) == 4) {
471 blur_SSE(in, out, srcWidth);
477 unsigned c0 = blur / 2;
478 unsigned c1 = blur + c0;
479 unsigned c2 = 256 - c1;
480 unsigned c3 = 256 - 2 * c0;
481 mult0.setFactor32(c0);
482 mult1.setFactor32(c1);
483 mult2.setFactor32(c2);
484 mult3.setFactor32(c3);
488 uint32_t f0 = mult0.mul32(p0);
489 uint32_t f1 = mult1.mul32(p0);
494 for (;
x < (srcWidth - 2);
x += 2) {
495 uint32_t g2 = mult2.mul32(p0);
496 out[3 *
x + 0] = mult0.conv32(g2 + f1);
498 uint32_t t0 = mult0.mul32(p1);
499 out[3 *
x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
501 f1 = mult1.mul32(p1);
502 out[3 *
x + 2] = mult0.conv32(g2 + f1);
504 uint32_t f2 = mult2.mul32(p1);
505 out[3 *
x + 3] = mult0.conv32(f2 + g1);
507 uint32_t t1 = mult0.mul32(p0);
508 out[3 *
x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + t1);
510 g1 = mult1.mul32(p0);
511 out[3 *
x + 5] = mult0.conv32(g1 + f2);
513 uint32_t g2 = mult2.mul32(p0);
514 out[3 *
x + 0] = mult0.conv32(g2 + f1);
516 uint32_t t0 = mult0.mul32(p1);
517 out[3 *
x + 1] = mult0.conv32(f0 + mult3.mul32(p0) + t0);
519 f1 = mult1.mul32(p1);
520 out[3 *
x + 2] = mult0.conv32(g2 + f1);
522 uint32_t f2 = mult2.mul32(p1);
523 out[3 *
x + 3] = mult0.conv32(f2 + g1);
524 out[3 *
x + 4] = mult0.conv32(g0 + mult3.mul32(p1) + f0);
528 template<
typename Pixel>
531 unsigned srcStartY,
unsigned srcEndY,
unsigned srcWidth,
536 srcWidth = sf.getLineWidth(srcStartY);
537 this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
538 dst, dstStartY, dstEndY);
540 this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
541 dst, dstStartY, dstEndY);
547 template class Simple3xScaler<uint16_t>;
550 template class Simple3xScaler<uint32_t>;