openMSX
Simple2xScaler.cc
Go to the documentation of this file.
1#include "Simple2xScaler.hh"
3#include "LineScalers.hh"
4#include "RawFrame.hh"
5#include "ScalerOutput.hh"
6#include "RenderSettings.hh"
7#include "narrow.hh"
8#include "vla.hh"
9#include <cassert>
10#include <cstddef>
11#include <cstdint>
12#ifdef __SSE2__
13#include <emmintrin.h>
14#endif
15
16namespace openmsx {
17
18// class Simple2xScaler
19
20template<std::unsigned_integral Pixel>
22 const PixelOperations<Pixel>& pixelOps_,
23 RenderSettings& renderSettings)
24 : Scaler2<Pixel>(pixelOps_)
25 , settings(renderSettings)
26 , pixelOps(pixelOps_)
27 , mult1(pixelOps)
28 , mult2(pixelOps)
29 , mult3(pixelOps)
30 , scanline(pixelOps)
31{
32}
33
34template<std::unsigned_integral Pixel>
36 FrameSource& src, unsigned srcStartY, unsigned srcEndY,
37 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
38{
39 int scanlineFactor = settings.getScanlineFactor();
40
41 unsigned dstHeight = dst.getHeight();
42 unsigned stopDstY = (dstEndY == dstHeight)
43 ? dstEndY : dstEndY - 2;
44 unsigned srcY = srcStartY, dstY = dstStartY;
45 for (/* */; dstY < stopDstY; srcY += 1, dstY += 2) {
46 auto color0 = src.getLineColor<Pixel>(srcY);
47 dst.fillLine(dstY + 0, color0);
48 Pixel color1 = scanline.darken(color0, scanlineFactor);
49 dst.fillLine(dstY + 1, color1);
50 }
51 if (dstY != dstHeight) {
52 unsigned nextLineWidth = src.getLineWidth(srcY + 1);
53 assert(src.getLineWidth(srcY) == 1);
54 assert(nextLineWidth != 1);
55 this->dispatchScale(src, srcY, srcEndY, nextLineWidth,
56 dst, dstY, dstEndY);
57 }
58}
59
60#ifdef __SSE2__
61
62// Combines upper-half of 'x' with lower half of 'y'.
63static inline __m128i shuffle(__m128i x, __m128i y)
64{
65 // mm_shuffle_pd() actually shuffles 64-bit floating point values, we
66 // need to shuffle integers. Though floats and ints are stored in the
67 // same xmmN registers. So this instruction does the right thing.
68 // However (some?) x86 CPUs keep the float and integer interpretations
69 // of these registers in different physical locations in the chip and
70 // there is some overhead on switching between these interpretations.
71 // So the casts in the statement below don't generate any instructions,
72 // but they still can cause overhead on (some?) CPUs.
73 return _mm_castpd_si128(_mm_shuffle_pd(
74 _mm_castsi128_pd(x), _mm_castsi128_pd(y), 1));
75}
76
77// 32bpp
78static void blur1on2_SSE2(
79 const uint32_t* __restrict in_, uint32_t* __restrict out_,
80 unsigned c1_, unsigned c2_, size_t width)
81{
82 width *= sizeof(uint32_t); // in bytes
83 assert(width >= (2 * sizeof(__m128i)));
84 assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
85 assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
86
87 ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
88 const auto* in = reinterpret_cast<const char*>(in_ ) - x;
89 auto* out = reinterpret_cast< char*>(out_) - 2 * x;
90
91 // Setup first iteration
92 __m128i c1 = _mm_set1_epi16(narrow<int16_t>(c1_));
93 __m128i c2 = _mm_set1_epi16(narrow<int16_t>(c2_));
94 __m128i zero = _mm_setzero_si128();
95
96 __m128i abcd = *reinterpret_cast<const __m128i*>(in);
97 __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
98 __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
99 __m128i d1a1 = _mm_mullo_epi16(c1, d0a0);
100
101 // Each iteration reads 4 pixels and generates 8 pixels
102 do {
103 // At the start of each iteration these variables are live:
104 // abcd, a0b0, d1a1
105 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
106 __m128i b0c0 = shuffle(a0b0, c0d0);
107 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
108 __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
109 __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
110 __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
111 __m128i abab = _mm_packus_epi16(daab, abbc);
112 *reinterpret_cast<__m128i*>(out + 2 * x) =
113 _mm_shuffle_epi32(abab, 0xd8);
114 abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
115 a0b0 = _mm_unpacklo_epi8(abcd, zero);
116 __m128i d0a0_= shuffle(c0d0, a0b0);
117 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
118 d1a1 = _mm_mullo_epi16(c1, d0a0_);
119 __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
120 __m128i cdda = _mm_srli_epi16(_mm_add_epi16(c2d2, d1a1), 8);
121 __m128i cdcd = _mm_packus_epi16(bccd, cdda);
122 *reinterpret_cast<__m128i*>(out + 2 * x + 16) =
123 _mm_shuffle_epi32(cdcd, 0xd8);
124 x += 16;
125 } while (x < 0);
126
127 // Last iteration (because this doesn't need to read new input)
128 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
129 __m128i b0c0 = shuffle(a0b0, c0d0);
130 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
131 __m128i b1c1 = _mm_mullo_epi16(c1, b0c0);
132 __m128i daab = _mm_srli_epi16(_mm_add_epi16(d1a1, a2b2), 8);
133 __m128i abbc = _mm_srli_epi16(_mm_add_epi16(a2b2, b1c1), 8);
134 __m128i abab = _mm_packus_epi16(daab, abbc);
135 *reinterpret_cast<__m128i*>(out) = _mm_shuffle_epi32(abab, 0xd8);
136 __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
137 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
138 __m128i d1d1 = _mm_mullo_epi16(c1, d0d0);
139 __m128i bccd = _mm_srli_epi16(_mm_add_epi16(b1c1, c2d2), 8);
140 __m128i cddd = _mm_srli_epi16(_mm_add_epi16(c2d2, d1d1), 8);
141 __m128i cdcd = _mm_packus_epi16(bccd, cddd);
142 *reinterpret_cast<__m128i*>(out + 16) = _mm_shuffle_epi32(cdcd, 0xd8);
143}
144
145#endif
146
147template<std::unsigned_integral Pixel>
148void Simple2xScaler<Pixel>::blur1on2(
149 std::span<const Pixel> in, std::span<Pixel> out, unsigned alpha)
150{
151 assert((2 * in.size()) == out.size());
152 /* This routine is functionally equivalent to the following:
153 *
154 * void blur1on2(const Pixel* in, Pixel* out, unsigned alpha)
155 * {
156 * unsigned c1 = alpha / 4;
157 * unsigned c2 = 256 - c1;
158 *
159 * Pixel prev, curr, next;
160 * prev = curr = in[0];
161 *
162 * unsigned x = 0;
163 * for (; x < (srcWidth - 1); ++x) {
164 * out[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
165 * Pixel next = in[x + 1];
166 * out[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
167 * prev = curr;
168 * curr = next;
169 * }
170 *
171 * out[2 * x + 0] = (c1 * prev + c2 * curr) >> 8;
172 * next = curr;
173 * out[2 * x + 1] = (c1 * next + c2 * curr) >> 8;
174 * }
175 */
176
177 if (alpha == 0) {
178 Scale_1on2<Pixel> scale;
179 scale(in, out);
180 return;
181 }
182
183 assert(alpha <= 256);
184 unsigned c1 = alpha / 4;
185 unsigned c2 = 256 - c1;
186
187#ifdef __SSE2__
188 if constexpr (sizeof(Pixel) == 4) {
189 // SSE2, only 32bpp
190 blur1on2_SSE2(in.data(), out.data(), c1, c2, in.size());
191 return;
192 }
193#endif
194 // C++ routine, both 16bpp and 32bpp.
195 // The loop is 2x unrolled and all common subexpressions and redundant
196 // assignments have been eliminated. 1 iteration generates 4 pixels.
197 mult1.setFactor32(c1);
198 mult2.setFactor32(c2);
199
200 Pixel p0 = in[0];
201 Pixel p1;
202 unsigned f0 = mult1.mul32(p0);
203 unsigned f1 = f0;
204
205 size_t srcWidth = in.size();
206 size_t x = 0;
207 for (; x < (srcWidth - 2); x += 2) {
208 unsigned tmp1 = mult2.mul32(p0);
209 out[2 * x + 0] = mult1.conv32(f1 + tmp1);
210
211 p1 = in[x + 1];
212 f1 = mult1.mul32(p1);
213 out[2 * x + 1] = mult1.conv32(f1 + tmp1);
214
215 unsigned tmp2 = mult2.mul32(p1);
216 out[2 * x + 2] = mult1.conv32(f0 + tmp2);
217
218 p0 = in[x + 2];
219 f0 = mult1.mul32(p0);
220 out[2 * x + 3] = mult1.conv32(f0 + tmp2);
221 }
222
223 unsigned tmp1 = mult2.mul32(p0);
224 out[2 * x + 0] = mult1.conv32(f1 + tmp1);
225
226 p1 = in[x + 1];
227 f1 = mult1.mul32(p1);
228 out[2 * x + 1] = mult1.conv32(f1 + tmp1);
229
230 unsigned tmp2 = mult2.mul32(p1);
231 out[2 * x + 2] = mult1.conv32(f0 + tmp2);
232
233 out[2 * x + 3] = p1;
234}
235
236#ifdef __SSE2__
237
238// 32bpp
239static void blur1on1_SSE2(
240 const uint32_t* __restrict in_, uint32_t* __restrict out_,
241 unsigned c1_, unsigned c2_, size_t width)
242{
243 width *= sizeof(uint32_t); // in bytes
244 assert(width >= (2 * sizeof(__m128i)));
245 assert((reinterpret_cast<uintptr_t>(in_ ) % sizeof(__m128i)) == 0);
246 assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
247
248 ptrdiff_t x = -ptrdiff_t(width - sizeof(__m128i));
249 const auto* in = reinterpret_cast<const char*>(in_ ) - x;
250 auto* out = reinterpret_cast< char*>(out_) - x;
251
252 // Setup first iteration
253 __m128i c1 = _mm_set1_epi16(narrow<int16_t>(c1_));
254 __m128i c2 = _mm_set1_epi16(narrow<int16_t>(c2_));
255 __m128i zero = _mm_setzero_si128();
256
257 __m128i abcd = *reinterpret_cast<const __m128i*>(in);
258 __m128i a0b0 = _mm_unpacklo_epi8(abcd, zero);
259 __m128i d0a0 = _mm_shuffle_epi32(a0b0, 0x44);
260
261 // Each iteration reads 4 pixels and generates 4 pixels
262 do {
263 // At the start of each iteration these variables are live:
264 // abcd, a0b0, d0a0
265 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
266 __m128i b0c0 = shuffle(a0b0, c0d0);
267 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
268 __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
269 __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
270 abcd = *reinterpret_cast<const __m128i*>(in + x + 16);
271 a0b0 = _mm_unpacklo_epi8(abcd, zero);
272 d0a0 = shuffle(c0d0, a0b0);
273 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
274 __m128i bdca = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0a0));
275 __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdca, c2d2), 8);
276 *reinterpret_cast<__m128i*>(out + x) =
277 _mm_packus_epi16(aabb, ccdd);
278 x += 16;
279 } while (x < 0);
280
281 // Last iteration (because this doesn't need to read new input)
282 __m128i c0d0 = _mm_unpackhi_epi8(abcd, zero);
283 __m128i b0c0 = shuffle(a0b0, c0d0);
284 __m128i a2b2 = _mm_mullo_epi16(c2, a0b0);
285 __m128i dbac = _mm_mullo_epi16(c1, _mm_add_epi16(d0a0, b0c0));
286 __m128i aabb = _mm_srli_epi16(_mm_add_epi16(dbac, a2b2), 8);
287 __m128i d0d0 = _mm_shuffle_epi32(c0d0, 0xee);
288 __m128i c2d2 = _mm_mullo_epi16(c2, c0d0);
289 __m128i bdcd = _mm_mullo_epi16(c1, _mm_add_epi16(b0c0, d0d0));
290 __m128i ccdd = _mm_srli_epi16(_mm_add_epi16(bdcd, c2d2), 8);
291 *reinterpret_cast<__m128i*>(out) = _mm_packus_epi16(aabb, ccdd);
292}
293
294#endif
295template<std::unsigned_integral Pixel>
296void Simple2xScaler<Pixel>::blur1on1(
297 std::span<const Pixel> in, std::span<Pixel> out, unsigned alpha)
298{
299 /* This routine is functionally equivalent to the following:
300 *
301 * void blur1on1(const Pixel* in, Pixel* out, unsigned alpha)
302 * {
303 * unsigned c1 = alpha / 4;
304 * unsigned c2 = 256 - alpha / 2;
305 *
306 * Pixel prev, curr, next;
307 * prev = curr = in[0];
308 *
309 * unsigned x = 0;
310 * for (; x < (srcWidth - 1); ++x) {
311 * next = in[x + 1];
312 * out[x] = (c1 * prev + c2 * curr + c1 * next) >> 8;
313 * prev = curr;
314 * curr = next;
315 * }
316 *
317 * next = curr;
318 * out[x] = c1 * prev + c2 * curr + c1 * next;
319 * }
320 */
321
322 if (alpha == 0) {
323 Scale_1on1<Pixel> copy;
324 copy(in, out);
325 return;
326 }
327
328 unsigned c1 = alpha / 4;
329 unsigned c2 = 256 - alpha / 2;
330
331#ifdef __SSE2__
332 if constexpr (sizeof(Pixel) == 4) {
333 // SSE2, only 32bpp
334 blur1on1_SSE2(in.data(), out.data(), c1, c2, in.size());
335 return;
336 }
337#endif
338 // C++ routine, both 16bpp and 32bpp.
339 // The loop is 2x unrolled and all common subexpressions and redundant
340 // assignments have been eliminated. 1 iteration generates 2 pixels.
341 mult1.setFactor32(c1);
342 mult3.setFactor32(c2);
343
344 Pixel p0 = in[0];
345 Pixel p1;
346 unsigned f0 = mult1.mul32(p0);
347 unsigned f1 = f0;
348
349 size_t srcWidth = in.size();
350 size_t x = 0;
351 for (; x < (srcWidth - 2); x += 2) {
352 p1 = in[x + 1];
353 unsigned t0 = mult1.mul32(p1);
354 out[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
355 f0 = t0;
356
357 p0 = in[x + 2];
358 unsigned t1 = mult1.mul32(p0);
359 out[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t1);
360 f1 = t1;
361 }
362
363 p1 = in[x + 1];
364 unsigned t0 = mult1.mul32(p1);
365 out[x] = mult1.conv32(f0 + mult3.mul32(p0) + t0);
366
367 out[x + 1] = mult1.conv32(f1 + mult3.mul32(p1) + t0);
368}
369
370template<std::unsigned_integral Pixel>
371void Simple2xScaler<Pixel>::drawScanline(
372 std::span<const Pixel> in1, std::span<const Pixel> in2, std::span<Pixel> out, int factor)
373{
374 if (factor != 255) {
375 scanline.draw(in1, in2, out, factor);
376 } else {
377 Scale_1on1<Pixel> scale;
378 scale(in1, out);
379 }
380}
381
382template<std::unsigned_integral Pixel>
383void Simple2xScaler<Pixel>::scale1x1to2x2(FrameSource& src,
384 unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
385 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
386{
387 VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
388 int blur = settings.getBlurFactor();
389 int scanlineFactor = settings.getScanlineFactor();
390
391 unsigned dstY = dstStartY;
392 auto srcLine = src.getLine(srcStartY++, buf);
393 auto dstLine0 = dst.acquireLine(dstY + 0);
394 blur1on2(srcLine, dstLine0, blur);
395
396 for (; dstY < dstEndY - 2; dstY += 2) {
397 srcLine = src.getLine(srcStartY++, buf);
398 auto dstLine2 = dst.acquireLine(dstY + 2);
399 blur1on2(srcLine, dstLine2, blur);
400
401 auto dstLine1 = dst.acquireLine(dstY + 1);
402 drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor);
403
404 dst.releaseLine(dstY + 0, dstLine0);
405 dst.releaseLine(dstY + 1, dstLine1);
406 dstLine0 = dstLine2;
407 }
408
409 srcLine = src.getLine(srcStartY++, buf);
410 VLA_SSE_ALIGNED(Pixel, buf2, 2 * size_t(srcWidth));
411 blur1on2(srcLine, buf2, blur);
412
413 auto dstLine1 = dst.acquireLine(dstY + 1);
414 drawScanline(dstLine0, buf2, dstLine1, scanlineFactor);
415 dst.releaseLine(dstY + 0, dstLine0);
416 dst.releaseLine(dstY + 1, dstLine1);
417}
418
419template<std::unsigned_integral Pixel>
420void Simple2xScaler<Pixel>::scale1x1to1x2(FrameSource& src,
421 unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
422 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
423{
424 VLA_SSE_ALIGNED(Pixel, buf, srcWidth);
425 int blur = settings.getBlurFactor();
426 int scanlineFactor = settings.getScanlineFactor();
427
428 unsigned dstY = dstStartY;
429 auto srcLine = src.getLine(srcStartY++, buf);
430 auto dstLine0 = dst.acquireLine(dstY);
431 blur1on1(srcLine, dstLine0, blur);
432
433 for (; dstY < dstEndY - 2; dstY += 2) {
434 srcLine = src.getLine(srcStartY++, buf);
435 auto dstLine2 = dst.acquireLine(dstY + 2);
436 blur1on1(srcLine, dstLine2, blur);
437
438 auto dstLine1 = dst.acquireLine(dstY + 1);
439 drawScanline(dstLine0, dstLine2, dstLine1, scanlineFactor);
440
441 dst.releaseLine(dstY + 0, dstLine0);
442 dst.releaseLine(dstY + 1, dstLine1);
443 dstLine0 = dstLine2;
444 }
445
446 srcLine = src.getLine(srcStartY++, buf);
447 VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
448 blur1on1(srcLine, buf2, blur);
449
450 auto dstLine1 = dst.acquireLine(dstY + 1);
451 drawScanline(dstLine0, buf2, dstLine1, scanlineFactor);
452 dst.releaseLine(dstY + 0, dstLine0);
453 dst.releaseLine(dstY + 1, dstLine1);
454}
455
456template<std::unsigned_integral Pixel>
457void Simple2xScaler<Pixel>::scaleImage(
458 FrameSource& src, const RawFrame* superImpose,
459 unsigned srcStartY, unsigned srcEndY, unsigned srcWidth,
460 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
461{
462 if (superImpose) {
463 // Note: this implementation is different from the openGL
464 // version. Here we first alpha-blend and then scale, so the
465 // video layer will also get blurred (and possibly down-scaled
466 // to MSX resolution). The openGL version will only blur the
467 // MSX frame, then blend with the video frame and then apply
468 // scanlines. I think the openGL version is visually slightly
469 // better, but much more work to implement in software (in
470 // openGL shaders it's very easy). Maybe we can improve this
471 // later (if required at all).
472 SuperImposedVideoFrame<Pixel> sf(src, *superImpose, pixelOps);
473 srcWidth = sf.getLineWidth(srcStartY);
474 this->dispatchScale(sf, srcStartY, srcEndY, srcWidth,
475 dst, dstStartY, dstEndY);
476 } else {
477 this->dispatchScale(src, srcStartY, srcEndY, srcWidth,
478 dst, dstStartY, dstEndY);
479 }
480}
481
482// Force template instantiation.
483#if HAVE_16BPP
484template class Simple2xScaler<uint16_t>;
485#endif
486#if HAVE_32BPP
487template class Simple2xScaler<uint32_t>;
488#endif
489
490} // namespace openmsx
Interface for getting lines from a video frame.
Definition: FrameSource.hh:20
Pixel getLineColor(unsigned line) const
Get the (single) color of the given line.
Definition: FrameSource.hh:79
virtual unsigned getLineWidth(unsigned line) const =0
Gets the number of display pixels on the given line.
Class containing all settings for renderers.
Base class for 2x scalers.
Definition: Scaler2.hh:12
virtual unsigned getHeight() const =0
virtual void fillLine(unsigned y, Pixel color)=0
Scaler which assigns the color of the original pixel to all pixels in the 2x2 square.
Simple2xScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
auto copy(InputRange &&range, OutputIter out)
Definition: ranges.hh:232
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:50