openMSX
Scale2xScaler.cc
Go to the documentation of this file.
1/*
2Original code: Copyright (C) 2001-2003 Andrea Mazzoleni
3openMSX adaptation by Maarten ter Huurne
4
5This file is based on code from the Scale2x project.
6This modified version is licensed under GPL; the original code is dual-licensed
7under GPL and under a custom license.
8
9Visit the Scale2x site for info:
10 http://scale2x.sourceforge.net/
11*/
12
13#include "Scale2xScaler.hh"
14#include "FrameSource.hh"
15#include "ScalerOutput.hh"
16#include "narrow.hh"
17#include "unreachable.hh"
18#include "vla.hh"
19#include "xrange.hh"
20#include <cassert>
21#include <cstddef>
22#include <cstdint>
23#ifdef __SSE2__
24#include "emmintrin.h" // SSE2
25#ifdef __SSSE3__
26#include "tmmintrin.h" // SSSE3 (supplemental SSE3)
27#endif
28#endif
29
30namespace openmsx {
31
32#ifdef __SSE2__
33
34// Take an (unaligned) word from a certain position out of two adjacent
35// (aligned) words. This either maps directly to the _mm_alignr_epi8()
36// intrinsic or emulates that behavior.
37template<int BYTES, int TMP = sizeof(__m128i) - BYTES>
38[[nodiscard]] static inline __m128i align(__m128i high, __m128i low)
39{
40#ifdef __SSSE3__
41 return _mm_alignr_epi8(high, low, BYTES);
42#else
43 return _mm_or_si128(
44 _mm_slli_si128(high, TMP),
45 _mm_srli_si128(low, BYTES));
46#endif
47}
48
49// Select bits from either one of the two inputs depending on the value of the
50// corresponding bit in a selection mask.
51[[nodiscard]] static inline __m128i select(__m128i a0, __m128i a1, __m128i mask)
52{
53 // The traditional formula is:
54 // (a0 & ~mask) | (a1 & mask)
55 // This can use the and-not instruction, so it's only 3 x86 asm
56 // instructions. However this implementation uses the formula:
57 // ((a0 ^ a1) & mask) ^ a0
58 // This also generates 3 instructions, but the advantage is that all
59 // operations are commutative. This matters on 2-operand instruction
60 // set like x86. In this particular case it results in better register
61 // allocation and more common subexpression elimination.
62 return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a0, a1), mask), a0);
63}
64
65// These three functions are abstracted to work either on 16bpp or 32bpp.
66template<std::unsigned_integral Pixel> [[nodiscard]] static inline __m128i isEqual(__m128i x, __m128i y)
67{
68 if constexpr (sizeof(Pixel) == 4) {
69 return _mm_cmpeq_epi32(x, y);
70 } else if constexpr (sizeof(Pixel) == 2) {
71 return _mm_cmpeq_epi16(x, y);
72 } else {
74 }
75}
76template<std::unsigned_integral Pixel> [[nodiscard]] static inline __m128i unpacklo(__m128i x, __m128i y)
77{
78 if constexpr (sizeof(Pixel) == 4) {
79 return _mm_unpacklo_epi32(x, y);
80 } else if constexpr (sizeof(Pixel) == 2) {
81 return _mm_unpacklo_epi16(x, y);
82 } else {
84 }
85}
86template<std::unsigned_integral Pixel> [[nodiscard]] static inline __m128i unpackhi(__m128i x, __m128i y)
87{
88 if constexpr (sizeof(Pixel) == 4) {
89 return _mm_unpackhi_epi32(x, y);
90 } else if constexpr (sizeof(Pixel) == 2) {
91 return _mm_unpackhi_epi16(x, y);
92 } else {
94 }
95}
96
97// Scale one 'unit'. A unit is 8x16bpp or 4x32bpp pixels.
98template<std::unsigned_integral Pixel, bool DOUBLE_X> static inline void scale1(
99 __m128i top, __m128i bottom,
100 __m128i prev, __m128i mid, __m128i next,
101 __m128i* out0, __m128i* out1)
102{
103 __m128i left = align<sizeof(__m128i) - sizeof(Pixel)>(mid, prev);
104 __m128i right = align< sizeof(Pixel)>(next, mid);
105
106 __m128i teqb = isEqual<Pixel>(top, bottom);
107 __m128i leqt = isEqual<Pixel>(left, top);
108 __m128i reqt = isEqual<Pixel>(right, top);
109 __m128i leqb = isEqual<Pixel>(left, bottom);
110 __m128i reqb = isEqual<Pixel>(right, bottom);
111
112 __m128i cndA = _mm_andnot_si128(_mm_or_si128(teqb, reqt), leqt);
113 __m128i cndB = _mm_andnot_si128(_mm_or_si128(teqb, leqt), reqt);
114 __m128i cndC = _mm_andnot_si128(_mm_or_si128(teqb, reqb), leqb);
115 __m128i cndD = _mm_andnot_si128(_mm_or_si128(teqb, leqb), reqb);
116
117 __m128i a = select(mid, top, cndA);
118 __m128i b = select(mid, top, cndB);
119 __m128i c = select(mid, bottom, cndC);
120 __m128i d = select(mid, bottom, cndD);
121
122 if constexpr (DOUBLE_X) {
123 out0[0] = unpacklo<Pixel>(a, b);
124 out0[1] = unpackhi<Pixel>(a, b);
125 out1[0] = unpacklo<Pixel>(c, d);
126 out1[1] = unpackhi<Pixel>(c, d);
127 } else {
128 out0[0] = a;
129 out1[0] = c;
130 }
131}
132
133// Scale 1 input line (plus the line above and below) to 2 output lines,
134// optionally doubling the amount of pixels within the output lines.
135template<bool DOUBLE_X, std::unsigned_integral Pixel,
136 int SHIFT = sizeof(__m128i) - sizeof(Pixel)>
137static inline void scaleSSE(
138 Pixel* __restrict out0_, // top output line
139 Pixel* __restrict out1_, // bottom output line
140 const Pixel* __restrict in0_, // top input line
141 const Pixel* __restrict in1_, // middle output line
142 const Pixel* __restrict in2_, // bottom output line
143 size_t width)
144{
145 // Must be properly aligned.
146 assert((reinterpret_cast<uintptr_t>(in0_ ) % sizeof(__m128i)) == 0);
147 assert((reinterpret_cast<uintptr_t>(in1_ ) % sizeof(__m128i)) == 0);
148 assert((reinterpret_cast<uintptr_t>(in2_ ) % sizeof(__m128i)) == 0);
149 assert((reinterpret_cast<uintptr_t>(out0_) % sizeof(__m128i)) == 0);
150 assert((reinterpret_cast<uintptr_t>(out1_) % sizeof(__m128i)) == 0);
151
152 // Must be a (strict positive) multiple of 16 bytes.
153 width *= sizeof(Pixel); // width in bytes
154 assert((width % sizeof(__m128i)) == 0);
155 assert(width > 1);
156 width -= sizeof(__m128i); // handle last unit special
157
158 constexpr size_t SCALE = DOUBLE_X ? 2 : 1;
159
160 // Generated code seems more efficient when all address calculations
161 // are done in bytes. Negative loop counter allows for a more efficient
162 // loop-end test.
163 const auto* in0 = reinterpret_cast<const char*>(in0_ ) + width;
164 const auto* in1 = reinterpret_cast<const char*>(in1_ ) + width;
165 const auto* in2 = reinterpret_cast<const char*>(in2_ ) + width;
166 auto* out0 = reinterpret_cast< char*>(out0_) + SCALE * width;
167 auto* out1 = reinterpret_cast< char*>(out1_) + SCALE * width;
168 ptrdiff_t x = -ptrdiff_t(width);
169
170 // Setup for first unit
171 __m128i next = *reinterpret_cast<const __m128i*>(in1 + x);
172 __m128i mid = _mm_slli_si128(next, SHIFT);
173
174 // Central units
175 do {
176 __m128i top = *reinterpret_cast<const __m128i*>(in0 + x);
177 __m128i bottom = *reinterpret_cast<const __m128i*>(in2 + x);
178 __m128i prev = mid;
179 mid = next;
180 next = *reinterpret_cast<const __m128i*>(in1 + x + sizeof(__m128i));
181 scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
182 reinterpret_cast<__m128i*>(out0 + SCALE * x),
183 reinterpret_cast<__m128i*>(out1 + SCALE * x));
184 x += sizeof(__m128i);
185 } while (x < 0);
186 assert(x == 0);
187
188 // Last unit
189 __m128i top = *reinterpret_cast<const __m128i*>(in0);
190 __m128i bottom = *reinterpret_cast<const __m128i*>(in2);
191 __m128i prev = mid;
192 mid = next;
193 next = _mm_srli_si128(next, SHIFT);
194 scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
195 reinterpret_cast<__m128i*>(out0),
196 reinterpret_cast<__m128i*>(out1));
197}
198
199#endif
200
201
202template<std::unsigned_integral Pixel>
204 : Scaler2<Pixel>(pixelOps_)
205{
206}
207
208template<std::unsigned_integral Pixel>
210 std::span<Pixel> dst0, std::span<Pixel> dst1,
211 std::span<const Pixel> src0, std::span<const Pixel> src1, std::span<const Pixel> src2)
212{
213 auto srcWidth = src0.size();
214 assert(src0.size() == srcWidth);
215 assert(src1.size() == srcWidth);
216 assert(src2.size() == srcWidth);
217 assert(dst0.size() == 2 * srcWidth);
218 assert(dst1.size() == 2 * srcWidth);
219
220 // For some reason, for the c++ version, processing the two output
221 // lines separately is faster than merging them in a single loop (even
222 // though a single loop only has to fetch the inputs once and can
223 // eliminate some common sub-expressions). For the asm version the
224 // situation is reversed.
225#ifdef __SSE2__
226 scaleSSE<true>(dst0.data(), dst1.data(), src0.data(), src1.data(), src2.data(), srcWidth);
227#else
228 scaleLineHalf_1on2(dst0, src0, src1, src2);
229 scaleLineHalf_1on2(dst1, src2, src1, src0);
230#endif
231}
232
233template<std::unsigned_integral Pixel>
234void Scale2xScaler<Pixel>::scaleLineHalf_1on2(
235 std::span<Pixel> dst,
236 std::span<const Pixel> src0, std::span<const Pixel> src1, std::span<const Pixel> src2)
237{
238 auto srcWidth = src0.size();
239 // n m is expanded to a b
240 // w m e c d
241 // s a = (w == n) && (s != n) && (e != n) ? n : m
242 // b = .. swap w/e
243 // c = .. swap n/s
244 // d = .. swap w/e n/s
245
246 // First pixel.
247 Pixel mid = src1[0];
248 Pixel right = src1[1];
249 dst[0] = mid;
250 dst[1] = (right == src0[0] && src2[0] != src0[0]) ? src0[0] : mid;
251
252 // Central pixels.
253 for (auto x : xrange(1u, srcWidth - 1)) {
254 Pixel left = mid;
255 mid = right;
256 right = src1[x + 1];
257 Pixel top = src0[x];
258 Pixel bot = src2[x];
259 dst[2 * x + 0] = (left == top && right != top && bot != top) ? top : mid;
260 dst[2 * x + 1] = (right == top && left != top && bot != top) ? top : mid;
261 }
262
263 // Last pixel.
264 dst[2 * srcWidth - 2] =
265 (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
266 ? src0[srcWidth - 1] : right;
267 dst[2 * srcWidth - 1] =
268 src1[srcWidth - 1];
269}
270
271template<std::unsigned_integral Pixel>
272inline void Scale2xScaler<Pixel>::scaleLine_1on1(
273 std::span<Pixel> dst0, std::span<Pixel> dst1,
274 std::span<const Pixel> src0, std::span<const Pixel> src1, std::span<const Pixel> src2)
275{
276 auto srcWidth = src0.size();
277 assert(src0.size() == srcWidth);
278 assert(src1.size() == srcWidth);
279 assert(src2.size() == srcWidth);
280 assert(dst0.size() == srcWidth);
281 assert(dst1.size() == srcWidth);
282
283#ifdef __SSE2__
284 scaleSSE<false>(dst0.data(), dst1.data(), src0.data(), src1.data(), src2.data(), srcWidth);
285#else
286 scaleLineHalf_1on1(dst0, src0, src1, src2);
287 scaleLineHalf_1on1(dst1, src2, src1, src0);
288#endif
289}
290
291template<std::unsigned_integral Pixel>
292void Scale2xScaler<Pixel>::scaleLineHalf_1on1(
293 std::span<Pixel> dst,
294 std::span<const Pixel> src0, std::span<const Pixel> src1, std::span<const Pixel> src2)
295{
296 auto srcWidth = src0.size();
297 // ab ef
298 // x0 12 34 5x
299 // cd gh
300
301 // First pixel.
302 Pixel mid = src1[0];
303 Pixel right = src1[1];
304 dst[0] = mid;
305
306 // Central pixels.
307 for (auto x : xrange(1u, srcWidth - 1)) {
308 Pixel left = mid;
309 mid = right;
310 right = src1[x + 1];
311 Pixel top = src0[x];
312 Pixel bot = src2[x];
313 dst[x] = (left == top && right != top && bot != top) ? top : mid;
314 }
315
316 // Last pixel.
317 dst[srcWidth - 1] =
318 (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
319 ? src0[srcWidth - 1] : right;
320}
321
322template<std::unsigned_integral Pixel>
324 unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
325 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
326{
327 VLA_SSE_ALIGNED(Pixel, buf0, srcWidth);
328 VLA_SSE_ALIGNED(Pixel, buf1, srcWidth);
329 VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
330
331 auto srcY = narrow<int>(srcStartY);
332 auto srcPrev = src.getLine(srcY - 1, buf0);
333 auto srcCurr = src.getLine(srcY + 0, buf1);
334
335 for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
336 auto srcNext = src.getLine(srcY + 1, buf2);
337 auto dstUpper = dst.acquireLine(dstY + 0);
338 auto dstLower = dst.acquireLine(dstY + 1);
339 scaleLine_1on2(dstUpper, dstLower,
340 srcPrev, srcCurr, srcNext);
341 dst.releaseLine(dstY + 0, dstUpper);
342 dst.releaseLine(dstY + 1, dstLower);
343 srcPrev = srcCurr;
344 srcCurr = srcNext;
345 std::swap(buf0, buf1);
346 std::swap(buf1, buf2);
347 }
348}
349
350template<std::unsigned_integral Pixel>
352 unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
353 ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
354{
355 VLA_SSE_ALIGNED(Pixel, buf0, srcWidth);
356 VLA_SSE_ALIGNED(Pixel, buf1, srcWidth);
357 VLA_SSE_ALIGNED(Pixel, buf2, srcWidth);
358
359 auto srcY = narrow<int>(srcStartY);
360 auto srcPrev = src.getLine(srcY - 1, buf0);
361 auto srcCurr = src.getLine(srcY + 0, buf1);
362
363 for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
364 auto srcNext = src.getLine(srcY + 1, buf2);
365 auto dstUpper = dst.acquireLine(dstY + 0);
366 auto dstLower = dst.acquireLine(dstY + 1);
367 scaleLine_1on1(dstUpper, dstLower,
368 srcPrev, srcCurr, srcNext);
369 dst.releaseLine(dstY + 0, dstUpper);
370 dst.releaseLine(dstY + 1, dstLower);
371 srcPrev = srcCurr;
372 srcCurr = srcNext;
373 std::swap(buf0, buf1);
374 std::swap(buf1, buf2);
375 }
376}
377
378// Force template instantiation.
379#if HAVE_16BPP
380template class Scale2xScaler<uint16_t>;
381#endif
382#if HAVE_32BPP
383template class Scale2xScaler<uint32_t>;
384#endif
385
386} // namespace openmsx
Interface for getting lines from a video frame.
Definition: FrameSource.hh:20
std::span< const Pixel > getLine(int line, std::span< Pixel > buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:96
Runs the Scale2x scaler algorithm.
void scale1x1to2x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
void scale1x1to1x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
Scale2xScaler(const PixelOperations< Pixel > &pixelOps)
Base class for 2x scalers.
Definition: Scaler2.hh:12
virtual void releaseLine(unsigned y, std::span< Pixel > buf)=0
virtual std::span< Pixel > acquireLine(unsigned y)=0
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
void swap(openmsx::MemBuffer< T > &l, openmsx::MemBuffer< T > &r) noexcept
Definition: MemBuffer.hh:202
uint32_t next(octet_iterator &it, octet_iterator end)
#define UNREACHABLE
Definition: unreachable.hh:38
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:50
constexpr auto xrange(T e)
Definition: xrange.hh:132