openMSX
Scale2xScaler.cc
Go to the documentation of this file.
1 /*
2 Original code: Copyright (C) 2001-2003 Andrea Mazzoleni
3 openMSX adaptation by Maarten ter Huurne
4 
5 This file is based on code from the Scale2x project.
6 This modified version is licensed under GPL; the original code is dual-licensed
7 under GPL and under a custom license.
8 
9 Visit the Scale2x site for info:
10  http://scale2x.sourceforge.net/
11 */
12 
13 #include "Scale2xScaler.hh"
14 #include "FrameSource.hh"
15 #include "ScalerOutput.hh"
16 #include "unreachable.hh"
17 #include "vla.hh"
18 #include "xrange.hh"
19 #include <cassert>
20 #include <cstddef>
21 #include <cstdint>
22 #ifdef __SSE2__
23 #include "emmintrin.h" // SSE2
24 #ifdef __SSSE3__
25 #include "tmmintrin.h" // SSSE3 (supplemental SSE3)
26 #endif
27 #endif
28 
29 namespace openmsx {
30 
31 #ifdef __SSE2__
32 
33 // Take an (unaligned) word from a certain position out of two adjacent
34 // (aligned) words. This either maps directly to the _mm_alignr_epi8()
35 // intrinsic or emulates that behavior.
36 template<int BYTES, int TMP = sizeof(__m128i) - BYTES>
37 [[nodiscard]] static inline __m128i align(__m128i high, __m128i low)
38 {
39 #ifdef __SSSE3__
40  return _mm_alignr_epi8(high, low, BYTES);
41 #else
42  return _mm_or_si128(
43  _mm_slli_si128(high, TMP),
44  _mm_srli_si128(low, BYTES));
45 #endif
46 }
47 
48 // Select bits from either one of the two inputs depending on the value of the
49 // corresponding bit in a selection mask.
50 [[nodiscard]] static inline __m128i select(__m128i a0, __m128i a1, __m128i mask)
51 {
52  // The traditional formula is:
53  // (a0 & ~mask) | (a1 & mask)
54  // This can use the and-not instruction, so it's only 3 x86 asm
55  // instructions. However this implementation uses the formula:
56  // ((a0 ^ a1) & mask) ^ a0
57  // This also generates 3 instructions, but the advantage is that all
58  // operations are commutative. This matters on 2-operand instruction
59  // set like x86. In this particular case it results in better register
60  // allocation and more common subexpression elimination.
61  return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a0, a1), mask), a0);
62 }
63 
64 // These three functions are abstracted to work either on 16bpp or 32bpp.
65 template<std::unsigned_integral Pixel> [[nodiscard]] static inline __m128i isEqual(__m128i x, __m128i y)
66 {
67  if constexpr (sizeof(Pixel) == 4) {
68  return _mm_cmpeq_epi32(x, y);
69  } else if constexpr (sizeof(Pixel) == 2) {
70  return _mm_cmpeq_epi16(x, y);
71  } else {
73  }
74 }
75 template<std::unsigned_integral Pixel> [[nodiscard]] static inline __m128i unpacklo(__m128i x, __m128i y)
76 {
77  if constexpr (sizeof(Pixel) == 4) {
78  return _mm_unpacklo_epi32(x, y);
79  } else if constexpr (sizeof(Pixel) == 2) {
80  return _mm_unpacklo_epi16(x, y);
81  } else {
83  }
84 }
85 template<std::unsigned_integral Pixel> [[nodiscard]] static inline __m128i unpackhi(__m128i x, __m128i y)
86 {
87  if constexpr (sizeof(Pixel) == 4) {
88  return _mm_unpackhi_epi32(x, y);
89  } else if constexpr (sizeof(Pixel) == 2) {
90  return _mm_unpackhi_epi16(x, y);
91  } else {
93  }
94 }
95 
96 // Scale one 'unit'. A unit is 8x16bpp or 4x32bpp pixels.
97 template<std::unsigned_integral Pixel, bool DOUBLE_X> static inline void scale1(
98  __m128i top, __m128i bottom,
99  __m128i prev, __m128i mid, __m128i next,
100  __m128i* out0, __m128i* out1)
101 {
102  __m128i left = align<sizeof(__m128i) - sizeof(Pixel)>(mid, prev);
103  __m128i right = align< sizeof(Pixel)>(next, mid);
104 
105  __m128i teqb = isEqual<Pixel>(top, bottom);
106  __m128i leqt = isEqual<Pixel>(left, top);
107  __m128i reqt = isEqual<Pixel>(right, top);
108  __m128i leqb = isEqual<Pixel>(left, bottom);
109  __m128i reqb = isEqual<Pixel>(right, bottom);
110 
111  __m128i cndA = _mm_andnot_si128(_mm_or_si128(teqb, reqt), leqt);
112  __m128i cndB = _mm_andnot_si128(_mm_or_si128(teqb, leqt), reqt);
113  __m128i cndC = _mm_andnot_si128(_mm_or_si128(teqb, reqb), leqb);
114  __m128i cndD = _mm_andnot_si128(_mm_or_si128(teqb, leqb), reqb);
115 
116  __m128i a = select(mid, top, cndA);
117  __m128i b = select(mid, top, cndB);
118  __m128i c = select(mid, bottom, cndC);
119  __m128i d = select(mid, bottom, cndD);
120 
121  if constexpr (DOUBLE_X) {
122  out0[0] = unpacklo<Pixel>(a, b);
123  out0[1] = unpackhi<Pixel>(a, b);
124  out1[0] = unpacklo<Pixel>(c, d);
125  out1[1] = unpackhi<Pixel>(c, d);
126  } else {
127  out0[0] = a;
128  out1[0] = c;
129  }
130 }
131 
132 // Scale 1 input line (plus the line above and below) to 2 output lines,
133 // optionally doubling the amount of pixels within the output lines.
134 template<bool DOUBLE_X, std::unsigned_integral Pixel,
135  int SHIFT = sizeof(__m128i) - sizeof(Pixel)>
136 static inline void scaleSSE(
137  Pixel* __restrict out0_, // top output line
138  Pixel* __restrict out1_, // bottom output line
139  const Pixel* __restrict in0_, // top input line
140  const Pixel* __restrict in1_, // middle output line
141  const Pixel* __restrict in2_, // bottom output line
142  size_t width)
143 {
144  // Must be properly aligned.
145  assert((reinterpret_cast<uintptr_t>(in0_ ) % sizeof(__m128i)) == 0);
146  assert((reinterpret_cast<uintptr_t>(in1_ ) % sizeof(__m128i)) == 0);
147  assert((reinterpret_cast<uintptr_t>(in2_ ) % sizeof(__m128i)) == 0);
148  assert((reinterpret_cast<uintptr_t>(out0_) % sizeof(__m128i)) == 0);
149  assert((reinterpret_cast<uintptr_t>(out1_) % sizeof(__m128i)) == 0);
150 
151  // Must be a (strict positive) multiple of 16 bytes.
152  width *= sizeof(Pixel); // width in bytes
153  assert((width % sizeof(__m128i)) == 0);
154  assert(width > 1);
155  width -= sizeof(__m128i); // handle last unit special
156 
157  constexpr size_t SCALE = DOUBLE_X ? 2 : 1;
158 
159  // Generated code seems more efficient when all address calculations
160  // are done in bytes. Negative loop counter allows for a more efficient
161  // loop-end test.
162  const auto* in0 = reinterpret_cast<const char*>(in0_ ) + width;
163  const auto* in1 = reinterpret_cast<const char*>(in1_ ) + width;
164  const auto* in2 = reinterpret_cast<const char*>(in2_ ) + width;
165  auto* out0 = reinterpret_cast< char*>(out0_) + SCALE * width;
166  auto* out1 = reinterpret_cast< char*>(out1_) + SCALE * width;
167  ptrdiff_t x = -ptrdiff_t(width);
168 
169  // Setup for first unit
170  __m128i next = *reinterpret_cast<const __m128i*>(in1 + x);
171  __m128i mid = _mm_slli_si128(next, SHIFT);
172 
173  // Central units
174  do {
175  __m128i top = *reinterpret_cast<const __m128i*>(in0 + x);
176  __m128i bottom = *reinterpret_cast<const __m128i*>(in2 + x);
177  __m128i prev = mid;
178  mid = next;
179  next = *reinterpret_cast<const __m128i*>(in1 + x + sizeof(__m128i));
180  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
181  reinterpret_cast<__m128i*>(out0 + SCALE * x),
182  reinterpret_cast<__m128i*>(out1 + SCALE * x));
183  x += sizeof(__m128i);
184  } while (x < 0);
185  assert(x == 0);
186 
187  // Last unit
188  __m128i top = *reinterpret_cast<const __m128i*>(in0);
189  __m128i bottom = *reinterpret_cast<const __m128i*>(in2);
190  __m128i prev = mid;
191  mid = next;
192  next = _mm_srli_si128(next, SHIFT);
193  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
194  reinterpret_cast<__m128i*>(out0),
195  reinterpret_cast<__m128i*>(out1));
196 }
197 
198 #endif
199 
200 
201 template<std::unsigned_integral Pixel>
203  : Scaler2<Pixel>(pixelOps_)
204 {
205 }
206 
207 template<std::unsigned_integral Pixel>
209  Pixel* __restrict dst0, Pixel* __restrict dst1,
210  const Pixel* __restrict src0, const Pixel* __restrict src1,
211  const Pixel* __restrict src2, size_t srcWidth) __restrict
212 {
213  // For some reason, for the c++ version, processing the two output
214  // lines separately is faster than merging them in a single loop (even
215  // though a single loop only has to fetch the inputs once and can
216  // eliminate some common sub-expressions). For the asm version the
217  // situation is reversed.
218 #ifdef __SSE2__
219  scaleSSE<true>(dst0, dst1, src0, src1, src2, srcWidth);
220 #else
221  scaleLineHalf_1on2(dst0, src0, src1, src2, srcWidth);
222  scaleLineHalf_1on2(dst1, src2, src1, src0, srcWidth);
223 #endif
224 }
225 
226 template<std::unsigned_integral Pixel>
227 void Scale2xScaler<Pixel>::scaleLineHalf_1on2(
228  Pixel* __restrict dst, const Pixel* __restrict src0,
229  const Pixel* __restrict src1, const Pixel* __restrict src2,
230  size_t srcWidth) __restrict
231 {
232  // n m is expanded to a b
233  // w m e c d
234  // s a = (w == n) && (s != n) && (e != n) ? n : m
235  // b = .. swap w/e
236  // c = .. swap n/s
237  // d = .. swap w/e n/s
238 
239  // First pixel.
240  Pixel mid = src1[0];
241  Pixel right = src1[1];
242  dst[0] = mid;
243  dst[1] = (right == src0[0] && src2[0] != src0[0]) ? src0[0] : mid;
244 
245  // Central pixels.
246  for (auto x : xrange(1u, srcWidth - 1)) {
247  Pixel left = mid;
248  mid = right;
249  right = src1[x + 1];
250  Pixel top = src0[x];
251  Pixel bot = src2[x];
252  dst[2 * x + 0] = (left == top && right != top && bot != top) ? top : mid;
253  dst[2 * x + 1] = (right == top && left != top && bot != top) ? top : mid;
254  }
255 
256  // Last pixel.
257  dst[2 * srcWidth - 2] =
258  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
259  ? src0[srcWidth - 1] : right;
260  dst[2 * srcWidth - 1] =
261  src1[srcWidth - 1];
262 }
263 
264 template<std::unsigned_integral Pixel>
265 inline void Scale2xScaler<Pixel>::scaleLine_1on1(
266  Pixel* __restrict dst0, Pixel* __restrict dst1,
267  const Pixel* __restrict src0, const Pixel* __restrict src1,
268  const Pixel* __restrict src2, size_t srcWidth) __restrict
269 {
270 #ifdef __SSE2__
271  scaleSSE<false>(dst0, dst1, src0, src1, src2, srcWidth);
272 #else
273  scaleLineHalf_1on1(dst0, src0, src1, src2, srcWidth);
274  scaleLineHalf_1on1(dst1, src2, src1, src0, srcWidth);
275 #endif
276 }
277 
278 template<std::unsigned_integral Pixel>
279 void Scale2xScaler<Pixel>::scaleLineHalf_1on1(
280  Pixel* __restrict dst, const Pixel* __restrict src0,
281  const Pixel* __restrict src1, const Pixel* __restrict src2,
282  size_t srcWidth) __restrict
283 {
284  // ab ef
285  // x0 12 34 5x
286  // cd gh
287 
288  // First pixel.
289  Pixel mid = src1[0];
290  Pixel right = src1[1];
291  dst[0] = mid;
292 
293  // Central pixels.
294  for (auto x : xrange(1u, srcWidth - 1)) {
295  Pixel left = mid;
296  mid = right;
297  right = src1[x + 1];
298  Pixel top = src0[x];
299  Pixel bot = src2[x];
300  dst[x] = (left == top && right != top && bot != top) ? top : mid;
301  }
302 
303  // Last pixel.
304  dst[srcWidth - 1] =
305  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
306  ? src0[srcWidth - 1] : right;
307 }
308 
309 template<std::unsigned_integral Pixel>
311  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
312  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
313 {
314  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
315  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
316  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
317 
318  int srcY = srcStartY;
319  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
320  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
321 
322  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
323  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
324  auto* dstUpper = dst.acquireLine(dstY + 0);
325  auto* dstLower = dst.acquireLine(dstY + 1);
326  scaleLine_1on2(dstUpper, dstLower,
327  srcPrev, srcCurr, srcNext,
328  srcWidth);
329  dst.releaseLine(dstY + 0, dstUpper);
330  dst.releaseLine(dstY + 1, dstLower);
331  srcPrev = srcCurr;
332  srcCurr = srcNext;
333  std::swap(buf0, buf1);
334  std::swap(buf1, buf2);
335  }
336 }
337 
338 template<std::unsigned_integral Pixel>
340  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
341  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
342 {
343  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
344  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
345  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
346 
347  int srcY = srcStartY;
348  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
349  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
350 
351  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
352  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
353  auto* dstUpper = dst.acquireLine(dstY + 0);
354  auto* dstLower = dst.acquireLine(dstY + 1);
355  scaleLine_1on1(dstUpper, dstLower,
356  srcPrev, srcCurr, srcNext,
357  srcWidth);
358  dst.releaseLine(dstY + 0, dstUpper);
359  dst.releaseLine(dstY + 1, dstLower);
360  srcPrev = srcCurr;
361  srcCurr = srcNext;
362  std::swap(buf0, buf1);
363  std::swap(buf1, buf2);
364  }
365 }
366 
367 // Force template instantiation.
368 #if HAVE_16BPP
369 template class Scale2xScaler<uint16_t>;
370 #endif
371 #if HAVE_32BPP
372 template class Scale2xScaler<uint32_t>;
373 #endif
374 
375 } // namespace openmsx
Interface for getting lines from a video frame.
Definition: FrameSource.hh:17
const Pixel * getLinePtr(int line, unsigned width, Pixel *buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:93
Runs the Scale2x scaler algorithm.
void scale1x1to2x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
void scale1x1to1x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
Scale2xScaler(const PixelOperations< Pixel > &pixelOps)
Base class for 2x scalers.
Definition: Scaler2.hh:12
virtual Pixel * acquireLine(unsigned y)=0
virtual void releaseLine(unsigned y, Pixel *buf)=0
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
constexpr int SCALE
Definition: ArkanoidPad.cc:24
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:127
constexpr nibble mask[4][13]
Definition: RP5C01.cc:34
uint32_t next(octet_iterator &it, octet_iterator end)
#define UNREACHABLE
Definition: unreachable.hh:38
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
constexpr auto xrange(T e)
Definition: xrange.hh:133