openMSX
Scale2xScaler.cc
Go to the documentation of this file.
1 /*
2 Original code: Copyright (C) 2001-2003 Andrea Mazzoleni
3 openMSX adaptation by Maarten ter Huurne
4 
5 This file is based on code from the Scale2x project.
6 This modified version is licensed under GPL; the original code is dual-licensed
7 under GPL and under a custom license.
8 
9 Visit the Scale2x site for info:
10  http://scale2x.sourceforge.net/
11 */
12 
13 #include "Scale2xScaler.hh"
14 #include "FrameSource.hh"
15 #include "ScalerOutput.hh"
16 #include "unreachable.hh"
17 #include "vla.hh"
18 #include <algorithm>
19 #include <cassert>
20 #include <cstddef>
21 #include <cstdint>
22 #ifdef __SSE2__
23 #include "emmintrin.h" // SSE2
24 #ifdef __SSSE3__
25 #include "tmmintrin.h" // SSSE3 (supplemental SSE3)
26 #endif
27 #endif
28 
29 namespace openmsx {
30 
31 #ifdef __SSE2__
32 
33 // Take an (unaligned) word from a certain position out of two adjacent
34 // (aligned) words. This either maps directly to the _mm_alignr_epi8()
35 // intrinsic or emulates that behavior.
36 template<int BYTES> static inline __m128i align(__m128i high, __m128i low)
37 {
38 #ifdef __SSSE3__
39  return _mm_alignr_epi8(high, low, BYTES);
40 #else
41  // Workaround gcc-4.8 bug: calculate 'sizeof(__m128i) - BYTES' in a
42  // separate expression. See
43  // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59071
44  static const int TMP = sizeof(__m128i) - BYTES;
45  return _mm_or_si128(
46  _mm_slli_si128(high, TMP),
47  _mm_srli_si128(low, BYTES));
48 #endif
49 }
50 
51 // Select bits from either one of the two inputs depending on the value of the
52 // corresponding bit in a selection mask.
53 static inline __m128i select(__m128i a0, __m128i a1, __m128i mask)
54 {
55  // The traditional formula is:
56  // (a0 & ~mask) | (a1 & mask)
57  // This can use the and-not instruction, so it's only 3 x86 asm
58  // instructions. However this implementation uses the formula:
59  // ((a0 ^ a1) & mask) ^ a0
60  // This also generates 3 instructions, but the advantage is that all
61  // operations are commutative. This matters on 2-operand instruction
62  // set like x86. In this particular case it results in better register
63  // allocation and more common subexpression elimination.
64  return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a0, a1), mask), a0);
65 }
66 
67 // These three functions are abstracted to work either on 16bpp or 32bpp.
68 template<typename Pixel> static inline __m128i isEqual(__m128i x, __m128i y)
69 {
70  if (sizeof(Pixel) == 4) {
71  return _mm_cmpeq_epi32(x, y);
72  } else if (sizeof(Pixel) == 2) {
73  return _mm_cmpeq_epi16(x, y);
74  } else {
76  }
77 }
78 template<typename Pixel> static inline __m128i unpacklo(__m128i x, __m128i y)
79 {
80  if (sizeof(Pixel) == 4) {
81  return _mm_unpacklo_epi32(x, y);
82  } else if (sizeof(Pixel) == 2) {
83  return _mm_unpacklo_epi16(x, y);
84  } else {
86  }
87 }
88 template<typename Pixel> static inline __m128i unpackhi(__m128i x, __m128i y)
89 {
90  if (sizeof(Pixel) == 4) {
91  return _mm_unpackhi_epi32(x, y);
92  } else if (sizeof(Pixel) == 2) {
93  return _mm_unpackhi_epi16(x, y);
94  } else {
96  }
97 }
98 
99 // Scale one 'unit'. A unit is 8x16bpp or 4x32bpp pixels.
100 // Workaround: it's more logical to pass the parameters
101 // 'top', 'bottom', 'prev', 'mid' and 'next'
102 // by value instead of by reference. Though this triggers a compile error in
103 // the 32-bit build on 'Visual Studio 2012 Version 11.0.60315.01 Update 2'
104 // Passing those parameter by-reference works around that compiler bug. I did
105 // verify that gcc still generates equally efficient code.
106 template<typename Pixel, bool DOUBLE_X> static inline void scale1(
107  __m128i& top, __m128i& bottom,
108  __m128i& prev, __m128i& mid, __m128i& next,
109  __m128i* out0, __m128i* out1)
110 {
111  __m128i left = align<sizeof(__m128i) - sizeof(Pixel)>(mid, prev);
112  __m128i right = align< sizeof(Pixel)>(next, mid);
113 
114  __m128i teqb = isEqual<Pixel>(top, bottom);
115  __m128i leqt = isEqual<Pixel>(left, top);
116  __m128i reqt = isEqual<Pixel>(right, top);
117  __m128i leqb = isEqual<Pixel>(left, bottom);
118  __m128i reqb = isEqual<Pixel>(right, bottom);
119 
120  __m128i cnda = _mm_andnot_si128(_mm_or_si128(teqb, reqt), leqt);
121  __m128i cndb = _mm_andnot_si128(_mm_or_si128(teqb, leqt), reqt);
122  __m128i cndc = _mm_andnot_si128(_mm_or_si128(teqb, reqb), leqb);
123  __m128i cndd = _mm_andnot_si128(_mm_or_si128(teqb, leqb), reqb);
124 
125  __m128i a = select(mid, top, cnda);
126  __m128i b = select(mid, top, cndb);
127  __m128i c = select(mid, bottom, cndc);
128  __m128i d = select(mid, bottom, cndd);
129 
130  if (DOUBLE_X) {
131  out0[0] = unpacklo<Pixel>(a, b);
132  out0[1] = unpackhi<Pixel>(a, b);
133  out1[0] = unpacklo<Pixel>(c, d);
134  out1[1] = unpackhi<Pixel>(c, d);
135  } else {
136  out0[0] = a;
137  out1[0] = c;
138  }
139 }
140 
141 // Scale 1 input line (plus the line above and below) to 2 output lines,
142 // optionally doubling the amount of pixels within the output lines.
143 template<bool DOUBLE_X, typename Pixel> static inline void scaleSSE(
144  Pixel* __restrict out0_, // top output line
145  Pixel* __restrict out1_, // bottom output line
146  const Pixel* __restrict in0_, // top input line
147  const Pixel* __restrict in1_, // middle output line
148  const Pixel* __restrict in2_, // bottom output line
149  size_t width)
150 {
151  // Must be properly aligned.
152  assert((reinterpret_cast<uintptr_t>(in0_ ) % sizeof(__m128i)) == 0);
153  assert((reinterpret_cast<uintptr_t>(in1_ ) % sizeof(__m128i)) == 0);
154  assert((reinterpret_cast<uintptr_t>(in2_ ) % sizeof(__m128i)) == 0);
155  assert((reinterpret_cast<uintptr_t>(out0_) % sizeof(__m128i)) == 0);
156  assert((reinterpret_cast<uintptr_t>(out1_) % sizeof(__m128i)) == 0);
157 
158  // Must be a (strict positive) multiple of 16 bytes.
159  width *= sizeof(Pixel); // width in bytes
160  assert((width % sizeof(__m128i)) == 0);
161  assert(width > 1);
162  width -= sizeof(__m128i); // handle last unit special
163 
164  static const int SHIFT = sizeof(__m128i) - sizeof(Pixel);
165  static const size_t SCALE = DOUBLE_X ? 2 : 1;
166 
167  // Generated code seems more efficient when all address calculations
168  // are done in bytes. Negative loop counter allows for a more efficient
169  // loop-end test.
170  auto* in0 = reinterpret_cast<const char*>(in0_ ) + width;
171  auto* in1 = reinterpret_cast<const char*>(in1_ ) + width;
172  auto* in2 = reinterpret_cast<const char*>(in2_ ) + width;
173  auto* out0 = reinterpret_cast< char*>(out0_) + SCALE * width;
174  auto* out1 = reinterpret_cast< char*>(out1_) + SCALE * width;
175  ptrdiff_t x = -ptrdiff_t(width);
176 
177  // Setup for first unit
178  __m128i next = *reinterpret_cast<const __m128i*>(in1 + x);
179  __m128i mid = _mm_slli_si128(next, SHIFT);
180 
181  // Central units
182  do {
183  __m128i top = *reinterpret_cast<const __m128i*>(in0 + x);
184  __m128i bottom = *reinterpret_cast<const __m128i*>(in2 + x);
185  __m128i prev = mid;
186  mid = next;
187  next = *reinterpret_cast<const __m128i*>(in1 + x + sizeof(__m128i));
188  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
189  reinterpret_cast<__m128i*>(out0 + SCALE * x),
190  reinterpret_cast<__m128i*>(out1 + SCALE * x));
191  x += sizeof(__m128i);
192  } while (x < 0);
193  assert(x == 0);
194 
195  // Last unit
196  __m128i top = *reinterpret_cast<const __m128i*>(in0);
197  __m128i bottom = *reinterpret_cast<const __m128i*>(in2);
198  __m128i prev = mid;
199  mid = next;
200  next = _mm_srli_si128(next, SHIFT);
201  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
202  reinterpret_cast<__m128i*>(out0),
203  reinterpret_cast<__m128i*>(out1));
204 }
205 
206 #endif
207 
208 
209 template <class Pixel>
211  : Scaler2<Pixel>(pixelOps_)
212 {
213 }
214 
215 template <class Pixel>
217  Pixel* __restrict dst0, Pixel* __restrict dst1,
218  const Pixel* __restrict src0, const Pixel* __restrict src1,
219  const Pixel* __restrict src2, size_t srcWidth) __restrict
220 {
221  // For some reason, for the c++ version, processing the two output
222  // lines separately is faster than merging them in a single loop (even
223  // though a single loop only has to fetch the inputs once and can
224  // eliminate some common sub-expressions). For the asm version the
225  // situation is reversed.
226 #ifdef __SSE2__
227  scaleSSE<true>(dst0, dst1, src0, src1, src2, srcWidth);
228 #else
229  scaleLineHalf_1on2(dst0, src0, src1, src2, srcWidth);
230  scaleLineHalf_1on2(dst1, src2, src1, src0, srcWidth);
231 #endif
232 }
233 
234 template <class Pixel>
236  Pixel* __restrict dst, const Pixel* __restrict src0,
237  const Pixel* __restrict src1, const Pixel* __restrict src2,
238  size_t srcWidth) __restrict
239 {
240  // n m is expaned to a b
241  // w m e c d
242  // s a = (w == n) && (s != n) && (e != n) ? n : m
243  // b = .. swap w/e
244  // c = .. swap n/s
245  // d = .. swap w/e n/s
246 
247  // First pixel.
248  Pixel mid = src1[0];
249  Pixel right = src1[1];
250  dst[0] = mid;
251  dst[1] = (right == src0[0] && src2[0] != src0[0]) ? src0[0] : mid;
252 
253  // Central pixels.
254  for (unsigned x = 1; x < srcWidth - 1; ++x) {
255  Pixel left = mid;
256  mid = right;
257  right = src1[x + 1];
258  Pixel top = src0[x];
259  Pixel bot = src2[x];
260  dst[2 * x + 0] = (left == top && right != top && bot != top) ? top : mid;
261  dst[2 * x + 1] = (right == top && left != top && bot != top) ? top : mid;
262  }
263 
264  // Last pixel.
265  dst[2 * srcWidth - 2] =
266  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
267  ? src0[srcWidth - 1] : right;
268  dst[2 * srcWidth - 1] =
269  src1[srcWidth - 1];
270 }
271 
272 template <class Pixel>
274  Pixel* __restrict dst0, Pixel* __restrict dst1,
275  const Pixel* __restrict src0, const Pixel* __restrict src1,
276  const Pixel* __restrict src2, size_t srcWidth) __restrict
277 {
278 #ifdef __SSE2__
279  scaleSSE<false>(dst0, dst1, src0, src1, src2, srcWidth);
280 #else
281  scaleLineHalf_1on1(dst0, src0, src1, src2, srcWidth);
282  scaleLineHalf_1on1(dst1, src2, src1, src0, srcWidth);
283 #endif
284 }
285 
286 template <class Pixel>
288  Pixel* __restrict dst, const Pixel* __restrict src0,
289  const Pixel* __restrict src1, const Pixel* __restrict src2,
290  size_t srcWidth) __restrict
291 {
292  // ab ef
293  // x0 12 34 5x
294  // cd gh
295 
296  // First pixel.
297  Pixel mid = src1[0];
298  Pixel right = src1[1];
299  dst[0] = mid;
300 
301  // Central pixels.
302  for (unsigned x = 1; x < srcWidth - 1; ++x) {
303  Pixel left = mid;
304  mid = right;
305  right = src1[x + 1];
306  Pixel top = src0[x];
307  Pixel bot = src2[x];
308  dst[x] = (left == top && right != top && bot != top) ? top : mid;
309  }
310 
311  // Last pixel.
312  dst[srcWidth - 1] =
313  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
314  ? src0[srcWidth - 1] : right;
315 }
316 
317 template <class Pixel>
319  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
320  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
321 {
322  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
323  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
324  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
325 
326  int srcY = srcStartY;
327  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
328  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
329 
330  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
331  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
332  auto* dstUpper = dst.acquireLine(dstY + 0);
333  auto* dstLower = dst.acquireLine(dstY + 1);
334  scaleLine_1on2(dstUpper, dstLower,
335  srcPrev, srcCurr, srcNext,
336  srcWidth);
337  dst.releaseLine(dstY + 0, dstUpper);
338  dst.releaseLine(dstY + 1, dstLower);
339  srcPrev = srcCurr;
340  srcCurr = srcNext;
341  std::swap(buf0, buf1);
342  std::swap(buf1, buf2);
343  }
344 }
345 
346 template <class Pixel>
348  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
349  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
350 {
351  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
352  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
353  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
354 
355  int srcY = srcStartY;
356  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
357  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
358 
359  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
360  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
361  auto* dstUpper = dst.acquireLine(dstY + 0);
362  auto* dstLower = dst.acquireLine(dstY + 1);
363  scaleLine_1on1(dstUpper, dstLower,
364  srcPrev, srcCurr, srcNext,
365  srcWidth);
366  dst.releaseLine(dstY + 0, dstUpper);
367  dst.releaseLine(dstY + 1, dstLower);
368  srcPrev = srcCurr;
369  srcCurr = srcNext;
370  std::swap(buf0, buf1);
371  std::swap(buf1, buf2);
372  }
373 }
374 
375 // Force template instantiation.
376 #if HAVE_16BPP
377 template class Scale2xScaler<uint16_t>;
378 #endif
379 #if HAVE_32BPP
380 template class Scale2xScaler<uint32_t>;
381 #endif
382 
383 } // namespace openmsx
void scale1x1to2x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
virtual Pixel * acquireLine(unsigned y)=0
Base class for 2x scalers.
Definition: Scaler2.hh:11
uint32_t Pixel
Runs the Scale2x scaler algorithm.
Interface for getting lines from a video frame.
Definition: FrameSource.hh:14
void scale1x1to1x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
Scale2xScaler(const PixelOperations< Pixel > &pixelOps)
virtual void releaseLine(unsigned y, Pixel *buf)=0
const Pixel * getLinePtr(int line, unsigned width, Pixel *buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:91
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
#define UNREACHABLE
Definition: unreachable.hh:35