openMSX
Scale2xScaler.cc
Go to the documentation of this file.
1 /*
2 Original code: Copyright (C) 2001-2003 Andrea Mazzoleni
3 openMSX adaptation by Maarten ter Huurne
4 
5 This file is based on code from the Scale2x project.
6 This modified version is licensed under GPL; the original code is dual-licensed
7 under GPL and under a custom license.
8 
9 Visit the Scale2x site for info:
10  http://scale2x.sourceforge.net/
11 */
12 
13 #include "Scale2xScaler.hh"
14 #include "FrameSource.hh"
15 #include "ScalerOutput.hh"
16 #include "unreachable.hh"
17 #include "vla.hh"
18 #include <algorithm>
19 #include <cassert>
20 #include <cstddef>
21 #include <cstdint>
22 #ifdef __SSE2__
23 #include "emmintrin.h" // SSE2
24 #ifdef __SSSE3__
25 #include "tmmintrin.h" // SSSE3 (supplemental SSE3)
26 #endif
27 #endif
28 
29 namespace openmsx {
30 
31 #ifdef __SSE2__
32 
33 // Take an (unaligned) word from a certain position out of two adjacent
34 // (aligned) words. This either maps directly to the _mm_alignr_epi8()
35 // intrinsic or emulates that behavior.
36 template<int BYTES, int TMP = sizeof(__m128i) - BYTES>
37 static inline __m128i align(__m128i high, __m128i low)
38 {
39 #ifdef __SSSE3__
40  return _mm_alignr_epi8(high, low, BYTES);
41 #else
42  return _mm_or_si128(
43  _mm_slli_si128(high, TMP),
44  _mm_srli_si128(low, BYTES));
45 #endif
46 }
47 
48 // Select bits from either one of the two inputs depending on the value of the
49 // corresponding bit in a selection mask.
50 static inline __m128i select(__m128i a0, __m128i a1, __m128i mask)
51 {
52  // The traditional formula is:
53  // (a0 & ~mask) | (a1 & mask)
54  // This can use the and-not instruction, so it's only 3 x86 asm
55  // instructions. However this implementation uses the formula:
56  // ((a0 ^ a1) & mask) ^ a0
57  // This also generates 3 instructions, but the advantage is that all
58  // operations are commutative. This matters on 2-operand instruction
59  // set like x86. In this particular case it results in better register
60  // allocation and more common subexpression elimination.
61  return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a0, a1), mask), a0);
62 }
63 
64 // These three functions are abstracted to work either on 16bpp or 32bpp.
65 template<typename Pixel> static inline __m128i isEqual(__m128i x, __m128i y)
66 {
67  if (sizeof(Pixel) == 4) {
68  return _mm_cmpeq_epi32(x, y);
69  } else if (sizeof(Pixel) == 2) {
70  return _mm_cmpeq_epi16(x, y);
71  } else {
73  }
74 }
75 template<typename Pixel> static inline __m128i unpacklo(__m128i x, __m128i y)
76 {
77  if (sizeof(Pixel) == 4) {
78  return _mm_unpacklo_epi32(x, y);
79  } else if (sizeof(Pixel) == 2) {
80  return _mm_unpacklo_epi16(x, y);
81  } else {
83  }
84 }
85 template<typename Pixel> static inline __m128i unpackhi(__m128i x, __m128i y)
86 {
87  if (sizeof(Pixel) == 4) {
88  return _mm_unpackhi_epi32(x, y);
89  } else if (sizeof(Pixel) == 2) {
90  return _mm_unpackhi_epi16(x, y);
91  } else {
93  }
94 }
95 
96 // Scale one 'unit'. A unit is 8x16bpp or 4x32bpp pixels.
97 // Workaround: it's more logical to pass the parameters
98 // 'top', 'bottom', 'prev', 'mid' and 'next'
99 // by value instead of by reference. Though this triggers a compile error in
100 // the 32-bit build on 'Visual Studio 2012 Version 11.0.60315.01 Update 2'
101 // Passing those parameter by-reference works around that compiler bug. I did
102 // verify that gcc still generates equally efficient code.
103 template<typename Pixel, bool DOUBLE_X> static inline void scale1(
104  __m128i& top, __m128i& bottom,
105  __m128i& prev, __m128i& mid, __m128i& next,
106  __m128i* out0, __m128i* out1)
107 {
108  __m128i left = align<sizeof(__m128i) - sizeof(Pixel)>(mid, prev);
109  __m128i right = align< sizeof(Pixel)>(next, mid);
110 
111  __m128i teqb = isEqual<Pixel>(top, bottom);
112  __m128i leqt = isEqual<Pixel>(left, top);
113  __m128i reqt = isEqual<Pixel>(right, top);
114  __m128i leqb = isEqual<Pixel>(left, bottom);
115  __m128i reqb = isEqual<Pixel>(right, bottom);
116 
117  __m128i cnda = _mm_andnot_si128(_mm_or_si128(teqb, reqt), leqt);
118  __m128i cndb = _mm_andnot_si128(_mm_or_si128(teqb, leqt), reqt);
119  __m128i cndc = _mm_andnot_si128(_mm_or_si128(teqb, reqb), leqb);
120  __m128i cndd = _mm_andnot_si128(_mm_or_si128(teqb, leqb), reqb);
121 
122  __m128i a = select(mid, top, cnda);
123  __m128i b = select(mid, top, cndb);
124  __m128i c = select(mid, bottom, cndc);
125  __m128i d = select(mid, bottom, cndd);
126 
127  if (DOUBLE_X) {
128  out0[0] = unpacklo<Pixel>(a, b);
129  out0[1] = unpackhi<Pixel>(a, b);
130  out1[0] = unpacklo<Pixel>(c, d);
131  out1[1] = unpackhi<Pixel>(c, d);
132  } else {
133  out0[0] = a;
134  out1[0] = c;
135  }
136 }
137 
138 // Scale 1 input line (plus the line above and below) to 2 output lines,
139 // optionally doubling the amount of pixels within the output lines.
140 template<bool DOUBLE_X, typename Pixel,
141  int SHIFT = sizeof(__m128i) - sizeof(Pixel)>
142 static inline void scaleSSE(
143  Pixel* __restrict out0_, // top output line
144  Pixel* __restrict out1_, // bottom output line
145  const Pixel* __restrict in0_, // top input line
146  const Pixel* __restrict in1_, // middle output line
147  const Pixel* __restrict in2_, // bottom output line
148  size_t width)
149 {
150  // Must be properly aligned.
151  assert((reinterpret_cast<uintptr_t>(in0_ ) % sizeof(__m128i)) == 0);
152  assert((reinterpret_cast<uintptr_t>(in1_ ) % sizeof(__m128i)) == 0);
153  assert((reinterpret_cast<uintptr_t>(in2_ ) % sizeof(__m128i)) == 0);
154  assert((reinterpret_cast<uintptr_t>(out0_) % sizeof(__m128i)) == 0);
155  assert((reinterpret_cast<uintptr_t>(out1_) % sizeof(__m128i)) == 0);
156 
157  // Must be a (strict positive) multiple of 16 bytes.
158  width *= sizeof(Pixel); // width in bytes
159  assert((width % sizeof(__m128i)) == 0);
160  assert(width > 1);
161  width -= sizeof(__m128i); // handle last unit special
162 
163  static const size_t SCALE = DOUBLE_X ? 2 : 1;
164 
165  // Generated code seems more efficient when all address calculations
166  // are done in bytes. Negative loop counter allows for a more efficient
167  // loop-end test.
168  auto* in0 = reinterpret_cast<const char*>(in0_ ) + width;
169  auto* in1 = reinterpret_cast<const char*>(in1_ ) + width;
170  auto* in2 = reinterpret_cast<const char*>(in2_ ) + width;
171  auto* out0 = reinterpret_cast< char*>(out0_) + SCALE * width;
172  auto* out1 = reinterpret_cast< char*>(out1_) + SCALE * width;
173  ptrdiff_t x = -ptrdiff_t(width);
174 
175  // Setup for first unit
176  __m128i next = *reinterpret_cast<const __m128i*>(in1 + x);
177  __m128i mid = _mm_slli_si128(next, SHIFT);
178 
179  // Central units
180  do {
181  __m128i top = *reinterpret_cast<const __m128i*>(in0 + x);
182  __m128i bottom = *reinterpret_cast<const __m128i*>(in2 + x);
183  __m128i prev = mid;
184  mid = next;
185  next = *reinterpret_cast<const __m128i*>(in1 + x + sizeof(__m128i));
186  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
187  reinterpret_cast<__m128i*>(out0 + SCALE * x),
188  reinterpret_cast<__m128i*>(out1 + SCALE * x));
189  x += sizeof(__m128i);
190  } while (x < 0);
191  assert(x == 0);
192 
193  // Last unit
194  __m128i top = *reinterpret_cast<const __m128i*>(in0);
195  __m128i bottom = *reinterpret_cast<const __m128i*>(in2);
196  __m128i prev = mid;
197  mid = next;
198  next = _mm_srli_si128(next, SHIFT);
199  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
200  reinterpret_cast<__m128i*>(out0),
201  reinterpret_cast<__m128i*>(out1));
202 }
203 
204 #endif
205 
206 
207 template <class Pixel>
209  : Scaler2<Pixel>(pixelOps_)
210 {
211 }
212 
213 template <class Pixel>
215  Pixel* __restrict dst0, Pixel* __restrict dst1,
216  const Pixel* __restrict src0, const Pixel* __restrict src1,
217  const Pixel* __restrict src2, size_t srcWidth) __restrict
218 {
219  // For some reason, for the c++ version, processing the two output
220  // lines separately is faster than merging them in a single loop (even
221  // though a single loop only has to fetch the inputs once and can
222  // eliminate some common sub-expressions). For the asm version the
223  // situation is reversed.
224 #ifdef __SSE2__
225  scaleSSE<true>(dst0, dst1, src0, src1, src2, srcWidth);
226 #else
227  scaleLineHalf_1on2(dst0, src0, src1, src2, srcWidth);
228  scaleLineHalf_1on2(dst1, src2, src1, src0, srcWidth);
229 #endif
230 }
231 
232 template <class Pixel>
234  Pixel* __restrict dst, const Pixel* __restrict src0,
235  const Pixel* __restrict src1, const Pixel* __restrict src2,
236  size_t srcWidth) __restrict
237 {
238  // n m is expaned to a b
239  // w m e c d
240  // s a = (w == n) && (s != n) && (e != n) ? n : m
241  // b = .. swap w/e
242  // c = .. swap n/s
243  // d = .. swap w/e n/s
244 
245  // First pixel.
246  Pixel mid = src1[0];
247  Pixel right = src1[1];
248  dst[0] = mid;
249  dst[1] = (right == src0[0] && src2[0] != src0[0]) ? src0[0] : mid;
250 
251  // Central pixels.
252  for (unsigned x = 1; x < srcWidth - 1; ++x) {
253  Pixel left = mid;
254  mid = right;
255  right = src1[x + 1];
256  Pixel top = src0[x];
257  Pixel bot = src2[x];
258  dst[2 * x + 0] = (left == top && right != top && bot != top) ? top : mid;
259  dst[2 * x + 1] = (right == top && left != top && bot != top) ? top : mid;
260  }
261 
262  // Last pixel.
263  dst[2 * srcWidth - 2] =
264  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
265  ? src0[srcWidth - 1] : right;
266  dst[2 * srcWidth - 1] =
267  src1[srcWidth - 1];
268 }
269 
270 template <class Pixel>
272  Pixel* __restrict dst0, Pixel* __restrict dst1,
273  const Pixel* __restrict src0, const Pixel* __restrict src1,
274  const Pixel* __restrict src2, size_t srcWidth) __restrict
275 {
276 #ifdef __SSE2__
277  scaleSSE<false>(dst0, dst1, src0, src1, src2, srcWidth);
278 #else
279  scaleLineHalf_1on1(dst0, src0, src1, src2, srcWidth);
280  scaleLineHalf_1on1(dst1, src2, src1, src0, srcWidth);
281 #endif
282 }
283 
284 template <class Pixel>
286  Pixel* __restrict dst, const Pixel* __restrict src0,
287  const Pixel* __restrict src1, const Pixel* __restrict src2,
288  size_t srcWidth) __restrict
289 {
290  // ab ef
291  // x0 12 34 5x
292  // cd gh
293 
294  // First pixel.
295  Pixel mid = src1[0];
296  Pixel right = src1[1];
297  dst[0] = mid;
298 
299  // Central pixels.
300  for (unsigned x = 1; x < srcWidth - 1; ++x) {
301  Pixel left = mid;
302  mid = right;
303  right = src1[x + 1];
304  Pixel top = src0[x];
305  Pixel bot = src2[x];
306  dst[x] = (left == top && right != top && bot != top) ? top : mid;
307  }
308 
309  // Last pixel.
310  dst[srcWidth - 1] =
311  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
312  ? src0[srcWidth - 1] : right;
313 }
314 
315 template <class Pixel>
317  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
318  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
319 {
320  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
321  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
322  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
323 
324  int srcY = srcStartY;
325  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
326  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
327 
328  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
329  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
330  auto* dstUpper = dst.acquireLine(dstY + 0);
331  auto* dstLower = dst.acquireLine(dstY + 1);
332  scaleLine_1on2(dstUpper, dstLower,
333  srcPrev, srcCurr, srcNext,
334  srcWidth);
335  dst.releaseLine(dstY + 0, dstUpper);
336  dst.releaseLine(dstY + 1, dstLower);
337  srcPrev = srcCurr;
338  srcCurr = srcNext;
339  std::swap(buf0, buf1);
340  std::swap(buf1, buf2);
341  }
342 }
343 
344 template <class Pixel>
346  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
347  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
348 {
349  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
350  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
351  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
352 
353  int srcY = srcStartY;
354  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
355  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
356 
357  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
358  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
359  auto* dstUpper = dst.acquireLine(dstY + 0);
360  auto* dstLower = dst.acquireLine(dstY + 1);
361  scaleLine_1on1(dstUpper, dstLower,
362  srcPrev, srcCurr, srcNext,
363  srcWidth);
364  dst.releaseLine(dstY + 0, dstUpper);
365  dst.releaseLine(dstY + 1, dstLower);
366  srcPrev = srcCurr;
367  srcCurr = srcNext;
368  std::swap(buf0, buf1);
369  std::swap(buf1, buf2);
370  }
371 }
372 
373 // Force template instantiation.
374 #if HAVE_16BPP
375 template class Scale2xScaler<uint16_t>;
376 #endif
377 #if HAVE_32BPP
378 template class Scale2xScaler<uint32_t>;
379 #endif
380 
381 } // namespace openmsx
void scale1x1to2x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
virtual Pixel * acquireLine(unsigned y)=0
Base class for 2x scalers.
Definition: Scaler2.hh:11
uint32_t Pixel
Runs the Scale2x scaler algorithm.
Interface for getting lines from a video frame.
Definition: FrameSource.hh:14
void scale1x1to1x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
Scale2xScaler(const PixelOperations< Pixel > &pixelOps)
virtual void releaseLine(unsigned y, Pixel *buf)=0
const Pixel * getLinePtr(int line, unsigned width, Pixel *buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:91
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
#define UNREACHABLE
Definition: unreachable.hh:35