openMSX
Scale2xScaler.cc
Go to the documentation of this file.
1 /*
2 Original code: Copyright (C) 2001-2003 Andrea Mazzoleni
3 openMSX adaptation by Maarten ter Huurne
4 
5 This file is based on code from the Scale2x project.
6 This modified version is licensed under GPL; the original code is dual-licensed
7 under GPL and under a custom license.
8 
9 Visit the Scale2x site for info:
10  http://scale2x.sourceforge.net/
11 */
12 
13 #include "Scale2xScaler.hh"
14 #include "FrameSource.hh"
15 #include "ScalerOutput.hh"
16 #include "unreachable.hh"
17 #include "vla.hh"
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #ifdef __SSE2__
22 #include "emmintrin.h" // SSE2
23 #ifdef __SSSE3__
24 #include "tmmintrin.h" // SSSE3 (supplemental SSE3)
25 #endif
26 #endif
27 
28 namespace openmsx {
29 
30 #ifdef __SSE2__
31 
32 // Take an (unaligned) word from a certain position out of two adjacent
33 // (aligned) words. This either maps directly to the _mm_alignr_epi8()
34 // intrinsic or emulates that behavior.
35 template<int BYTES, int TMP = sizeof(__m128i) - BYTES>
36 [[nodiscard]] static inline __m128i align(__m128i high, __m128i low)
37 {
38 #ifdef __SSSE3__
39  return _mm_alignr_epi8(high, low, BYTES);
40 #else
41  return _mm_or_si128(
42  _mm_slli_si128(high, TMP),
43  _mm_srli_si128(low, BYTES));
44 #endif
45 }
46 
47 // Select bits from either one of the two inputs depending on the value of the
48 // corresponding bit in a selection mask.
49 [[nodiscard]] static inline __m128i select(__m128i a0, __m128i a1, __m128i mask)
50 {
51  // The traditional formula is:
52  // (a0 & ~mask) | (a1 & mask)
53  // This can use the and-not instruction, so it's only 3 x86 asm
54  // instructions. However this implementation uses the formula:
55  // ((a0 ^ a1) & mask) ^ a0
56  // This also generates 3 instructions, but the advantage is that all
57  // operations are commutative. This matters on 2-operand instruction
58  // set like x86. In this particular case it results in better register
59  // allocation and more common subexpression elimination.
60  return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a0, a1), mask), a0);
61 }
62 
63 // These three functions are abstracted to work either on 16bpp or 32bpp.
64 template<typename Pixel> [[nodiscard]] static inline __m128i isEqual(__m128i x, __m128i y)
65 {
66  if (sizeof(Pixel) == 4) {
67  return _mm_cmpeq_epi32(x, y);
68  } else if (sizeof(Pixel) == 2) {
69  return _mm_cmpeq_epi16(x, y);
70  } else {
72  }
73 }
74 template<typename Pixel> [[nodiscard]] static inline __m128i unpacklo(__m128i x, __m128i y)
75 {
76  if (sizeof(Pixel) == 4) {
77  return _mm_unpacklo_epi32(x, y);
78  } else if (sizeof(Pixel) == 2) {
79  return _mm_unpacklo_epi16(x, y);
80  } else {
82  }
83 }
84 template<typename Pixel> [[nodiscard]] static inline __m128i unpackhi(__m128i x, __m128i y)
85 {
86  if (sizeof(Pixel) == 4) {
87  return _mm_unpackhi_epi32(x, y);
88  } else if (sizeof(Pixel) == 2) {
89  return _mm_unpackhi_epi16(x, y);
90  } else {
92  }
93 }
94 
95 // Scale one 'unit'. A unit is 8x16bpp or 4x32bpp pixels.
96 // Workaround: it's more logical to pass the parameters
97 // 'top', 'bottom', 'prev', 'mid' and 'next'
98 // by value instead of by reference. Though this triggers a compile error in
99 // the 32-bit build on 'Visual Studio 2012 Version 11.0.60315.01 Update 2'
100 // Passing those parameter by-reference works around that compiler bug. I did
101 // verify that gcc still generates equally efficient code.
102 template<typename Pixel, bool DOUBLE_X> static inline void scale1(
103  __m128i& top, __m128i& bottom,
104  __m128i& prev, __m128i& mid, __m128i& next,
105  __m128i* out0, __m128i* out1)
106 {
107  __m128i left = align<sizeof(__m128i) - sizeof(Pixel)>(mid, prev);
108  __m128i right = align< sizeof(Pixel)>(next, mid);
109 
110  __m128i teqb = isEqual<Pixel>(top, bottom);
111  __m128i leqt = isEqual<Pixel>(left, top);
112  __m128i reqt = isEqual<Pixel>(right, top);
113  __m128i leqb = isEqual<Pixel>(left, bottom);
114  __m128i reqb = isEqual<Pixel>(right, bottom);
115 
116  __m128i cnda = _mm_andnot_si128(_mm_or_si128(teqb, reqt), leqt);
117  __m128i cndb = _mm_andnot_si128(_mm_or_si128(teqb, leqt), reqt);
118  __m128i cndc = _mm_andnot_si128(_mm_or_si128(teqb, reqb), leqb);
119  __m128i cndd = _mm_andnot_si128(_mm_or_si128(teqb, leqb), reqb);
120 
121  __m128i a = select(mid, top, cnda);
122  __m128i b = select(mid, top, cndb);
123  __m128i c = select(mid, bottom, cndc);
124  __m128i d = select(mid, bottom, cndd);
125 
126  if (DOUBLE_X) {
127  out0[0] = unpacklo<Pixel>(a, b);
128  out0[1] = unpackhi<Pixel>(a, b);
129  out1[0] = unpacklo<Pixel>(c, d);
130  out1[1] = unpackhi<Pixel>(c, d);
131  } else {
132  out0[0] = a;
133  out1[0] = c;
134  }
135 }
136 
137 // Scale 1 input line (plus the line above and below) to 2 output lines,
138 // optionally doubling the amount of pixels within the output lines.
139 template<bool DOUBLE_X, typename Pixel,
140  int SHIFT = sizeof(__m128i) - sizeof(Pixel)>
141 static inline void scaleSSE(
142  Pixel* __restrict out0_, // top output line
143  Pixel* __restrict out1_, // bottom output line
144  const Pixel* __restrict in0_, // top input line
145  const Pixel* __restrict in1_, // middle output line
146  const Pixel* __restrict in2_, // bottom output line
147  size_t width)
148 {
149  // Must be properly aligned.
150  assert((reinterpret_cast<uintptr_t>(in0_ ) % sizeof(__m128i)) == 0);
151  assert((reinterpret_cast<uintptr_t>(in1_ ) % sizeof(__m128i)) == 0);
152  assert((reinterpret_cast<uintptr_t>(in2_ ) % sizeof(__m128i)) == 0);
153  assert((reinterpret_cast<uintptr_t>(out0_) % sizeof(__m128i)) == 0);
154  assert((reinterpret_cast<uintptr_t>(out1_) % sizeof(__m128i)) == 0);
155 
156  // Must be a (strict positive) multiple of 16 bytes.
157  width *= sizeof(Pixel); // width in bytes
158  assert((width % sizeof(__m128i)) == 0);
159  assert(width > 1);
160  width -= sizeof(__m128i); // handle last unit special
161 
162  constexpr size_t SCALE = DOUBLE_X ? 2 : 1;
163 
164  // Generated code seems more efficient when all address calculations
165  // are done in bytes. Negative loop counter allows for a more efficient
166  // loop-end test.
167  const auto* in0 = reinterpret_cast<const char*>(in0_ ) + width;
168  const auto* in1 = reinterpret_cast<const char*>(in1_ ) + width;
169  const auto* in2 = reinterpret_cast<const char*>(in2_ ) + width;
170  auto* out0 = reinterpret_cast< char*>(out0_) + SCALE * width;
171  auto* out1 = reinterpret_cast< char*>(out1_) + SCALE * width;
172  ptrdiff_t x = -ptrdiff_t(width);
173 
174  // Setup for first unit
175  __m128i next = *reinterpret_cast<const __m128i*>(in1 + x);
176  __m128i mid = _mm_slli_si128(next, SHIFT);
177 
178  // Central units
179  do {
180  __m128i top = *reinterpret_cast<const __m128i*>(in0 + x);
181  __m128i bottom = *reinterpret_cast<const __m128i*>(in2 + x);
182  __m128i prev = mid;
183  mid = next;
184  next = *reinterpret_cast<const __m128i*>(in1 + x + sizeof(__m128i));
185  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
186  reinterpret_cast<__m128i*>(out0 + SCALE * x),
187  reinterpret_cast<__m128i*>(out1 + SCALE * x));
188  x += sizeof(__m128i);
189  } while (x < 0);
190  assert(x == 0);
191 
192  // Last unit
193  __m128i top = *reinterpret_cast<const __m128i*>(in0);
194  __m128i bottom = *reinterpret_cast<const __m128i*>(in2);
195  __m128i prev = mid;
196  mid = next;
197  next = _mm_srli_si128(next, SHIFT);
198  scale1<Pixel, DOUBLE_X>(top, bottom, prev, mid, next,
199  reinterpret_cast<__m128i*>(out0),
200  reinterpret_cast<__m128i*>(out1));
201 }
202 
203 #endif
204 
205 
206 template<typename Pixel>
208  : Scaler2<Pixel>(pixelOps_)
209 {
210 }
211 
212 template<typename Pixel>
214  Pixel* __restrict dst0, Pixel* __restrict dst1,
215  const Pixel* __restrict src0, const Pixel* __restrict src1,
216  const Pixel* __restrict src2, size_t srcWidth) __restrict
217 {
218  // For some reason, for the c++ version, processing the two output
219  // lines separately is faster than merging them in a single loop (even
220  // though a single loop only has to fetch the inputs once and can
221  // eliminate some common sub-expressions). For the asm version the
222  // situation is reversed.
223 #ifdef __SSE2__
224  scaleSSE<true>(dst0, dst1, src0, src1, src2, srcWidth);
225 #else
226  scaleLineHalf_1on2(dst0, src0, src1, src2, srcWidth);
227  scaleLineHalf_1on2(dst1, src2, src1, src0, srcWidth);
228 #endif
229 }
230 
231 template<typename Pixel>
232 void Scale2xScaler<Pixel>::scaleLineHalf_1on2(
233  Pixel* __restrict dst, const Pixel* __restrict src0,
234  const Pixel* __restrict src1, const Pixel* __restrict src2,
235  size_t srcWidth) __restrict
236 {
237  // n m is expaned to a b
238  // w m e c d
239  // s a = (w == n) && (s != n) && (e != n) ? n : m
240  // b = .. swap w/e
241  // c = .. swap n/s
242  // d = .. swap w/e n/s
243 
244  // First pixel.
245  Pixel mid = src1[0];
246  Pixel right = src1[1];
247  dst[0] = mid;
248  dst[1] = (right == src0[0] && src2[0] != src0[0]) ? src0[0] : mid;
249 
250  // Central pixels.
251  for (size_t x = 1; x < srcWidth - 1; ++x) {
252  Pixel left = mid;
253  mid = right;
254  right = src1[x + 1];
255  Pixel top = src0[x];
256  Pixel bot = src2[x];
257  dst[2 * x + 0] = (left == top && right != top && bot != top) ? top : mid;
258  dst[2 * x + 1] = (right == top && left != top && bot != top) ? top : mid;
259  }
260 
261  // Last pixel.
262  dst[2 * srcWidth - 2] =
263  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
264  ? src0[srcWidth - 1] : right;
265  dst[2 * srcWidth - 1] =
266  src1[srcWidth - 1];
267 }
268 
269 template<typename Pixel>
270 inline void Scale2xScaler<Pixel>::scaleLine_1on1(
271  Pixel* __restrict dst0, Pixel* __restrict dst1,
272  const Pixel* __restrict src0, const Pixel* __restrict src1,
273  const Pixel* __restrict src2, size_t srcWidth) __restrict
274 {
275 #ifdef __SSE2__
276  scaleSSE<false>(dst0, dst1, src0, src1, src2, srcWidth);
277 #else
278  scaleLineHalf_1on1(dst0, src0, src1, src2, srcWidth);
279  scaleLineHalf_1on1(dst1, src2, src1, src0, srcWidth);
280 #endif
281 }
282 
283 template<typename Pixel>
284 void Scale2xScaler<Pixel>::scaleLineHalf_1on1(
285  Pixel* __restrict dst, const Pixel* __restrict src0,
286  const Pixel* __restrict src1, const Pixel* __restrict src2,
287  size_t srcWidth) __restrict
288 {
289  // ab ef
290  // x0 12 34 5x
291  // cd gh
292 
293  // First pixel.
294  Pixel mid = src1[0];
295  Pixel right = src1[1];
296  dst[0] = mid;
297 
298  // Central pixels.
299  for (size_t x = 1; x < srcWidth - 1; ++x) {
300  Pixel left = mid;
301  mid = right;
302  right = src1[x + 1];
303  Pixel top = src0[x];
304  Pixel bot = src2[x];
305  dst[x] = (left == top && right != top && bot != top) ? top : mid;
306  }
307 
308  // Last pixel.
309  dst[srcWidth - 1] =
310  (mid == src0[srcWidth - 1] && src2[srcWidth - 1] != src0[srcWidth - 1])
311  ? src0[srcWidth - 1] : right;
312 }
313 
314 template<typename Pixel>
316  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
317  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
318 {
319  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
320  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
321  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
322 
323  int srcY = srcStartY;
324  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
325  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
326 
327  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
328  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
329  auto* dstUpper = dst.acquireLine(dstY + 0);
330  auto* dstLower = dst.acquireLine(dstY + 1);
331  scaleLine_1on2(dstUpper, dstLower,
332  srcPrev, srcCurr, srcNext,
333  srcWidth);
334  dst.releaseLine(dstY + 0, dstUpper);
335  dst.releaseLine(dstY + 1, dstLower);
336  srcPrev = srcCurr;
337  srcCurr = srcNext;
338  std::swap(buf0, buf1);
339  std::swap(buf1, buf2);
340  }
341 }
342 
343 template<typename Pixel>
345  unsigned srcStartY, unsigned /*srcEndY*/, unsigned srcWidth,
346  ScalerOutput<Pixel>& dst, unsigned dstStartY, unsigned dstEndY)
347 {
348  VLA_SSE_ALIGNED(Pixel, buf0_, srcWidth); auto* buf0 = buf0_;
349  VLA_SSE_ALIGNED(Pixel, buf1_, srcWidth); auto* buf1 = buf1_;
350  VLA_SSE_ALIGNED(Pixel, buf2_, srcWidth); auto* buf2 = buf2_;
351 
352  int srcY = srcStartY;
353  auto* srcPrev = src.getLinePtr(srcY - 1, srcWidth, buf0);
354  auto* srcCurr = src.getLinePtr(srcY + 0, srcWidth, buf1);
355 
356  for (unsigned dstY = dstStartY; dstY < dstEndY; srcY += 1, dstY += 2) {
357  auto* srcNext = src.getLinePtr(srcY + 1, srcWidth, buf2);
358  auto* dstUpper = dst.acquireLine(dstY + 0);
359  auto* dstLower = dst.acquireLine(dstY + 1);
360  scaleLine_1on1(dstUpper, dstLower,
361  srcPrev, srcCurr, srcNext,
362  srcWidth);
363  dst.releaseLine(dstY + 0, dstUpper);
364  dst.releaseLine(dstY + 1, dstLower);
365  srcPrev = srcCurr;
366  srcCurr = srcNext;
367  std::swap(buf0, buf1);
368  std::swap(buf1, buf2);
369  }
370 }
371 
372 // Force template instantiation.
373 #if HAVE_16BPP
374 template class Scale2xScaler<uint16_t>;
375 #endif
376 #if HAVE_32BPP
377 template class Scale2xScaler<uint32_t>;
378 #endif
379 
380 } // namespace openmsx
openmsx::Scale2xScaler::Scale2xScaler
Scale2xScaler(const PixelOperations< Pixel > &pixelOps)
Definition: Scale2xScaler.cc:207
openmsx::PixelOperations
Definition: PixelOperations.hh:144
FrameSource.hh
openmsx::ScalerOutput::releaseLine
virtual void releaseLine(unsigned y, Pixel *buf)=0
openmsx::FrameSource::getLinePtr
const Pixel * getLinePtr(int line, unsigned width, Pixel *buf) const
Gets a pointer to the pixels of the given line number.
Definition: FrameSource.hh:90
openmsx::Scaler2
Base class for 2x scalers.
Definition: Scaler2.hh:12
utf8::next
uint32_t next(octet_iterator &it, octet_iterator end)
Definition: utf8_checked.hh:146
openmsx::ScalerOutput
Definition: ScalerOutput.hh:7
vla.hh
openmsx::Pixel
uint32_t Pixel
Definition: GLHQLiteScaler.cc:98
UNREACHABLE
#define UNREACHABLE
Definition: unreachable.hh:38
openmsx::SCALE
constexpr int SCALE
Definition: ArkanoidPad.cc:29
ScalerOutput.hh
openmsx::ScalerOutput::acquireLine
virtual Pixel * acquireLine(unsigned y)=0
openmsx::x
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:1416
openmsx::FrameSource
Interface for getting lines from a video frame.
Definition: FrameSource.hh:14
openmsx::Scale2xScaler::scale1x1to1x2
void scale1x1to1x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
Definition: Scale2xScaler.cc:344
openmsx::mask
constexpr nibble mask[4][13]
Definition: RP5C01.cc:33
openmsx::Scale2xScaler
Runs the Scale2x scaler algorithm.
Definition: Scale2xScaler.hh:12
unreachable.hh
VLA_SSE_ALIGNED
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
openmsx::Scale2xScaler::scale1x1to2x2
void scale1x1to2x2(FrameSource &src, unsigned srcStartY, unsigned srcEndY, unsigned srcWidth, ScalerOutput< Pixel > &dst, unsigned dstStartY, unsigned dstEndY) override
Definition: Scale2xScaler.cc:315
openmsx
This file implemented 3 utility functions:
Definition: Autofire.cc:5
Scale2xScaler.hh