openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "SDLOutputSurface.hh"
9 #include "aligned.hh"
10 #include "checked_cast.hh"
11 #include "random.hh"
12 #include "xrange.hh"
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstdint>
17 #include <cstddef>
18 #include <numeric>
19 #ifdef __SSE2__
20 #include <emmintrin.h>
21 #endif
22 
23 namespace openmsx {
24 
25 constexpr unsigned NOISE_SHIFT = 8192;
26 constexpr unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
27 ALIGNAS_SSE static signed char noiseBuf[NOISE_BUF_SIZE];
28 
29 template<typename Pixel>
30 void FBPostProcessor<Pixel>::preCalcNoise(float factor)
31 {
32  // We skip noise drawing if the factor is 0, so there is no point in
33  // initializing the random data in that case.
34  if (factor == 0.0f) return;
35 
36  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
37  // 4 element boundaries) must have the same value. Later optimizations
38  // depend on it.
39 
40  float scale[4];
41  if constexpr (sizeof(Pixel) == 4) {
42  // 32bpp
43  // TODO ATM we compensate for big endian here. A better
44  // alternative is to turn noiseBuf into an array of ints (it's
45  // now bytes) and in the 16bpp code extract R,G,B components
46  // from those ints
47  const auto p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
48  : 0x03020100);
49  // TODO we can also fill the array with 'factor' and only set
50  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
51  // way to get the position of the alpha byte (yet).
52  scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
53  scale[pixelOps.red (p)] = factor;
54  scale[pixelOps.green(p)] = factor;
55  scale[pixelOps.blue (p)] = factor;
56  } else {
57  // 16bpp
58  scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
59  scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
60  scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
61  scale[3] = 0.0f;
62  }
63 
64  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
65  std::normal_distribution<float> distribution(0.0f, 1.0f);
66  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
67  float r = distribution(generator);
68  noiseBuf[i + 0] = std::clamp(int(roundf(r * scale[0])), -128, 127);
69  noiseBuf[i + 1] = std::clamp(int(roundf(r * scale[1])), -128, 127);
70  noiseBuf[i + 2] = std::clamp(int(roundf(r * scale[2])), -128, 127);
71  noiseBuf[i + 3] = std::clamp(int(roundf(r * scale[3])), -128, 127);
72  }
73 }
74 
75 #ifdef __SSE2__
76 static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
77 {
78  // To each of the RGBA color components (a value in range [0..255]) we
79  // want to add a signed noise value (in range [-128..127]) and also clip
80  // the result to the range [0..255]. There is no SSE instruction that
81  // directly performs this operation. But we can:
82  // - subtract 128 from the RGBA component to get a signed byte
83  // - perform the addition with signed saturation
84  // - add 128 to the result to get back to the unsigned byte range
85  // For 8-bit values the following 3 expressions are equivalent:
86  // x + 128 == x - 128 == x ^ 128
87  // So the expression becomes:
88  // signed_add_sat(value ^ 128, noise) ^ 128
89  // The following loop does just that, though it processes 64 bytes per
90  // iteration.
91  ptrdiff_t x = width * sizeof(uint32_t);
92  assert((x & 63) == 0);
93  assert((uintptr_t(buf_) & 15) == 0);
94 
95  char* buf = reinterpret_cast<char*>(buf_) + x;
96  char* nse = reinterpret_cast<char*>(noise) + x;
97  x = -x;
98 
99  __m128i b7 = _mm_set1_epi8(-128); // 0x80
100  do {
101  __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
102  __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
103  __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
104  __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
105  __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
106  __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
107  __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
108  __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
109  __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
110  __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
111  __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
112  __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
113  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
114  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
115  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
116  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
117  x += 4 * sizeof(__m128i);
118  } while (x < 0);
119 }
120 #endif
121 
127 static constexpr uint32_t addNoise4(uint32_t p, uint32_t n)
128 {
129  // unclipped result (lower 8 bits of each component)
130  // alternative:
131  // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
132  // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
133  // uint32_t s = s20 | s31;
134  uint32_t s0 = p + n; // carry spills to neighbors
135  uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
136  uint32_t s = s0 - ci; // subtract carry bits again
137 
138  // Underflow of a component happens ONLY
139  // WHEN input component is in range [0, 127]
140  // AND noise component is negative
141  // AND result component is in range [128, 255]
142  // Overflow of a component happens ONLY
143  // WHEN input component in in range [128, 255]
144  // AND noise component is positive
145  // AND result component is in range [0, 127]
146  // Create a mask per component containing 00 for no under/overflow,
147  // FF for under/overflow
148  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
149  uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
150  uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
151  // alternative1: uint32_t u2 = u1 | (u1 >> 1);
152  // uint32_t u4 = u2 | (u2 >> 2);
153  // uint32_t u8 = u4 | (u4 >> 4);
154  // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
155  uint32_t u8 = (u1 << 1) - (u1 >> 7);
156 
157  uint32_t o1 = t & p; // overflow
158  uint32_t o8 = (o1 << 1) - (o1 >> 7);
159 
160  // clip result
161  return (s & (~u8)) | o8;
162 }
163 
164 template<typename Pixel>
165 void FBPostProcessor<Pixel>::drawNoiseLine(
166  Pixel* buf, signed char* noise, size_t width)
167 {
168 #ifdef __SSE2__
169  if constexpr (sizeof(Pixel) == 4) {
170  // cast to avoid compilation error in case of 16bpp (even
171  // though this code is dead in that case).
172  auto* buf32 = reinterpret_cast<uint32_t*>(buf);
173  drawNoiseLineSse2(buf32, noise, width);
174  return;
175  }
176 #endif
177  // c++ version
178  if constexpr (sizeof(Pixel) == 4) {
179  // optimized version for 32bpp
180  auto* noise4 = reinterpret_cast<uint32_t*>(noise);
181  for (auto i : xrange(width)) {
182  buf[i] = addNoise4(buf[i], noise4[i]);
183  }
184  } else {
185  int mr = pixelOps.getMaxRed();
186  int mg = pixelOps.getMaxGreen();
187  int mb = pixelOps.getMaxBlue();
188  for (auto i : xrange(width)) {
189  Pixel p = buf[i];
190  int r = pixelOps.red(p);
191  int g = pixelOps.green(p);
192  int b = pixelOps.blue(p);
193 
194  r += noise[4 * i + 0];
195  g += noise[4 * i + 1];
196  b += noise[4 * i + 2];
197 
198  r = std::clamp(r, 0, mr);
199  g = std::clamp(g, 0, mg);
200  b = std::clamp(b, 0, mb);
201 
202  buf[i] = pixelOps.combine(r, g, b);
203  }
204  }
205 }
206 
207 template<typename Pixel>
208 void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output_)
209 {
210  if (renderSettings.getNoise() == 0.0f) return;
211 
212  auto& output = checked_cast<SDLOutputSurface&>(output_);
213  auto [w, h] = output.getLogicalSize();
214  auto pixelAccess = output.getDirectPixelAccess();
215  for (auto y : xrange(h)) {
216  auto* buf = pixelAccess.getLinePtr<Pixel>(y);
217  drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
218  }
219 }
220 
221 template<typename Pixel>
222 void FBPostProcessor<Pixel>::update(const Setting& setting) noexcept
223 {
225  auto& noiseSetting = renderSettings.getNoiseSetting();
226  if (&setting == &noiseSetting) {
227  preCalcNoise(noiseSetting.getDouble());
228  }
229 }
230 
231 
232 template<typename Pixel>
234  Display& display_, OutputSurface& screen_, const std::string& videoSource,
235  unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
236  : PostProcessor(
237  motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
238  canDoInterlace_)
239  , noiseShift(screen.getLogicalHeight())
240  , pixelOps(screen.getPixelFormat())
241 {
242  scaleAlgorithm = RenderSettings::NO_SCALER;
243  scaleFactor = unsigned(-1);
244  stretchWidth = unsigned(-1);
245 
246  auto& noiseSetting = renderSettings.getNoiseSetting();
247  noiseSetting.attach(*this);
248  preCalcNoise(noiseSetting.getDouble());
249  assert((screen.getLogicalWidth() * sizeof(Pixel)) < NOISE_SHIFT);
250 }
251 
252 template<typename Pixel>
254 {
255  renderSettings.getNoiseSetting().detach(*this);
256 }
257 
258 template<typename Pixel>
260 {
261  auto& output = checked_cast<SDLOutputSurface&>(output_);
262  if (renderSettings.getInterleaveBlackFrame()) {
263  interleaveCount ^= 1;
264  if (interleaveCount) {
265  output.clearScreen();
266  return;
267  }
268  }
269 
270  if (!paintFrame) return;
271 
272  // New scaler algorithm selected? Or different horizontal stretch?
273  auto algo = renderSettings.getScaleAlgorithm();
274  unsigned factor = renderSettings.getScaleFactor();
275  unsigned inWidth = lrintf(renderSettings.getHorizontalStretch());
276  if ((scaleAlgorithm != algo) || (scaleFactor != factor) ||
277  (inWidth != stretchWidth) || (lastOutput != &output)) {
278  scaleAlgorithm = algo;
279  scaleFactor = factor;
280  stretchWidth = inWidth;
281  lastOutput = &output;
283  PixelOperations<Pixel>(output.getPixelFormat()),
284  renderSettings);
286  output, pixelOps, inWidth);
287  }
288 
289  // Scale image.
290  const unsigned srcHeight = paintFrame->getHeight();
291  const unsigned dstHeight = output.getLogicalHeight();
292 
293  unsigned g = std::gcd(srcHeight, dstHeight);
294  unsigned srcStep = srcHeight / g;
295  unsigned dstStep = dstHeight / g;
296 
297  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
298  // on the PC screen, as a preparation for resizable output window.
299  unsigned srcStartY = 0;
300  unsigned dstStartY = 0;
301  while (dstStartY < dstHeight) {
302  // Currently this is true because the source frame height
303  // is always >= dstHeight/(dstStep/srcStep).
304  assert(srcStartY < srcHeight);
305 
306  // get region with equal lineWidth
307  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
308  unsigned srcEndY = srcStartY + srcStep;
309  unsigned dstEndY = dstStartY + dstStep;
310  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
311  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
312  srcEndY += srcStep;
313  dstEndY += dstStep;
314  }
315 
316  // fill region
317  //fprintf(stderr, "post processing lines %d-%d: %d\n",
318  // srcStartY, srcEndY, lineWidth);
319  currScaler->scaleImage(
320  *paintFrame, superImposeVideoFrame,
321  srcStartY, srcEndY, lineWidth, // source
322  *stretchScaler, dstStartY, dstEndY); // dest
323 
324  // next region
325  srcStartY = srcEndY;
326  dstStartY = dstEndY;
327  }
328 
329  drawNoise(output);
330 
331  output.flushFrameBuffer();
332 }
333 
334 template<typename Pixel>
335 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
336  std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
337 {
338  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
339  std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
340  for (auto y : xrange(screen.getLogicalHeight())) {
341  noiseShift[y] = distribution(generator) * 16;
342  }
343 
344  return PostProcessor::rotateFrames(std::move(finishedFrame), time);
345 }
346 
347 
348 // Force template instantiation.
349 #if HAVE_16BPP
350 template class FBPostProcessor<uint16_t>;
351 #endif
352 #if HAVE_32BPP
353 template class FBPostProcessor<uint32_t>;
354 #endif
355 
356 } // namespace openmsx
BaseSetting * setting
Definition: Interpreter.cc:27
int g
TclObject t
#define ALIGNAS_SSE
Definition: aligned.hh:24
Represents the output window/screen of openMSX.
Definition: Display.hh:33
Rasterizer using SDL.
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
void paint(OutputSurface &output) override
Paint this layer.
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
A frame buffer where pixels can be written to.
int getLogicalWidth() const
Abstract base class for post processors.
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
RenderSettings & renderSettings
Render settings.
OutputSurface & screen
The surface which is visible to the user.
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
void attach(Observer< T > &observer)
Definition: Subject.hh:50
void update(const Setting &setting) noexcept override
Definition: VideoLayer.cc:50
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
mat3 o3(vec3(1, 2, 3), vec3(4, 5, 0), vec3(7, 8, 9))
imat3 i3(ivec3(1, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
constexpr vecN< N, T > clamp(const vecN< N, T > &x, const vecN< N, T > &minVal, const vecN< N, T > &maxVal)
Definition: gl_vec.hh:296
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
constexpr unsigned NOISE_SHIFT
constexpr unsigned NOISE_BUF_SIZE
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:118
auto & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
static std::unique_ptr< ScalerOutput< Pixel > > create(SDLOutputSurface &output, PixelOperations< Pixel > pixelOps, unsigned inWidth)
constexpr auto xrange(T e)
Definition: xrange.hh:155