openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "SDLOutputSurface.hh"
9 #include "aligned.hh"
10 #include "checked_cast.hh"
11 #include "endian.hh"
12 #include "random.hh"
13 #include "xrange.hh"
14 #include <algorithm>
15 #include <cassert>
16 #include <cmath>
17 #include <cstdint>
18 #include <cstddef>
19 #include <numeric>
20 #ifdef __SSE2__
21 #include <emmintrin.h>
22 #endif
23 
24 namespace openmsx {
25 
26 constexpr unsigned NOISE_SHIFT = 8192;
27 constexpr unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
28 ALIGNAS_SSE static signed char noiseBuf[NOISE_BUF_SIZE];
29 
30 template<typename Pixel>
31 void FBPostProcessor<Pixel>::preCalcNoise(float factor)
32 {
33  // We skip noise drawing if the factor is 0, so there is no point in
34  // initializing the random data in that case.
35  if (factor == 0.0f) return;
36 
37  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
38  // 4 element boundaries) must have the same value. Later optimizations
39  // depend on it.
40 
41  float scale[4];
42  if constexpr (sizeof(Pixel) == 4) {
43  // 32bpp
44  // TODO ATM we compensate for big endian here. A better
45  // alternative is to turn noiseBuf into an array of ints (it's
46  // now bytes) and in the 16bpp code extract R,G,B components
47  // from those ints
48  const auto p = Pixel(Endian::BIG ? 0x00010203 : 0x03020100);
49  // TODO we can also fill the array with 'factor' and only set
50  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
51  // way to get the position of the alpha byte (yet).
52  scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
53  scale[pixelOps.red (p)] = factor;
54  scale[pixelOps.green(p)] = factor;
55  scale[pixelOps.blue (p)] = factor;
56  } else {
57  // 16bpp
58  scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
59  scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
60  scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
61  scale[3] = 0.0f;
62  }
63 
64  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
65  std::normal_distribution<float> distribution(0.0f, 1.0f);
66  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
67  float r = distribution(generator);
68  noiseBuf[i + 0] = std::clamp(int(roundf(r * scale[0])), -128, 127);
69  noiseBuf[i + 1] = std::clamp(int(roundf(r * scale[1])), -128, 127);
70  noiseBuf[i + 2] = std::clamp(int(roundf(r * scale[2])), -128, 127);
71  noiseBuf[i + 3] = std::clamp(int(roundf(r * scale[3])), -128, 127);
72  }
73 }
74 
75 #ifdef __SSE2__
76 static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
77 {
78  // To each of the RGBA color components (a value in range [0..255]) we
79  // want to add a signed noise value (in range [-128..127]) and also clip
80  // the result to the range [0..255]. There is no SSE instruction that
81  // directly performs this operation. But we can:
82  // - subtract 128 from the RGBA component to get a signed byte
83  // - perform the addition with signed saturation
84  // - add 128 to the result to get back to the unsigned byte range
85  // For 8-bit values the following 3 expressions are equivalent:
86  // x + 128 == x - 128 == x ^ 128
87  // So the expression becomes:
88  // signed_add_sat(value ^ 128, noise) ^ 128
89  // The following loop does just that, though it processes 64 bytes per
90  // iteration.
91  ptrdiff_t x = width * sizeof(uint32_t);
92  assert((x & 63) == 0);
93  assert((uintptr_t(buf_) & 15) == 0);
94 
95  char* buf = reinterpret_cast<char*>(buf_) + x;
96  char* nse = reinterpret_cast<char*>(noise) + x;
97  x = -x;
98 
99  __m128i b7 = _mm_set1_epi8(-128); // 0x80
100  do {
101  __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
102  __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
103  __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
104  __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
105  __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
106  __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
107  __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
108  __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
109  __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
110  __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
111  __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
112  __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
113  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
114  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
115  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
116  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
117  x += 4 * sizeof(__m128i);
118  } while (x < 0);
119 }
120 #endif
121 
127 static constexpr uint32_t addNoise4(uint32_t p, uint32_t n)
128 {
129  // unclipped result (lower 8 bits of each component)
130  // alternative:
131  // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
132  // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
133  // uint32_t s = s20 | s31;
134  uint32_t s0 = p + n; // carry spills to neighbors
135  uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
136  uint32_t s = s0 - ci; // subtract carry bits again
137 
138  // Underflow of a component happens ONLY
139  // WHEN input component is in range [0, 127]
140  // AND noise component is negative
141  // AND result component is in range [128, 255]
142  // Overflow of a component happens ONLY
143  // WHEN input component in in range [128, 255]
144  // AND noise component is positive
145  // AND result component is in range [0, 127]
146  // Create a mask per component containing 00 for no under/overflow,
147  // FF for under/overflow
148  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
149  uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
150  uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
151  // alternative1: uint32_t u2 = u1 | (u1 >> 1);
152  // uint32_t u4 = u2 | (u2 >> 2);
153  // uint32_t u8 = u4 | (u4 >> 4);
154  // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
155  uint32_t u8 = (u1 << 1) - (u1 >> 7);
156 
157  uint32_t o1 = t & p; // overflow
158  uint32_t o8 = (o1 << 1) - (o1 >> 7);
159 
160  // clip result
161  return (s & (~u8)) | o8;
162 }
163 
164 template<typename Pixel>
165 void FBPostProcessor<Pixel>::drawNoiseLine(
166  Pixel* buf, signed char* noise, size_t width)
167 {
168 #ifdef __SSE2__
169  if constexpr (sizeof(Pixel) == 4) {
170  // cast to avoid compilation error in case of 16bpp (even
171  // though this code is dead in that case).
172  auto* buf32 = reinterpret_cast<uint32_t*>(buf);
173  drawNoiseLineSse2(buf32, noise, width);
174  return;
175  }
176 #endif
177  // c++ version
178  if constexpr (sizeof(Pixel) == 4) {
179  // optimized version for 32bpp
180  auto* noise4 = reinterpret_cast<uint32_t*>(noise);
181  for (auto i : xrange(width)) {
182  buf[i] = addNoise4(buf[i], noise4[i]);
183  }
184  } else {
185  int mr = pixelOps.getMaxRed();
186  int mg = pixelOps.getMaxGreen();
187  int mb = pixelOps.getMaxBlue();
188  for (auto i : xrange(width)) {
189  Pixel p = buf[i];
190  int r = pixelOps.red(p);
191  int g = pixelOps.green(p);
192  int b = pixelOps.blue(p);
193 
194  r += noise[4 * i + 0];
195  g += noise[4 * i + 1];
196  b += noise[4 * i + 2];
197 
198  r = std::clamp(r, 0, mr);
199  g = std::clamp(g, 0, mg);
200  b = std::clamp(b, 0, mb);
201 
202  buf[i] = pixelOps.combine(r, g, b);
203  }
204  }
205 }
206 
207 template<typename Pixel>
208 void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output_)
209 {
210  if (renderSettings.getNoise() == 0.0f) return;
211 
212  auto& output = checked_cast<SDLOutputSurface&>(output_);
213  auto [w, h] = output.getLogicalSize();
214  auto pixelAccess = output.getDirectPixelAccess();
215  for (auto y : xrange(h)) {
216  auto* buf = pixelAccess.getLinePtr<Pixel>(y);
217  drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
218  }
219 }
220 
221 template<typename Pixel>
222 void FBPostProcessor<Pixel>::update(const Setting& setting) noexcept
223 {
225  auto& noiseSetting = renderSettings.getNoiseSetting();
226  if (&setting == &noiseSetting) {
227  preCalcNoise(noiseSetting.getDouble());
228  }
229 }
230 
231 
232 template<typename Pixel>
234  Display& display_, OutputSurface& screen_, const std::string& videoSource,
235  unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
236  : PostProcessor(
237  motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
238  canDoInterlace_)
239  , scaleAlgorithm(RenderSettings::NO_SCALER)
240  , scaleFactor(unsigned(-1))
241  , stretchWidth(unsigned(-1))
242  , noiseShift(screen.getLogicalHeight())
243  , pixelOps(screen.getPixelFormat())
244 {
245  auto& noiseSetting = renderSettings.getNoiseSetting();
246  noiseSetting.attach(*this);
247  preCalcNoise(noiseSetting.getDouble());
248  assert((screen.getLogicalWidth() * sizeof(Pixel)) < NOISE_SHIFT);
249 }
250 
251 template<typename Pixel>
253 {
254  renderSettings.getNoiseSetting().detach(*this);
255 }
256 
257 template<typename Pixel>
259 {
260  auto& output = checked_cast<SDLOutputSurface&>(output_);
261  if (renderSettings.getInterleaveBlackFrame()) {
262  interleaveCount ^= 1;
263  if (interleaveCount) {
264  output.clearScreen();
265  return;
266  }
267  }
268 
269  if (!paintFrame) return;
270 
271  // New scaler algorithm selected? Or different horizontal stretch?
272  auto algo = renderSettings.getScaleAlgorithm();
273  unsigned factor = renderSettings.getScaleFactor();
274  unsigned inWidth = lrintf(renderSettings.getHorizontalStretch());
275  if ((scaleAlgorithm != algo) || (scaleFactor != factor) ||
276  (inWidth != stretchWidth) || (lastOutput != &output)) {
277  scaleAlgorithm = algo;
278  scaleFactor = factor;
279  stretchWidth = inWidth;
280  lastOutput = &output;
282  PixelOperations<Pixel>(output.getPixelFormat()),
283  renderSettings);
285  output, pixelOps, inWidth);
286  }
287 
288  // Scale image.
289  const unsigned srcHeight = paintFrame->getHeight();
290  const unsigned dstHeight = output.getLogicalHeight();
291 
292  unsigned g = std::gcd(srcHeight, dstHeight);
293  unsigned srcStep = srcHeight / g;
294  unsigned dstStep = dstHeight / g;
295 
296  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
297  // on the PC screen, as a preparation for resizable output window.
298  unsigned srcStartY = 0;
299  unsigned dstStartY = 0;
300  while (dstStartY < dstHeight) {
301  // Currently this is true because the source frame height
302  // is always >= dstHeight/(dstStep/srcStep).
303  assert(srcStartY < srcHeight);
304 
305  // get region with equal lineWidth
306  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
307  unsigned srcEndY = srcStartY + srcStep;
308  unsigned dstEndY = dstStartY + dstStep;
309  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
310  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
311  srcEndY += srcStep;
312  dstEndY += dstStep;
313  }
314 
315  // fill region
316  //fprintf(stderr, "post processing lines %d-%d: %d\n",
317  // srcStartY, srcEndY, lineWidth);
318  currScaler->scaleImage(
319  *paintFrame, superImposeVideoFrame,
320  srcStartY, srcEndY, lineWidth, // source
321  *stretchScaler, dstStartY, dstEndY); // dest
322 
323  // next region
324  srcStartY = srcEndY;
325  dstStartY = dstEndY;
326  }
327 
328  drawNoise(output);
329 
330  output.flushFrameBuffer();
331 }
332 
333 template<typename Pixel>
334 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
335  std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
336 {
337  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
338  std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
339  for (auto y : xrange(screen.getLogicalHeight())) {
340  noiseShift[y] = distribution(generator) * 16;
341  }
342 
343  return PostProcessor::rotateFrames(std::move(finishedFrame), time);
344 }
345 
346 
347 // Force template instantiation.
348 #if HAVE_16BPP
349 template class FBPostProcessor<uint16_t>;
350 #endif
351 #if HAVE_32BPP
352 template class FBPostProcessor<uint32_t>;
353 #endif
354 
355 } // namespace openmsx
BaseSetting * setting
Definition: Interpreter.cc:27
int g
TclObject t
#define ALIGNAS_SSE
Definition: aligned.hh:24
Represents the output window/screen of openMSX.
Definition: Display.hh:33
Rasterizer using SDL.
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
void paint(OutputSurface &output) override
Paint this layer.
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
A frame buffer where pixels can be written to.
int getLogicalWidth() const
Abstract base class for post processors.
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
RenderSettings & renderSettings
Render settings.
OutputSurface & screen
The surface which is visible to the user.
Class containing all settings for renderers.
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
void attach(Observer< T > &observer)
Definition: Subject.hh:50
void update(const Setting &setting) noexcept override
Definition: VideoLayer.cc:50
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
mat3 o3(vec3(1, 2, 3), vec3(4, 5, 0), vec3(7, 8, 9))
imat3 i3(ivec3(1, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
constexpr bool BIG
Definition: endian.hh:12
constexpr vecN< N, T > clamp(const vecN< N, T > &x, const vecN< N, T > &minVal, const vecN< N, T > &maxVal)
Definition: gl_vec.hh:296
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
constexpr unsigned NOISE_SHIFT
constexpr unsigned NOISE_BUF_SIZE
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:127
auto & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
static std::unique_ptr< ScalerOutput< Pixel > > create(SDLOutputSurface &output, PixelOperations< Pixel > pixelOps, unsigned inWidth)
constexpr auto xrange(T e)
Definition: xrange.hh:155