openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "OutputSurface.hh"
9 #include "IntegerSetting.hh"
10 #include "FloatSetting.hh"
11 #include "BooleanSetting.hh"
12 #include "EnumSetting.hh"
13 #include "Math.hh"
14 #include "aligned.hh"
15 #include "random.hh"
16 #include "xrange.hh"
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstddef>
21 #ifdef __SSE2__
22 #include <emmintrin.h>
23 #endif
24 
25 namespace openmsx {
26 
27 static const unsigned NOISE_SHIFT = 8192;
28 static const unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
29 SSE_ALIGNED(static signed char noiseBuf[NOISE_BUF_SIZE]);
30 
31 template <class Pixel>
32 void FBPostProcessor<Pixel>::preCalcNoise(float factor)
33 {
34  // We skip noise drawing if the factor is 0, so there is no point in
35  // initializing the random data in that case.
36  if (factor == 0.0f) return;
37 
38  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
39  // 4 element boundaries) must have the same value. Later optimizations
40  // depend on it.
41 
42  float scale[4];
43  if (sizeof(Pixel) == 4) {
44  // 32bpp
45  // TODO ATM we compensate for big endian here. A better
46  // alternative is to turn noiseBuf into an array of ints (it's
47  // now bytes) and in the 16bpp code extract R,G,B components
48  // from those ints
49  const Pixel p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
50  : 0x03020100);
51  // TODO we can also fill the array with 'factor' and only set
52  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
53  // way to get the position of the alpha byte (yet).
54  scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
55  scale[pixelOps.red (p)] = factor;
56  scale[pixelOps.green(p)] = factor;
57  scale[pixelOps.blue (p)] = factor;
58  } else {
59  // 16bpp
60  scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
61  scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
62  scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
63  scale[3] = 0.0f;
64  }
65 
66  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
67  std::normal_distribution<float> distribution(0.0f, 1.0f);
68  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
69  float r = distribution(generator);
70  noiseBuf[i + 0] = Math::clip<-128, 127>(roundf(r * scale[0]));
71  noiseBuf[i + 1] = Math::clip<-128, 127>(roundf(r * scale[1]));
72  noiseBuf[i + 2] = Math::clip<-128, 127>(roundf(r * scale[2]));
73  noiseBuf[i + 3] = Math::clip<-128, 127>(roundf(r * scale[3]));
74  }
75 }
76 
77 #ifdef __SSE2__
78 static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
79 {
80  // To each of the RGBA color components (a value in range [0..255]) we
81  // want to add a signed noise value (in range [-128..127]) and also clip
82  // the result to the range [0..255]. There is no SSE instruction that
83  // directly performs this operation. But we can:
84  // - subtract 128 from the RGBA component to get a signed byte
85  // - perform the addition with signed saturation
86  // - add 128 to the result to get back to the unsigned byte range
87  // For 8-bit values the following 3 expressions are equivalent:
88  // x + 128 == x - 128 == x ^ 128
89  // So the expression becomes:
90  // signed_add_sat(value ^ 128, noise) ^ 128
91  // The follwoing loop does just that, though it processes 64 bytes per
92  // iteration.
93  ptrdiff_t x = width * sizeof(uint32_t);
94  assert((x & 63) == 0);
95  assert((uintptr_t(buf_) & 15) == 0);
96 
97  char* buf = reinterpret_cast<char*>(buf_) + x;
98  char* nse = reinterpret_cast<char*>(noise) + x;
99  x = -x;
100 
101  __m128i b7 = _mm_set1_epi8(-128); // 0x80
102  do {
103  __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
104  __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
105  __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
106  __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
107  __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
108  __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
109  __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
110  __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
111  __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
112  __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
113  __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
114  __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
115  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
116  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
117  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
118  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
119  x += 4 * sizeof(__m128i);
120  } while (x < 0);
121 }
122 #endif
123 
129 static inline uint32_t addNoise4(uint32_t p, uint32_t n)
130 {
131  // unclipped result (lower 8 bits of each component)
132  // alternative:
133  // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
134  // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
135  // uint32_t s = s20 | s31;
136  uint32_t s0 = p + n; // carry spills to neighbors
137  uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
138  uint32_t s = s0 - ci; // subtract carry bits again
139 
140  // Underflow of a component happens ONLY
141  // WHEN input component is in range [0, 127]
142  // AND noise component is negative
143  // AND result component is in range [128, 255]
144  // Overflow of a component happens ONLY
145  // WHEN input component in in range [128, 255]
146  // AND noise component is positive
147  // AND result component is in range [0, 127]
148  // Create a mask per component containing 00 for no under/overflow,
149  // FF for under/overflow
150  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
151  uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
152  uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
153  // alternative1: uint32_t u2 = u1 | (u1 >> 1);
154  // uint32_t u4 = u2 | (u2 >> 2);
155  // uint32_t u8 = u4 | (u4 >> 4);
156  // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
157  uint32_t u8 = (u1 << 1) - (u1 >> 7);
158 
159  uint32_t o1 = t & p; // overflow
160  uint32_t o8 = (o1 << 1) - (o1 >> 7);
161 
162  // clip result
163  return (s & (~u8)) | o8;
164 }
165 
166 template <class Pixel>
167 void FBPostProcessor<Pixel>::drawNoiseLine(
168  Pixel* buf, signed char* noise, size_t width)
169 {
170 #ifdef __SSE2__
171  if (sizeof(Pixel) == 4) {
172  // cast to avoid compilation error in case of 16bpp (even
173  // though this code is dead in that case).
174  auto* buf32 = reinterpret_cast<uint32_t*>(buf);
175  drawNoiseLineSse2(buf32, noise, width);
176  return;
177  }
178 #endif
179  // c++ version
180  if (sizeof(Pixel) == 4) {
181  // optimized version for 32bpp
182  auto noise4 = reinterpret_cast<uint32_t*>(noise);
183  for (unsigned i = 0; i < width; ++i) {
184  buf[i] = addNoise4(buf[i], noise4[i]);
185  }
186  } else {
187  int mr = pixelOps.getMaxRed();
188  int mg = pixelOps.getMaxGreen();
189  int mb = pixelOps.getMaxBlue();
190  for (unsigned i = 0; i < width; ++i) {
191  Pixel p = buf[i];
192  int r = pixelOps.red(p);
193  int g = pixelOps.green(p);
194  int b = pixelOps.blue(p);
195 
196  r += noise[4 * i + 0];
197  g += noise[4 * i + 1];
198  b += noise[4 * i + 2];
199 
200  r = std::min(std::max(r, 0), mr);
201  g = std::min(std::max(g, 0), mg);
202  b = std::min(std::max(b, 0), mb);
203 
204  buf[i] = pixelOps.combine(r, g, b);
205  }
206  }
207 }
208 
209 template <class Pixel>
210 void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output)
211 {
212  if (renderSettings.getNoise() == 0.0f) return;
213 
214  unsigned h = output.getHeight();
215  unsigned w = output.getWidth();
216  output.lock();
217  for (unsigned y = 0; y < h; ++y) {
218  Pixel* buf = output.getLinePtrDirect<Pixel>(y);
219  drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
220  }
221 }
222 
223 template <class Pixel>
224 void FBPostProcessor<Pixel>::update(const Setting& setting)
225 {
226  VideoLayer::update(setting);
227  auto& noiseSetting = renderSettings.getNoiseSetting();
228  if (&setting == &noiseSetting) {
229  preCalcNoise(noiseSetting.getDouble());
230  }
231 }
232 
233 
234 template <class Pixel>
236  Display& display_, OutputSurface& screen_, const std::string& videoSource,
237  unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
238  : PostProcessor(
239  motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
240  canDoInterlace_)
241  , noiseShift(screen.getHeight())
242  , pixelOps(screen.getSDLFormat())
243 {
244  scaleAlgorithm = RenderSettings::NO_SCALER;
245  scaleFactor = unsigned(-1);
246 
247  auto& noiseSetting = renderSettings.getNoiseSetting();
248  noiseSetting.attach(*this);
249  preCalcNoise(noiseSetting.getDouble());
250  assert((screen.getWidth() * sizeof(Pixel)) < NOISE_SHIFT);
251 }
252 
253 template <class Pixel>
255 {
257 }
258 
259 template <class Pixel>
261 {
263  interleaveCount ^= 1;
264  if (interleaveCount) {
265  output.clearScreen();
266  return;
267  }
268  }
269 
270  if (!paintFrame) return;
271 
272  // New scaler algorithm selected?
273  auto algo = renderSettings.getScaleAlgorithm();
274  unsigned factor = renderSettings.getScaleFactor();
275  if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
276  scaleAlgorithm = algo;
277  scaleFactor = factor;
281  }
282 
283  // Scale image.
284  const unsigned srcHeight = paintFrame->getHeight();
285  const unsigned dstHeight = output.getHeight();
286 
287  unsigned g = Math::gcd(srcHeight, dstHeight);
288  unsigned srcStep = srcHeight / g;
289  unsigned dstStep = dstHeight / g;
290 
291  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
292  // on the PC screen, as a preparation for resizable output window.
293  unsigned srcStartY = 0;
294  unsigned dstStartY = 0;
295  while (dstStartY < dstHeight) {
296  // Currently this is true because the source frame height
297  // is always >= dstHeight/(dstStep/srcStep).
298  assert(srcStartY < srcHeight);
299 
300  // get region with equal lineWidth
301  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
302  unsigned srcEndY = srcStartY + srcStep;
303  unsigned dstEndY = dstStartY + dstStep;
304  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
305  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
306  srcEndY += srcStep;
307  dstEndY += dstStep;
308  }
309 
310  // fill region
311  //fprintf(stderr, "post processing lines %d-%d: %d\n",
312  // srcStartY, srcEndY, lineWidth );
313  output.lock();
314  float horStretch = renderSettings.getHorizontalStretch();
315  unsigned inWidth = unsigned(horStretch + 0.5f);
316  std::unique_ptr<ScalerOutput<Pixel>> dst(
318  output, pixelOps, inWidth));
319  currScaler->scaleImage(
321  srcStartY, srcEndY, lineWidth, // source
322  *dst, dstStartY, dstEndY); // dest
323 
324  // next region
325  srcStartY = srcEndY;
326  dstStartY = dstEndY;
327  }
328 
329  drawNoise(output);
330 
331  output.flushFrameBuffer(); // for SDLGL-FBxx
332 }
333 
334 template <class Pixel>
335 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
336  std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
337 {
338  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
339  std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
340  for (auto y : xrange(screen.getHeight())) {
341  noiseShift[y] = distribution(generator) * 16;
342  }
343 
344  return PostProcessor::rotateFrames(std::move(finishedFrame), time);
345 }
346 
347 
348 // Force template instantiation.
349 #if HAVE_16BPP
350 template class FBPostProcessor<uint16_t>;
351 #endif
352 #if HAVE_32BPP
353 template class FBPostProcessor<uint32_t>;
354 #endif
355 
356 } // namespace openmsx
void lock()
Lock this OutputSurface.
Represents the output window/screen of openMSX.
Definition: Display.hh:31
static unsigned getLineWidth(FrameSource *frame, unsigned y, unsigned step)
Returns the maximum width for lines [y..y+step).
virtual void clearScreen()=0
Clear screen (paint it black).
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
vecN< N, T > min(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:257
int clip(int x)
Clips x to the range [LO,HI].
Definition: Math.hh:34
bool getInterleaveBlackFrame() const
Is black frame interleaving enabled?
A frame buffer where pixels can be written to.
uint32_t Pixel
vecN< N, T > max(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:266
void paint(OutputSurface &output) override
Paint this layer.
mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
SSE_ALIGNED(static signed char noiseBuf[NOISE_BUF_SIZE])
void attach(Observer< T > &observer)
Definition: Subject.hh:52
RenderSettings & renderSettings
Render settings.
const SDL_PixelFormat & getSDLFormat() const
void update(const Setting &setting) override
Definition: VideoLayer.cc:49
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
virtual void flushFrameBuffer()
For SDLGL-FB-nn, copy frame buffer to OpenGL display.
unsigned gcd(unsigned a, unsigned b)
Calculate greatest common divider of two strictly positive integers.
Definition: Math.hh:67
unsigned getHeight() const
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
unsigned getWidth() const
OutputSurface & screen
The surface which is visible to the user.
std::minstd_rand0 & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
Abstract base class for post processors.
FrameSource * paintFrame
Represents a frame as it should be displayed.
const RawFrame * superImposeVideoFrame
Video frame on which to superimpose the (VDP) output.
float getHorizontalStretch() const
void detach(Observer< T > &observer)
Definition: Subject.hh:58
unsigned getHeight() const
Gets the number of lines in this frame.
Definition: FrameSource.hh:44
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
ScaleAlgorithm getScaleAlgorithm() const
The current scaling algorithm.
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
XRange< T > xrange(T e)
Definition: xrange.hh:98
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
Rasterizer using SDL.