openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "OutputSurface.hh"
9 #include "Math.hh"
10 #include "aligned.hh"
11 #include "random.hh"
12 #include "xrange.hh"
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstdint>
17 #include <cstddef>
18 #ifdef __SSE2__
19 #include <emmintrin.h>
20 #endif
21 
22 namespace openmsx {
23 
24 static const unsigned NOISE_SHIFT = 8192;
25 static const unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
26 SSE_ALIGNED(static signed char noiseBuf[NOISE_BUF_SIZE]);
27 
28 template <class Pixel>
29 void FBPostProcessor<Pixel>::preCalcNoise(float factor)
30 {
31  // We skip noise drawing if the factor is 0, so there is no point in
32  // initializing the random data in that case.
33  if (factor == 0.0f) return;
34 
35  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
36  // 4 element boundaries) must have the same value. Later optimizations
37  // depend on it.
38 
39  float scale[4];
40  if (sizeof(Pixel) == 4) {
41  // 32bpp
42  // TODO ATM we compensate for big endian here. A better
43  // alternative is to turn noiseBuf into an array of ints (it's
44  // now bytes) and in the 16bpp code extract R,G,B components
45  // from those ints
46  const auto p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
47  : 0x03020100);
48  // TODO we can also fill the array with 'factor' and only set
49  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
50  // way to get the position of the alpha byte (yet).
51  scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
52  scale[pixelOps.red (p)] = factor;
53  scale[pixelOps.green(p)] = factor;
54  scale[pixelOps.blue (p)] = factor;
55  } else {
56  // 16bpp
57  scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
58  scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
59  scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
60  scale[3] = 0.0f;
61  }
62 
63  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
64  std::normal_distribution<float> distribution(0.0f, 1.0f);
65  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
66  float r = distribution(generator);
67  noiseBuf[i + 0] = Math::clip<-128, 127>(roundf(r * scale[0]));
68  noiseBuf[i + 1] = Math::clip<-128, 127>(roundf(r * scale[1]));
69  noiseBuf[i + 2] = Math::clip<-128, 127>(roundf(r * scale[2]));
70  noiseBuf[i + 3] = Math::clip<-128, 127>(roundf(r * scale[3]));
71  }
72 }
73 
74 #ifdef __SSE2__
75 static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
76 {
77  // To each of the RGBA color components (a value in range [0..255]) we
78  // want to add a signed noise value (in range [-128..127]) and also clip
79  // the result to the range [0..255]. There is no SSE instruction that
80  // directly performs this operation. But we can:
81  // - subtract 128 from the RGBA component to get a signed byte
82  // - perform the addition with signed saturation
83  // - add 128 to the result to get back to the unsigned byte range
84  // For 8-bit values the following 3 expressions are equivalent:
85  // x + 128 == x - 128 == x ^ 128
86  // So the expression becomes:
87  // signed_add_sat(value ^ 128, noise) ^ 128
88  // The follwoing loop does just that, though it processes 64 bytes per
89  // iteration.
90  ptrdiff_t x = width * sizeof(uint32_t);
91  assert((x & 63) == 0);
92  assert((uintptr_t(buf_) & 15) == 0);
93 
94  char* buf = reinterpret_cast<char*>(buf_) + x;
95  char* nse = reinterpret_cast<char*>(noise) + x;
96  x = -x;
97 
98  __m128i b7 = _mm_set1_epi8(-128); // 0x80
99  do {
100  __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
101  __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
102  __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
103  __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
104  __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
105  __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
106  __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
107  __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
108  __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
109  __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
110  __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
111  __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
112  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
113  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
114  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
115  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
116  x += 4 * sizeof(__m128i);
117  } while (x < 0);
118 }
119 #endif
120 
126 static inline uint32_t addNoise4(uint32_t p, uint32_t n)
127 {
128  // unclipped result (lower 8 bits of each component)
129  // alternative:
130  // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
131  // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
132  // uint32_t s = s20 | s31;
133  uint32_t s0 = p + n; // carry spills to neighbors
134  uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
135  uint32_t s = s0 - ci; // subtract carry bits again
136 
137  // Underflow of a component happens ONLY
138  // WHEN input component is in range [0, 127]
139  // AND noise component is negative
140  // AND result component is in range [128, 255]
141  // Overflow of a component happens ONLY
142  // WHEN input component in in range [128, 255]
143  // AND noise component is positive
144  // AND result component is in range [0, 127]
145  // Create a mask per component containing 00 for no under/overflow,
146  // FF for under/overflow
147  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
148  uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
149  uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
150  // alternative1: uint32_t u2 = u1 | (u1 >> 1);
151  // uint32_t u4 = u2 | (u2 >> 2);
152  // uint32_t u8 = u4 | (u4 >> 4);
153  // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
154  uint32_t u8 = (u1 << 1) - (u1 >> 7);
155 
156  uint32_t o1 = t & p; // overflow
157  uint32_t o8 = (o1 << 1) - (o1 >> 7);
158 
159  // clip result
160  return (s & (~u8)) | o8;
161 }
162 
163 template <class Pixel>
164 void FBPostProcessor<Pixel>::drawNoiseLine(
165  Pixel* buf, signed char* noise, size_t width)
166 {
167 #ifdef __SSE2__
168  if (sizeof(Pixel) == 4) {
169  // cast to avoid compilation error in case of 16bpp (even
170  // though this code is dead in that case).
171  auto* buf32 = reinterpret_cast<uint32_t*>(buf);
172  drawNoiseLineSse2(buf32, noise, width);
173  return;
174  }
175 #endif
176  // c++ version
177  if (sizeof(Pixel) == 4) {
178  // optimized version for 32bpp
179  auto noise4 = reinterpret_cast<uint32_t*>(noise);
180  for (size_t i = 0; i < width; ++i) {
181  buf[i] = addNoise4(buf[i], noise4[i]);
182  }
183  } else {
184  int mr = pixelOps.getMaxRed();
185  int mg = pixelOps.getMaxGreen();
186  int mb = pixelOps.getMaxBlue();
187  for (size_t i = 0; i < width; ++i) {
188  Pixel p = buf[i];
189  int r = pixelOps.red(p);
190  int g = pixelOps.green(p);
191  int b = pixelOps.blue(p);
192 
193  r += noise[4 * i + 0];
194  g += noise[4 * i + 1];
195  b += noise[4 * i + 2];
196 
197  r = std::min(std::max(r, 0), mr);
198  g = std::min(std::max(g, 0), mg);
199  b = std::min(std::max(b, 0), mb);
200 
201  buf[i] = pixelOps.combine(r, g, b);
202  }
203  }
204 }
205 
206 template <class Pixel>
207 void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output)
208 {
209  if (renderSettings.getNoise() == 0.0f) return;
210 
211  unsigned h = output.getHeight();
212  unsigned w = output.getWidth();
213  output.lock();
214  for (unsigned y = 0; y < h; ++y) {
215  auto* buf = output.getLinePtrDirect<Pixel>(y);
216  drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
217  }
218 }
219 
220 template <class Pixel>
221 void FBPostProcessor<Pixel>::update(const Setting& setting)
222 {
223  VideoLayer::update(setting);
224  auto& noiseSetting = renderSettings.getNoiseSetting();
225  if (&setting == &noiseSetting) {
226  preCalcNoise(noiseSetting.getDouble());
227  }
228 }
229 
230 
231 template <class Pixel>
233  Display& display_, OutputSurface& screen_, const std::string& videoSource,
234  unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
235  : PostProcessor(
236  motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
237  canDoInterlace_)
238  , noiseShift(screen.getHeight())
239  , pixelOps(screen.getSDLFormat())
240 {
241  scaleAlgorithm = RenderSettings::NO_SCALER;
242  scaleFactor = unsigned(-1);
243 
244  auto& noiseSetting = renderSettings.getNoiseSetting();
245  noiseSetting.attach(*this);
246  preCalcNoise(noiseSetting.getDouble());
247  assert((screen.getWidth() * sizeof(Pixel)) < NOISE_SHIFT);
248 }
249 
250 template <class Pixel>
252 {
254 }
255 
256 template <class Pixel>
258 {
260  interleaveCount ^= 1;
261  if (interleaveCount) {
262  output.clearScreen();
263  return;
264  }
265  }
266 
267  if (!paintFrame) return;
268 
269  // New scaler algorithm selected?
270  auto algo = renderSettings.getScaleAlgorithm();
271  unsigned factor = renderSettings.getScaleFactor();
272  if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
273  scaleAlgorithm = algo;
274  scaleFactor = factor;
278  }
279 
280  // Scale image.
281  const unsigned srcHeight = paintFrame->getHeight();
282  const unsigned dstHeight = output.getHeight();
283 
284  unsigned g = Math::gcd(srcHeight, dstHeight);
285  unsigned srcStep = srcHeight / g;
286  unsigned dstStep = dstHeight / g;
287 
288  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
289  // on the PC screen, as a preparation for resizable output window.
290  unsigned srcStartY = 0;
291  unsigned dstStartY = 0;
292  while (dstStartY < dstHeight) {
293  // Currently this is true because the source frame height
294  // is always >= dstHeight/(dstStep/srcStep).
295  assert(srcStartY < srcHeight);
296 
297  // get region with equal lineWidth
298  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
299  unsigned srcEndY = srcStartY + srcStep;
300  unsigned dstEndY = dstStartY + dstStep;
301  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
302  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
303  srcEndY += srcStep;
304  dstEndY += dstStep;
305  }
306 
307  // fill region
308  //fprintf(stderr, "post processing lines %d-%d: %d\n",
309  // srcStartY, srcEndY, lineWidth );
310  output.lock();
311  float horStretch = renderSettings.getHorizontalStretch();
312  unsigned inWidth = lrintf(horStretch);
313  std::unique_ptr<ScalerOutput<Pixel>> dst(
315  output, pixelOps, inWidth));
316  currScaler->scaleImage(
318  srcStartY, srcEndY, lineWidth, // source
319  *dst, dstStartY, dstEndY); // dest
320 
321  // next region
322  srcStartY = srcEndY;
323  dstStartY = dstEndY;
324  }
325 
326  drawNoise(output);
327 
328  output.flushFrameBuffer();
329 }
330 
331 template <class Pixel>
332 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
333  std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
334 {
335  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
336  std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
337  for (auto y : xrange(screen.getHeight())) {
338  noiseShift[y] = distribution(generator) * 16;
339  }
340 
341  return PostProcessor::rotateFrames(std::move(finishedFrame), time);
342 }
343 
344 
345 // Force template instantiation.
346 #if HAVE_16BPP
347 template class FBPostProcessor<uint16_t>;
348 #endif
349 #if HAVE_32BPP
350 template class FBPostProcessor<uint32_t>;
351 #endif
352 
353 } // namespace openmsx
void lock()
Lock this OutputSurface.
Definition: OutputSurface.cc:7
Represents the output window/screen of openMSX.
Definition: Display.hh:31
static unsigned getLineWidth(FrameSource *frame, unsigned y, unsigned step)
Returns the maximum width for lines [y..y+step).
virtual void clearScreen()=0
Clear screen (paint it black).
auto xrange(T e)
Definition: xrange.hh:170
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
vecN< N, T > min(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:269
int clip(int x)
Clips x to the range [LO,HI].
Definition: Math.hh:101
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
A frame buffer where pixels can be written to.
uint32_t Pixel
vecN< N, T > max(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:287
void paint(OutputSurface &output) override
Paint this layer.
mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
SSE_ALIGNED(static signed char noiseBuf[NOISE_BUF_SIZE])
void attach(Observer< T > &observer)
Definition: Subject.hh:49
RenderSettings & renderSettings
Render settings.
void update(const Setting &setting) override
Definition: VideoLayer.cc:49
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
virtual void flushFrameBuffer()
For SDLGL-FB-nn, copy frame buffer to OpenGL display.
ScaleAlgorithm getScaleAlgorithm() const
The current scaling algorithm.
unsigned gcd(unsigned a, unsigned b)
Calculate greatest common divider of two strictly positive integers.
Definition: Math.hh:134
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
OutputSurface & screen
The surface which is visible to the user.
imat3 i3(ivec3(1, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
int g
Abstract base class for post processors.
FrameSource * paintFrame
Represents a frame as it should be displayed.
float getHorizontalStretch() const
mat3 o3(vec3(1, 2, 3), vec3(4, 5, 0), vec3(7, 8, 9))
const RawFrame * superImposeVideoFrame
Video frame on which to superimpose the (VDP) output.
void detach(Observer< T > &observer)
Definition: Subject.hh:55
bool getInterleaveBlackFrame() const
Is black frame interleaving enabled?
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
TclObject t
auto & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
unsigned getHeight() const
Gets the number of lines in this frame.
Definition: FrameSource.hh:44
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
Rasterizer using SDL.
const SDL_PixelFormat & getSDLFormat() const