openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "OutputSurface.hh"
9 #include "Math.hh"
10 #include "aligned.hh"
11 #include "random.hh"
12 #include "xrange.hh"
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstdint>
17 #include <cstddef>
18 #ifdef __SSE2__
19 #include <emmintrin.h>
20 #endif
21 
22 namespace openmsx {
23 
24 constexpr unsigned NOISE_SHIFT = 8192;
25 constexpr unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
26 alignas(SSE_ALIGNMENT) static signed char noiseBuf[NOISE_BUF_SIZE];
27 
28 template <class Pixel>
29 void FBPostProcessor<Pixel>::preCalcNoise(float factor)
30 {
31  // We skip noise drawing if the factor is 0, so there is no point in
32  // initializing the random data in that case.
33  if (factor == 0.0f) return;
34 
35  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
36  // 4 element boundaries) must have the same value. Later optimizations
37  // depend on it.
38 
39  float scale[4];
40  if (sizeof(Pixel) == 4) {
41  // 32bpp
42  // TODO ATM we compensate for big endian here. A better
43  // alternative is to turn noiseBuf into an array of ints (it's
44  // now bytes) and in the 16bpp code extract R,G,B components
45  // from those ints
46  const auto p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
47  : 0x03020100);
48  // TODO we can also fill the array with 'factor' and only set
49  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
50  // way to get the position of the alpha byte (yet).
51  scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
52  scale[pixelOps.red (p)] = factor;
53  scale[pixelOps.green(p)] = factor;
54  scale[pixelOps.blue (p)] = factor;
55  } else {
56  // 16bpp
57  scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
58  scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
59  scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
60  scale[3] = 0.0f;
61  }
62 
63  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
64  std::normal_distribution<float> distribution(0.0f, 1.0f);
65  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
66  float r = distribution(generator);
67  noiseBuf[i + 0] = Math::clip<-128, 127>(roundf(r * scale[0]));
68  noiseBuf[i + 1] = Math::clip<-128, 127>(roundf(r * scale[1]));
69  noiseBuf[i + 2] = Math::clip<-128, 127>(roundf(r * scale[2]));
70  noiseBuf[i + 3] = Math::clip<-128, 127>(roundf(r * scale[3]));
71  }
72 }
73 
74 #ifdef __SSE2__
75 static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
76 {
77  // To each of the RGBA color components (a value in range [0..255]) we
78  // want to add a signed noise value (in range [-128..127]) and also clip
79  // the result to the range [0..255]. There is no SSE instruction that
80  // directly performs this operation. But we can:
81  // - subtract 128 from the RGBA component to get a signed byte
82  // - perform the addition with signed saturation
83  // - add 128 to the result to get back to the unsigned byte range
84  // For 8-bit values the following 3 expressions are equivalent:
85  // x + 128 == x - 128 == x ^ 128
86  // So the expression becomes:
87  // signed_add_sat(value ^ 128, noise) ^ 128
88  // The follwoing loop does just that, though it processes 64 bytes per
89  // iteration.
90  ptrdiff_t x = width * sizeof(uint32_t);
91  assert((x & 63) == 0);
92  assert((uintptr_t(buf_) & 15) == 0);
93 
94  char* buf = reinterpret_cast<char*>(buf_) + x;
95  char* nse = reinterpret_cast<char*>(noise) + x;
96  x = -x;
97 
98  __m128i b7 = _mm_set1_epi8(-128); // 0x80
99  do {
100  __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
101  __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
102  __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
103  __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
104  __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
105  __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
106  __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
107  __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
108  __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
109  __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
110  __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
111  __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
112  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
113  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
114  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
115  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
116  x += 4 * sizeof(__m128i);
117  } while (x < 0);
118 }
119 #endif
120 
126 static inline uint32_t addNoise4(uint32_t p, uint32_t n)
127 {
128  // unclipped result (lower 8 bits of each component)
129  // alternative:
130  // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
131  // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
132  // uint32_t s = s20 | s31;
133  uint32_t s0 = p + n; // carry spills to neighbors
134  uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
135  uint32_t s = s0 - ci; // subtract carry bits again
136 
137  // Underflow of a component happens ONLY
138  // WHEN input component is in range [0, 127]
139  // AND noise component is negative
140  // AND result component is in range [128, 255]
141  // Overflow of a component happens ONLY
142  // WHEN input component in in range [128, 255]
143  // AND noise component is positive
144  // AND result component is in range [0, 127]
145  // Create a mask per component containing 00 for no under/overflow,
146  // FF for under/overflow
147  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
148  uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
149  uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
150  // alternative1: uint32_t u2 = u1 | (u1 >> 1);
151  // uint32_t u4 = u2 | (u2 >> 2);
152  // uint32_t u8 = u4 | (u4 >> 4);
153  // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
154  uint32_t u8 = (u1 << 1) - (u1 >> 7);
155 
156  uint32_t o1 = t & p; // overflow
157  uint32_t o8 = (o1 << 1) - (o1 >> 7);
158 
159  // clip result
160  return (s & (~u8)) | o8;
161 }
162 
163 template <class Pixel>
165  Pixel* buf, signed char* noise, size_t width)
166 {
167 #ifdef __SSE2__
168  if (sizeof(Pixel) == 4) {
169  // cast to avoid compilation error in case of 16bpp (even
170  // though this code is dead in that case).
171  auto* buf32 = reinterpret_cast<uint32_t*>(buf);
172  drawNoiseLineSse2(buf32, noise, width);
173  return;
174  }
175 #endif
176  // c++ version
177  if (sizeof(Pixel) == 4) {
178  // optimized version for 32bpp
179  auto noise4 = reinterpret_cast<uint32_t*>(noise);
180  for (size_t i = 0; i < width; ++i) {
181  buf[i] = addNoise4(buf[i], noise4[i]);
182  }
183  } else {
184  int mr = pixelOps.getMaxRed();
185  int mg = pixelOps.getMaxGreen();
186  int mb = pixelOps.getMaxBlue();
187  for (size_t i = 0; i < width; ++i) {
188  Pixel p = buf[i];
189  int r = pixelOps.red(p);
190  int g = pixelOps.green(p);
191  int b = pixelOps.blue(p);
192 
193  r += noise[4 * i + 0];
194  g += noise[4 * i + 1];
195  b += noise[4 * i + 2];
196 
197  r = std::min(std::max(r, 0), mr);
198  g = std::min(std::max(g, 0), mg);
199  b = std::min(std::max(b, 0), mb);
200 
201  buf[i] = pixelOps.combine(r, g, b);
202  }
203  }
204 }
205 
206 template <class Pixel>
208 {
209  if (renderSettings.getNoise() == 0.0f) return;
210 
211  unsigned h = output.getHeight();
212  unsigned w = output.getWidth();
213  output.lock();
214  for (unsigned y = 0; y < h; ++y) {
215  auto* buf = output.getLinePtrDirect<Pixel>(y);
216  drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
217  }
218 }
219 
220 template <class Pixel>
221 void FBPostProcessor<Pixel>::update(const Setting& setting)
222 {
223  VideoLayer::update(setting);
224  auto& noiseSetting = renderSettings.getNoiseSetting();
225  if (&setting == &noiseSetting) {
226  preCalcNoise(noiseSetting.getDouble());
227  }
228 }
229 
230 
231 template <class Pixel>
233  Display& display_, OutputSurface& screen_, const std::string& videoSource,
234  unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
235  : PostProcessor(
236  motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
237  canDoInterlace_)
238  , noiseShift(screen.getHeight())
239  , pixelOps(screen.getSDLFormat())
240 {
241  scaleAlgorithm = RenderSettings::NO_SCALER;
242  scaleFactor = unsigned(-1);
243 
244  auto& noiseSetting = renderSettings.getNoiseSetting();
245  noiseSetting.attach(*this);
246  preCalcNoise(noiseSetting.getDouble());
247  assert((screen.getWidth() * sizeof(Pixel)) < NOISE_SHIFT);
248 }
249 
250 template <class Pixel>
252 {
254 }
255 
256 template <class Pixel>
258 {
260  interleaveCount ^= 1;
261  if (interleaveCount) {
262  output.clearScreen();
263  return;
264  }
265  }
266 
267  if (!paintFrame) return;
268 
269  // New scaler algorithm selected?
270  auto algo = renderSettings.getScaleAlgorithm();
271  unsigned factor = renderSettings.getScaleFactor();
272  if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
273  scaleAlgorithm = algo;
274  scaleFactor = factor;
278  }
279 
280  // Scale image.
281  const unsigned srcHeight = paintFrame->getHeight();
282  const unsigned dstHeight = output.getHeight();
283 
284  unsigned g = Math::gcd(srcHeight, dstHeight);
285  unsigned srcStep = srcHeight / g;
286  unsigned dstStep = dstHeight / g;
287 
288  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
289  // on the PC screen, as a preparation for resizable output window.
290  unsigned srcStartY = 0;
291  unsigned dstStartY = 0;
292  while (dstStartY < dstHeight) {
293  // Currently this is true because the source frame height
294  // is always >= dstHeight/(dstStep/srcStep).
295  assert(srcStartY < srcHeight);
296 
297  // get region with equal lineWidth
298  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
299  unsigned srcEndY = srcStartY + srcStep;
300  unsigned dstEndY = dstStartY + dstStep;
301  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
302  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
303  srcEndY += srcStep;
304  dstEndY += dstStep;
305  }
306 
307  // fill region
308  //fprintf(stderr, "post processing lines %d-%d: %d\n",
309  // srcStartY, srcEndY, lineWidth );
310  output.lock();
311  float horStretch = renderSettings.getHorizontalStretch();
312  unsigned inWidth = lrintf(horStretch);
313  std::unique_ptr<ScalerOutput<Pixel>> dst(
315  output, pixelOps, inWidth));
316  currScaler->scaleImage(
318  srcStartY, srcEndY, lineWidth, // source
319  *dst, dstStartY, dstEndY); // dest
320 
321  // next region
322  srcStartY = srcEndY;
323  dstStartY = dstEndY;
324  }
325 
326  drawNoise(output);
327 
328  output.flushFrameBuffer();
329 }
330 
331 template <class Pixel>
332 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
333  std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
334 {
335  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
336  std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
337  for (auto y : xrange(screen.getHeight())) {
338  noiseShift[y] = distribution(generator) * 16;
339  }
340 
341  return PostProcessor::rotateFrames(std::move(finishedFrame), time);
342 }
343 
344 
345 // Force template instantiation.
346 #if HAVE_16BPP
347 template class FBPostProcessor<uint16_t>;
348 #endif
349 #if HAVE_32BPP
350 template class FBPostProcessor<uint32_t>;
351 #endif
352 
353 } // namespace openmsx
void lock()
Lock this OutputSurface.
Definition: OutputSurface.cc:7
Represents the output window/screen of openMSX.
Definition: Display.hh:31
static unsigned getLineWidth(FrameSource *frame, unsigned y, unsigned step)
Returns the maximum width for lines [y..y+step).
auto xrange(T e)
Definition: xrange.hh:170
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
vecN< N, T > min(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:274
int clip(int x)
Clips x to the range [LO,HI].
Definition: Math.hh:101
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
A frame buffer where pixels can be written to.
uint32_t Pixel
vecN< N, T > max(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:292
void paint(OutputSurface &output) override
Paint this layer.
mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
constexpr unsigned NOISE_SHIFT
void attach(Observer< T > &observer)
Definition: Subject.hh:50
RenderSettings & renderSettings
Render settings.
void update(const Setting &setting) override
Definition: VideoLayer.cc:49
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
virtual void flushFrameBuffer()
Copy frame buffer to display buffer.
ScaleAlgorithm getScaleAlgorithm() const
The current scaling algorithm.
unsigned gcd(unsigned a, unsigned b)
Calculate greatest common divider of two strictly positive integers.
Definition: Math.hh:134
constexpr auto SSE_ALIGNMENT
Definition: aligned.hh:13
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
OutputSurface & screen
The surface which is visible to the user.
imat3 i3(ivec3(1, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
int g
virtual void clearScreen()
Clear frame buffer (paint it black).
Pixel * getLinePtrDirect(unsigned y)
Returns a pointer to the requested line in the pixel buffer.
Abstract base class for post processors.
FrameSource * paintFrame
Represents a frame as it should be displayed.
float getHorizontalStretch() const
mat3 o3(vec3(1, 2, 3), vec3(4, 5, 0), vec3(7, 8, 9))
const RawFrame * superImposeVideoFrame
Video frame on which to superimpose the (VDP) output.
constexpr unsigned NOISE_BUF_SIZE
void detach(Observer< T > &observer)
Definition: Subject.hh:56
bool getInterleaveBlackFrame() const
Is black frame interleaving enabled?
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:1377
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
TclObject t
auto & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
unsigned getHeight() const
Gets the number of lines in this frame.
Definition: FrameSource.hh:44
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
Rasterizer using SDL.
const SDL_PixelFormat & getSDLFormat() const