openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1 #include "FBPostProcessor.hh"
2 #include "RawFrame.hh"
3 #include "StretchScalerOutput.hh"
4 #include "ScalerOutput.hh"
5 #include "RenderSettings.hh"
6 #include "Scaler.hh"
7 #include "ScalerFactory.hh"
8 #include "SDLOutputSurface.hh"
9 #include "Math.hh"
10 #include "aligned.hh"
11 #include "checked_cast.hh"
12 #include "random.hh"
13 #include "xrange.hh"
14 #include <algorithm>
15 #include <cassert>
16 #include <cmath>
17 #include <cstdint>
18 #include <cstddef>
19 #ifdef __SSE2__
20 #include <emmintrin.h>
21 #endif
22 
23 namespace openmsx {
24 
25 constexpr unsigned NOISE_SHIFT = 8192;
26 constexpr unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
27 alignas(SSE_ALIGNMENT) static signed char noiseBuf[NOISE_BUF_SIZE];
28 
29 template <class Pixel>
30 void FBPostProcessor<Pixel>::preCalcNoise(float factor)
31 {
32  // We skip noise drawing if the factor is 0, so there is no point in
33  // initializing the random data in that case.
34  if (factor == 0.0f) return;
35 
36  // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
37  // 4 element boundaries) must have the same value. Later optimizations
38  // depend on it.
39 
40  float scale[4];
41  if (sizeof(Pixel) == 4) {
42  // 32bpp
43  // TODO ATM we compensate for big endian here. A better
44  // alternative is to turn noiseBuf into an array of ints (it's
45  // now bytes) and in the 16bpp code extract R,G,B components
46  // from those ints
47  const auto p = Pixel(OPENMSX_BIGENDIAN ? 0x00010203
48  : 0x03020100);
49  // TODO we can also fill the array with 'factor' and only set
50  // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
51  // way to get the position of the alpha byte (yet).
52  scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
53  scale[pixelOps.red (p)] = factor;
54  scale[pixelOps.green(p)] = factor;
55  scale[pixelOps.blue (p)] = factor;
56  } else {
57  // 16bpp
58  scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
59  scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
60  scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
61  scale[3] = 0.0f;
62  }
63 
64  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
65  std::normal_distribution<float> distribution(0.0f, 1.0f);
66  for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
67  float r = distribution(generator);
68  noiseBuf[i + 0] = Math::clip<-128, 127>(roundf(r * scale[0]));
69  noiseBuf[i + 1] = Math::clip<-128, 127>(roundf(r * scale[1]));
70  noiseBuf[i + 2] = Math::clip<-128, 127>(roundf(r * scale[2]));
71  noiseBuf[i + 3] = Math::clip<-128, 127>(roundf(r * scale[3]));
72  }
73 }
74 
75 #ifdef __SSE2__
76 static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
77 {
78  // To each of the RGBA color components (a value in range [0..255]) we
79  // want to add a signed noise value (in range [-128..127]) and also clip
80  // the result to the range [0..255]. There is no SSE instruction that
81  // directly performs this operation. But we can:
82  // - subtract 128 from the RGBA component to get a signed byte
83  // - perform the addition with signed saturation
84  // - add 128 to the result to get back to the unsigned byte range
85  // For 8-bit values the following 3 expressions are equivalent:
86  // x + 128 == x - 128 == x ^ 128
87  // So the expression becomes:
88  // signed_add_sat(value ^ 128, noise) ^ 128
89  // The follwoing loop does just that, though it processes 64 bytes per
90  // iteration.
91  ptrdiff_t x = width * sizeof(uint32_t);
92  assert((x & 63) == 0);
93  assert((uintptr_t(buf_) & 15) == 0);
94 
95  char* buf = reinterpret_cast<char*>(buf_) + x;
96  char* nse = reinterpret_cast<char*>(noise) + x;
97  x = -x;
98 
99  __m128i b7 = _mm_set1_epi8(-128); // 0x80
100  do {
101  __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
102  __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
103  __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
104  __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
105  __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
106  __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
107  __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
108  __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
109  __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
110  __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
111  __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
112  __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
113  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
114  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
115  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
116  _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
117  x += 4 * sizeof(__m128i);
118  } while (x < 0);
119 }
120 #endif
121 
127 static inline uint32_t addNoise4(uint32_t p, uint32_t n)
128 {
129  // unclipped result (lower 8 bits of each component)
130  // alternative:
131  // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
132  // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
133  // uint32_t s = s20 | s31;
134  uint32_t s0 = p + n; // carry spills to neighbors
135  uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
136  uint32_t s = s0 - ci; // subtract carry bits again
137 
138  // Underflow of a component happens ONLY
139  // WHEN input component is in range [0, 127]
140  // AND noise component is negative
141  // AND result component is in range [128, 255]
142  // Overflow of a component happens ONLY
143  // WHEN input component in in range [128, 255]
144  // AND noise component is positive
145  // AND result component is in range [0, 127]
146  // Create a mask per component containing 00 for no under/overflow,
147  // FF for under/overflow
148  // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
149  uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
150  uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
151  // alternative1: uint32_t u2 = u1 | (u1 >> 1);
152  // uint32_t u4 = u2 | (u2 >> 2);
153  // uint32_t u8 = u4 | (u4 >> 4);
154  // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
155  uint32_t u8 = (u1 << 1) - (u1 >> 7);
156 
157  uint32_t o1 = t & p; // overflow
158  uint32_t o8 = (o1 << 1) - (o1 >> 7);
159 
160  // clip result
161  return (s & (~u8)) | o8;
162 }
163 
164 template <class Pixel>
165 void FBPostProcessor<Pixel>::drawNoiseLine(
166  Pixel* buf, signed char* noise, size_t width)
167 {
168 #ifdef __SSE2__
169  if (sizeof(Pixel) == 4) {
170  // cast to avoid compilation error in case of 16bpp (even
171  // though this code is dead in that case).
172  auto* buf32 = reinterpret_cast<uint32_t*>(buf);
173  drawNoiseLineSse2(buf32, noise, width);
174  return;
175  }
176 #endif
177  // c++ version
178  if (sizeof(Pixel) == 4) {
179  // optimized version for 32bpp
180  auto noise4 = reinterpret_cast<uint32_t*>(noise);
181  for (size_t i = 0; i < width; ++i) {
182  buf[i] = addNoise4(buf[i], noise4[i]);
183  }
184  } else {
185  int mr = pixelOps.getMaxRed();
186  int mg = pixelOps.getMaxGreen();
187  int mb = pixelOps.getMaxBlue();
188  for (size_t i = 0; i < width; ++i) {
189  Pixel p = buf[i];
190  int r = pixelOps.red(p);
191  int g = pixelOps.green(p);
192  int b = pixelOps.blue(p);
193 
194  r += noise[4 * i + 0];
195  g += noise[4 * i + 1];
196  b += noise[4 * i + 2];
197 
198  r = std::min(std::max(r, 0), mr);
199  g = std::min(std::max(g, 0), mg);
200  b = std::min(std::max(b, 0), mb);
201 
202  buf[i] = pixelOps.combine(r, g, b);
203  }
204  }
205 }
206 
207 template <class Pixel>
208 void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output_)
209 {
210  if (renderSettings.getNoise() == 0.0f) return;
211 
212  auto& output = checked_cast<SDLOutputSurface&>(output_);
213  auto [w, h] = output.getLogicalSize();
214  auto pixelAccess = output.getDirectPixelAccess();
215  for (int y = 0; y < h; ++y) {
216  auto* buf = pixelAccess.getLinePtr<Pixel>(y);
217  drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
218  }
219 }
220 
221 template <class Pixel>
222 void FBPostProcessor<Pixel>::update(const Setting& setting)
223 {
224  VideoLayer::update(setting);
225  auto& noiseSetting = renderSettings.getNoiseSetting();
226  if (&setting == &noiseSetting) {
227  preCalcNoise(noiseSetting.getDouble());
228  }
229 }
230 
231 
232 template <class Pixel>
234  Display& display_, OutputSurface& screen_, const std::string& videoSource,
235  unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
236  : PostProcessor(
237  motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
238  canDoInterlace_)
239  , noiseShift(screen.getLogicalHeight())
240  , pixelOps(screen.getPixelFormat())
241 {
242  scaleAlgorithm = RenderSettings::NO_SCALER;
243  scaleFactor = unsigned(-1);
244 
245  auto& noiseSetting = renderSettings.getNoiseSetting();
246  noiseSetting.attach(*this);
247  preCalcNoise(noiseSetting.getDouble());
248  assert((screen.getLogicalWidth() * sizeof(Pixel)) < NOISE_SHIFT);
249 }
250 
251 template <class Pixel>
253 {
254  renderSettings.getNoiseSetting().detach(*this);
255 }
256 
257 template <class Pixel>
259 {
260  auto& output = checked_cast<SDLOutputSurface&>(output_);
261  if (renderSettings.getInterleaveBlackFrame()) {
262  interleaveCount ^= 1;
263  if (interleaveCount) {
264  output.clearScreen();
265  return;
266  }
267  }
268 
269  if (!paintFrame) return;
270 
271  // New scaler algorithm selected?
272  auto algo = renderSettings.getScaleAlgorithm();
273  unsigned factor = renderSettings.getScaleFactor();
274  if ((scaleAlgorithm != algo) || (scaleFactor != factor)) {
275  scaleAlgorithm = algo;
276  scaleFactor = factor;
278  PixelOperations<Pixel>(output.getPixelFormat()),
279  renderSettings);
280  }
281 
282  // Scale image.
283  const unsigned srcHeight = paintFrame->getHeight();
284  const unsigned dstHeight = output.getLogicalHeight();
285 
286  unsigned g = Math::gcd(srcHeight, dstHeight);
287  unsigned srcStep = srcHeight / g;
288  unsigned dstStep = dstHeight / g;
289 
290  // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
291  // on the PC screen, as a preparation for resizable output window.
292  unsigned srcStartY = 0;
293  unsigned dstStartY = 0;
294  while (dstStartY < dstHeight) {
295  // Currently this is true because the source frame height
296  // is always >= dstHeight/(dstStep/srcStep).
297  assert(srcStartY < srcHeight);
298 
299  // get region with equal lineWidth
300  unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
301  unsigned srcEndY = srcStartY + srcStep;
302  unsigned dstEndY = dstStartY + dstStep;
303  while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
304  (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
305  srcEndY += srcStep;
306  dstEndY += dstStep;
307  }
308 
309  // fill region
310  //fprintf(stderr, "post processing lines %d-%d: %d\n",
311  // srcStartY, srcEndY, lineWidth );
312  float horStretch = renderSettings.getHorizontalStretch();
313  unsigned inWidth = lrintf(horStretch);
314  std::unique_ptr<ScalerOutput<Pixel>> dst(
316  output, pixelOps, inWidth));
317  currScaler->scaleImage(
318  *paintFrame, superImposeVideoFrame,
319  srcStartY, srcEndY, lineWidth, // source
320  *dst, dstStartY, dstEndY); // dest
321 
322  // next region
323  srcStartY = srcEndY;
324  dstStartY = dstEndY;
325  }
326 
327  drawNoise(output);
328 
329  output.flushFrameBuffer();
330 }
331 
332 template <class Pixel>
333 std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
334  std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
335 {
336  auto& generator = global_urng(); // fast (non-cryptographic) random numbers
337  std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
338  for (auto y : xrange(screen.getLogicalHeight())) {
339  noiseShift[y] = distribution(generator) * 16;
340  }
341 
342  return PostProcessor::rotateFrames(std::move(finishedFrame), time);
343 }
344 
345 
346 // Force template instantiation.
347 #if HAVE_16BPP
348 template class FBPostProcessor<uint16_t>;
349 #endif
350 #if HAVE_32BPP
351 template class FBPostProcessor<uint32_t>;
352 #endif
353 
354 } // namespace openmsx
n3
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
xrange
auto xrange(T e)
Definition: xrange.hh:170
SSE_ALIGNMENT
constexpr auto SSE_ALIGNMENT
Definition: aligned.hh:13
openmsx::RenderSettings::NO_SCALER
Definition: RenderSettings.hh:39
gl::min
vecN< N, T > min(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:274
openmsx::PixelOperations
Definition: PixelOperations.hh:142
aligned.hh
random.hh
openmsx::NOISE_SHIFT
constexpr unsigned NOISE_SHIFT
Definition: FBPostProcessor.cc:25
RawFrame.hh
openmsx::StretchScalerOutputFactory
Definition: StretchScalerOutput.hh:13
t
TclObject t
Definition: TclObject_test.cc:264
openmsx::RenderSettings::getNoiseSetting
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
Definition: RenderSettings.hh:91
openmsx::Subject::attach
void attach(Observer< T > &observer)
Definition: Subject.hh:50
ScalerFactory.hh
openmsx::FBPostProcessor::paint
void paint(OutputSurface &output) override
Paint this layer.
Definition: FBPostProcessor.cc:258
openmsx::Pixel
uint32_t Pixel
Definition: GLHQLiteScaler.cc:93
Scaler.hh
openmsx::OutputSurface::getLogicalWidth
int getLogicalWidth() const
Definition: OutputSurface.hh:27
ScalerOutput.hh
SDLOutputSurface.hh
openmsx::MSXMotherBoard
Definition: MSXMotherBoard.hh:59
openmsx::FBPostProcessor
Rasterizer using SDL.
Definition: FBPostProcessor.hh:18
openmsx::PostProcessor
Abstract base class for post processors.
Definition: PostProcessor.hh:29
openmsx::FBPostProcessor::FBPostProcessor
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
Definition: FBPostProcessor.cc:233
RenderSettings.hh
openmsx::ScalerFactory::createScaler
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
Definition: ScalerFactory.cc:28
g
int g
Definition: ScopedAssign_test.cc:20
FBPostProcessor.hh
openmsx::x
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:1377
checked_cast.hh
openmsx::PostProcessor::rotateFrames
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
Definition: PostProcessor.cc:93
gl::scale
mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
openmsx::NOISE_BUF_SIZE
constexpr unsigned NOISE_BUF_SIZE
Definition: FBPostProcessor.cc:26
openmsx::Display
Represents the output window/screen of openMSX.
Definition: Display.hh:31
Math::gcd
unsigned gcd(unsigned a, unsigned b)
Calculate greatest common divider of two strictly positive integers.
Definition: Math.hh:135
openmsx::FBPostProcessor::rotateFrames
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
Definition: FBPostProcessor.cc:333
Math::clip
int clip(int x)
Clips x to the range [LO,HI].
Definition: Math.hh:102
openmsx::OutputSurface
A frame buffer where pixels can be written to.
Definition: OutputSurface.hh:19
gl::max
vecN< N, T > max(const vecN< N, T > &x, const vecN< N, T > &y)
Definition: gl_vec.hh:292
i3
imat3 i3(ivec3(1, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
Math.hh
global_urng
auto & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
StretchScalerOutput.hh
openmsx::PostProcessor::renderSettings
RenderSettings & renderSettings
Render settings.
Definition: PostProcessor.hh:106
openmsx
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
xrange.hh
openmsx::VideoLayer::update
void update(const Setting &setting) override
Definition: VideoLayer.cc:49
o3
mat3 o3(vec3(1, 2, 3), vec3(4, 5, 0), vec3(7, 8, 9))
openmsx::PostProcessor::screen
OutputSurface & screen
The surface which is visible to the user.
Definition: PostProcessor.hh:109
openmsx::FBPostProcessor::~FBPostProcessor
~FBPostProcessor() override
Definition: FBPostProcessor.cc:252