openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1#include "FBPostProcessor.hh"
2#include "RawFrame.hh"
4#include "ScalerOutput.hh"
5#include "RenderSettings.hh"
6#include "Scaler.hh"
7#include "ScalerFactory.hh"
8#include "SDLOutputSurface.hh"
9#include "aligned.hh"
10#include "checked_cast.hh"
11#include "endian.hh"
12#include "random.hh"
13#include "xrange.hh"
14#include <algorithm>
15#include <cassert>
16#include <cmath>
17#include <cstdint>
18#include <cstddef>
19#include <numeric>
20#ifdef __SSE2__
21#include <emmintrin.h>
22#endif
23
24namespace openmsx {
25
26constexpr unsigned NOISE_SHIFT = 8192;
27constexpr unsigned NOISE_BUF_SIZE = 2 * NOISE_SHIFT;
28ALIGNAS_SSE static signed char noiseBuf[NOISE_BUF_SIZE];
29
30template<std::unsigned_integral Pixel>
32{
33 // We skip noise drawing if the factor is 0, so there is no point in
34 // initializing the random data in that case.
35 if (factor == 0.0f) return;
36
37 // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
38 // 4 element boundaries) must have the same value. Later optimizations
39 // depend on it.
40
41 float scale[4];
42 if constexpr (sizeof(Pixel) == 4) {
43 // 32bpp
44 // TODO ATM we compensate for big endian here. A better
45 // alternative is to turn noiseBuf into an array of ints (it's
46 // now bytes) and in the 16bpp code extract R,G,B components
47 // from those ints
48 const auto p = Pixel(Endian::BIG ? 0x00010203 : 0x03020100);
49 // TODO we can also fill the array with 'factor' and only set
50 // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
51 // way to get the position of the alpha byte (yet).
52 scale[0] = scale[1] = scale[2] = scale[3] = 0.0f;
53 scale[pixelOps.red (p)] = factor;
54 scale[pixelOps.green(p)] = factor;
55 scale[pixelOps.blue (p)] = factor;
56 } else {
57 // 16bpp
58 scale[0] = (pixelOps.getMaxRed() / 255.0f) * factor;
59 scale[1] = (pixelOps.getMaxGreen() / 255.0f) * factor;
60 scale[2] = (pixelOps.getMaxBlue() / 255.0f) * factor;
61 scale[3] = 0.0f;
62 }
63
64 auto& generator = global_urng(); // fast (non-cryptographic) random numbers
65 std::normal_distribution<float> distribution(0.0f, 1.0f);
66 for (unsigned i = 0; i < NOISE_BUF_SIZE; i += 4) {
67 float r = distribution(generator);
68 noiseBuf[i + 0] = std::clamp(int(roundf(r * scale[0])), -128, 127);
69 noiseBuf[i + 1] = std::clamp(int(roundf(r * scale[1])), -128, 127);
70 noiseBuf[i + 2] = std::clamp(int(roundf(r * scale[2])), -128, 127);
71 noiseBuf[i + 3] = std::clamp(int(roundf(r * scale[3])), -128, 127);
72 }
73}
74
75#ifdef __SSE2__
76static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
77{
78 // To each of the RGBA color components (a value in range [0..255]) we
79 // want to add a signed noise value (in range [-128..127]) and also clip
80 // the result to the range [0..255]. There is no SSE instruction that
81 // directly performs this operation. But we can:
82 // - subtract 128 from the RGBA component to get a signed byte
83 // - perform the addition with signed saturation
84 // - add 128 to the result to get back to the unsigned byte range
85 // For 8-bit values the following 3 expressions are equivalent:
86 // x + 128 == x - 128 == x ^ 128
87 // So the expression becomes:
88 // signed_add_sat(value ^ 128, noise) ^ 128
89 // The following loop does just that, though it processes 64 bytes per
90 // iteration.
91 ptrdiff_t x = width * sizeof(uint32_t);
92 assert((x & 63) == 0);
93 assert((uintptr_t(buf_) & 15) == 0);
94
95 char* buf = reinterpret_cast<char*>(buf_) + x;
96 char* nse = reinterpret_cast<char*>(noise) + x;
97 x = -x;
98
99 __m128i b7 = _mm_set1_epi8(-128); // 0x80
100 do {
101 __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
102 __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
103 __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
104 __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
105 __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
106 __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
107 __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
108 __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
109 __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
110 __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
111 __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
112 __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
113 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
114 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
115 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
116 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
117 x += 4 * sizeof(__m128i);
118 } while (x < 0);
119}
120#endif
121
127static constexpr uint32_t addNoise4(uint32_t p, uint32_t n)
128{
129 // unclipped result (lower 8 bits of each component)
130 // alternative:
131 // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
132 // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
133 // uint32_t s = s20 | s31;
134 uint32_t s0 = p + n; // carry spills to neighbors
135 uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
136 uint32_t s = s0 - ci; // subtract carry bits again
137
138 // Underflow of a component happens ONLY
139 // WHEN input component is in range [0, 127]
140 // AND noise component is negative
141 // AND result component is in range [128, 255]
142 // Overflow of a component happens ONLY
143 // WHEN input component in in range [128, 255]
144 // AND noise component is positive
145 // AND result component is in range [0, 127]
146 // Create a mask per component containing 00 for no under/overflow,
147 // FF for under/overflow
148 // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
149 uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
150 uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
151 // alternative1: uint32_t u2 = u1 | (u1 >> 1);
152 // uint32_t u4 = u2 | (u2 >> 2);
153 // uint32_t u8 = u4 | (u4 >> 4);
154 // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
155 uint32_t u8 = (u1 << 1) - (u1 >> 7);
156
157 uint32_t o1 = t & p; // overflow
158 uint32_t o8 = (o1 << 1) - (o1 >> 7);
159
160 // clip result
161 return (s & (~u8)) | o8;
162}
163
164template<std::unsigned_integral Pixel>
165void FBPostProcessor<Pixel>::drawNoiseLine(
166 Pixel* buf, signed char* noise, size_t width)
167{
168#ifdef __SSE2__
169 if constexpr (sizeof(Pixel) == 4) {
170 // cast to avoid compilation error in case of 16bpp (even
171 // though this code is dead in that case).
172 auto* buf32 = reinterpret_cast<uint32_t*>(buf);
173 drawNoiseLineSse2(buf32, noise, width);
174 return;
175 }
176#endif
177 // c++ version
178 if constexpr (sizeof(Pixel) == 4) {
179 // optimized version for 32bpp
180 auto* noise4 = reinterpret_cast<uint32_t*>(noise);
181 for (auto i : xrange(width)) {
182 buf[i] = addNoise4(buf[i], noise4[i]);
183 }
184 } else {
185 int mr = pixelOps.getMaxRed();
186 int mg = pixelOps.getMaxGreen();
187 int mb = pixelOps.getMaxBlue();
188 for (auto i : xrange(width)) {
189 Pixel p = buf[i];
190 int r = pixelOps.red(p);
191 int g = pixelOps.green(p);
192 int b = pixelOps.blue(p);
193
194 r += noise[4 * i + 0];
195 g += noise[4 * i + 1];
196 b += noise[4 * i + 2];
197
198 r = std::clamp(r, 0, mr);
199 g = std::clamp(g, 0, mg);
200 b = std::clamp(b, 0, mb);
201
202 buf[i] = pixelOps.combine(r, g, b);
203 }
204 }
205}
206
207template<std::unsigned_integral Pixel>
208void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output_)
209{
210 if (renderSettings.getNoise() == 0.0f) return;
211
212 auto& output = checked_cast<SDLOutputSurface&>(output_);
213 auto [w, h] = output.getLogicalSize();
214 auto pixelAccess = output.getDirectPixelAccess();
215 for (auto y : xrange(h)) {
216 auto* buf = pixelAccess.getLinePtr<Pixel>(y);
217 drawNoiseLine(buf, &noiseBuf[noiseShift[y]], w);
218 }
219}
220
221template<std::unsigned_integral Pixel>
222void FBPostProcessor<Pixel>::update(const Setting& setting) noexcept
223{
225 auto& noiseSetting = renderSettings.getNoiseSetting();
226 if (&setting == &noiseSetting) {
227 preCalcNoise(noiseSetting.getDouble());
228 }
229}
230
231
232template<std::unsigned_integral Pixel>
234 Display& display_, OutputSurface& screen_, const std::string& videoSource,
235 unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
237 motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
238 canDoInterlace_)
239 , scaleAlgorithm(RenderSettings::NO_SCALER)
240 , scaleFactor(unsigned(-1))
241 , stretchWidth(unsigned(-1))
242 , noiseShift(screen.getLogicalHeight())
243 , pixelOps(screen.getPixelFormat())
244{
245 auto& noiseSetting = renderSettings.getNoiseSetting();
246 noiseSetting.attach(*this);
247 preCalcNoise(noiseSetting.getDouble());
248 assert((screen.getLogicalWidth() * sizeof(Pixel)) < NOISE_SHIFT);
249}
250
251template<std::unsigned_integral Pixel>
253{
254 renderSettings.getNoiseSetting().detach(*this);
255}
256
257template<std::unsigned_integral Pixel>
259{
260 auto& output = checked_cast<SDLOutputSurface&>(output_);
261 if (renderSettings.getInterleaveBlackFrame()) {
262 interleaveCount ^= 1;
263 if (interleaveCount) {
264 output.clearScreen();
265 return;
266 }
267 }
268
269 if (!paintFrame) return;
270
271 // New scaler algorithm selected? Or different horizontal stretch?
272 auto algo = renderSettings.getScaleAlgorithm();
273 unsigned factor = renderSettings.getScaleFactor();
274 unsigned inWidth = lrintf(renderSettings.getHorizontalStretch());
275 if ((scaleAlgorithm != algo) || (scaleFactor != factor) ||
276 (inWidth != stretchWidth) || (lastOutput != &output)) {
277 scaleAlgorithm = algo;
278 scaleFactor = factor;
279 stretchWidth = inWidth;
280 lastOutput = &output;
282 PixelOperations<Pixel>(output.getPixelFormat()),
283 renderSettings);
285 output, pixelOps, inWidth);
286 }
287
288 // Scale image.
289 const unsigned srcHeight = paintFrame->getHeight();
290 const unsigned dstHeight = output.getLogicalHeight();
291
292 unsigned g = std::gcd(srcHeight, dstHeight);
293 unsigned srcStep = srcHeight / g;
294 unsigned dstStep = dstHeight / g;
295
296 // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
297 // on the PC screen, as a preparation for resizable output window.
298 unsigned srcStartY = 0;
299 unsigned dstStartY = 0;
300 while (dstStartY < dstHeight) {
301 // Currently this is true because the source frame height
302 // is always >= dstHeight/(dstStep/srcStep).
303 assert(srcStartY < srcHeight);
304
305 // get region with equal lineWidth
306 unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
307 unsigned srcEndY = srcStartY + srcStep;
308 unsigned dstEndY = dstStartY + dstStep;
309 while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
310 (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
311 srcEndY += srcStep;
312 dstEndY += dstStep;
313 }
314
315 // fill region
316 //fprintf(stderr, "post processing lines %d-%d: %d\n",
317 // srcStartY, srcEndY, lineWidth);
318 currScaler->scaleImage(
319 *paintFrame, superImposeVideoFrame,
320 srcStartY, srcEndY, lineWidth, // source
321 *stretchScaler, dstStartY, dstEndY); // dest
322
323 // next region
324 srcStartY = srcEndY;
325 dstStartY = dstEndY;
326 }
327
328 drawNoise(output);
329
330 output.flushFrameBuffer();
331}
332
333template<std::unsigned_integral Pixel>
334std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
335 std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
336{
337 auto& generator = global_urng(); // fast (non-cryptographic) random numbers
338 std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
339 for (auto y : xrange(screen.getLogicalHeight())) {
340 noiseShift[y] = distribution(generator) * 16;
341 }
342
343 return PostProcessor::rotateFrames(std::move(finishedFrame), time);
344}
345
346
347// Force template instantiation.
348#if HAVE_16BPP
349template class FBPostProcessor<uint16_t>;
350#endif
351#if HAVE_32BPP
352template class FBPostProcessor<uint32_t>;
353#endif
354
355} // namespace openmsx
BaseSetting * setting
Definition: Interpreter.cc:27
int g
TclObject t
#define ALIGNAS_SSE
Definition: aligned.hh:24
Represents the output window/screen of openMSX.
Definition: Display.hh:33
Rasterizer using SDL.
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
void paint(OutputSurface &output) override
Paint this layer.
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
A frame buffer where pixels can be written to.
int getLogicalWidth() const
Abstract base class for post processors.
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
RenderSettings & renderSettings
Render settings.
OutputSurface & screen
The surface which is visible to the user.
Class containing all settings for renderers.
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
void attach(Observer< T > &observer)
Definition: Subject.hh:50
void update(const Setting &setting) noexcept override
Definition: VideoLayer.cc:50
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
mat3 o3(vec3(1, 2, 3), vec3(4, 5, 0), vec3(7, 8, 9))
imat3 i3(ivec3(1, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
constexpr bool BIG
Definition: endian.hh:13
constexpr vecN< N, T > clamp(const vecN< N, T > &x, const vecN< N, T > &minVal, const vecN< N, T > &maxVal)
Definition: gl_vec.hh:292
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
constexpr unsigned NOISE_SHIFT
constexpr unsigned NOISE_BUF_SIZE
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:127
auto & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
static std::unique_ptr< ScalerOutput< Pixel > > create(SDLOutputSurface &output, PixelOperations< Pixel > pixelOps, unsigned inWidth)
constexpr auto xrange(T e)
Definition: xrange.hh:133