openMSX
FBPostProcessor.cc
Go to the documentation of this file.
1#include "FBPostProcessor.hh"
2#include "RawFrame.hh"
4#include "ScalerOutput.hh"
5#include "RenderSettings.hh"
6#include "Scaler.hh"
7#include "ScalerFactory.hh"
8#include "SDLOutputSurface.hh"
9#include "aligned.hh"
10#include "checked_cast.hh"
11#include "endian.hh"
12#include "narrow.hh"
13#include "random.hh"
14#include "xrange.hh"
15#include <algorithm>
16#include <array>
17#include <cassert>
18#include <cmath>
19#include <cstdint>
20#include <cstddef>
21#include <numeric>
22#ifdef __SSE2__
23#include <emmintrin.h>
24#endif
25
26namespace openmsx {
27
28static constexpr unsigned NOISE_SHIFT = 8192;
29ALIGNAS_SSE static std::array<int8_t, 2 * NOISE_SHIFT> noiseBuf;
30
31template<std::unsigned_integral Pixel>
32void FBPostProcessor<Pixel>::preCalcNoise(float factor)
33{
34 // We skip noise drawing if the factor is 0, so there is no point in
35 // initializing the random data in that case.
36 if (factor == 0.0f) return;
37
38 // for 32bpp groups of 4 consecutive noiseBuf elements (starting at
39 // 4 element boundaries) must have the same value. Later optimizations
40 // depend on it.
41
42 std::array<float, 4> scale;
43 if constexpr (sizeof(Pixel) == 4) {
44 // 32bpp
45 // TODO ATM we compensate for big endian here. A better
46 // alternative is to turn noiseBuf into an array of ints (it's
47 // now bytes) and in the 16bpp code extract R,G,B components
48 // from those ints
49 const auto p = Pixel(Endian::BIG ? 0x00010203 : 0x03020100);
50 // TODO we can also fill the array with 'factor' and only set
51 // 'alpha' to 0.0. But PixelOperations doesn't offer a simple
52 // way to get the position of the alpha byte (yet).
53 ranges::fill(scale, 0.0f);
54 scale[pixelOps.red (p)] = factor;
55 scale[pixelOps.green(p)] = factor;
56 scale[pixelOps.blue (p)] = factor;
57 } else {
58 // 16bpp
59 scale[0] = (narrow_cast<float>(pixelOps.getMaxRed()) / 255.0f) * factor;
60 scale[1] = (narrow_cast<float>(pixelOps.getMaxGreen()) / 255.0f) * factor;
61 scale[2] = (narrow_cast<float>(pixelOps.getMaxBlue()) / 255.0f) * factor;
62 scale[3] = 0.0f;
63 }
64
65 auto& generator = global_urng(); // fast (non-cryptographic) random numbers
66 std::normal_distribution<float> distribution(0.0f, 1.0f);
67 for (unsigned i = 0; i < noiseBuf.size(); i += 4) {
68 float r = distribution(generator);
69 noiseBuf[i + 0] = narrow<int8_t>(std::clamp(int(roundf(r * scale[0])), -128, 127));
70 noiseBuf[i + 1] = narrow<int8_t>(std::clamp(int(roundf(r * scale[1])), -128, 127));
71 noiseBuf[i + 2] = narrow<int8_t>(std::clamp(int(roundf(r * scale[2])), -128, 127));
72 noiseBuf[i + 3] = narrow<int8_t>(std::clamp(int(roundf(r * scale[3])), -128, 127));
73 }
74}
75
76#ifdef __SSE2__
77static inline void drawNoiseLineSse2(uint32_t* buf_, signed char* noise, size_t width)
78{
79 // To each of the RGBA color components (a value in range [0..255]) we
80 // want to add a signed noise value (in range [-128..127]) and also clip
81 // the result to the range [0..255]. There is no SSE instruction that
82 // directly performs this operation. But we can:
83 // - subtract 128 from the RGBA component to get a signed byte
84 // - perform the addition with signed saturation
85 // - add 128 to the result to get back to the unsigned byte range
86 // For 8-bit values the following 3 expressions are equivalent:
87 // x + 128 == x - 128 == x ^ 128
88 // So the expression becomes:
89 // signed_add_sat(value ^ 128, noise) ^ 128
90 // The following loop does just that, though it processes 64 bytes per
91 // iteration.
92 auto x = narrow<ptrdiff_t>(width * sizeof(uint32_t));
93 assert((x & 63) == 0);
94 assert((uintptr_t(buf_) & 15) == 0);
95
96 char* buf = reinterpret_cast<char*>(buf_) + x;
97 char* nse = reinterpret_cast<char*>(noise) + x;
98 x = -x;
99
100 __m128i b7 = _mm_set1_epi8(-128); // 0x80
101 do {
102 __m128i i0 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 0));
103 __m128i i1 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 16));
104 __m128i i2 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 32));
105 __m128i i3 = _mm_load_si128(reinterpret_cast<__m128i*>(buf + x + 48));
106 __m128i n0 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 0));
107 __m128i n1 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 16));
108 __m128i n2 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 32));
109 __m128i n3 = _mm_load_si128(reinterpret_cast<__m128i*>(nse + x + 48));
110 __m128i o0 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i0, b7), n0), b7);
111 __m128i o1 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i1, b7), n1), b7);
112 __m128i o2 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i2, b7), n2), b7);
113 __m128i o3 = _mm_xor_si128(_mm_adds_epi8(_mm_xor_si128(i3, b7), n3), b7);
114 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 0), o0);
115 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 16), o1);
116 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 32), o2);
117 _mm_store_si128(reinterpret_cast<__m128i*>(buf + x + 48), o3);
118 x += 4 * sizeof(__m128i);
119 } while (x < 0);
120}
121#endif
122
128static constexpr uint32_t addNoise4(uint32_t p, uint32_t n)
129{
130 // unclipped result (lower 8 bits of each component)
131 // alternative:
132 // uint32_t s20 = ((p & 0x00FF00FF) + (n & 0x00FF00FF)) & 0x00FF00FF;
133 // uint32_t s31 = ((p & 0xFF00FF00) + (n & 0xFF00FF00)) & 0xFF00FF00;
134 // uint32_t s = s20 | s31;
135 uint32_t s0 = p + n; // carry spills to neighbors
136 uint32_t ci = (p ^ n ^ s0) & 0x01010100; // carry-in bits of prev sum
137 uint32_t s = s0 - ci; // subtract carry bits again
138
139 // Underflow of a component happens ONLY
140 // WHEN input component is in range [0, 127]
141 // AND noise component is negative
142 // AND result component is in range [128, 255]
143 // Overflow of a component happens ONLY
144 // WHEN input component in in range [128, 255]
145 // AND noise component is positive
146 // AND result component is in range [0, 127]
147 // Create a mask per component containing 00 for no under/overflow,
148 // FF for under/overflow
149 // ((~p & n & s) | (p & ~n & ~s)) == ((p ^ n) & (p ^ s))
150 uint32_t t = (p ^ n) & (p ^ s) & 0x80808080;
151 uint32_t u1 = t & s; // underflow (alternative: u1 = t & n)
152 // alternative1: uint32_t u2 = u1 | (u1 >> 1);
153 // uint32_t u4 = u2 | (u2 >> 2);
154 // uint32_t u8 = u4 | (u4 >> 4);
155 // alternative2: uint32_t u8 = (u1 >> 7) * 0xFF;
156 uint32_t u8 = (u1 << 1) - (u1 >> 7);
157
158 uint32_t o1 = t & p; // overflow
159 uint32_t o8 = (o1 << 1) - (o1 >> 7);
160
161 // clip result
162 return (s & (~u8)) | o8;
163}
164
165template<std::unsigned_integral Pixel>
166void FBPostProcessor<Pixel>::drawNoiseLine(
167 std::span<Pixel> buf, signed char* noise)
168{
169 auto width = buf.size();
170#ifdef __SSE2__
171 if constexpr (sizeof(Pixel) == 4) {
172 // cast to avoid compilation error in case of 16bpp (even
173 // though this code is dead in that case).
174 auto* buf32 = reinterpret_cast<uint32_t*>(buf.data());
175 drawNoiseLineSse2(buf32, noise, width);
176 return;
177 }
178#endif
179 // c++ version
180 if constexpr (sizeof(Pixel) == 4) {
181 // optimized version for 32bpp
182 auto* noise4 = reinterpret_cast<uint32_t*>(noise);
183 for (auto i : xrange(width)) {
184 buf[i] = addNoise4(buf[i], noise4[i]);
185 }
186 } else {
187 int mr = pixelOps.getMaxRed();
188 int mg = pixelOps.getMaxGreen();
189 int mb = pixelOps.getMaxBlue();
190 for (auto i : xrange(width)) {
191 Pixel p = buf[i];
192 int r = pixelOps.red(p);
193 int g = pixelOps.green(p);
194 int b = pixelOps.blue(p);
195
196 r += noise[4 * i + 0];
197 g += noise[4 * i + 1];
198 b += noise[4 * i + 2];
199
200 r = std::clamp(r, 0, mr);
201 g = std::clamp(g, 0, mg);
202 b = std::clamp(b, 0, mb);
203
204 buf[i] = pixelOps.combine(r, g, b);
205 }
206 }
207}
208
209template<std::unsigned_integral Pixel>
210void FBPostProcessor<Pixel>::drawNoise(OutputSurface& output_)
211{
212 if (renderSettings.getNoise() == 0.0f) return;
213
214 auto& output = checked_cast<SDLOutputSurface&>(output_);
215 auto [w, h] = output.getLogicalSize();
216 auto pixelAccess = output.getDirectPixelAccess();
217 for (auto y : xrange(h)) {
218 auto buf = pixelAccess.getLine<Pixel>(y).subspan(0, w);
219 drawNoiseLine(buf, &noiseBuf[noiseShift[y]]);
220 }
221}
222
223template<std::unsigned_integral Pixel>
224void FBPostProcessor<Pixel>::update(const Setting& setting) noexcept
225{
227 auto& noiseSetting = renderSettings.getNoiseSetting();
228 if (&setting == &noiseSetting) {
229 preCalcNoise(noiseSetting.getFloat());
230 }
231}
232
233
234template<std::unsigned_integral Pixel>
236 Display& display_, OutputSurface& screen_, const std::string& videoSource,
237 unsigned maxWidth_, unsigned height_, bool canDoInterlace_)
239 motherBoard_, display_, screen_, videoSource, maxWidth_, height_,
240 canDoInterlace_)
241 , noiseShift(screen.getLogicalHeight())
242 , pixelOps(screen.getPixelFormat())
243{
244 auto& noiseSetting = renderSettings.getNoiseSetting();
245 noiseSetting.attach(*this);
246 preCalcNoise(noiseSetting.getFloat());
247 assert((screen.getLogicalWidth() * sizeof(Pixel)) < NOISE_SHIFT);
248}
249
250template<std::unsigned_integral Pixel>
252{
253 renderSettings.getNoiseSetting().detach(*this);
254}
255
256template<std::unsigned_integral Pixel>
258{
259 auto& output = checked_cast<SDLOutputSurface&>(output_);
260 if (renderSettings.getInterleaveBlackFrame()) {
261 interleaveCount ^= 1;
262 if (interleaveCount) {
263 output.clearScreen();
264 return;
265 }
266 }
267
268 if (!paintFrame) return;
269
270 // New scaler algorithm selected? Or different horizontal stretch?
271 auto algo = renderSettings.getScaleAlgorithm();
272 unsigned factor = renderSettings.getScaleFactor();
273 unsigned inWidth = narrow<unsigned>(lrintf(renderSettings.getHorizontalStretch()));
274 if ((scaleAlgorithm != algo) || (scaleFactor != factor) ||
275 (inWidth != stretchWidth) || (lastOutput != &output)) {
276 scaleAlgorithm = algo;
277 scaleFactor = factor;
278 stretchWidth = inWidth;
279 lastOutput = &output;
281 PixelOperations<Pixel>(output.getPixelFormat()),
282 renderSettings);
284 output, pixelOps, inWidth);
285 }
286
287 // Scale image.
288 const unsigned srcHeight = paintFrame->getHeight();
289 const unsigned dstHeight = output.getLogicalHeight();
290
291 unsigned g = std::gcd(srcHeight, dstHeight);
292 unsigned srcStep = srcHeight / g;
293 unsigned dstStep = dstHeight / g;
294
295 // TODO: Store all MSX lines in RawFrame and only scale the ones that fit
296 // on the PC screen, as a preparation for resizable output window.
297 unsigned srcStartY = 0;
298 unsigned dstStartY = 0;
299 stretchScaler->frameStart();
300 while (dstStartY < dstHeight) {
301 // Currently this is true because the source frame height
302 // is always >= dstHeight/(dstStep/srcStep).
303 assert(srcStartY < srcHeight);
304
305 // get region with equal lineWidth
306 unsigned lineWidth = getLineWidth(paintFrame, srcStartY, srcStep);
307 unsigned srcEndY = srcStartY + srcStep;
308 unsigned dstEndY = dstStartY + dstStep;
309 while ((srcEndY < srcHeight) && (dstEndY < dstHeight) &&
310 (getLineWidth(paintFrame, srcEndY, srcStep) == lineWidth)) {
311 srcEndY += srcStep;
312 dstEndY += dstStep;
313 }
314
315 // fill region
316 //fprintf(stderr, "post processing lines %d-%d: %d\n",
317 // srcStartY, srcEndY, lineWidth);
318 currScaler->scaleImage(
319 *paintFrame, superImposeVideoFrame,
320 srcStartY, srcEndY, lineWidth, // source
321 *stretchScaler, dstStartY, dstEndY); // dest
322
323 // next region
324 srcStartY = srcEndY;
325 dstStartY = dstEndY;
326 }
327 stretchScaler->frameStop();
328
329 drawNoise(output);
330
331 output.flushFrameBuffer();
332}
333
334template<std::unsigned_integral Pixel>
335std::unique_ptr<RawFrame> FBPostProcessor<Pixel>::rotateFrames(
336 std::unique_ptr<RawFrame> finishedFrame, EmuTime::param time)
337{
338 auto& generator = global_urng(); // fast (non-cryptographic) random numbers
339 std::uniform_int_distribution<int> distribution(0, NOISE_SHIFT / 16 - 1);
340 for (auto y : xrange(screen.getLogicalHeight())) {
341 noiseShift[y] = narrow<uint16_t>(distribution(generator) * 16);
342 }
343
344 return PostProcessor::rotateFrames(std::move(finishedFrame), time);
345}
346
347
348// Force template instantiation.
349#if HAVE_16BPP
350template class FBPostProcessor<uint16_t>;
351#endif
352#if HAVE_32BPP
353template class FBPostProcessor<uint32_t>;
354#endif
355
356} // namespace openmsx
BaseSetting * setting
Definition: Interpreter.cc:28
int g
TclObject t
#define ALIGNAS_SSE
Definition: aligned.hh:24
Represents the output window/screen of openMSX.
Definition: Display.hh:33
Rasterizer using SDL.
FBPostProcessor(MSXMotherBoard &motherBoard, Display &display, OutputSurface &screen, const std::string &videoSource, unsigned maxWidth, unsigned height, bool canDoInterlace)
void paint(OutputSurface &output) override
Paint this layer.
std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time) override
Sets up the "abcdFrame" variables for a new frame.
A frame buffer where pixels can be written to.
int getLogicalWidth() const
Abstract base class for post processors.
virtual std::unique_ptr< RawFrame > rotateFrames(std::unique_ptr< RawFrame > finishedFrame, EmuTime::param time)
Sets up the "abcdFrame" variables for a new frame.
RenderSettings & renderSettings
Render settings.
OutputSurface & screen
The surface which is visible to the user.
FloatSetting & getNoiseSetting()
The amount of noise to add to the frame.
static std::unique_ptr< Scaler< Pixel > > createScaler(const PixelOperations< Pixel > &pixelOps, RenderSettings &renderSettings)
Instantiates a Scaler.
void attach(Observer< T > &observer)
Definition: Subject.hh:50
unsigned size() const
Definition: TclObject.hh:167
void update(const Setting &setting) noexcept override
Definition: VideoLayer.cc:48
mat3 n3(vec3(1, 0, 3), vec3(4, 5, 6), vec3(7, 8, 9))
mat3 o3(vec3(1, 2, 3), vec3(4, 5, 0), vec3(7, 8, 9))
imat3 i3(ivec3(1, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
constexpr bool BIG
Definition: endian.hh:15
constexpr vecN< N, T > clamp(const vecN< N, T > &x, const vecN< N, T > &minVal, const vecN< N, T > &maxVal)
Definition: gl_vec.hh:294
constexpr mat4 scale(const vec3 &xyz)
Definition: gl_transform.hh:19
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
constexpr void fill(ForwardRange &&range, const T &value)
Definition: ranges.hh:287
auto & global_urng()
Return reference to a (shared) global random number generator.
Definition: random.hh:8
constexpr auto subspan(Range &&range, size_t offset, size_t count=std::dynamic_extent)
Definition: ranges.hh:446
static std::unique_ptr< ScalerOutput< Pixel > > create(SDLOutputSurface &output, PixelOperations< Pixel > pixelOps, unsigned inWidth)
constexpr auto xrange(T e)
Definition: xrange.hh:132