openMSX
Scanline.cc
Go to the documentation of this file.
1#include "Scanline.hh"
2#include "PixelOperations.hh"
3#include "enumerate.hh"
4#include "narrow.hh"
5#include "unreachable.hh"
6#include "xrange.hh"
7#include <cassert>
8#include <cstddef>
9#ifdef __SSE2__
10#include <emmintrin.h>
11#endif
12
13namespace openmsx {
14
15// class Multiply<uint16_t>
16
18 : pixelOps(pixelOps_)
19{
20}
21
23{
24 if (f == factor) return;
25 factor = f;
26
27 for (auto [p, t] : enumerate(tab)) {
28 auto pix = uint32_t(p);
29 t = ((((pix & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask()) |
30 ((((pix & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask()) |
31 ((((pix & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask());
32 }
33}
34
35inline uint16_t Multiply<uint16_t>::multiply(uint16_t p, unsigned f) const
36{
37 unsigned r = (((p & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask();
38 unsigned g = (((p & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask();
39 unsigned b = (((p & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask();
40 return r | g | b;
41}
42
43inline uint16_t Multiply<uint16_t>::multiply(uint16_t p) const
44{
45 return tab[p];
46}
47
48
49// class Multiply<uint32_t>
50
52{
53}
54
56{
57 factor = f;
58}
59
60inline uint32_t Multiply<uint32_t>::multiply(uint32_t p, unsigned f) const
61{
63}
64
65inline uint32_t Multiply<uint32_t>::multiply(uint32_t p) const
66{
67 return multiply(p, factor);
68}
69
70
71#ifdef __SSE2__
72
73// 32bpp
74static inline void drawSSE2_1(
75 const char* __restrict in1, const char* __restrict in2,
76 char* __restrict out, __m128i f)
77{
78 __m128i zero = _mm_setzero_si128();
79 __m128i a = *reinterpret_cast<const __m128i*>(in1);
80 __m128i b = *reinterpret_cast<const __m128i*>(in2);
81 __m128i c = _mm_avg_epu8(a, b);
82 __m128i l = _mm_unpacklo_epi8(c, zero);
83 __m128i h = _mm_unpackhi_epi8(c, zero);
84 __m128i m = _mm_mulhi_epu16(l, f);
85 __m128i n = _mm_mulhi_epu16(h, f);
86 __m128i r = _mm_packus_epi16(m, n);
87 *reinterpret_cast<__m128i*>(out) = r;
88}
89static inline void drawSSE2(
90 const uint32_t* __restrict in1_,
91 const uint32_t* __restrict in2_,
92 uint32_t* __restrict out_,
93 unsigned factor,
94 size_t width,
95 PixelOperations<uint32_t>& /*dummy*/,
96 Multiply<uint32_t>& /*dummy*/)
97{
98 width *= sizeof(uint32_t); // in bytes
99 assert(width >= 64);
100 assert((reinterpret_cast<uintptr_t>(in1_) % sizeof(__m128i)) == 0);
101 assert((reinterpret_cast<uintptr_t>(in2_) % sizeof(__m128i)) == 0);
102 assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
103 const auto* in1 = reinterpret_cast<const char*>(in1_) + width;
104 const auto* in2 = reinterpret_cast<const char*>(in2_) + width;
105 auto* out = reinterpret_cast< char*>(out_) + width;
106
107 __m128i f = _mm_set1_epi16(narrow_cast<int16_t>(factor << 8));
108 ptrdiff_t x = -ptrdiff_t(width);
109 do {
110 drawSSE2_1(in1 + x + 0, in2 + x + 0, out + x + 0, f);
111 drawSSE2_1(in1 + x + 16, in2 + x + 16, out + x + 16, f);
112 drawSSE2_1(in1 + x + 32, in2 + x + 32, out + x + 32, f);
113 drawSSE2_1(in1 + x + 48, in2 + x + 48, out + x + 48, f);
114 x += 64;
115 } while (x < 0);
116}
117
118// 16bpp
119static inline void drawSSE2(
120 const uint16_t* __restrict in1_,
121 const uint16_t* __restrict in2_,
122 uint16_t* __restrict out_,
123 unsigned factor,
124 size_t width,
125 PixelOperations<uint16_t>& pixelOps,
126 Multiply<uint16_t>& darkener)
127{
128 width *= sizeof(uint16_t); // in bytes
129 assert(width >= 16);
130 const auto* in1 = reinterpret_cast<const char*>(in1_) + width;
131 const auto* in2 = reinterpret_cast<const char*>(in2_) + width;
132 auto* out = reinterpret_cast< char*>(out_) + width;
133
134 darkener.setFactor(factor);
135 auto table = darkener.getTable();
136 __m128i mask = _mm_set1_epi16(narrow_cast<int16_t>(pixelOps.getBlendMask()));
137
138 ptrdiff_t x = -ptrdiff_t(width);
139 do {
140 __m128i a = *reinterpret_cast<const __m128i*>(in1 + x);
141 __m128i b = *reinterpret_cast<const __m128i*>(in2 + x);
142 __m128i c = _mm_add_epi16(
143 _mm_and_si128(a, b),
144 _mm_srli_epi16(
145 _mm_and_si128(mask, _mm_xor_si128(a, b)),
146 1));
147 *reinterpret_cast<__m128i*>(out + x) = _mm_set_epi16(
148 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 7)]),
149 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 6)]),
150 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 5)]),
151 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 4)]),
152 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 3)]),
153 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 2)]),
154 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 1)]),
155 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 0)]));
156 // An alternative for the above statement is this block (this
157 // is close to what we has in our old MMX routine). On gcc this
158 // generates significantly shorter (25%) but also significantly
159 // slower (30%) code. On clang both alternatives generate
160 // identical code, comparable in size to the fast gcc version
161 // (but still a bit faster).
162 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 0)], 0);
163 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 1)], 1);
164 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 2)], 2);
165 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 3)], 3);
166 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 4)], 4);
167 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 5)], 5);
168 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 6)], 6);
169 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 7)], 7);
170 //*reinterpret_cast<__m128i*>(out + x) = c;
171
172 x += 16;
173 } while (x < 0);
174}
175
176#endif
177
178
179// class Scanline
180
181template<std::unsigned_integral Pixel>
183 : darkener(pixelOps_)
184 , pixelOps(pixelOps_)
185{
186}
187
188template<std::unsigned_integral Pixel>
190 std::span<const Pixel> src1, std::span<const Pixel> src2,
191 std::span<Pixel> dst, unsigned factor)
192{
193 auto width = src1.size();
194 assert(src1.size() == width);
195 assert(src2.size() == width);
196 assert(dst .size() == width);
197#ifdef __SSE2__
198 drawSSE2(src1.data(), src2.data(), dst.data(), factor, width, pixelOps, darkener);
199#else
200 // non-SSE2 routine, both 16bpp and 32bpp
201 darkener.setFactor(factor);
202 for (auto x : xrange(width)) {
203 dst[x] = darkener.multiply(
204 pixelOps.template blend<1, 1>(src1[x], src2[x]));
205 }
206#endif
207}
208
209template<std::unsigned_integral Pixel>
210Pixel Scanline<Pixel>::darken(Pixel p, unsigned factor) const
211{
212 return darkener.multiply(p, factor);
213}
214
215template<std::unsigned_integral Pixel>
216Pixel Scanline<Pixel>::darken(Pixel p1, Pixel p2, unsigned factor) const
217{
218 return darkener.multiply(pixelOps.template blend<1, 1>(p1, p2), factor);
219}
220
221// Force template instantiation.
222#if HAVE_16BPP
223template class Scanline<uint16_t>;
224#endif
225#if HAVE_32BPP
226template class Scanline<uint32_t>;
227#endif
228
229} // namespace openmsx
int g
TclObject t
Helper class to perform 'pixel x scalar' calculations.
Definition: Scanline.hh:15
static Pixel multiply(Pixel p, unsigned x)
Perform a component wise multiplication of a pixel with an 8-bit fractional value: result = (pixel * ...
Helper class to draw scanlines.
Definition: Scanline.hh:46
void draw(std::span< const Pixel > src1, std::span< const Pixel > src2, std::span< Pixel > dst, unsigned factor)
Draws a scanline.
Definition: Scanline.cc:189
Pixel darken(Pixel p, unsigned factor) const
Darken one pixel.
Definition: Scanline.cc:210
Scanline(const PixelOperations< Pixel > &pixelOps)
Definition: Scanline.cc:182
constexpr auto enumerate(Iterable &&iterable)
Heavily inspired by Nathan Reed's blog post: Python-Like enumerate() In C++17 http://reedbeta....
Definition: enumerate.hh:28
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
size_t size(std::string_view utf8)
constexpr auto xrange(T e)
Definition: xrange.hh:133