openMSX
Scanline.cc
Go to the documentation of this file.
1#include "Scanline.hh"
2#include "PixelOperations.hh"
3#include "enumerate.hh"
4#include "narrow.hh"
5#include "unreachable.hh"
6#include "xrange.hh"
7#include <cassert>
8#include <cstddef>
9#ifdef __SSE2__
10#include <emmintrin.h>
11#endif
12
13namespace openmsx {
14
15// class Multiply<uint16_t>
16
18 : pixelOps(pixelOps_)
19{
20}
21
23{
24 if (f == factor) return;
25 factor = f;
26
27 for (auto [p, t] : enumerate(tab)) {
28 auto pix = uint32_t(p);
29 t = narrow_cast<uint16_t>(
30 ((((pix & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask()) |
31 ((((pix & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask()) |
32 ((((pix & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask()));
33 }
34}
35
36inline uint16_t Multiply<uint16_t>::multiply(uint16_t p, unsigned f) const
37{
38 auto r = (((p & pixelOps.getRmask()) * f) >> 8) & pixelOps.getRmask();
39 auto g = (((p & pixelOps.getGmask()) * f) >> 8) & pixelOps.getGmask();
40 auto b = (((p & pixelOps.getBmask()) * f) >> 8) & pixelOps.getBmask();
41 return narrow_cast<uint16_t>(r | g | b);
42}
43
44inline uint16_t Multiply<uint16_t>::multiply(uint16_t p) const
45{
46 return tab[p];
47}
48
49
50// class Multiply<uint32_t>
51
53{
54}
55
57{
58 factor = f;
59}
60
61inline uint32_t Multiply<uint32_t>::multiply(uint32_t p, unsigned f) const
62{
64}
65
66inline uint32_t Multiply<uint32_t>::multiply(uint32_t p) const
67{
68 return multiply(p, factor);
69}
70
71
72#ifdef __SSE2__
73
74// 32bpp
75static inline void drawSSE2_1(
76 const char* __restrict in1, const char* __restrict in2,
77 char* __restrict out, __m128i f)
78{
79 __m128i zero = _mm_setzero_si128();
80 __m128i a = *reinterpret_cast<const __m128i*>(in1);
81 __m128i b = *reinterpret_cast<const __m128i*>(in2);
82 __m128i c = _mm_avg_epu8(a, b);
83 __m128i l = _mm_unpacklo_epi8(c, zero);
84 __m128i h = _mm_unpackhi_epi8(c, zero);
85 __m128i m = _mm_mulhi_epu16(l, f);
86 __m128i n = _mm_mulhi_epu16(h, f);
87 __m128i r = _mm_packus_epi16(m, n);
88 *reinterpret_cast<__m128i*>(out) = r;
89}
90static inline void drawSSE2(
91 const uint32_t* __restrict in1_,
92 const uint32_t* __restrict in2_,
93 uint32_t* __restrict out_,
94 unsigned factor,
95 size_t width,
96 PixelOperations<uint32_t>& /*dummy*/,
97 Multiply<uint32_t>& /*dummy*/)
98{
99 width *= sizeof(uint32_t); // in bytes
100 assert(width >= 64);
101 assert((reinterpret_cast<uintptr_t>(in1_) % sizeof(__m128i)) == 0);
102 assert((reinterpret_cast<uintptr_t>(in2_) % sizeof(__m128i)) == 0);
103 assert((reinterpret_cast<uintptr_t>(out_) % sizeof(__m128i)) == 0);
104 const auto* in1 = reinterpret_cast<const char*>(in1_) + width;
105 const auto* in2 = reinterpret_cast<const char*>(in2_) + width;
106 auto* out = reinterpret_cast< char*>(out_) + width;
107
108 __m128i f = _mm_set1_epi16(narrow_cast<int16_t>(factor << 8));
109 ptrdiff_t x = -ptrdiff_t(width);
110 do {
111 drawSSE2_1(in1 + x + 0, in2 + x + 0, out + x + 0, f);
112 drawSSE2_1(in1 + x + 16, in2 + x + 16, out + x + 16, f);
113 drawSSE2_1(in1 + x + 32, in2 + x + 32, out + x + 32, f);
114 drawSSE2_1(in1 + x + 48, in2 + x + 48, out + x + 48, f);
115 x += 64;
116 } while (x < 0);
117}
118
119// 16bpp
120static inline void drawSSE2(
121 const uint16_t* __restrict in1_,
122 const uint16_t* __restrict in2_,
123 uint16_t* __restrict out_,
124 unsigned factor,
125 size_t width,
126 PixelOperations<uint16_t>& pixelOps,
127 Multiply<uint16_t>& darkener)
128{
129 width *= sizeof(uint16_t); // in bytes
130 assert(width >= 16);
131 const auto* in1 = reinterpret_cast<const char*>(in1_) + width;
132 const auto* in2 = reinterpret_cast<const char*>(in2_) + width;
133 auto* out = reinterpret_cast< char*>(out_) + width;
134
135 darkener.setFactor(factor);
136 auto table = darkener.getTable();
137 __m128i mask = _mm_set1_epi16(narrow_cast<int16_t>(pixelOps.getBlendMask()));
138
139 ptrdiff_t x = -ptrdiff_t(width);
140 do {
141 __m128i a = *reinterpret_cast<const __m128i*>(in1 + x);
142 __m128i b = *reinterpret_cast<const __m128i*>(in2 + x);
143 __m128i c = _mm_add_epi16(
144 _mm_and_si128(a, b),
145 _mm_srli_epi16(
146 _mm_and_si128(mask, _mm_xor_si128(a, b)),
147 1));
148 *reinterpret_cast<__m128i*>(out + x) = _mm_set_epi16(
149 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 7)]),
150 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 6)]),
151 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 5)]),
152 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 4)]),
153 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 3)]),
154 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 2)]),
155 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 1)]),
156 narrow_cast<int16_t>(table[_mm_extract_epi16(c, 0)]));
157 // An alternative for the above statement is this block (this
158 // is close to what we has in our old MMX routine). On gcc this
159 // generates significantly shorter (25%) but also significantly
160 // slower (30%) code. On clang both alternatives generate
161 // identical code, comparable in size to the fast gcc version
162 // (but still a bit faster).
163 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 0)], 0);
164 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 1)], 1);
165 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 2)], 2);
166 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 3)], 3);
167 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 4)], 4);
168 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 5)], 5);
169 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 6)], 6);
170 //c = _mm_insert_epi16(c, table[_mm_extract_epi16(c, 7)], 7);
171 //*reinterpret_cast<__m128i*>(out + x) = c;
172
173 x += 16;
174 } while (x < 0);
175}
176
177#endif
178
179
180// class Scanline
181
182template<std::unsigned_integral Pixel>
184 : darkener(pixelOps_)
185 , pixelOps(pixelOps_)
186{
187}
188
189template<std::unsigned_integral Pixel>
191 std::span<const Pixel> src1, std::span<const Pixel> src2,
192 std::span<Pixel> dst, unsigned factor)
193{
194 auto width = src1.size();
195 assert(src1.size() == width);
196 assert(src2.size() == width);
197 assert(dst .size() == width);
198#ifdef __SSE2__
199 drawSSE2(src1.data(), src2.data(), dst.data(), factor, width, pixelOps, darkener);
200#else
201 // non-SSE2 routine, both 16bpp and 32bpp
202 darkener.setFactor(factor);
203 for (auto x : xrange(width)) {
204 dst[x] = darkener.multiply(
205 pixelOps.template blend<1, 1>(src1[x], src2[x]));
206 }
207#endif
208}
209
210template<std::unsigned_integral Pixel>
211Pixel Scanline<Pixel>::darken(Pixel p, unsigned factor) const
212{
213 return darkener.multiply(p, factor);
214}
215
216template<std::unsigned_integral Pixel>
217Pixel Scanline<Pixel>::darken(Pixel p1, Pixel p2, unsigned factor) const
218{
219 return darkener.multiply(pixelOps.template blend<1, 1>(p1, p2), factor);
220}
221
222// Force template instantiation.
223#if HAVE_16BPP
224template class Scanline<uint16_t>;
225#endif
226#if HAVE_32BPP
227template class Scanline<uint32_t>;
228#endif
229
230} // namespace openmsx
int g
TclObject t
Helper class to perform 'pixel x scalar' calculations.
Definition: Scanline.hh:15
static Pixel multiply(Pixel p, unsigned x)
Perform a component wise multiplication of a pixel with an 8-bit fractional value: result = (pixel * ...
Helper class to draw scanlines.
Definition: Scanline.hh:46
void draw(std::span< const Pixel > src1, std::span< const Pixel > src2, std::span< Pixel > dst, unsigned factor)
Draws a scanline.
Definition: Scanline.cc:190
Pixel darken(Pixel p, unsigned factor) const
Darken one pixel.
Definition: Scanline.cc:211
Scanline(const PixelOperations< Pixel > &pixelOps)
Definition: Scanline.cc:183
constexpr auto enumerate(Iterable &&iterable)
Heavily inspired by Nathan Reed's blog post: Python-Like enumerate() In C++17 http://reedbeta....
Definition: enumerate.hh:28
This file implemented 3 utility functions:
Definition: Autofire.cc:9
uint32_t Pixel
size_t size(std::string_view utf8)
constexpr auto xrange(T e)
Definition: xrange.hh:132