21using Pixel = uint32_t;
29void scale_1on3(std::span<const Pixel> in, std::span<Pixel> out);
30void scale_1on4(std::span<const Pixel> in, std::span<Pixel> out);
31void scale_1on6(std::span<const Pixel> in, std::span<Pixel> out);
32void scale_1on2(std::span<const Pixel> in, std::span<Pixel> out);
33void scale_2on1(std::span<const Pixel> in, std::span<Pixel> out);
34void scale_6on1(std::span<const Pixel> in, std::span<Pixel> out);
35void scale_4on1(std::span<const Pixel> in, std::span<Pixel> out);
36void scale_3on1(std::span<const Pixel> in, std::span<Pixel> out);
37void scale_3on2(std::span<const Pixel> in, std::span<Pixel> out);
38void scale_3on4(std::span<const Pixel> in, std::span<Pixel> out);
39void scale_3on8(std::span<const Pixel> in, std::span<Pixel> out);
40void scale_2on3(std::span<const Pixel> in, std::span<Pixel> out);
41void scale_4on3(std::span<const Pixel> in, std::span<Pixel> out);
42void scale_8on3(std::span<const Pixel> in, std::span<Pixel> out);
43void scale_2on9(std::span<const Pixel> in, std::span<Pixel> out);
44void scale_4on9(std::span<const Pixel> in, std::span<Pixel> out);
45void scale_8on9(std::span<const Pixel> in, std::span<Pixel> out);
54template<
unsigned w1 = 1,
unsigned w2 = 1>
55void blendLines(std::span<const Pixel> in1, std::span<const Pixel> in2,
56 std::span<Pixel> out);
65void alphaBlendLines(std::span<const Pixel> in1, std::span<const Pixel> in2,
66 std::span<Pixel> out);
68 std::span<Pixel> out);
74static inline void scale_1onN(
75 std::span<const Pixel> in, std::span<Pixel> out)
77 auto outWidth = out.size();
78 assert(in.size() == (outWidth / N));
81 for (; i < (outWidth - (N - 1)); i += N, j += 1) {
87 for (
auto k :
xrange(N - 1)) {
88 if ((i + k) < outWidth) out[i + k] = 0;
92inline void scale_1on3(std::span<const Pixel> in, std::span<Pixel> out)
94 scale_1onN<3>(in, out);
97inline void scale_1on4(std::span<const Pixel> in, std::span<Pixel> out)
99 scale_1onN<4>(in, out);
102inline void scale_1on6(std::span<const Pixel> in, std::span<Pixel> out)
104 scale_1onN<6>(in, out);
108inline __m128i unpacklo(__m128i x, __m128i y)
111 return _mm_unpacklo_epi32(x, y);
113inline __m128i unpackhi(__m128i x, __m128i y)
116 return _mm_unpackhi_epi32(x, y);
119inline void scale_1on2_SSE(
const Pixel* __restrict in_,
Pixel* __restrict out_,
size_t srcWidth)
121 size_t bytes = srcWidth *
sizeof(
Pixel);
122 assert((bytes % (4 *
sizeof(__m128i))) == 0);
125 const auto* in = std::bit_cast<const char*>(in_) + bytes;
126 auto* out = std::bit_cast< char*>(out_) + 2 * bytes;
128 auto x = -ptrdiff_t(bytes);
130 __m128i a0 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + x + 0));
131 __m128i a1 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + x + 16));
132 __m128i a2 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + x + 32));
133 __m128i a3 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + x + 48));
134 __m128i l0 = unpacklo(a0, a0);
135 __m128i h0 = unpackhi(a0, a0);
136 __m128i l1 = unpacklo(a1, a1);
137 __m128i h1 = unpackhi(a1, a1);
138 __m128i l2 = unpacklo(a2, a2);
139 __m128i h2 = unpackhi(a2, a2);
140 __m128i
l3 = unpacklo(a3, a3);
141 __m128i h3 = unpackhi(a3, a3);
142 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 0), l0);
143 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 16), h0);
144 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 32), l1);
145 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 48), h1);
146 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 64), l2);
147 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 80), h2);
148 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 96),
l3);
149 _mm_storeu_si128(std::bit_cast<__m128i*>(out + 2*x + 112), h3);
150 x += 4 *
sizeof(__m128i);
155inline void scale_1on2(std::span<const Pixel> in, std::span<Pixel> out)
169 auto srcWidth = in.size();
170 assert((out.size() / 2) == srcWidth);
173 size_t chunk = 4 *
sizeof(__m128i) /
sizeof(
Pixel);
174 size_t srcWidth2 = srcWidth & ~(chunk - 1);
175 scale_1on2_SSE(in.data(), out.data(), srcWidth2);
176 in = in .subspan( srcWidth2);
177 out = out.subspan(2 * srcWidth2);
178 srcWidth -= srcWidth2;
183 for (
auto x :
xrange(srcWidth)) {
184 out[x * 2] = out[x * 2 + 1] = in[x];
189template<
int IMM8>
static inline __m128i shuffle(__m128i x, __m128i y)
191 return _mm_castps_si128(_mm_shuffle_ps(
192 _mm_castsi128_ps(x), _mm_castsi128_ps(y), IMM8));
195inline __m128i blend(__m128i x, __m128i y)
198 __m128i p = shuffle<0x88>(x, y);
199 __m128i q = shuffle<0xDD>(x, y);
200 return _mm_avg_epu8(p, q);
203inline void scale_2on1_SSE(
204 const Pixel* __restrict in_,
Pixel* __restrict out_,
size_t dstBytes)
206 assert((dstBytes % (4 *
sizeof(__m128i))) == 0);
207 assert(dstBytes != 0);
209 const auto* in = std::bit_cast<const char*>(in_) + 2 * dstBytes;
210 auto* out = std::bit_cast< char*>(out_) + dstBytes;
212 auto x = -ptrdiff_t(dstBytes);
214 __m128i a0 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 0));
215 __m128i a1 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 16));
216 __m128i a2 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 32));
217 __m128i a3 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 48));
218 __m128i a4 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 64));
219 __m128i a5 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 80));
220 __m128i a6 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 96));
221 __m128i a7 = _mm_loadu_si128(std::bit_cast<const __m128i*>(in + 2*x + 112));
222 __m128i b0 = blend(a0, a1);
223 __m128i b1 = blend(a2, a3);
224 __m128i b2 = blend(a4, a5);
225 __m128i b3 = blend(a6, a7);
226 _mm_storeu_si128(std::bit_cast<__m128i*>(out + x + 0), b0);
227 _mm_storeu_si128(std::bit_cast<__m128i*>(out + x + 16), b1);
228 _mm_storeu_si128(std::bit_cast<__m128i*>(out + x + 32), b2);
229 _mm_storeu_si128(std::bit_cast<__m128i*>(out + x + 48), b3);
230 x += 4 *
sizeof(__m128i);
235inline void scale_2on1(std::span<const Pixel> in, std::span<Pixel> out)
237 assert((in.size() / 2) == out.size());
238 auto outWidth = out.size();
240 auto n64 = (outWidth *
sizeof(
Pixel)) & ~63;
241 scale_2on1_SSE(in.data(), out.data(), n64);
242 outWidth &= ((64 /
sizeof(
Pixel)) - 1);
243 if (outWidth == 0) [[likely]]
return;
244 in = in .subspan(2 * n64 /
sizeof(
Pixel));
245 out = out.subspan( n64 /
sizeof(
Pixel));
250 for (
auto i :
xrange(outWidth)) {
251 out[i] = pixelOps.template blend<1, 1>(
252 in[2 * i + 0], in[2 * i + 1]);
256inline void scale_6on1(std::span<const Pixel> in, std::span<Pixel> out)
258 assert((in.size() / 6) == out.size());
260 for (
auto i :
xrange(out.size())) {
261 out[i] = pixelOps.template blend<1, 1, 1, 1, 1, 1>(subspan<6>(in, 6 * i));
265inline void scale_4on1(std::span<const Pixel> in, std::span<Pixel> out)
267 assert((in.size() / 4) == out.size());
269 for (
auto i :
xrange(out.size())) {
270 out[i] = pixelOps.template blend<1, 1, 1, 1>(subspan<4>(in, 4 * i));
274inline void scale_3on1(std::span<const Pixel> in, std::span<Pixel> out)
276 assert((in.size() / 3) == out.size());
278 for (
auto i :
xrange(out.size())) {
279 out[i] = pixelOps.template blend<1, 1, 1>(subspan<3>(in, 3 * i));
283inline void scale_3on2(std::span<const Pixel> in, std::span<Pixel> out)
285 assert((in.size() / 3) == (out.size() / 2));
287 size_t n = out.size();
289 for (; i < (n - 1); i += 2, j += 3) {
290 out[i + 0] = pixelOps.template blend<2, 1>(subspan<2>(in, j + 0));
291 out[i + 1] = pixelOps.template blend<1, 2>(subspan<2>(in, j + 1));
293 if (i < n) out[i] = 0;
296inline void scale_3on4(std::span<const Pixel> in, std::span<Pixel> out)
298 assert((in.size() / 3) == (out.size() / 4));
300 size_t n = out.size();
302 for (; i < (n - 3); i += 4, j += 3) {
303 out[i + 0] = in[j + 0];
304 out[i + 1] = pixelOps.template blend<1, 2>(subspan<2>(in, j + 0));
305 out[i + 2] = pixelOps.template blend<2, 1>(subspan<2>(in, j + 1));
306 out[i + 3] = in[j + 2];
308 for (
auto k :
xrange(4 - 1)) {
309 if ((i + k) < n) out[i + k] = 0;
313inline void scale_3on8(std::span<const Pixel> in, std::span<Pixel> out)
315 assert((in.size() / 3) == (out.size() / 8));
317 size_t n = out.size();
319 for (; i < (n - 7); i += 8, j += 3) {
320 out[i + 0] = in[j + 0];
321 out[i + 1] = in[j + 0];
322 out[i + 2] = pixelOps.template blend<2, 1>(subspan<2>(in, j + 0));
323 out[i + 3] = in[j + 1];
324 out[i + 4] = in[j + 1];
325 out[i + 5] = pixelOps.template blend<1, 2>(subspan<2>(in, j + 1));
326 out[i + 6] = in[j + 2];
327 out[i + 7] = in[j + 2];
329 for (
auto k :
xrange(8 - 1)) {
330 if ((i + k) < n) out[i + k] = 0;
334inline void scale_2on3(std::span<const Pixel> in, std::span<Pixel> out)
336 assert((in.size() / 2) == (out.size() / 3));
338 size_t n = out.size();
340 for (; i < (n - 2); i += 3, j += 2) {
341 out[i + 0] = in[j + 0];
342 out[i + 1] = pixelOps.template blend<1, 1>(subspan<2>(in, j));
343 out[i + 2] = in[j + 1];
345 if ((i + 0) < n) out[i + 0] = 0;
346 if ((i + 1) < n) out[i + 1] = 0;
349inline void scale_4on3(std::span<const Pixel> in, std::span<Pixel> out)
351 assert((in.size() / 4) == (out.size() / 3));
353 size_t n = out.size();
355 for (; i < (n - 2); i += 3, j += 4) {
356 out[i + 0] = pixelOps.template blend<3, 1>(subspan<2>(in, j + 0));
357 out[i + 1] = pixelOps.template blend<1, 1>(subspan<2>(in, j + 1));
358 out[i + 2] = pixelOps.template blend<1, 3>(subspan<2>(in, j + 2));
360 if ((i + 0) < n) out[i + 0] = 0;
361 if ((i + 1) < n) out[i + 1] = 0;
364inline void scale_8on3(std::span<const Pixel> in, std::span<Pixel> out)
366 assert((in.size() / 8) == (out.size() / 3));
368 size_t n = out.size();
370 for (; i < (n - 2); i += 3, j += 8) {
371 out[i + 0] = pixelOps.template blend<3, 3, 2> (subspan<3>(in, j + 0));
372 out[i + 1] = pixelOps.template blend<1, 3, 3, 1>(subspan<4>(in, j + 2));
373 out[i + 2] = pixelOps.template blend<2, 3, 3> (subspan<3>(in, j + 5));
375 if ((i + 0) < n) out[i + 0] = 0;
376 if ((i + 1) < n) out[i + 1] = 0;
379inline void scale_2on9(std::span<const Pixel> in, std::span<Pixel> out)
381 assert((in.size() / 2) == (out.size() / 9));
383 size_t n = out.size();
385 for (; i < (n - 8); i += 9, j += 2) {
386 out[i + 0] = in[j + 0];
387 out[i + 1] = in[j + 0];
388 out[i + 2] = in[j + 0];
389 out[i + 3] = in[j + 0];
390 out[i + 4] = pixelOps.template blend<1, 1>(subspan<2>(in, j));
391 out[i + 5] = in[j + 1];
392 out[i + 6] = in[j + 1];
393 out[i + 7] = in[j + 1];
394 out[i + 8] = in[j + 1];
396 if ((i + 0) < n) out[i + 0] = 0;
397 if ((i + 1) < n) out[i + 1] = 0;
398 if ((i + 2) < n) out[i + 2] = 0;
399 if ((i + 3) < n) out[i + 3] = 0;
400 if ((i + 4) < n) out[i + 4] = 0;
401 if ((i + 5) < n) out[i + 5] = 0;
402 if ((i + 6) < n) out[i + 6] = 0;
403 if ((i + 7) < n) out[i + 7] = 0;
406inline void scale_4on9(std::span<const Pixel> in, std::span<Pixel> out)
408 assert((in.size() / 4) == (out.size() / 9));
410 size_t n = out.size();
412 for (; i < (n - 8); i += 9, j += 4) {
413 out[i + 0] = in[j + 0];
414 out[i + 1] = in[j + 0];
415 out[i + 2] = pixelOps.template blend<1, 3>(subspan<2>(in, j + 0));
416 out[i + 3] = in[j + 1];
417 out[i + 4] = pixelOps.template blend<1, 1>(subspan<2>(in, j + 1));
418 out[i + 5] = in[j + 2];
419 out[i + 6] = pixelOps.template blend<3, 1>(subspan<2>(in, j + 2));
420 out[i + 7] = in[j + 3];
421 out[i + 8] = in[j + 3];
423 if ((i + 0) < n) out[i + 0] = 0;
424 if ((i + 1) < n) out[i + 1] = 0;
425 if ((i + 2) < n) out[i + 2] = 0;
426 if ((i + 3) < n) out[i + 3] = 0;
427 if ((i + 4) < n) out[i + 4] = 0;
428 if ((i + 5) < n) out[i + 5] = 0;
429 if ((i + 6) < n) out[i + 6] = 0;
430 if ((i + 7) < n) out[i + 7] = 0;
433inline void scale_8on9(std::span<const Pixel> in, std::span<Pixel> out)
435 assert((in.size() / 8) == (out.size() / 9));
437 size_t n = out.size();
439 for (; i < (n - 8); i += 9, j += 8) {
440 out[i + 0] = in[j + 0];
441 out[i + 1] = pixelOps.template blend<1, 7>(subspan<2>(in, j + 0));
442 out[i + 2] = pixelOps.template blend<1, 3>(subspan<2>(in, j + 1));
443 out[i + 3] = pixelOps.template blend<3, 5>(subspan<2>(in, j + 2));
444 out[i + 4] = pixelOps.template blend<1, 1>(subspan<2>(in, j + 3));
445 out[i + 5] = pixelOps.template blend<5, 3>(subspan<2>(in, j + 4));
446 out[i + 6] = pixelOps.template blend<3, 1>(subspan<2>(in, j + 5));
447 out[i + 7] = pixelOps.template blend<7, 1>(subspan<2>(in, j + 6));
448 out[i + 8] = in[j + 7];
450 if ((i + 0) < n) out[i + 0] = 0;
451 if ((i + 1) < n) out[i + 1] = 0;
452 if ((i + 2) < n) out[i + 2] = 0;
453 if ((i + 3) < n) out[i + 3] = 0;
454 if ((i + 4) < n) out[i + 4] = 0;
455 if ((i + 5) < n) out[i + 5] = 0;
456 if ((i + 6) < n) out[i + 6] = 0;
457 if ((i + 7) < n) out[i + 7] = 0;
460template<
unsigned w1,
unsigned w2>
461void blendLines(std::span<const Pixel> in1, std::span<const Pixel> in2, std::span<Pixel> out)
466 assert(in1.size() == in2.size());
467 assert(in1.size() == out.size());
469 for (
auto [i1, i2, o] : std::views::zip(in1, in2, out)) {
470 o = pixelOps.template blend<w1, w2>(i1, i2);
475 std::span<const Pixel> in1, std::span<const Pixel> in2, std::span<Pixel> out)
478 assert(in1.size() == in2.size());
479 assert(in1.size() == out.size());
481 for (
auto [i1, i2, o] : std::views::zip(in1, in2, out)) {
487 Pixel in1, std::span<const Pixel> in2, std::span<Pixel> out)
493 assert(in2.size() == out.size());
496 unsigned alpha = pixelOps.
alpha(in1);
505 unsigned alpha2 = 256 - alpha;
506 for (
auto [i2, o] : std::views::zip(in2, out)) {
507 o = in1M + pixelOps.
multiply(i2, alpha2);
unsigned alpha(Pixel p) const
static Pixel multiply(Pixel p, unsigned x)
Perform a component wise multiplication of a pixel with an 8-bit fractional value: result = (pixel * ...
Pixel alphaBlend(Pixel p1, Pixel p2) const
Perform alpha blending of two pixels.
imat3 l3(ivec3(0, 2, 3), ivec3(4, 5, 6), ivec3(7, 8, 9))
This file implemented 3 utility functions:
void scale_3on1(std::span< const Pixel > in, std::span< Pixel > out)
void blendLines(std::span< const Pixel > in1, std::span< const Pixel > in2, std::span< Pixel > out)
BlendLines functor Generate an output line that is an interpolation of two input lines.
void scale_2on1(std::span< const Pixel > in, std::span< Pixel > out)
void scale_2on3(std::span< const Pixel > in, std::span< Pixel > out)
void scale_4on9(std::span< const Pixel > in, std::span< Pixel > out)
void scale_6on1(std::span< const Pixel > in, std::span< Pixel > out)
void scale_1on2(std::span< const Pixel > in, std::span< Pixel > out)
void scale_1on3(std::span< const Pixel > in, std::span< Pixel > out)
Scale_XonY functions Transforms an input line of pixel to an output line (possibly) with a different ...
void scale_4on3(std::span< const Pixel > in, std::span< Pixel > out)
void scale_3on2(std::span< const Pixel > in, std::span< Pixel > out)
void scale_3on8(std::span< const Pixel > in, std::span< Pixel > out)
void scale_2on9(std::span< const Pixel > in, std::span< Pixel > out)
void alphaBlendLines(std::span< const Pixel > in1, std::span< const Pixel > in2, std::span< Pixel > out)
AlphaBlendLines functor Generate an output line that is a per-pixel-alpha-blend of the two input line...
void scale_1on4(std::span< const Pixel > in, std::span< Pixel > out)
CharacterConverter::Pixel Pixel
void scale_1on6(std::span< const Pixel > in, std::span< Pixel > out)
void scale_4on1(std::span< const Pixel > in, std::span< Pixel > out)
void scale_3on4(std::span< const Pixel > in, std::span< Pixel > out)
void scale_8on9(std::span< const Pixel > in, std::span< Pixel > out)
void scale_8on3(std::span< const Pixel > in, std::span< Pixel > out)
constexpr auto xrange(T e)