openMSX
yuv2rgb.cc
Go to the documentation of this file.
1#include "yuv2rgb.hh"
2
3#include "RawFrame.hh"
4
5#include "Math.hh"
6#include "xrange.hh"
7
8#include <array>
9#include <bit>
10#include <cassert>
11#include <concepts>
12#include <cstdint>
13#ifdef __SSE2__
14#include <emmintrin.h>
15#endif
16
18
19using Pixel = uint32_t;
20
21#ifdef __SSE2__
22
23/*
24 * This implementation of yuv420 to rgb is based upon the corresponding routine
25 * from Mono. See this blog entry:
26 * http://blog.sublimeintervention.com/archive/2008/Mar-21.html
27 * Source code:
28 * http://anonsvn.mono-project.com/viewvc/trunk/moon/src/yuv-converter.cpp?revision=136072
29 * This code is GPL2 (only)
30 *
31 * Copyright 2008 Novell, Inc. (http://www.novell.com)
32 *
33 * There are other implementations:
34 * - ffmpeg
35 * - mythtv
36 * - pcsx2
37 * I have not done a comparison of these implementations.
38 */
39
40/* R = 1.164 * (Y - 16) + 1.596 * (V - 128)
41 * G = 1.164 * (Y - 16) - 0.813 * (V - 128) - 0.391 * (U - 128)
42 * B = 1.164 * (Y - 16) + 2.018 * (U - 128)
43 * OR
44 * R = 1.164 * Y + 1.596 * V - 222.921
45 * G = 1.164 * Y - 0.813 * V - 0.391 * U + 135.576
46 * B = 1.164 * Y + 2.018 * U - 276.836
47 */
48static inline void yuv2rgb_sse2(
49 const uint8_t* u_ , const uint8_t* v_,
50 const uint8_t* y0_, const uint8_t* y1_,
51 Pixel* out0_, Pixel* out1_)
52{
53 // This routine calculates 32x2 RGBA pixels. Each output pixel uses a
54 // unique corresponding input Y value, but a group of 2x2 ouput pixels
55 // shares the same U and V input value.
56 const auto* u = std::bit_cast<const __m128i*>(u_);
57 const auto* v = std::bit_cast<const __m128i*>(v_);
58 const auto* y0 = std::bit_cast<const __m128i*>(y0_);
59 const auto* y1 = std::bit_cast<const __m128i*>(y1_);
60 auto* out0 = std::bit_cast< __m128i*>(out0_);
61 auto* out1 = std::bit_cast< __m128i*>(out1_);
62
63 // constants
64 const __m128i ZERO = _mm_setzero_si128();
65 const __m128i ALPHA = _mm_set1_epi16( -1); // 0xFFFF
66 const __m128i RED_V = _mm_set1_epi16( 102); // 102/64 = 1.59
67 const __m128i GREEN_U = _mm_set1_epi16( -25); // -25/64 = -0.39
68 const __m128i GREEN_V = _mm_set1_epi16( -52); // -52/64 = -0.81
69 const __m128i BLUE_U = _mm_set1_epi16( 129); // 129/64 = 2.02
70 const __m128i COEF_Y = _mm_set1_epi16( 74); // 74/64 = 1.16
71 const __m128i CNST_R = _mm_set1_epi16( -223); // -222.921
72 const __m128i CNST_G = _mm_set1_epi16( 136); // 135.576
73 const __m128i CNST_B = _mm_set1_epi16( -277); // -276.836
74 const __m128i Y_MASK = _mm_set1_epi16(0x00FF);
75
76 // left
77 __m128i u0f = _mm_load_si128(u);
78 __m128i v0f = _mm_load_si128(v);
79 __m128i u07 = _mm_unpacklo_epi8(u0f, ZERO);
80 __m128i v07 = _mm_unpacklo_epi8(v0f, ZERO);
81 __m128i mr07 = _mm_srai_epi16(_mm_mullo_epi16(v07, RED_V), 6);
82 __m128i sg07 = _mm_mullo_epi16(v07, GREEN_V);
83 __m128i tg07 = _mm_mullo_epi16(u07, GREEN_U);
84 __m128i mg07 = _mm_srai_epi16(_mm_adds_epi16(sg07, tg07), 6);
85 __m128i mb07 = _mm_srli_epi16(_mm_mullo_epi16(u07, BLUE_U), 6); // logical shift
86 __m128i dr07 = _mm_adds_epi16(mr07, CNST_R);
87 __m128i dg07 = _mm_adds_epi16(mg07, CNST_G);
88 __m128i db07 = _mm_adds_epi16(mb07, CNST_B);
89
90 // block top,left
91 __m128i y00_0f = _mm_load_si128(y0 + 0);
92 __m128i y00_even = _mm_and_si128(y00_0f, Y_MASK);
93 __m128i y00_odd = _mm_srli_epi16(y00_0f, 8);
94 __m128i dy00_even = _mm_srai_epi16(_mm_mullo_epi16(y00_even, COEF_Y), 6);
95 __m128i dy00_odd = _mm_srai_epi16(_mm_mullo_epi16(y00_odd, COEF_Y), 6);
96 __m128i r00_even = _mm_adds_epi16(dr07, dy00_even);
97 __m128i g00_even = _mm_adds_epi16(dg07, dy00_even);
98 __m128i b00_even = _mm_adds_epi16(db07, dy00_even);
99 __m128i r00_odd = _mm_adds_epi16(dr07, dy00_odd);
100 __m128i g00_odd = _mm_adds_epi16(dg07, dy00_odd);
101 __m128i b00_odd = _mm_adds_epi16(db07, dy00_odd);
102 __m128i r00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r00_even, r00_even),
103 _mm_packus_epi16(r00_odd, r00_odd));
104 __m128i g00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g00_even, g00_even),
105 _mm_packus_epi16(g00_odd, g00_odd));
106 __m128i b00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b00_even, b00_even),
107 _mm_packus_epi16(b00_odd, b00_odd));
108 __m128i rb00_07 = _mm_unpacklo_epi8(r00_0f, b00_0f);
109 __m128i rb00_8f = _mm_unpackhi_epi8(r00_0f, b00_0f);
110 __m128i ga00_07 = _mm_unpacklo_epi8(g00_0f, ALPHA);
111 __m128i ga00_8f = _mm_unpackhi_epi8(g00_0f, ALPHA);
112 __m128i rgba00_03 = _mm_unpacklo_epi8(rb00_07, ga00_07);
113 __m128i rgba00_47 = _mm_unpackhi_epi8(rb00_07, ga00_07);
114 __m128i rgba00_8b = _mm_unpacklo_epi8(rb00_8f, ga00_8f);
115 __m128i rgba00_cf = _mm_unpackhi_epi8(rb00_8f, ga00_8f);
116 _mm_store_si128(out0 + 0, rgba00_03);
117 _mm_store_si128(out0 + 1, rgba00_47);
118 _mm_store_si128(out0 + 2, rgba00_8b);
119 _mm_store_si128(out0 + 3, rgba00_cf);
120
121 // block bottom,left
122 __m128i y10_0f = _mm_load_si128(y1 + 0);
123 __m128i y10_even = _mm_and_si128(y10_0f, Y_MASK);
124 __m128i y10_odd = _mm_srli_epi16(y10_0f, 8);
125 __m128i dy10_even = _mm_srai_epi16(_mm_mullo_epi16(y10_even, COEF_Y), 6);
126 __m128i dy10_odd = _mm_srai_epi16(_mm_mullo_epi16(y10_odd, COEF_Y), 6);
127 __m128i r10_even = _mm_adds_epi16(dr07, dy10_even);
128 __m128i g10_even = _mm_adds_epi16(dg07, dy10_even);
129 __m128i b10_even = _mm_adds_epi16(db07, dy10_even);
130 __m128i r10_odd = _mm_adds_epi16(dr07, dy10_odd);
131 __m128i g10_odd = _mm_adds_epi16(dg07, dy10_odd);
132 __m128i b10_odd = _mm_adds_epi16(db07, dy10_odd);
133 __m128i r10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r10_even, r10_even),
134 _mm_packus_epi16(r10_odd, r10_odd));
135 __m128i g10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g10_even, g10_even),
136 _mm_packus_epi16(g10_odd, g10_odd));
137 __m128i b10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b10_even, b10_even),
138 _mm_packus_epi16(b10_odd, b10_odd));
139 __m128i rb10_07 = _mm_unpacklo_epi8(r10_0f, b10_0f);
140 __m128i rb10_8f = _mm_unpackhi_epi8(r10_0f, b10_0f);
141 __m128i ga10_07 = _mm_unpacklo_epi8(g10_0f, ALPHA);
142 __m128i ga10_8f = _mm_unpackhi_epi8(g10_0f, ALPHA);
143 __m128i rgba10_03 = _mm_unpacklo_epi8(rb10_07, ga10_07);
144 __m128i rgba10_47 = _mm_unpackhi_epi8(rb10_07, ga10_07);
145 __m128i rgba10_8b = _mm_unpacklo_epi8(rb10_8f, ga10_8f);
146 __m128i rgba10_cf = _mm_unpackhi_epi8(rb10_8f, ga10_8f);
147 _mm_store_si128(out1 + 0, rgba10_03);
148 _mm_store_si128(out1 + 1, rgba10_47);
149 _mm_store_si128(out1 + 2, rgba10_8b);
150 _mm_store_si128(out1 + 3, rgba10_cf);
151
152 // right
153 __m128i u8f = _mm_unpackhi_epi8(u0f, ZERO);
154 __m128i v8f = _mm_unpackhi_epi8(v0f, ZERO);
155 __m128i mr8f = _mm_srai_epi16(_mm_mullo_epi16(v8f, RED_V), 6);
156 __m128i sg8f = _mm_mullo_epi16(v8f, GREEN_V);
157 __m128i tg8f = _mm_mullo_epi16(u8f, GREEN_U);
158 __m128i mg8f = _mm_srai_epi16(_mm_adds_epi16(sg8f, tg8f), 6);
159 __m128i mb8f = _mm_srli_epi16(_mm_mullo_epi16(u8f, BLUE_U), 6); // logical shift
160 __m128i dr8f = _mm_adds_epi16(mr8f, CNST_R);
161 __m128i dg8f = _mm_adds_epi16(mg8f, CNST_G);
162 __m128i db8f = _mm_adds_epi16(mb8f, CNST_B);
163
164 // block top,right
165 __m128i y01_0f = _mm_load_si128(y0 + 1);
166 __m128i y01_even = _mm_and_si128(y01_0f, Y_MASK);
167 __m128i y01_odd = _mm_srli_epi16(y01_0f, 8);
168 __m128i dy01_even = _mm_srai_epi16(_mm_mullo_epi16(y01_even, COEF_Y), 6);
169 __m128i dy01_odd = _mm_srai_epi16(_mm_mullo_epi16(y01_odd, COEF_Y), 6);
170 __m128i r01_even = _mm_adds_epi16(dr8f, dy01_even);
171 __m128i g01_even = _mm_adds_epi16(dg8f, dy01_even);
172 __m128i b01_even = _mm_adds_epi16(db8f, dy01_even);
173 __m128i r01_odd = _mm_adds_epi16(dr8f, dy01_odd);
174 __m128i g01_odd = _mm_adds_epi16(dg8f, dy01_odd);
175 __m128i b01_odd = _mm_adds_epi16(db8f, dy01_odd);
176 __m128i r01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r01_even, r01_even),
177 _mm_packus_epi16(r01_odd, r01_odd));
178 __m128i g01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g01_even, g01_even),
179 _mm_packus_epi16(g01_odd, g01_odd));
180 __m128i b01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b01_even, b01_even),
181 _mm_packus_epi16(b01_odd, b01_odd));
182 __m128i rb01_07 = _mm_unpacklo_epi8(r01_0f, b01_0f);
183 __m128i rb01_8f = _mm_unpackhi_epi8(r01_0f, b01_0f);
184 __m128i ga01_07 = _mm_unpacklo_epi8(g01_0f, ALPHA);
185 __m128i ga01_8f = _mm_unpackhi_epi8(g01_0f, ALPHA);
186 __m128i rgba01_03 = _mm_unpacklo_epi8(rb01_07, ga01_07);
187 __m128i rgba01_47 = _mm_unpackhi_epi8(rb01_07, ga01_07);
188 __m128i rgba01_8b = _mm_unpacklo_epi8(rb01_8f, ga01_8f);
189 __m128i rgba01_cf = _mm_unpackhi_epi8(rb01_8f, ga01_8f);
190 _mm_store_si128(out0 + 4, rgba01_03);
191 _mm_store_si128(out0 + 5, rgba01_47);
192 _mm_store_si128(out0 + 6, rgba01_8b);
193 _mm_store_si128(out0 + 7, rgba01_cf);
194
195 // block bottom,right
196 __m128i y11_0f = _mm_load_si128(y1 + 1);
197 __m128i y11_even = _mm_and_si128(y11_0f, Y_MASK);
198 __m128i y11_odd = _mm_srli_epi16(y11_0f, 8);
199 __m128i dy11_even = _mm_srai_epi16(_mm_mullo_epi16(y11_even, COEF_Y), 6);
200 __m128i dy11_odd = _mm_srai_epi16(_mm_mullo_epi16(y11_odd, COEF_Y), 6);
201 __m128i r11_even = _mm_adds_epi16(dr8f, dy11_even);
202 __m128i g11_even = _mm_adds_epi16(dg8f, dy11_even);
203 __m128i b11_even = _mm_adds_epi16(db8f, dy11_even);
204 __m128i r11_odd = _mm_adds_epi16(dr8f, dy11_odd);
205 __m128i g11_odd = _mm_adds_epi16(dg8f, dy11_odd);
206 __m128i b11_odd = _mm_adds_epi16(db8f, dy11_odd);
207 __m128i r11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r11_even, r11_even),
208 _mm_packus_epi16(r11_odd, r11_odd));
209 __m128i g11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g11_even, g11_even),
210 _mm_packus_epi16(g11_odd, g11_odd));
211 __m128i b11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b11_even, b11_even),
212 _mm_packus_epi16(b11_odd, b11_odd));
213 __m128i rb11_07 = _mm_unpacklo_epi8(r11_0f, b11_0f);
214 __m128i rb11_8f = _mm_unpackhi_epi8(r11_0f, b11_0f);
215 __m128i ga11_07 = _mm_unpacklo_epi8(g11_0f, ALPHA);
216 __m128i ga11_8f = _mm_unpackhi_epi8(g11_0f, ALPHA);
217 __m128i rgba11_03 = _mm_unpacklo_epi8(rb11_07, ga11_07);
218 __m128i rgba11_47 = _mm_unpackhi_epi8(rb11_07, ga11_07);
219 __m128i rgba11_8b = _mm_unpacklo_epi8(rb11_8f, ga11_8f);
220 __m128i rgba11_cf = _mm_unpackhi_epi8(rb11_8f, ga11_8f);
221 _mm_store_si128(out1 + 4, rgba11_03);
222 _mm_store_si128(out1 + 5, rgba11_47);
223 _mm_store_si128(out1 + 6, rgba11_8b);
224 _mm_store_si128(out1 + 7, rgba11_cf);
225}
226
227static inline void convertHelperSSE2(
228 const th_ycbcr_buffer& buffer, RawFrame& output)
229{
230 const int width = buffer[0].width;
231 const size_t y_stride = buffer[0].stride;
232 const size_t uv_stride2 = buffer[1].stride / 2;
233
234 assert((width % 32) == 0);
235 assert((buffer[0].height % 2) == 0);
236
237 for (int y = 0; y < buffer[0].height; y += 2) {
238 const uint8_t* pY1 = buffer[0].data + (y + 0) * y_stride;
239 const uint8_t* pY2 = buffer[0].data + (y + 1) * y_stride;
240 const uint8_t* pCb = buffer[1].data + (y + 0) * uv_stride2;
241 const uint8_t* pCr = buffer[2].data + (y + 0) * uv_stride2;
242 auto out0 = output.getLineDirect(y + 0);
243 auto out1 = output.getLineDirect(y + 1);
244
245 for (int x = 0; x < width; x += 32) {
246 // convert a block of (32 x 2) pixels
247 yuv2rgb_sse2(pCb, pCr, pY1, pY2, &out0[x], &out1[x]);
248 pCb += 16;
249 pCr += 16;
250 pY1 += 32;
251 pY2 += 32;
252 }
253
254 output.setLineWidth(y + 0, width);
255 output.setLineWidth(y + 1, width);
256 }
257}
258
259#endif // __SSE2__
260
261static constexpr int PREC = 15;
262static constexpr int COEF_Y = int(1.164 * (1 << PREC) + 0.5); // prefer to use lrint() to round
263static constexpr int COEF_RV = int(1.596 * (1 << PREC) + 0.5); // but that's not (yet) constexpr
264static constexpr int COEF_GU = int(0.391 * (1 << PREC) + 0.5); // in current versions of c++
265static constexpr int COEF_GV = int(0.813 * (1 << PREC) + 0.5);
266static constexpr int COEF_BU = int(2.018 * (1 << PREC) + 0.5);
267
268struct Coefs {
269 std::array<int, 256> gu;
270 std::array<int, 256> gv;
271 std::array<int, 256> bu;
272 std::array<int, 256> rv;
273 std::array<int, 256> y;
274};
275
276[[nodiscard]] static constexpr Coefs getCoefs()
277{
278 Coefs coefs = {};
279 for (auto i : xrange(256)) {
280 coefs.gu[i] = -COEF_GU * (i - 128);
281 coefs.gv[i] = -COEF_GV * (i - 128);
282 coefs.bu[i] = COEF_BU * (i - 128);
283 coefs.rv[i] = COEF_RV * (i - 128);
284 coefs.y[i] = COEF_Y * (i - 16) + (PREC / 2);
285 }
286 return coefs;
287}
288
289[[nodiscard]] static inline Pixel calc(
290 int y, int ruv, int guv, int buv)
291{
292 uint8_t r = Math::clipIntToByte((y + ruv) >> PREC);
293 uint8_t g = Math::clipIntToByte((y + guv) >> PREC);
294 uint8_t b = Math::clipIntToByte((y + buv) >> PREC);
295 return (r << 0) | (g << 8) | (b << 16);
296}
297
298static void convertHelper(const th_ycbcr_buffer& buffer, RawFrame& output)
299{
300 assert(buffer[1].width * 2 == buffer[0].width);
301 assert(buffer[1].height * 2 == buffer[0].height);
302
303 static constexpr Coefs coefs = getCoefs();
304
305 const int width = buffer[0].width;
306 const size_t y_stride = buffer[0].stride;
307 const size_t uv_stride2 = buffer[1].stride / 2;
308
309 for (int y = 0; y < buffer[0].height; y += 2) {
310 const uint8_t* pY = buffer[0].data + y * y_stride;
311 const uint8_t* pCb = buffer[1].data + y * uv_stride2;
312 const uint8_t* pCr = buffer[2].data + y * uv_stride2;
313 auto out0 = output.getLineDirect(y + 0);
314 auto out1 = output.getLineDirect(y + 1);
315
316 for (int x = 0; x < width; x += 2, pY += 2, ++pCr, ++pCb) {
317 int ruv = coefs.rv[*pCr];
318 int guv = coefs.gu[*pCb] + coefs.gv[*pCr];
319 int buv = coefs.bu[*pCb];
320
321 int Y00 = coefs.y[pY[0]];
322 out0[x + 0] = calc(Y00, ruv, guv, buv);
323
324 int Y01 = coefs.y[pY[1]];
325 out0[x + 1] = calc(Y01, ruv, guv, buv);
326
327 int Y10 = coefs.y[pY[y_stride + 0]];
328 out1[x + 0] = calc(Y10, ruv, guv, buv);
329
330 int Y11 = coefs.y[pY[y_stride + 1]];
331 out1[x + 1] = calc(Y11, ruv, guv, buv);
332 }
333
334 output.setLineWidth(y + 0, width);
335 output.setLineWidth(y + 1, width);
336 }
337}
338
339void convert(const th_ycbcr_buffer& input, RawFrame& output)
340{
341#ifdef __SSE2__
342 convertHelperSSE2(input, output);
343 return;
344#endif
345 convertHelper(input, output);
346}
347
348} // namespace openmsx::yuv2rgb
int g
A video frame as output by the VDP scanline conversion unit, before any postprocessing filters are ap...
Definition RawFrame.hh:16
std::span< Pixel > getLineDirect(unsigned y)
Definition RawFrame.hh:20
void setLineWidth(unsigned line, unsigned width)
Definition RawFrame.hh:33
uint8_t clipIntToByte(int x)
Clip x to range [0,255].
Definition Math.hh:61
uint32_t Pixel
Definition yuv2rgb.cc:19
void convert(const th_ycbcr_buffer &input, RawFrame &output)
Definition yuv2rgb.cc:339
std::array< int, 256 > gu
Definition yuv2rgb.cc:269
std::array< int, 256 > gv
Definition yuv2rgb.cc:270
std::array< int, 256 > y
Definition yuv2rgb.cc:273
std::array< int, 256 > rv
Definition yuv2rgb.cc:272
std::array< int, 256 > bu
Definition yuv2rgb.cc:271
constexpr auto xrange(T e)
Definition xrange.hh:132