openMSX
yuv2rgb.cc
Go to the documentation of this file.
1 #include "yuv2rgb.hh"
2 #include "RawFrame.hh"
3 #include "Math.hh"
4 #include <cassert>
5 #include <cstdint>
6 #include <SDL.h>
7 #ifdef __SSE2__
8 #include <emmintrin.h>
9 #endif
10 
11 namespace openmsx::yuv2rgb {
12 
13 #ifdef __SSE2__
14 
15 /*
16  * This implementation of yuv420 to rgb is based upon the corresponding routine
17  * from Mono. See this blog entry:
18  * http://blog.sublimeintervention.com/archive/2008/Mar-21.html
19  * Source code:
20  * http://anonsvn.mono-project.com/viewvc/trunk/moon/src/yuv-converter.cpp?revision=136072
21  * This code is GPL2 (only)
22  *
23  * Copyright 2008 Novell, Inc. (http://www.novell.com)
24  *
25  * There are other implementations:
26  * - ffmpeg
27  * - mythtv
28  * - pcsx2
29  * I have not done a comparison of these implementations.
30  */
31 
32 /* R = 1.164 * (Y - 16) + 1.596 * (V - 128)
33  * G = 1.164 * (Y - 16) - 0.813 * (V - 128) - 0.391 * (U - 128)
34  * B = 1.164 * (Y - 16) + 2.018 * (U - 128)
35  * OR
36  * R = 1.164 * Y + 1.596 * V - 222.921
37  * G = 1.164 * Y - 0.813 * V - 0.391 * U + 135.576
38  * B = 1.164 * Y + 2.018 * U - 276.836
39  */
40 static inline void yuv2rgb_sse2(
41  const uint8_t* u_ , const uint8_t* v_,
42  const uint8_t* y0_, const uint8_t* y1_,
43  uint32_t* out0_, uint32_t* out1_)
44 {
45  // This routine calculates 32x2 RGBA pixels. Each output pixel uses a
46  // unique corresponding input Y value, but a group of 2x2 ouput pixels
47  // shares the same U and V input value.
48  auto* u = reinterpret_cast<const __m128i*>(u_);
49  auto* v = reinterpret_cast<const __m128i*>(v_);
50  auto* y0 = reinterpret_cast<const __m128i*>(y0_);
51  auto* y1 = reinterpret_cast<const __m128i*>(y1_);
52  auto* out0 = reinterpret_cast< __m128i*>(out0_);
53  auto* out1 = reinterpret_cast< __m128i*>(out1_);
54 
55  // constants
56  const __m128i ZERO = _mm_setzero_si128();
57  const __m128i ALPHA = _mm_set1_epi16( -1); // 0xFFFF
58  const __m128i RED_V = _mm_set1_epi16( 102); // 102/64 = 1.59
59  const __m128i GREEN_U = _mm_set1_epi16( -25); // -25/64 = -0.39
60  const __m128i GREEN_V = _mm_set1_epi16( -52); // -52/64 = -0.81
61  const __m128i BLUE_U = _mm_set1_epi16( 129); // 129/64 = 2.02
62  const __m128i COEF_Y = _mm_set1_epi16( 74); // 74/64 = 1.16
63  const __m128i CNST_R = _mm_set1_epi16( -223); // -222.921
64  const __m128i CNST_G = _mm_set1_epi16( 136); // 135.576
65  const __m128i CNST_B = _mm_set1_epi16( -277); // -276.836
66  const __m128i Y_MASK = _mm_set1_epi16(0x00FF);
67 
68  // left
69  __m128i u0f = _mm_load_si128(u);
70  __m128i v0f = _mm_load_si128(v);
71  __m128i u07 = _mm_unpacklo_epi8(u0f, ZERO);
72  __m128i v07 = _mm_unpacklo_epi8(v0f, ZERO);
73  __m128i mr07 = _mm_srai_epi16(_mm_mullo_epi16(v07, RED_V), 6);
74  __m128i sg07 = _mm_mullo_epi16(v07, GREEN_V);
75  __m128i tg07 = _mm_mullo_epi16(u07, GREEN_U);
76  __m128i mg07 = _mm_srai_epi16(_mm_adds_epi16(sg07, tg07), 6);
77  __m128i mb07 = _mm_srli_epi16(_mm_mullo_epi16(u07, BLUE_U), 6); // logical shift
78  __m128i dr07 = _mm_adds_epi16(mr07, CNST_R);
79  __m128i dg07 = _mm_adds_epi16(mg07, CNST_G);
80  __m128i db07 = _mm_adds_epi16(mb07, CNST_B);
81 
82  // block top,left
83  __m128i y00_0f = _mm_load_si128(y0 + 0);
84  __m128i y00_even = _mm_and_si128(y00_0f, Y_MASK);
85  __m128i y00_odd = _mm_srli_epi16(y00_0f, 8);
86  __m128i dy00_even = _mm_srai_epi16(_mm_mullo_epi16(y00_even, COEF_Y), 6);
87  __m128i dy00_odd = _mm_srai_epi16(_mm_mullo_epi16(y00_odd, COEF_Y), 6);
88  __m128i r00_even = _mm_adds_epi16(dr07, dy00_even);
89  __m128i g00_even = _mm_adds_epi16(dg07, dy00_even);
90  __m128i b00_even = _mm_adds_epi16(db07, dy00_even);
91  __m128i r00_odd = _mm_adds_epi16(dr07, dy00_odd);
92  __m128i g00_odd = _mm_adds_epi16(dg07, dy00_odd);
93  __m128i b00_odd = _mm_adds_epi16(db07, dy00_odd);
94  __m128i r00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r00_even, r00_even),
95  _mm_packus_epi16(r00_odd, r00_odd));
96  __m128i g00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g00_even, g00_even),
97  _mm_packus_epi16(g00_odd, g00_odd));
98  __m128i b00_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b00_even, b00_even),
99  _mm_packus_epi16(b00_odd, b00_odd));
100  __m128i br00_07 = _mm_unpacklo_epi8(b00_0f, r00_0f);
101  __m128i br00_8f = _mm_unpackhi_epi8(b00_0f, r00_0f);
102  __m128i ga00_07 = _mm_unpacklo_epi8(g00_0f, ALPHA);
103  __m128i ga00_8f = _mm_unpackhi_epi8(g00_0f, ALPHA);
104  __m128i bgra00_03 = _mm_unpacklo_epi8(br00_07, ga00_07);
105  __m128i bgra00_47 = _mm_unpackhi_epi8(br00_07, ga00_07);
106  __m128i bgra00_8b = _mm_unpacklo_epi8(br00_8f, ga00_8f);
107  __m128i bgra00_cf = _mm_unpackhi_epi8(br00_8f, ga00_8f);
108  _mm_store_si128(out0 + 0, bgra00_03);
109  _mm_store_si128(out0 + 1, bgra00_47);
110  _mm_store_si128(out0 + 2, bgra00_8b);
111  _mm_store_si128(out0 + 3, bgra00_cf);
112 
113  // block bottom,left
114  __m128i y10_0f = _mm_load_si128(y1 + 0);
115  __m128i y10_even = _mm_and_si128(y10_0f, Y_MASK);
116  __m128i y10_odd = _mm_srli_epi16(y10_0f, 8);
117  __m128i dy10_even = _mm_srai_epi16(_mm_mullo_epi16(y10_even, COEF_Y), 6);
118  __m128i dy10_odd = _mm_srai_epi16(_mm_mullo_epi16(y10_odd, COEF_Y), 6);
119  __m128i r10_even = _mm_adds_epi16(dr07, dy10_even);
120  __m128i g10_even = _mm_adds_epi16(dg07, dy10_even);
121  __m128i b10_even = _mm_adds_epi16(db07, dy10_even);
122  __m128i r10_odd = _mm_adds_epi16(dr07, dy10_odd);
123  __m128i g10_odd = _mm_adds_epi16(dg07, dy10_odd);
124  __m128i b10_odd = _mm_adds_epi16(db07, dy10_odd);
125  __m128i r10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r10_even, r10_even),
126  _mm_packus_epi16(r10_odd, r10_odd));
127  __m128i g10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g10_even, g10_even),
128  _mm_packus_epi16(g10_odd, g10_odd));
129  __m128i b10_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b10_even, b10_even),
130  _mm_packus_epi16(b10_odd, b10_odd));
131  __m128i br10_07 = _mm_unpacklo_epi8(b10_0f, r10_0f);
132  __m128i br10_8f = _mm_unpackhi_epi8(b10_0f, r10_0f);
133  __m128i ga10_07 = _mm_unpacklo_epi8(g10_0f, ALPHA);
134  __m128i ga10_8f = _mm_unpackhi_epi8(g10_0f, ALPHA);
135  __m128i bgra10_03 = _mm_unpacklo_epi8(br10_07, ga10_07);
136  __m128i bgra10_47 = _mm_unpackhi_epi8(br10_07, ga10_07);
137  __m128i bgra10_8b = _mm_unpacklo_epi8(br10_8f, ga10_8f);
138  __m128i bgra10_cf = _mm_unpackhi_epi8(br10_8f, ga10_8f);
139  _mm_store_si128(out1 + 0, bgra10_03);
140  _mm_store_si128(out1 + 1, bgra10_47);
141  _mm_store_si128(out1 + 2, bgra10_8b);
142  _mm_store_si128(out1 + 3, bgra10_cf);
143 
144  // right
145  __m128i u8f = _mm_unpackhi_epi8(u0f, ZERO);
146  __m128i v8f = _mm_unpackhi_epi8(v0f, ZERO);
147  __m128i mr8f = _mm_srai_epi16(_mm_mullo_epi16(v8f, RED_V), 6);
148  __m128i sg8f = _mm_mullo_epi16(v8f, GREEN_V);
149  __m128i tg8f = _mm_mullo_epi16(u8f, GREEN_U);
150  __m128i mg8f = _mm_srai_epi16(_mm_adds_epi16(sg8f, tg8f), 6);
151  __m128i mb8f = _mm_srli_epi16(_mm_mullo_epi16(u8f, BLUE_U), 6); // logical shift
152  __m128i dr8f = _mm_adds_epi16(mr8f, CNST_R);
153  __m128i dg8f = _mm_adds_epi16(mg8f, CNST_G);
154  __m128i db8f = _mm_adds_epi16(mb8f, CNST_B);
155 
156  // block top,right
157  __m128i y01_0f = _mm_load_si128(y0 + 1);
158  __m128i y01_even = _mm_and_si128(y01_0f, Y_MASK);
159  __m128i y01_odd = _mm_srli_epi16(y01_0f, 8);
160  __m128i dy01_even = _mm_srai_epi16(_mm_mullo_epi16(y01_even, COEF_Y), 6);
161  __m128i dy01_odd = _mm_srai_epi16(_mm_mullo_epi16(y01_odd, COEF_Y), 6);
162  __m128i r01_even = _mm_adds_epi16(dr8f, dy01_even);
163  __m128i g01_even = _mm_adds_epi16(dg8f, dy01_even);
164  __m128i b01_even = _mm_adds_epi16(db8f, dy01_even);
165  __m128i r01_odd = _mm_adds_epi16(dr8f, dy01_odd);
166  __m128i g01_odd = _mm_adds_epi16(dg8f, dy01_odd);
167  __m128i b01_odd = _mm_adds_epi16(db8f, dy01_odd);
168  __m128i r01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r01_even, r01_even),
169  _mm_packus_epi16(r01_odd, r01_odd));
170  __m128i g01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g01_even, g01_even),
171  _mm_packus_epi16(g01_odd, g01_odd));
172  __m128i b01_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b01_even, b01_even),
173  _mm_packus_epi16(b01_odd, b01_odd));
174  __m128i br01_07 = _mm_unpacklo_epi8(b01_0f, r01_0f);
175  __m128i br01_8f = _mm_unpackhi_epi8(b01_0f, r01_0f);
176  __m128i ga01_07 = _mm_unpacklo_epi8(g01_0f, ALPHA);
177  __m128i ga01_8f = _mm_unpackhi_epi8(g01_0f, ALPHA);
178  __m128i bgra01_03 = _mm_unpacklo_epi8(br01_07, ga01_07);
179  __m128i bgra01_47 = _mm_unpackhi_epi8(br01_07, ga01_07);
180  __m128i bgra01_8b = _mm_unpacklo_epi8(br01_8f, ga01_8f);
181  __m128i bgra01_cf = _mm_unpackhi_epi8(br01_8f, ga01_8f);
182  _mm_store_si128(out0 + 4, bgra01_03);
183  _mm_store_si128(out0 + 5, bgra01_47);
184  _mm_store_si128(out0 + 6, bgra01_8b);
185  _mm_store_si128(out0 + 7, bgra01_cf);
186 
187  // block bottom,right
188  __m128i y11_0f = _mm_load_si128(y1 + 1);
189  __m128i y11_even = _mm_and_si128(y11_0f, Y_MASK);
190  __m128i y11_odd = _mm_srli_epi16(y11_0f, 8);
191  __m128i dy11_even = _mm_srai_epi16(_mm_mullo_epi16(y11_even, COEF_Y), 6);
192  __m128i dy11_odd = _mm_srai_epi16(_mm_mullo_epi16(y11_odd, COEF_Y), 6);
193  __m128i r11_even = _mm_adds_epi16(dr8f, dy11_even);
194  __m128i g11_even = _mm_adds_epi16(dg8f, dy11_even);
195  __m128i b11_even = _mm_adds_epi16(db8f, dy11_even);
196  __m128i r11_odd = _mm_adds_epi16(dr8f, dy11_odd);
197  __m128i g11_odd = _mm_adds_epi16(dg8f, dy11_odd);
198  __m128i b11_odd = _mm_adds_epi16(db8f, dy11_odd);
199  __m128i r11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(r11_even, r11_even),
200  _mm_packus_epi16(r11_odd, r11_odd));
201  __m128i g11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(g11_even, g11_even),
202  _mm_packus_epi16(g11_odd, g11_odd));
203  __m128i b11_0f = _mm_unpackhi_epi8(_mm_packus_epi16(b11_even, b11_even),
204  _mm_packus_epi16(b11_odd, b11_odd));
205  __m128i br11_07 = _mm_unpacklo_epi8(b11_0f, r11_0f);
206  __m128i br11_8f = _mm_unpackhi_epi8(b11_0f, r11_0f);
207  __m128i ga11_07 = _mm_unpacklo_epi8(g11_0f, ALPHA);
208  __m128i ga11_8f = _mm_unpackhi_epi8(g11_0f, ALPHA);
209  __m128i bgra11_03 = _mm_unpacklo_epi8(br11_07, ga11_07);
210  __m128i bgra11_47 = _mm_unpackhi_epi8(br11_07, ga11_07);
211  __m128i bgra11_8b = _mm_unpacklo_epi8(br11_8f, ga11_8f);
212  __m128i bgra11_cf = _mm_unpackhi_epi8(br11_8f, ga11_8f);
213  _mm_store_si128(out1 + 4, bgra11_03);
214  _mm_store_si128(out1 + 5, bgra11_47);
215  _mm_store_si128(out1 + 6, bgra11_8b);
216  _mm_store_si128(out1 + 7, bgra11_cf);
217 }
218 
219 static inline void convertHelperSSE2(
220  const th_ycbcr_buffer& buffer, RawFrame& output)
221 {
222  const int width = buffer[0].width;
223  const int y_stride = buffer[0].stride;
224  const int uv_stride2 = buffer[1].stride / 2;
225 
226  assert((width % 32) == 0);
227  assert((buffer[0].height % 2) == 0);
228 
229  for (int y = 0; y < buffer[0].height; y += 2) {
230  const uint8_t* pY1 = buffer[0].data + y * y_stride;
231  const uint8_t* pY2 = buffer[0].data + (y + 1) * y_stride;
232  const uint8_t* pCb = buffer[1].data + y * uv_stride2;
233  const uint8_t* pCr = buffer[2].data + y * uv_stride2;
234  auto* out0 = output.getLinePtrDirect<uint32_t>(y + 0);
235  auto* out1 = output.getLinePtrDirect<uint32_t>(y + 1);
236 
237  for (int x = 0; x < width; x += 32) {
238  // convert a block of (32 x 2) pixels
239  yuv2rgb_sse2(pCb, pCr, pY1, pY2, out0, out1);
240  pCb += 16;
241  pCr += 16;
242  pY1 += 32;
243  pY2 += 32;
244  out0 += 32;
245  out1 += 32;
246  }
247 
248  output.setLineWidth(y + 0, width);
249  output.setLineWidth(y + 1, width);
250  }
251 }
252 
253 #endif // __SSE2__
254 
255 constexpr int PREC = 15;
256 constexpr int COEF_Y = int(1.164 * (1 << PREC) + 0.5); // prefer to use lrint() to round
257 constexpr int COEF_RV = int(1.596 * (1 << PREC) + 0.5); // but that's not (yet) constexpr
258 constexpr int COEF_GU = int(0.391 * (1 << PREC) + 0.5); // in current versions of c++
259 constexpr int COEF_GV = int(0.813 * (1 << PREC) + 0.5);
260 constexpr int COEF_BU = int(2.018 * (1 << PREC) + 0.5);
261 
262 struct Coefs {
263  int gu[256];
264  int gv[256];
265  int bu[256];
266  int rv[256];
267  int y [256];
268 };
269 
270 static constexpr Coefs getCoefs()
271 {
272  Coefs coefs = {};
273  for (int i = 0; i < 256; ++i) {
274  coefs.gu[i] = -COEF_GU * (i - 128);
275  coefs.gv[i] = -COEF_GV * (i - 128);
276  coefs.bu[i] = COEF_BU * (i - 128);
277  coefs.rv[i] = COEF_RV * (i - 128);
278  coefs.y[i] = COEF_Y * (i - 16) + (PREC / 2);
279  }
280  return coefs;
281 }
282 
283 template<typename Pixel>
284 static inline Pixel calc(const SDL_PixelFormat& format,
285  int y, int ruv, int guv, int buv)
286 {
287  uint8_t r = Math::clipIntToByte((y + ruv) >> PREC);
288  uint8_t g = Math::clipIntToByte((y + guv) >> PREC);
289  uint8_t b = Math::clipIntToByte((y + buv) >> PREC);
290  if (sizeof(Pixel) == 4) {
291  return (r << 16) | (g << 8) | (b << 0);
292  } else {
293  return static_cast<Pixel>(SDL_MapRGB(&format, r, g, b));
294  }
295 }
296 
297 template<typename Pixel>
298 static void convertHelper(const th_ycbcr_buffer& buffer, RawFrame& output,
299  const SDL_PixelFormat& format)
300 {
301  assert(buffer[1].width * 2 == buffer[0].width);
302  assert(buffer[1].height * 2 == buffer[0].height);
303 
304  static constexpr Coefs coefs = getCoefs();
305 
306  const int width = buffer[0].width;
307  const int y_stride = buffer[0].stride;
308  const int uv_stride2 = buffer[1].stride / 2;
309 
310  for (int y = 0; y < buffer[0].height; y += 2) {
311  const uint8_t* pY = buffer[0].data + y * y_stride;
312  const uint8_t* pCb = buffer[1].data + y * uv_stride2;
313  const uint8_t* pCr = buffer[2].data + y * uv_stride2;
314  auto* out0 = output.getLinePtrDirect<Pixel>(y + 0);
315  auto* out1 = output.getLinePtrDirect<Pixel>(y + 1);
316 
317  for (int x = 0; x < width;
318  x += 2, pY += 2, ++pCr, ++pCb, out0 += 2, out1 += 2) {
319  int ruv = coefs.rv[*pCr];
320  int guv = coefs.gu[*pCb] + coefs.gv[*pCr];
321  int buv = coefs.bu[*pCb];
322 
323  int Y00 = coefs.y[pY[0]];
324  out0[0] = calc<Pixel>(format, Y00, ruv, guv, buv);
325 
326  int Y01 = coefs.y[pY[1]];
327  out0[1] = calc<Pixel>(format, Y01, ruv, guv, buv);
328 
329  int Y10 = coefs.y[pY[y_stride + 0]];
330  out1[0] = calc<Pixel>(format, Y10, ruv, guv, buv);
331 
332  int Y11 = coefs.y[pY[y_stride + 1]];
333  out1[1] = calc<Pixel>(format, Y11, ruv, guv, buv);
334  }
335 
336  output.setLineWidth(y + 0, width);
337  output.setLineWidth(y + 1, width);
338  }
339 }
340 
341 void convert(const th_ycbcr_buffer& input, RawFrame& output)
342 {
343  const SDL_PixelFormat& format = output.getSDLPixelFormat();
344  if (format.BytesPerPixel == 4) {
345 #ifdef __SSE2__
346  convertHelperSSE2(input, output);
347 #else
348  convertHelper<uint32_t>(input, output, format);
349 #endif
350  } else {
351  assert(format.BytesPerPixel == 2);
352  convertHelper<uint16_t>(input, output, format);
353  }
354 }
355 
356 } // namespace openmsx::yuv2rgb
constexpr int COEF_GU
Definition: yuv2rgb.cc:258
constexpr int PREC
Definition: yuv2rgb.cc:255
uint32_t Pixel
A video frame as output by the VDP scanline conversion unit, before any postprocessing filters are ap...
Definition: RawFrame.hh:25
constexpr int COEF_GV
Definition: yuv2rgb.cc:259
void convert(const th_ycbcr_buffer &input, RawFrame &output)
Definition: yuv2rgb.cc:341
int g
constexpr int COEF_BU
Definition: yuv2rgb.cc:260
uint8_t clipIntToByte(int x)
Clip x to range [0,255].
Definition: Math.hh:119
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:1377
constexpr int COEF_Y
Definition: yuv2rgb.cc:256
Pixel * getLinePtrDirect(unsigned y)
Definition: RawFrame.hh:31
void format(SectorAccessibleDisk &disk, bool dos1)
Format the given disk (= a single partition).
void setLineWidth(unsigned line, unsigned width)
Definition: RawFrame.hh:39
constexpr int COEF_RV
Definition: yuv2rgb.cc:257
const SDL_PixelFormat & getSDLPixelFormat() const
Definition: FrameSource.hh:190