openMSX
ResampleHQ.cc
Go to the documentation of this file.
1 // Based on libsamplerate-0.1.2 (aka Secret Rabit Code)
2 //
3 // simplified code in several ways:
4 // - resample algorithm is no longer switchable, we took this variant:
5 // Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6 // - don't allow to change sample rate on-the-fly
7 // - assume input (and thus also output) signals have infinte length, so
8 // there is no special code to handle the ending of the signal
9 // - changed/simplified API to better match openmsx use model
10 // (e.g. remove all error checking)
11 
12 #include "ResampleHQ.hh"
13 #include "ResampledSoundDevice.hh"
14 #include "FixedPoint.hh"
15 #include "MemBuffer.hh"
16 #include "countof.hh"
17 #include "likely.hh"
18 #include "noncopyable.hh"
19 #include "stl.hh"
20 #include "vla.hh"
21 #include "build-info.hh"
22 #include <algorithm>
23 #include <vector>
24 #include <cmath>
25 #include <cstring>
26 #include <cassert>
27 #ifdef __SSE2__
28 #include <emmintrin.h>
29 #endif
30 
31 namespace openmsx {
32 
33 // Note: without appending 'f' to the values in ResampleCoeffs.ii,
34 // this will generate thousands of C4305 warnings in VC++
35 // E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
36 static const float coeffs[] = {
37  #include "ResampleCoeffs.ii"
38 };
39 
40 static const int INDEX_INC = 128;
41 static const int COEFF_LEN = countof(coeffs);
42 static const int COEFF_HALF_LEN = COEFF_LEN - 1;
43 static const unsigned TAB_LEN = 4096;
44 
45 class ResampleCoeffs : private noncopyable
46 {
47 public:
48  static ResampleCoeffs& instance();
49  void getCoeffs(double ratio, float*& table, unsigned& filterLen);
50  void releaseCoeffs(double ratio);
51 
52 private:
55 
57  ~ResampleCoeffs();
58 
59  double getCoeff(FilterIndex index);
60  Table calcTable(double ratio, unsigned& filterLen);
61 
62  struct Element {
63  double ratio;
64  Table table;
65  unsigned filterLen;
66  unsigned count;
67 
68  // workaround for vs013: normally these are auto-generated
69  Element(double ratio_, Table&& table_, unsigned filterLen_, unsigned count_)
70  : ratio(ratio_), table(std::move(table_))
71  , filterLen(filterLen_), count(count_) {}
72  Element(Element&& e)
73  : ratio (std::move(e.ratio))
74  , table (std::move(e.table))
75  , filterLen(std::move(e.filterLen))
76  , count (std::move(e.count)) {}
77  Element& operator=(Element&& e) {
78  ratio = std::move(e.ratio);
79  table = std::move(e.table);
80  filterLen = std::move(e.filterLen);
81  count = std::move(e.count);
82  return *this;
83  }
84  };
85  std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
86 };
87 
88 ResampleCoeffs::ResampleCoeffs()
89 {
90 }
91 
92 ResampleCoeffs::~ResampleCoeffs()
93 {
94  assert(cache.empty());
95 }
96 
98 {
99  static ResampleCoeffs resampleCoeffs;
100  return resampleCoeffs;
101 }
102 
104  double ratio, float*& table, unsigned& filterLen)
105 {
106  auto it = find_if(begin(cache), end(cache),
107  [=](const Element& e) { return e.ratio == ratio; });
108  if (it != end(cache)) {
109  table = it->table.data();
110  filterLen = it->filterLen;
111  it->count++;
112  return;
113  }
114  Table tab = calcTable(ratio, filterLen);
115  table = tab.data();
116  cache.push_back({ratio, std::move(tab), filterLen, 1});
117 }
118 
120 {
121  auto it = find_if_unguarded(cache,
122  [=](const Element& e) { return e.ratio == ratio; });
123  it->count--;
124  if (it->count == 0) {
125  if (it != (end(cache) - 1)) {
126  *it = std::move(cache.back()); // move last element here
127  }
128  cache.pop_back(); // and erase last
129  }
130 }
131 
132 double ResampleCoeffs::getCoeff(FilterIndex index)
133 {
134  double fraction = index.fractionAsDouble();
135  int indx = index.toInt();
136  return double(coeffs[indx]) +
137  fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
138 }
139 
140 ResampleCoeffs::Table ResampleCoeffs::calcTable(double ratio, unsigned& filterLen)
141 {
142  double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
143  double normFactor = floatIncr / INDEX_INC;
144  FilterIndex increment = FilterIndex(floatIncr);
145  FilterIndex maxFilterIndex(COEFF_HALF_LEN);
146 
147  int min_idx = -maxFilterIndex.divAsInt(increment);
148  int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
149  int idx_cnt = max_idx - min_idx + 1;
150  filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
151  min_idx -= (filterLen - idx_cnt);
152  Table table(TAB_LEN * filterLen);
153  memset(table.data(), 0, TAB_LEN * filterLen * sizeof(float));
154 
155  for (unsigned t = 0; t < TAB_LEN; ++t) {
156  double lastPos = double(t) / TAB_LEN;
157  FilterIndex startFilterIndex(lastPos * floatIncr);
158 
159  FilterIndex filterIndex(startFilterIndex);
160  int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
161  filterIndex += increment * coeffCount;
162  int bufIndex = -coeffCount;
163  do {
164  table[t * filterLen + bufIndex - min_idx] =
165  float(getCoeff(filterIndex) * normFactor);
166  filterIndex -= increment;
167  bufIndex += 1;
168  } while (filterIndex >= FilterIndex(0));
169 
170  filterIndex = increment - startFilterIndex;
171  coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
172  filterIndex += increment * coeffCount;
173  bufIndex = 1 + coeffCount;
174  do {
175  table[t * filterLen + bufIndex - min_idx] =
176  float(getCoeff(filterIndex) * normFactor);
177  filterIndex -= increment;
178  bufIndex -= 1;
179  } while (filterIndex > FilterIndex(0));
180  }
181  return table;
182 }
183 
184 
185 template <unsigned CHANNELS>
187  ResampledSoundDevice& input_,
188  const DynamicClock& hostClock_, unsigned emuSampleRate)
189  : input(input_)
190  , hostClock(hostClock_)
191  , emuClock(hostClock.getTime(), emuSampleRate)
192  , ratio(float(emuSampleRate) / hostClock.getFreq())
193 {
194  ResampleCoeffs::instance().getCoeffs(ratio, table, filterLen);
195 
196  // fill buffer with 'enough' zero's
197  unsigned extra = int(filterLen + 1 + ratio + 1);
198  bufStart = 0;
199  bufEnd = extra;
200  nonzeroSamples = 0;
201  unsigned initialSize = 4000; // buffer grows dynamically if this is too small
202  buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
203 }
204 
205 template <unsigned CHANNELS>
207 {
209 }
210 
211 #ifdef __SSE2__
212 static inline void calcSseMono(const float* buf_, const float* tab_, long len, int* out)
213 {
214  assert((len % 4) == 0);
215  assert((uintptr_t(tab_) % 16) == 0);
216 
217  long x = (len & ~7) * sizeof(float);
218  assert((x % 32) == 0);
219  const char* buf = reinterpret_cast<const char*>(buf_) + x;
220  const char* tab = reinterpret_cast<const char*>(tab_) + x;
221  x = -x;
222 
223  __m128 a0 = _mm_setzero_ps();
224  __m128 a1 = _mm_setzero_ps();
225  do {
226  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
227  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
228  __m128 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
229  __m128 t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
230  __m128 m0 = _mm_mul_ps(b0, t0);
231  __m128 m1 = _mm_mul_ps(b1, t1);
232  a0 = _mm_add_ps(a0, m0);
233  a1 = _mm_add_ps(a1, m1);
234  x += 2 * sizeof(__m128);
235  } while (x < 0);
236  if (len & 4) {
237  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
238  __m128 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
239  __m128 m0 = _mm_mul_ps(b0, t0);
240  a0 = _mm_add_ps(a0, m0);
241  }
242 
243  __m128 a = _mm_add_ps(a0, a1);
244  // The following can be _slighly_ faster by using the SSE3 _mm_hadd_ps()
245  // intrinsic, but not worth the trouble.
246  __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
247  __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
248 
249  *out = _mm_cvtss_si32(s);
250 }
251 
252 template<int N> static inline __m128 shuffle(__m128 x)
253 {
254  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
255 }
256 static inline void calcSseStereo(const float* buf_, const float* tab_, long len, int* out)
257 {
258  assert((len % 4) == 0);
259  assert((uintptr_t(tab_) % 16) == 0);
260 
261  long x = (len & ~7) * sizeof(float);
262  const char* buf = reinterpret_cast<const char*>(buf_) + 2*x;
263  const char* tab = reinterpret_cast<const char*>(tab_) + x;
264  x = -x;
265 
266  __m128 a0 = _mm_setzero_ps();
267  __m128 a1 = _mm_setzero_ps();
268  __m128 a2 = _mm_setzero_ps();
269  __m128 a3 = _mm_setzero_ps();
270  do {
271  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 0));
272  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 16));
273  __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 32));
274  __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 48));
275  __m128 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
276  __m128 tb = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
277  __m128 t0 = shuffle<0x50>(ta);
278  __m128 t1 = shuffle<0xFA>(ta);
279  __m128 t2 = shuffle<0x50>(tb);
280  __m128 t3 = shuffle<0xFA>(tb);
281  __m128 m0 = _mm_mul_ps(b0, t0);
282  __m128 m1 = _mm_mul_ps(b1, t1);
283  __m128 m2 = _mm_mul_ps(b2, t2);
284  __m128 m3 = _mm_mul_ps(b3, t3);
285  a0 = _mm_add_ps(a0, m0);
286  a1 = _mm_add_ps(a1, m1);
287  a2 = _mm_add_ps(a2, m2);
288  a3 = _mm_add_ps(a3, m3);
289  x += 2 * sizeof(__m128);
290  } while (x < 0);
291  if (len & 4) {
292  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
293  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
294  __m128 ta = _mm_load_ps (reinterpret_cast<const float*>(tab));
295  __m128 t0 = shuffle<0x50>(ta);
296  __m128 t1 = shuffle<0xFA>(ta);
297  __m128 m0 = _mm_mul_ps(b0, t0);
298  __m128 m1 = _mm_mul_ps(b1, t1);
299  a0 = _mm_add_ps(a0, m0);
300  a1 = _mm_add_ps(a1, m1);
301  }
302 
303  __m128 a01 = _mm_add_ps(a0, a1);
304  __m128 a23 = _mm_add_ps(a2, a3);
305  __m128 a = _mm_add_ps(a01, a23);
306  // Can faster with SSE3, but (like above) not worth the trouble.
307  __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
308  __m128i si = _mm_cvtps_epi32(s);
309 #if ASM_X86_64
310  *reinterpret_cast<int64_t*>(out) = _mm_cvtsi128_si64(si);
311 #else
312  out[0] = _mm_cvtsi128_si32(si);
313  out[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(si, 0x55));
314 #endif
315 }
316 #endif
317 
318 template <unsigned CHANNELS>
319 void ResampleHQ<CHANNELS>::calcOutput(
320  float pos, int* __restrict output)
321 {
322  assert((filterLen & 3) == 0);
323 
324  int t = int(pos * TAB_LEN + 0.5f) % TAB_LEN;
325  const float* tab = &table[t * filterLen];
326 
327  int bufIdx = int(pos) + bufStart;
328  assert((bufIdx + filterLen) <= bufEnd);
329  bufIdx *= CHANNELS;
330  const float* buf = &buffer[bufIdx];
331 
332 #ifdef __SSE2__
333  if (CHANNELS == 1) {
334  calcSseMono (buf, tab, filterLen, output);
335  } else {
336  calcSseStereo(buf, tab, filterLen, output);
337  }
338  return;
339 #endif
340 
341  // c++ version, both mono and stereo
342  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
343  float r0 = 0.0f;
344  float r1 = 0.0f;
345  float r2 = 0.0f;
346  float r3 = 0.0f;
347  for (unsigned i = 0; i < filterLen; i += 4) {
348  r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
349  r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
350  r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
351  r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
352  }
353  output[ch] = lrint(r0 + r1 + r2 + r3);
354  ++buf;
355  }
356 }
357 
358 template <unsigned CHANNELS>
359 void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
360 {
361  // Still enough free space at end of buffer?
362  unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
363  if (free < emuNum) {
364  // No, then move everything to the start
365  // (data needs to be in a contiguous memory block)
366  unsigned available = bufEnd - bufStart;
367  memmove(&buffer[0], &buffer[bufStart * CHANNELS],
368  available * CHANNELS * sizeof(float));
369  bufStart = 0;
370  bufEnd = available;
371 
372  free = unsigned(buffer.size() / CHANNELS) - bufEnd;
373  int missing = emuNum - free;
374  if (unlikely(missing > 0)) {
375  // Still not enough room: grow the buffer.
376  // TODO an alternative is to instead of using a large
377  // buffer, chop the work in multiple smaller pieces.
378  // That may have the advantage that the data fits in
379  // the CPU's data cache. OTOH too small chunks have
380  // more overhead. (Not yet implemented because it's
381  // more complex).
382  buffer.resize(buffer.size() + missing * CHANNELS);
383  }
384  }
385  VLA_SSE_ALIGNED(int, tmpBuf, emuNum * CHANNELS + 3);
386  if (input.generateInput(tmpBuf, emuNum)) {
387  for (unsigned i = 0; i < emuNum * CHANNELS; ++i) {
388  buffer[bufEnd * CHANNELS + i] = float(tmpBuf[i]);
389  }
390  bufEnd += emuNum;
391  nonzeroSamples = bufEnd - bufStart;
392  } else {
393  memset(&buffer[bufEnd * CHANNELS], 0,
394  emuNum * CHANNELS * sizeof(float));
395  bufEnd += emuNum;
396  }
397 
398  assert(bufStart <= bufEnd);
399  assert(bufEnd <= (buffer.size() / CHANNELS));
400 }
401 
402 template <unsigned CHANNELS>
404  int* __restrict dataOut, unsigned hostNum, EmuTime::param time)
405 {
406  unsigned emuNum = emuClock.getTicksTill(time);
407  if (emuNum > 0) {
408  prepareData(emuNum);
409  }
410 
411  bool notMuted = nonzeroSamples > 0;
412  if (notMuted) {
413  // main processing loop
414  EmuTime host1 = hostClock.getFastAdd(1);
415  assert(host1 > emuClock.getTime());
416  float pos = emuClock.getTicksTillDouble(host1);
417  assert(pos <= (ratio + 2));
418  for (unsigned i = 0; i < hostNum; ++i) {
419  calcOutput(pos, &dataOut[i * CHANNELS]);
420  pos += ratio;
421  }
422  }
423  emuClock += emuNum;
424  bufStart += emuNum;
425  nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
426 
427  assert(bufStart <= bufEnd);
428  unsigned available = bufEnd - bufStart;
429  unsigned extra = int(filterLen + 1 + ratio + 1);
430  assert(available == extra); (void)available; (void)extra;
431 
432  return notMuted;
433 }
434 
435 // Force template instantiation.
436 template class ResampleHQ<1>;
437 template class ResampleHQ<2>;
438 
439 } // namespace openmsx
string_ref::const_iterator end(const string_ref &x)
Definition: string_ref.hh:167
#define unlikely(x)
Definition: likely.hh:15
void getCoeffs(double ratio, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:103
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock, unsigned emuSampleRate)
Definition: ResampleHQ.cc:186
bool generateOutput(int *dataOut, unsigned num, EmuTime::param time) override
Definition: ResampleHQ.cc:403
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:15
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
#define countof(array)
Definition: countof.hh:48
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:97
ITER find_if_unguarded(ITER first, ITER last, PRED pred)
Faster alternative to 'find_if' when it's guaranteed that the predicate will be true for at least one...
Definition: stl.hh:143
Based on boost::noncopyable, see boost documentation: http://www.boost.org/libs/utility.
Definition: noncopyable.hh:12
string_ref::const_iterator begin(const string_ref &x)
Definition: string_ref.hh:166
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:91
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:119
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44