openMSX
ResampleHQ.cc
Go to the documentation of this file.
1 // Based on libsamplerate-0.1.2 (aka Secret Rabit Code)
2 //
3 // simplified code in several ways:
4 // - resample algorithm is no longer switchable, we took this variant:
5 // Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6 // - don't allow to change sample rate on-the-fly
7 // - assume input (and thus also output) signals have infinte length, so
8 // there is no special code to handle the ending of the signal
9 // - changed/simplified API to better match openmsx use model
10 // (e.g. remove all error checking)
11 
12 #include "ResampleHQ.hh"
13 #include "ResampledSoundDevice.hh"
14 #include "FixedPoint.hh"
15 #include "MemBuffer.hh"
16 #include "countof.hh"
17 #include "likely.hh"
18 #include "stl.hh"
19 #include "vla.hh"
20 #include "build-info.hh"
21 #include <algorithm>
22 #include <vector>
23 #include <cmath>
24 #include <cstring>
25 #include <cassert>
26 #ifdef __SSE2__
27 #include <emmintrin.h>
28 #endif
29 
30 namespace openmsx {
31 
32 // Note: without appending 'f' to the values in ResampleCoeffs.ii,
33 // this will generate thousands of C4305 warnings in VC++
34 // E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
35 static const float coeffs[] = {
36  #include "ResampleCoeffs.ii"
37 };
38 
39 static const int INDEX_INC = 128;
40 static const int COEFF_LEN = countof(coeffs);
41 static const int COEFF_HALF_LEN = COEFF_LEN - 1;
42 static const unsigned TAB_LEN = 4096;
43 
45 {
46 public:
47  static ResampleCoeffs& instance();
48  void getCoeffs(double ratio, float*& table, unsigned& filterLen);
49  void releaseCoeffs(double ratio);
50 
51 private:
54 
56  ~ResampleCoeffs();
57 
58  double getCoeff(FilterIndex index);
59  Table calcTable(double ratio, unsigned& filterLen);
60 
61  struct Element {
62  double ratio;
63  Table table;
64  unsigned filterLen;
65  unsigned count;
66  };
67  std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
68 };
69 
70 ResampleCoeffs::ResampleCoeffs()
71 {
72 }
73 
74 ResampleCoeffs::~ResampleCoeffs()
75 {
76  assert(cache.empty());
77 }
78 
80 {
81  static ResampleCoeffs resampleCoeffs;
82  return resampleCoeffs;
83 }
84 
86  double ratio, float*& table, unsigned& filterLen)
87 {
88  auto it = find_if(begin(cache), end(cache),
89  [=](const Element& e) { return e.ratio == ratio; });
90  if (it != end(cache)) {
91  table = it->table.data();
92  filterLen = it->filterLen;
93  it->count++;
94  return;
95  }
96  Table tab = calcTable(ratio, filterLen);
97  table = tab.data();
98  cache.push_back({ratio, std::move(tab), filterLen, 1});
99 }
100 
102 {
103  auto it = rfind_if_unguarded(cache,
104  [=](const Element& e) { return e.ratio == ratio; });
105  it->count--;
106  if (it->count == 0) {
107  move_pop_back(cache, it);
108  }
109 }
110 
111 double ResampleCoeffs::getCoeff(FilterIndex index)
112 {
113  double fraction = index.fractionAsDouble();
114  int indx = index.toInt();
115  return double(coeffs[indx]) +
116  fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
117 }
118 
119 ResampleCoeffs::Table ResampleCoeffs::calcTable(double ratio, unsigned& filterLen)
120 {
121  double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
122  double normFactor = floatIncr / INDEX_INC;
123  FilterIndex increment = FilterIndex(floatIncr);
124  FilterIndex maxFilterIndex(COEFF_HALF_LEN);
125 
126  int min_idx = -maxFilterIndex.divAsInt(increment);
127  int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
128  int idx_cnt = max_idx - min_idx + 1;
129  filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
130  min_idx -= (filterLen - idx_cnt);
131  Table table(TAB_LEN * filterLen);
132  memset(table.data(), 0, TAB_LEN * filterLen * sizeof(float));
133 
134  for (unsigned t = 0; t < TAB_LEN; ++t) {
135  double lastPos = double(t) / TAB_LEN;
136  FilterIndex startFilterIndex(lastPos * floatIncr);
137 
138  FilterIndex filterIndex(startFilterIndex);
139  int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
140  filterIndex += increment * coeffCount;
141  int bufIndex = -coeffCount;
142  do {
143  table[t * filterLen + bufIndex - min_idx] =
144  float(getCoeff(filterIndex) * normFactor);
145  filterIndex -= increment;
146  bufIndex += 1;
147  } while (filterIndex >= FilterIndex(0));
148 
149  filterIndex = increment - startFilterIndex;
150  coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
151  filterIndex += increment * coeffCount;
152  bufIndex = 1 + coeffCount;
153  do {
154  table[t * filterLen + bufIndex - min_idx] =
155  float(getCoeff(filterIndex) * normFactor);
156  filterIndex -= increment;
157  bufIndex -= 1;
158  } while (filterIndex > FilterIndex(0));
159  }
160  return table;
161 }
162 
163 
164 template <unsigned CHANNELS>
166  ResampledSoundDevice& input_,
167  const DynamicClock& hostClock_, unsigned emuSampleRate)
168  : input(input_)
169  , hostClock(hostClock_)
170  , emuClock(hostClock.getTime(), emuSampleRate)
171  , ratio(float(emuSampleRate) / hostClock.getFreq())
172 {
173  ResampleCoeffs::instance().getCoeffs(ratio, table, filterLen);
174 
175  // fill buffer with 'enough' zero's
176  unsigned extra = int(filterLen + 1 + ratio + 1);
177  bufStart = 0;
178  bufEnd = extra;
179  nonzeroSamples = 0;
180  unsigned initialSize = 4000; // buffer grows dynamically if this is too small
181  buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
182 }
183 
184 template <unsigned CHANNELS>
186 {
188 }
189 
190 #ifdef __SSE2__
191 static inline void calcSseMono(const float* buf_, const float* tab_, long len, int* out)
192 {
193  assert((len % 4) == 0);
194  assert((uintptr_t(tab_) % 16) == 0);
195 
196  long x = (len & ~7) * sizeof(float);
197  assert((x % 32) == 0);
198  const char* buf = reinterpret_cast<const char*>(buf_) + x;
199  const char* tab = reinterpret_cast<const char*>(tab_) + x;
200  x = -x;
201 
202  __m128 a0 = _mm_setzero_ps();
203  __m128 a1 = _mm_setzero_ps();
204  do {
205  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
206  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
207  __m128 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
208  __m128 t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
209  __m128 m0 = _mm_mul_ps(b0, t0);
210  __m128 m1 = _mm_mul_ps(b1, t1);
211  a0 = _mm_add_ps(a0, m0);
212  a1 = _mm_add_ps(a1, m1);
213  x += 2 * sizeof(__m128);
214  } while (x < 0);
215  if (len & 4) {
216  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
217  __m128 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
218  __m128 m0 = _mm_mul_ps(b0, t0);
219  a0 = _mm_add_ps(a0, m0);
220  }
221 
222  __m128 a = _mm_add_ps(a0, a1);
223  // The following can be _slighly_ faster by using the SSE3 _mm_hadd_ps()
224  // intrinsic, but not worth the trouble.
225  __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
226  __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
227 
228  *out = _mm_cvtss_si32(s);
229 }
230 
231 template<int N> static inline __m128 shuffle(__m128 x)
232 {
233  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
234 }
235 static inline void calcSseStereo(const float* buf_, const float* tab_, long len, int* out)
236 {
237  assert((len % 4) == 0);
238  assert((uintptr_t(tab_) % 16) == 0);
239 
240  long x = (len & ~7) * sizeof(float);
241  const char* buf = reinterpret_cast<const char*>(buf_) + 2*x;
242  const char* tab = reinterpret_cast<const char*>(tab_) + x;
243  x = -x;
244 
245  __m128 a0 = _mm_setzero_ps();
246  __m128 a1 = _mm_setzero_ps();
247  __m128 a2 = _mm_setzero_ps();
248  __m128 a3 = _mm_setzero_ps();
249  do {
250  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 0));
251  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 16));
252  __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 32));
253  __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 2*x + 48));
254  __m128 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
255  __m128 tb = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
256  __m128 t0 = shuffle<0x50>(ta);
257  __m128 t1 = shuffle<0xFA>(ta);
258  __m128 t2 = shuffle<0x50>(tb);
259  __m128 t3 = shuffle<0xFA>(tb);
260  __m128 m0 = _mm_mul_ps(b0, t0);
261  __m128 m1 = _mm_mul_ps(b1, t1);
262  __m128 m2 = _mm_mul_ps(b2, t2);
263  __m128 m3 = _mm_mul_ps(b3, t3);
264  a0 = _mm_add_ps(a0, m0);
265  a1 = _mm_add_ps(a1, m1);
266  a2 = _mm_add_ps(a2, m2);
267  a3 = _mm_add_ps(a3, m3);
268  x += 2 * sizeof(__m128);
269  } while (x < 0);
270  if (len & 4) {
271  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
272  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
273  __m128 ta = _mm_load_ps (reinterpret_cast<const float*>(tab));
274  __m128 t0 = shuffle<0x50>(ta);
275  __m128 t1 = shuffle<0xFA>(ta);
276  __m128 m0 = _mm_mul_ps(b0, t0);
277  __m128 m1 = _mm_mul_ps(b1, t1);
278  a0 = _mm_add_ps(a0, m0);
279  a1 = _mm_add_ps(a1, m1);
280  }
281 
282  __m128 a01 = _mm_add_ps(a0, a1);
283  __m128 a23 = _mm_add_ps(a2, a3);
284  __m128 a = _mm_add_ps(a01, a23);
285  // Can faster with SSE3, but (like above) not worth the trouble.
286  __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
287  __m128i si = _mm_cvtps_epi32(s);
288 #if ASM_X86_64
289  *reinterpret_cast<int64_t*>(out) = _mm_cvtsi128_si64(si);
290 #else
291  out[0] = _mm_cvtsi128_si32(si);
292  out[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(si, 0x55));
293 #endif
294 }
295 #endif
296 
297 template <unsigned CHANNELS>
299  float pos, int* __restrict output)
300 {
301  assert((filterLen & 3) == 0);
302 
303  int t = int(pos * TAB_LEN + 0.5f) % TAB_LEN;
304  const float* tab = &table[t * filterLen];
305 
306  int bufIdx = int(pos) + bufStart;
307  assert((bufIdx + filterLen) <= bufEnd);
308  bufIdx *= CHANNELS;
309  const float* buf = &buffer[bufIdx];
310 
311 #ifdef __SSE2__
312  if (CHANNELS == 1) {
313  calcSseMono (buf, tab, filterLen, output);
314  } else {
315  calcSseStereo(buf, tab, filterLen, output);
316  }
317  return;
318 #endif
319 
320  // c++ version, both mono and stereo
321  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
322  float r0 = 0.0f;
323  float r1 = 0.0f;
324  float r2 = 0.0f;
325  float r3 = 0.0f;
326  for (unsigned i = 0; i < filterLen; i += 4) {
327  r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
328  r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
329  r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
330  r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
331  }
332  output[ch] = lrint(r0 + r1 + r2 + r3);
333  ++buf;
334  }
335 }
336 
337 template <unsigned CHANNELS>
338 void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
339 {
340  // Still enough free space at end of buffer?
341  unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
342  if (free < emuNum) {
343  // No, then move everything to the start
344  // (data needs to be in a contiguous memory block)
345  unsigned available = bufEnd - bufStart;
346  memmove(&buffer[0], &buffer[bufStart * CHANNELS],
347  available * CHANNELS * sizeof(float));
348  bufStart = 0;
349  bufEnd = available;
350 
351  free = unsigned(buffer.size() / CHANNELS) - bufEnd;
352  int missing = emuNum - free;
353  if (unlikely(missing > 0)) {
354  // Still not enough room: grow the buffer.
355  // TODO an alternative is to instead of using a large
356  // buffer, chop the work in multiple smaller pieces.
357  // That may have the advantage that the data fits in
358  // the CPU's data cache. OTOH too small chunks have
359  // more overhead. (Not yet implemented because it's
360  // more complex).
361  buffer.resize(buffer.size() + missing * CHANNELS);
362  }
363  }
364  VLA_SSE_ALIGNED(int, tmpBuf, emuNum * CHANNELS + 3);
365  if (input.generateInput(tmpBuf, emuNum)) {
366  for (unsigned i = 0; i < emuNum * CHANNELS; ++i) {
367  buffer[bufEnd * CHANNELS + i] = float(tmpBuf[i]);
368  }
369  bufEnd += emuNum;
370  nonzeroSamples = bufEnd - bufStart;
371  } else {
372  memset(&buffer[bufEnd * CHANNELS], 0,
373  emuNum * CHANNELS * sizeof(float));
374  bufEnd += emuNum;
375  }
376 
377  assert(bufStart <= bufEnd);
378  assert(bufEnd <= (buffer.size() / CHANNELS));
379 }
380 
381 template <unsigned CHANNELS>
383  int* __restrict dataOut, unsigned hostNum, EmuTime::param time)
384 {
385  unsigned emuNum = emuClock.getTicksTill(time);
386  if (emuNum > 0) {
387  prepareData(emuNum);
388  }
389 
390  bool notMuted = nonzeroSamples > 0;
391  if (notMuted) {
392  // main processing loop
393  EmuTime host1 = hostClock.getFastAdd(1);
394  assert(host1 > emuClock.getTime());
395  float pos = emuClock.getTicksTillDouble(host1);
396  assert(pos <= (ratio + 2));
397  for (unsigned i = 0; i < hostNum; ++i) {
398  calcOutput(pos, &dataOut[i * CHANNELS]);
399  pos += ratio;
400  }
401  }
402  emuClock += emuNum;
403  bufStart += emuNum;
404  nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
405 
406  assert(bufStart <= bufEnd);
407  unsigned available = bufEnd - bufStart;
408  unsigned extra = int(filterLen + 1 + ratio + 1);
409  assert(available == extra); (void)available; (void)extra;
410 
411  return notMuted;
412 }
413 
414 // Force template instantiation.
415 template class ResampleHQ<1>;
416 template class ResampleHQ<2>;
417 
418 } // namespace openmsx
EmuTime::param getTime() const
Gets the time at which the last clock tick occurred.
Definition: DynamicClock.hh:37
int divAsInt(const FixedPoint other) const
Returns the result of a division between this fixed point number and another, rounded towards zero...
Definition: FixedPoint.hh:122
string_ref::const_iterator end(const string_ref &x)
Definition: string_ref.hh:167
#define unlikely(x)
Definition: likely.hh:15
void getCoeffs(double ratio, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:85
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock, unsigned emuSampleRate)
Definition: ResampleHQ.cc:165
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:192
bool generateOutput(int *dataOut, unsigned num, EmuTime::param time) override
Definition: ResampleHQ.cc:382
bool generateInput(int *buffer, unsigned num)
Note: To enable various optimizations (like SSE), this method is allowed to generate up to 3 extra sa...
unsigned getTicksTill(EmuTime::param e) const
Calculate the number of ticks for this clock until the given time.
Definition: DynamicClock.hh:51
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:15
double fractionAsDouble() const
Returns the fractional part of this fixed point number as a double.
Definition: FixedPoint.hh:112
int toInt() const
Returns the integer part (rounded down) of this fixed point number.
Definition: FixedPoint.hh:78
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
#define countof(array)
Definition: countof.hh:48
double getTicksTillDouble(EmuTime::param e) const
Definition: DynamicClock.hh:72
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:79
string_ref::const_iterator begin(const string_ref &x)
Definition: string_ref.hh:166
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:90
EmuTime getFastAdd(unsigned n) const
auto rfind_if_unguarded(RANGE &range, PRED pred) -> decltype(std::begin(range))
Definition: stl.hh:174
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:101
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44