openMSX
ResampleHQ.cc
Go to the documentation of this file.
1 // Based on libsamplerate-0.1.2 (aka Secret Rabit Code)
2 //
3 // simplified code in several ways:
4 // - resample algorithm is no longer switchable, we took this variant:
5 // Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6 // - don't allow to change sample rate on-the-fly
7 // - assume input (and thus also output) signals have infinte length, so
8 // there is no special code to handle the ending of the signal
9 // - changed/simplified API to better match openmsx use model
10 // (e.g. remove all error checking)
11 
12 #include "ResampleHQ.hh"
13 #include "ResampledSoundDevice.hh"
14 #include "FixedPoint.hh"
15 #include "MemBuffer.hh"
16 #include "countof.hh"
17 #include "likely.hh"
18 #include "stl.hh"
19 #include "vla.hh"
20 #include "build-info.hh"
21 #include <algorithm>
22 #include <vector>
23 #include <cmath>
24 #include <cstddef>
25 #include <cstring>
26 #include <cassert>
27 #ifdef __SSE2__
28 #include <emmintrin.h>
29 #endif
30 
31 namespace openmsx {
32 
33 // Note: without appending 'f' to the values in ResampleCoeffs.ii,
34 // this will generate thousands of C4305 warnings in VC++
35 // E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
36 static const float coeffs[] = {
37  #include "ResampleCoeffs.ii"
38 };
39 
41 
42 static const int INDEX_INC = 128;
43 static const int COEFF_LEN = countof(coeffs);
44 static const int COEFF_HALF_LEN = COEFF_LEN - 1;
45 static const unsigned TAB_LEN = 4096;
46 static const unsigned HALF_TAB_LEN = TAB_LEN / 2;
47 
49 {
50 public:
51  static ResampleCoeffs& instance();
52  void getCoeffs(double ratio, int16_t*& permute, float*& table, unsigned& filterLen);
53  void releaseCoeffs(double ratio);
54 
55 private:
58 
59  ResampleCoeffs() = default;
60  ~ResampleCoeffs();
61 
62  Table calcTable(double ratio, int16_t* permute, unsigned& filterLen);
63 
64  struct Element {
65  double ratio;
66  PermuteTable permute;
67  Table table;
68  unsigned filterLen;
69  unsigned count;
70  };
71  std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
72 };
73 
74 ResampleCoeffs::~ResampleCoeffs()
75 {
76  assert(cache.empty());
77 }
78 
80 {
81  static ResampleCoeffs resampleCoeffs;
82  return resampleCoeffs;
83 }
84 
86  double ratio, int16_t*& permute, float*& table, unsigned& filterLen)
87 {
88  auto it = find_if(begin(cache), end(cache),
89  [=](const Element& e) { return e.ratio == ratio; });
90  if (it != end(cache)) {
91  permute = it->permute.data();
92  table = it->table.data();
93  filterLen = it->filterLen;
94  it->count++;
95  return;
96  }
97  Element elem;
98  elem.ratio = ratio;
99  elem.count = 1;
100  elem.permute = PermuteTable(HALF_TAB_LEN);
101  elem.table = calcTable(ratio, elem.permute.data(), elem.filterLen);
102  permute = elem.permute.data();
103  table = elem.table.data();
104  filterLen = elem.filterLen;
105  cache.push_back(std::move(elem));
106 }
107 
109 {
110  auto it = rfind_if_unguarded(cache,
111  [=](const Element& e) { return e.ratio == ratio; });
112  it->count--;
113  if (it->count == 0) {
114  move_pop_back(cache, it);
115  }
116 }
117 
118 // -- Permutation stuff --
119 //
120 // The rows in the resample coefficient table are not visited sequentially.
121 // Instead, depending on the resample-ratio, we take fixed non-integer jumps
122 // from one row to the next.
123 //
124 // In reality the table has 4096 rows (of which only 2048 are actually stored).
125 // But for simplicity I'll here work out examples for a table with only 16 rows
126 // (of which 8 are stored).
127 //
128 // Let's first assume a jump of '5.2'. This means that after we've used row
129 // 'r', the next row we need is 'r + 5.2'. Of course row numbers must be
130 // integers, so a jump of 5.2 actually means that 80% of the time we advance 5
131 // rows and 20% of the time we advance 6 rows.
132 //
133 // The rows in the (full) table are circular. This means that once we're past
134 // row 15 (in this example) we restart at row 0. So rows 'wrap' past the end
135 // (modulo arithmetic). We also only store the 1st half of the table, the
136 // entries for the 2nd half are 'folded' back to the 1st half according to the
137 // formula: y = 15 - x.
138 //
139 // Let's now calculate the possible transitions. If we're currently on row '0',
140 // the next row will be either '5' (80% chance) or row '6' (20% chance). When
141 // we're on row '5' the next most likely row will be '10', but after folding
142 // '10' becomes '15-10 = 5' (so 5 goes to itself (80% chance)). Row '10' most
143 // likely goes to '15', after folding we get that '5' goes to '0'. Row '15'
144 // most likely goes to '20', and after wrapping and folding that becomes '0'
145 // goes to '4'. Calculating this for all rows gives:
146 // 0 -> 5 or 4 (80%) 0 -> 6 or 5 (20%)
147 // 1 -> 6 or 3 1 -> 7 or 4
148 // 2 -> 7 or 2 2 -> 7 or 3
149 // 3 -> 7 or 1 3 -> 6 or 2
150 // 4 -> 6 or 0 4 -> 5 or 1
151 // 5 -> 5 or 0 5 -> 4 or 0
152 // 6 -> 4 or 1 6 -> 3 or 0
153 // 7 -> 3 or 2 7 -> 2 or 1
154 // So every row has 4 possible successors (2 more and 2 less likely). Possibly
155 // some of these 4 are the same, or even the same as the starting row. Note
156 // that if row x goes to row y (x->y) then also y->x, this turns out to be true
157 // in general.
158 //
159 // For cache efficiency it's best if rows that are needed after each other in
160 // time are also stored sequentially in memory (both before or after is fine).
161 // Clearly storing the rows in numeric order will not read the memory
162 // sequentially. For this specific example we could stores the rows in the
163 // order:
164 // 2, 7, 3, 1, 6, 4, 0, 5
165 // With this order all likely transitions are sequential. The less likely
166 // transitions are not. But I don't believe there exists an order that's good
167 // for both the likely and the unlikely transitions. Do let me know if I'm
168 // wrong.
169 //
170 // In this example the transitions form a single chain (it turns out this is
171 // often the case). But for example for a step-size of 4.3 we get
172 // 0 -> 4 or 3 (70%) 0 -> 5 or 4 (30%)
173 // 1 -> 5 or 2 1 -> 6 or 3
174 // 2 -> 6 or 1 2 -> 7 or 2
175 // 3 -> 7 or 0 3 -> 7 or 1
176 // 4 -> 7 or 0 4 -> 6 or 0
177 // 5 -> 6 or 1 5 -> 5 or 0
178 // 6 -> 5 or 2 6 -> 4 or 1
179 // 7 -> 4 or 3 7 -> 3 or 2
180 // Only looking at the more likely transitions, we get 2 cycles of length 4:
181 // 0, 4, 7, 3
182 // 1, 5, 6, 2
183 //
184 // So the previous example gave a single chain with 2 clear end-points. Now we
185 // have 2 separate cycles. It turns out that for any possible step-size we
186 // either get a single chain or k cycles of size N/k. (So e.g. a chain of
187 // length 5 plus a cycle of length 3 is impossible. Also 1 cycle of length 4
188 // plus 2 cycles of length 2 is impossible). To be honest I've only partially
189 // mathematically proven this, but at least I've verified it for N=16 and
190 // N=4096 for all possible step-sizes.
191 //
192 // To linearise a chain in memory there are only 2 (good) possibilities: start
193 // at either end-point. But to store a cycle any point is as good as any other.
194 // Also the order in which to store the cycles themselves can still be chosen.
195 //
196 // Let's come back to the example with step-size 4.3. If we linearise this as
197 // | 0, 4, 7, 3 | 1, 5, 6, 2 |
198 // then most of the more likely transitions are sequential. The exceptions are
199 // 0 <-> 3 and 1 <-> 2
200 // but those are unavoidable with cycles. In return 2 of the less likely
201 // transitions '3 <-> 1' are now sequential. I believe this is the best
202 // possible linearization (better said: there are other linearizations that are
203 // equally good, but none is better). But do let me know if you find a better
204 // one!
205 //
206 // For step-size '8.4' an optimal(?) linearization seems to be
207 // | 0, 7 | 1, 6 | 2, 5 | 3, 4 |
208 // For step-size '7.9' the order is:
209 // | 7, 0 | 6, 1 | 5, 2 | 4, 3 |
210 // And for step-size '3.8':
211 // | 7, 4, 0, 3 | 6, 5, 1, 2 |
212 //
213 // I've again not (fully) mathematically proven it, but it seems we can
214 // optimally(?) linearise cycles by:
215 // * if likely step < unlikely step:
216 // pick unassigned rows from 0 to N/2-1, and complete each cycle
217 // * if likely step > unlikely step:
218 // pick unassigned rows from N/2-1 to 0, and complete each cycle
219 //
220 // The routine calcPermute() below calculates these optimal(?) linearizations.
221 // More in detail it calculates a permutation table: the i-th element in this
222 // table tells where in memory the i-th logical row of the original (half)
223 // resample coefficient table is physically stored.
224 
225 static const unsigned N = TAB_LEN;
226 static const unsigned N1 = N - 1;
227 static const unsigned N2 = N / 2;
228 
229 static unsigned mapIdx(unsigned x)
230 {
231  unsigned t = x & N1; // first wrap
232  return (t < N2) ? t : N1 - t; // then fold
233 }
234 
235 static std::pair<unsigned, unsigned> next(unsigned x, unsigned step)
236 {
237  return {mapIdx(x + step), mapIdx(N1 - x + step)};
238 }
239 
240 static void calcPermute(double ratio, int16_t* permute)
241 {
242  double r2 = ratio * N;
243  double fract = r2 - floor(r2);
244  unsigned step = floor(r2);
245  bool incr;
246  if (fract > 0.5) {
247  // mostly (> 50%) take steps of 'floor(r2) + 1'
248  step += 1;
249  incr = false; // assign from high to low
250  } else {
251  // mostly take steps of 'floor(r2)'
252  incr = true; // assign from low to high
253  }
254 
255  // initially set all as unassigned
256  for (unsigned i = 0; i < N2; ++i) {
257  permute[i] = -1;
258  }
259 
260  unsigned nxt1, nxt2;
261  unsigned restart = incr ? 0 : N2 - 1;
262  unsigned curr = restart;
263  // check for chain (instead of cycles)
264  if (incr) {
265  for (unsigned i = 0; i < N2; ++i) {
266  std::tie(nxt1, nxt2) = next(i, step);
267  if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
268  }
269  } else {
270  for (unsigned i = N2 - 1; int(i) >= 0; --i) {
271  std::tie(nxt1, nxt2) = next(i, step);
272  if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
273  }
274  }
275 
276  // assign all rows (in chain of cycle(s))
277  unsigned cnt = 0;
278  while (true) {
279  assert(permute[curr] == -1);
280  assert(cnt < N2);
281  permute[curr] = cnt++;
282 
283  std::tie(nxt1, nxt2) = next(curr, step);
284  if (permute[nxt1] == -1) {
285  curr = nxt1;
286  continue;
287  } else if (permute[nxt2] == -1) {
288  curr = nxt2;
289  continue;
290  }
291 
292  // finished chain or cycle
293  if (cnt == N2) break; // done
294 
295  // continue with next cycle
296  while (permute[restart] != -1) {
297  if (incr) {
298  ++restart;
299  assert(restart != N2);
300  } else {
301  assert(restart != 0);
302  --restart;
303  }
304  }
305  curr = restart;
306  }
307 
308 #ifdef DEBUG
309  int16_t testPerm[N2];
310  for (unsigned i = 0; i < N2; ++i) testPerm[i] = i;
311  assert(std::is_permutation(permute, permute + N2, testPerm));
312 #endif
313 }
314 
315 static double getCoeff(FilterIndex index)
316 {
317  double fraction = index.fractionAsDouble();
318  int indx = index.toInt();
319  return double(coeffs[indx]) +
320  fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
321 }
322 
323 ResampleCoeffs::Table ResampleCoeffs::calcTable(
324  double ratio, int16_t* permute, unsigned& filterLen)
325 {
326  calcPermute(ratio, permute);
327 
328  double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
329  double normFactor = floatIncr / INDEX_INC;
330  FilterIndex increment = FilterIndex(floatIncr);
331  FilterIndex maxFilterIndex(COEFF_HALF_LEN);
332 
333  int min_idx = -maxFilterIndex.divAsInt(increment);
334  int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
335  int idx_cnt = max_idx - min_idx + 1;
336  filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
337  min_idx -= (filterLen - idx_cnt) / 2;
338  Table table(HALF_TAB_LEN * filterLen);
339  memset(table.data(), 0, HALF_TAB_LEN * filterLen * sizeof(float));
340 
341  for (unsigned t = 0; t < HALF_TAB_LEN; ++t) {
342  float* tab = &table[permute[t] * filterLen];
343  double lastPos = (double(t) + 0.5) / TAB_LEN;
344  FilterIndex startFilterIndex(lastPos * floatIncr);
345 
346  FilterIndex filterIndex(startFilterIndex);
347  int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
348  filterIndex += increment * coeffCount;
349  int bufIndex = -coeffCount;
350  do {
351  tab[bufIndex - min_idx] =
352  float(getCoeff(filterIndex) * normFactor);
353  filterIndex -= increment;
354  bufIndex += 1;
355  } while (filterIndex >= FilterIndex(0));
356 
357  filterIndex = increment - startFilterIndex;
358  coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
359  filterIndex += increment * coeffCount;
360  bufIndex = 1 + coeffCount;
361  do {
362  tab[bufIndex - min_idx] =
363  float(getCoeff(filterIndex) * normFactor);
364  filterIndex -= increment;
365  bufIndex -= 1;
366  } while (filterIndex > FilterIndex(0));
367  }
368  return table;
369 }
370 
371 
372 template <unsigned CHANNELS>
374  ResampledSoundDevice& input_,
375  const DynamicClock& hostClock_, unsigned emuSampleRate)
376  : input(input_)
377  , hostClock(hostClock_)
378  , emuClock(hostClock.getTime(), emuSampleRate)
379  , ratio(float(emuSampleRate) / hostClock.getFreq())
380 {
381  ResampleCoeffs::instance().getCoeffs(ratio, permute, table, filterLen);
382 
383  // fill buffer with 'enough' zero's
384  unsigned extra = int(filterLen + 1 + ratio + 1);
385  bufStart = 0;
386  bufEnd = extra;
387  nonzeroSamples = 0;
388  unsigned initialSize = 4000; // buffer grows dynamically if this is too small
389  buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
390 }
391 
392 template <unsigned CHANNELS>
394 {
396 }
397 
398 #ifdef __SSE2__
399 template<bool REVERSE>
400 static inline void calcSseMono(const float* buf_, const float* tab_, size_t len, int* out)
401 {
402  assert((len % 4) == 0);
403  assert((uintptr_t(tab_) % 16) == 0);
404 
405  ptrdiff_t x = (len & ~7) * sizeof(float);
406  assert((x % 32) == 0);
407  const char* buf = reinterpret_cast<const char*>(buf_) + x;
408  const char* tab = reinterpret_cast<const char*>(tab_) + (REVERSE ? -x : x);
409  x = -x;
410 
411  __m128 a0 = _mm_setzero_ps();
412  __m128 a1 = _mm_setzero_ps();
413  do {
414  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
415  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
416  __m128 t0, t1;
417  if (REVERSE) {
418  t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 16));
419  t1 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 32));
420  } else {
421  t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
422  t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
423  }
424  __m128 m0 = _mm_mul_ps(b0, t0);
425  __m128 m1 = _mm_mul_ps(b1, t1);
426  a0 = _mm_add_ps(a0, m0);
427  a1 = _mm_add_ps(a1, m1);
428  x += 2 * sizeof(__m128);
429  } while (x < 0);
430  if (len & 4) {
431  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
432  __m128 t0;
433  if (REVERSE) {
434  t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
435  } else {
436  t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
437  }
438  __m128 m0 = _mm_mul_ps(b0, t0);
439  a0 = _mm_add_ps(a0, m0);
440  }
441 
442  __m128 a = _mm_add_ps(a0, a1);
443  // The following can be _slightly_ faster by using the SSE3 _mm_hadd_ps()
444  // intrinsic, but not worth the trouble.
445  __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
446  __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
447 
448  *out = _mm_cvtss_si32(s);
449 }
450 
451 template<int N> static inline __m128 shuffle(__m128 x)
452 {
453  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
454 }
455 template<bool REVERSE>
456 static inline void calcSseStereo(const float* buf_, const float* tab_, size_t len, int* out)
457 {
458  assert((len % 4) == 0);
459  assert((uintptr_t(tab_) % 16) == 0);
460 
461  ptrdiff_t x = 2 * (len & ~7) * sizeof(float);
462  const char* buf = reinterpret_cast<const char*>(buf_) + x;
463  const char* tab = reinterpret_cast<const char*>(tab_);
464  x = -x;
465 
466  __m128 a0 = _mm_setzero_ps();
467  __m128 a1 = _mm_setzero_ps();
468  __m128 a2 = _mm_setzero_ps();
469  __m128 a3 = _mm_setzero_ps();
470  do {
471  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
472  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
473  __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 32));
474  __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 48));
475  __m128 ta, tb;
476  if (REVERSE) {
477  ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
478  tb = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 32));
479  tab -= 2 * sizeof(__m128);
480  } else {
481  ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
482  tb = _mm_load_ps (reinterpret_cast<const float*>(tab + 16));
483  tab += 2 * sizeof(__m128);
484  }
485  __m128 t0 = shuffle<0x50>(ta);
486  __m128 t1 = shuffle<0xFA>(ta);
487  __m128 t2 = shuffle<0x50>(tb);
488  __m128 t3 = shuffle<0xFA>(tb);
489  __m128 m0 = _mm_mul_ps(b0, t0);
490  __m128 m1 = _mm_mul_ps(b1, t1);
491  __m128 m2 = _mm_mul_ps(b2, t2);
492  __m128 m3 = _mm_mul_ps(b3, t3);
493  a0 = _mm_add_ps(a0, m0);
494  a1 = _mm_add_ps(a1, m1);
495  a2 = _mm_add_ps(a2, m2);
496  a3 = _mm_add_ps(a3, m3);
497  x += 4 * sizeof(__m128);
498  } while (x < 0);
499  if (len & 4) {
500  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
501  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
502  __m128 ta;
503  if (REVERSE) {
504  ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
505  } else {
506  ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
507  }
508  __m128 t0 = shuffle<0x50>(ta);
509  __m128 t1 = shuffle<0xFA>(ta);
510  __m128 m0 = _mm_mul_ps(b0, t0);
511  __m128 m1 = _mm_mul_ps(b1, t1);
512  a0 = _mm_add_ps(a0, m0);
513  a1 = _mm_add_ps(a1, m1);
514  }
515 
516  __m128 a01 = _mm_add_ps(a0, a1);
517  __m128 a23 = _mm_add_ps(a2, a3);
518  __m128 a = _mm_add_ps(a01, a23);
519  // Can faster with SSE3, but (like above) not worth the trouble.
520  __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
521  __m128i si = _mm_cvtps_epi32(s);
522 #if ASM_X86_64
523  *reinterpret_cast<int64_t*>(out) = _mm_cvtsi128_si64(si);
524 #else
525  out[0] = _mm_cvtsi128_si32(si);
526  out[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(si, 0x55));
527 #endif
528 }
529 
530 #endif
531 
532 template <unsigned CHANNELS>
534  float pos, int* __restrict output)
535 {
536  assert((filterLen & 3) == 0);
537 
538  int bufIdx = int(pos) + bufStart;
539  assert((bufIdx + filterLen) <= bufEnd);
540  bufIdx *= CHANNELS;
541  const float* buf = &buffer[bufIdx];
542 
543  int t = unsigned(int(pos * TAB_LEN + 0.5f)) % TAB_LEN;
544  if (!(t & HALF_TAB_LEN)) {
545  // first half, begin of row 't'
546  t = permute[t];
547  const float* tab = &table[t * filterLen];
548 
549 #ifdef __SSE2__
550  if (CHANNELS == 1) {
551  calcSseMono <false>(buf, tab, filterLen, output);
552  } else {
553  calcSseStereo<false>(buf, tab, filterLen, output);
554  }
555  return;
556 #endif
557 
558  // c++ version, both mono and stereo
559  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
560  float r0 = 0.0f;
561  float r1 = 0.0f;
562  float r2 = 0.0f;
563  float r3 = 0.0f;
564  for (unsigned i = 0; i < filterLen; i += 4) {
565  r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
566  r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
567  r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
568  r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
569  }
570  output[ch] = lrint(r0 + r1 + r2 + r3);
571  ++buf;
572  }
573  } else {
574  // 2nd half, end of row 'TAB_LEN - 1 - t'
575  t = permute[TAB_LEN - 1 - t];
576  const float* tab = &table[(t + 1) * filterLen];
577 
578 #ifdef __SSE2__
579  if (CHANNELS == 1) {
580  calcSseMono <true>(buf, tab, filterLen, output);
581  } else {
582  calcSseStereo<true>(buf, tab, filterLen, output);
583  }
584  return;
585 #endif
586 
587  // c++ version, both mono and stereo
588  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
589  float r0 = 0.0f;
590  float r1 = 0.0f;
591  float r2 = 0.0f;
592  float r3 = 0.0f;
593  for (int i = 0; i < int(filterLen); i += 4) {
594  r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
595  r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
596  r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
597  r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
598  }
599  output[ch] = lrint(r0 + r1 + r2 + r3);
600  ++buf;
601  }
602  }
603 }
604 
605 template <unsigned CHANNELS>
606 void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
607 {
608  // Still enough free space at end of buffer?
609  unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
610  if (free < emuNum) {
611  // No, then move everything to the start
612  // (data needs to be in a contiguous memory block)
613  unsigned available = bufEnd - bufStart;
614  memmove(&buffer[0], &buffer[bufStart * CHANNELS],
615  available * CHANNELS * sizeof(float));
616  bufStart = 0;
617  bufEnd = available;
618 
619  free = unsigned(buffer.size() / CHANNELS) - bufEnd;
620  int missing = emuNum - free;
621  if (unlikely(missing > 0)) {
622  // Still not enough room: grow the buffer.
623  // TODO an alternative is to instead of using a large
624  // buffer, chop the work in multiple smaller pieces.
625  // That may have the advantage that the data fits in
626  // the CPU's data cache. OTOH too small chunks have
627  // more overhead. (Not yet implemented because it's
628  // more complex).
629  buffer.resize(buffer.size() + missing * CHANNELS);
630  }
631  }
632  VLA_SSE_ALIGNED(int, tmpBuf, emuNum * CHANNELS + 3);
633  if (input.generateInput(tmpBuf, emuNum)) {
634  for (unsigned i = 0; i < emuNum * CHANNELS; ++i) {
635  buffer[bufEnd * CHANNELS + i] = float(tmpBuf[i]);
636  }
637  bufEnd += emuNum;
638  nonzeroSamples = bufEnd - bufStart;
639  } else {
640  memset(&buffer[bufEnd * CHANNELS], 0,
641  emuNum * CHANNELS * sizeof(float));
642  bufEnd += emuNum;
643  }
644 
645  assert(bufStart <= bufEnd);
646  assert(bufEnd <= (buffer.size() / CHANNELS));
647 }
648 
649 template <unsigned CHANNELS>
651  int* __restrict dataOut, unsigned hostNum, EmuTime::param time)
652 {
653  unsigned emuNum = emuClock.getTicksTill(time);
654  if (emuNum > 0) {
655  prepareData(emuNum);
656  }
657 
658  bool notMuted = nonzeroSamples > 0;
659  if (notMuted) {
660  // main processing loop
661  EmuTime host1 = hostClock.getFastAdd(1);
662  assert(host1 > emuClock.getTime());
663  float pos = emuClock.getTicksTillDouble(host1);
664  assert(pos <= (ratio + 2));
665  for (unsigned i = 0; i < hostNum; ++i) {
666  calcOutput(pos, &dataOut[i * CHANNELS]);
667  pos += ratio;
668  }
669  }
670  emuClock += emuNum;
671  bufStart += emuNum;
672  nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
673 
674  assert(bufStart <= bufEnd);
675  unsigned available = bufEnd - bufStart;
676  unsigned extra = int(filterLen + 1 + ratio + 1);
677  assert(available == extra); (void)available; (void)extra;
678 
679  return notMuted;
680 }
681 
682 // Force template instantiation.
683 template class ResampleHQ<1>;
684 template class ResampleHQ<2>;
685 
686 } // namespace openmsx
string_ref::const_iterator end(const string_ref &x)
Definition: string_ref.hh:167
double getTicksTillDouble(EmuTime::param e) const
Definition: DynamicClock.hh:72
#define unlikely(x)
Definition: likely.hh:15
int divAsInt(const FixedPoint other) const
Returns the result of a division between this fixed point number and another, rounded towards zero...
Definition: FixedPoint.hh:122
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock, unsigned emuSampleRate)
Definition: ResampleHQ.cc:373
FixedPoint< 16 > FilterIndex
Definition: ResampleHQ.cc:40
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:192
EmuTime::param getTime() const
Gets the time at which the last clock tick occurred.
Definition: DynamicClock.hh:37
bool generateOutput(int *dataOut, unsigned num, EmuTime::param time) override
Definition: ResampleHQ.cc:650
bool generateInput(int *buffer, unsigned num)
Note: To enable various optimizations (like SSE), this method is allowed to generate up to 3 extra sa...
void getCoeffs(double ratio, int16_t *&permute, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:85
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:15
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:90
double fractionAsDouble() const
Returns the fractional part of this fixed point number as a double.
Definition: FixedPoint.hh:112
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
EmuTime getFastAdd(unsigned n) const
int toInt() const
Returns the integer part (rounded down) of this fixed point number.
Definition: FixedPoint.hh:78
#define countof(array)
Definition: countof.hh:48
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:79
string_ref::const_iterator begin(const string_ref &x)
Definition: string_ref.hh:166
unsigned getTicksTill(EmuTime::param e) const
Calculate the number of ticks for this clock until the given time.
Definition: DynamicClock.hh:51
auto rfind_if_unguarded(RANGE &range, PRED pred) -> decltype(std::begin(range))
Definition: stl.hh:174
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:108
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44