openMSX
ResampleHQ.cc
Go to the documentation of this file.
1 // Based on libsamplerate-0.1.2 (aka Secret Rabbit Code)
2 //
3 // simplified code in several ways:
4 // - resample algorithm is no longer switchable, we took this variant:
5 // Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6 // - don't allow to change sample rate on-the-fly
7 // - assume input (and thus also output) signals have infinite length, so
8 // there is no special code to handle the ending of the signal
9 // - changed/simplified API to better match openmsx use model
10 // (e.g. remove all error checking)
11 
12 #include "ResampleHQ.hh"
13 #include "ResampledSoundDevice.hh"
14 #include "FixedPoint.hh"
15 #include "MemBuffer.hh"
16 #include "aligned.hh"
17 #include "likely.hh"
18 #include "ranges.hh"
19 #include "stl.hh"
20 #include "vla.hh"
21 #include "xrange.hh"
22 #include "build-info.hh"
23 #include <vector>
24 #include <cmath>
25 #include <cstddef>
26 #include <cstring>
27 #include <cassert>
28 #include <iterator>
29 #ifdef __SSE2__
30 #include <emmintrin.h>
31 #endif
32 
33 namespace openmsx {
34 
35 // Note: without appending 'f' to the values in ResampleCoeffs.ii,
36 // this will generate thousands of C4305 warnings in VC++
37 // E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
38 constexpr float coeffs[] = {
39  #include "ResampleCoeffs.ii"
40 };
41 
43 
44 constexpr int INDEX_INC = 128;
45 constexpr int COEFF_LEN = int(std::size(coeffs));
46 constexpr int COEFF_HALF_LEN = COEFF_LEN - 1;
47 constexpr unsigned TAB_LEN = 4096;
48 constexpr unsigned HALF_TAB_LEN = TAB_LEN / 2;
49 
51 {
52 public:
53  ResampleCoeffs(const ResampleCoeffs&) = delete;
55 
56  static ResampleCoeffs& instance();
57  void getCoeffs(double ratio, int16_t*& permute, float*& table, unsigned& filterLen);
58  void releaseCoeffs(double ratio);
59 
60 private:
63 
64  ResampleCoeffs() = default;
65  ~ResampleCoeffs();
66 
67  static Table calcTable(double ratio, int16_t* permute, unsigned& filterLen);
68 
69  struct Element {
70  double ratio;
71  PermuteTable permute;
72  Table table;
73  unsigned filterLen;
74  unsigned count;
75  };
76  std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
77 };
78 
79 ResampleCoeffs::~ResampleCoeffs()
80 {
81  assert(cache.empty());
82 }
83 
85 {
86  static ResampleCoeffs resampleCoeffs;
87  return resampleCoeffs;
88 }
89 
91  double ratio, int16_t*& permute, float*& table, unsigned& filterLen)
92 {
93  if (auto it = ranges::find_if(cache, [=](auto& e) { return e.ratio == ratio; });
94  it != end(cache)) {
95  permute = it->permute.data();
96  table = it->table.data();
97  filterLen = it->filterLen;
98  it->count++;
99  return;
100  }
101  Element elem;
102  elem.ratio = ratio;
103  elem.count = 1;
104  elem.permute = PermuteTable(HALF_TAB_LEN);
105  elem.table = calcTable(ratio, elem.permute.data(), elem.filterLen);
106  permute = elem.permute.data();
107  table = elem.table.data();
108  filterLen = elem.filterLen;
109  cache.push_back(std::move(elem));
110 }
111 
113 {
114  auto it = rfind_if_unguarded(cache,
115  [=](const Element& e) { return e.ratio == ratio; });
116  it->count--;
117  if (it->count == 0) {
118  move_pop_back(cache, it);
119  }
120 }
121 
122 // -- Permutation stuff --
123 //
124 // The rows in the resample coefficient table are not visited sequentially.
125 // Instead, depending on the resample-ratio, we take fixed non-integer jumps
126 // from one row to the next.
127 //
128 // In reality the table has 4096 rows (of which only 2048 are actually stored).
129 // But for simplicity I'll here work out examples for a table with only 16 rows
130 // (of which 8 are stored).
131 //
132 // Let's first assume a jump of '5.2'. This means that after we've used row
133 // 'r', the next row we need is 'r + 5.2'. Of course row numbers must be
134 // integers, so a jump of 5.2 actually means that 80% of the time we advance 5
135 // rows and 20% of the time we advance 6 rows.
136 //
137 // The rows in the (full) table are circular. This means that once we're past
138 // row 15 (in this example) we restart at row 0. So rows 'wrap' past the end
139 // (modulo arithmetic). We also only store the 1st half of the table, the
140 // entries for the 2nd half are 'folded' back to the 1st half according to the
141 // formula: y = 15 - x.
142 //
143 // Let's now calculate the possible transitions. If we're currently on row '0',
144 // the next row will be either '5' (80% chance) or row '6' (20% chance). When
145 // we're on row '5' the next most likely row will be '10', but after folding
146 // '10' becomes '15-10 = 5' (so 5 goes to itself (80% chance)). Row '10' most
147 // likely goes to '15', after folding we get that '5' goes to '0'. Row '15'
148 // most likely goes to '20', and after wrapping and folding that becomes '0'
149 // goes to '4'. Calculating this for all rows gives:
150 // 0 -> 5 or 4 (80%) 0 -> 6 or 5 (20%)
151 // 1 -> 6 or 3 1 -> 7 or 4
152 // 2 -> 7 or 2 2 -> 7 or 3
153 // 3 -> 7 or 1 3 -> 6 or 2
154 // 4 -> 6 or 0 4 -> 5 or 1
155 // 5 -> 5 or 0 5 -> 4 or 0
156 // 6 -> 4 or 1 6 -> 3 or 0
157 // 7 -> 3 or 2 7 -> 2 or 1
158 // So every row has 4 possible successors (2 more and 2 less likely). Possibly
159 // some of these 4 are the same, or even the same as the starting row. Note
160 // that if row x goes to row y (x->y) then also y->x, this turns out to be true
161 // in general.
162 //
163 // For cache efficiency it's best if rows that are needed after each other in
164 // time are also stored sequentially in memory (both before or after is fine).
165 // Clearly storing the rows in numeric order will not read the memory
166 // sequentially. For this specific example we could stores the rows in the
167 // order:
168 // 2, 7, 3, 1, 6, 4, 0, 5
169 // With this order all likely transitions are sequential. The less likely
170 // transitions are not. But I don't believe there exists an order that's good
171 // for both the likely and the unlikely transitions. Do let me know if I'm
172 // wrong.
173 //
174 // In this example the transitions form a single chain (it turns out this is
175 // often the case). But for example for a step-size of 4.3 we get
176 // 0 -> 4 or 3 (70%) 0 -> 5 or 4 (30%)
177 // 1 -> 5 or 2 1 -> 6 or 3
178 // 2 -> 6 or 1 2 -> 7 or 2
179 // 3 -> 7 or 0 3 -> 7 or 1
180 // 4 -> 7 or 0 4 -> 6 or 0
181 // 5 -> 6 or 1 5 -> 5 or 0
182 // 6 -> 5 or 2 6 -> 4 or 1
183 // 7 -> 4 or 3 7 -> 3 or 2
184 // Only looking at the more likely transitions, we get 2 cycles of length 4:
185 // 0, 4, 7, 3
186 // 1, 5, 6, 2
187 //
188 // So the previous example gave a single chain with 2 clear end-points. Now we
189 // have 2 separate cycles. It turns out that for any possible step-size we
190 // either get a single chain or k cycles of size N/k. (So e.g. a chain of
191 // length 5 plus a cycle of length 3 is impossible. Also 1 cycle of length 4
192 // plus 2 cycles of length 2 is impossible). To be honest I've only partially
193 // mathematically proven this, but at least I've verified it for N=16 and
194 // N=4096 for all possible step-sizes.
195 //
196 // To linearise a chain in memory there are only 2 (good) possibilities: start
197 // at either end-point. But to store a cycle any point is as good as any other.
198 // Also the order in which to store the cycles themselves can still be chosen.
199 //
200 // Let's come back to the example with step-size 4.3. If we linearise this as
201 // | 0, 4, 7, 3 | 1, 5, 6, 2 |
202 // then most of the more likely transitions are sequential. The exceptions are
203 // 0 <-> 3 and 1 <-> 2
204 // but those are unavoidable with cycles. In return 2 of the less likely
205 // transitions '3 <-> 1' are now sequential. I believe this is the best
206 // possible linearization (better said: there are other linearizations that are
207 // equally good, but none is better). But do let me know if you find a better
208 // one!
209 //
210 // For step-size '8.4' an optimal(?) linearization seems to be
211 // | 0, 7 | 1, 6 | 2, 5 | 3, 4 |
212 // For step-size '7.9' the order is:
213 // | 7, 0 | 6, 1 | 5, 2 | 4, 3 |
214 // And for step-size '3.8':
215 // | 7, 4, 0, 3 | 6, 5, 1, 2 |
216 //
217 // I've again not (fully) mathematically proven it, but it seems we can
218 // optimally(?) linearise cycles by:
219 // * if likely step < unlikely step:
220 // pick unassigned rows from 0 to N/2-1, and complete each cycle
221 // * if likely step > unlikely step:
222 // pick unassigned rows from N/2-1 to 0, and complete each cycle
223 //
224 // The routine calcPermute() below calculates these optimal(?) linearizations.
225 // More in detail it calculates a permutation table: the i-th element in this
226 // table tells where in memory the i-th logical row of the original (half)
227 // resample coefficient table is physically stored.
228 
229 constexpr unsigned N = TAB_LEN;
230 constexpr unsigned N1 = N - 1;
231 constexpr unsigned N2 = N / 2;
232 
233 static constexpr unsigned mapIdx(unsigned x)
234 {
235  unsigned t = x & N1; // first wrap
236  return (t < N2) ? t : N1 - t; // then fold
237 }
238 
239 static constexpr std::pair<unsigned, unsigned> next(unsigned x, unsigned step)
240 {
241  return {mapIdx(x + step), mapIdx(N1 - x + step)};
242 }
243 
244 static void calcPermute(double ratio, int16_t* permute)
245 {
246  double r2 = ratio * N;
247  double fract = r2 - floor(r2);
248  unsigned step = floor(r2);
249  bool incr = [&] {
250  if (fract > 0.5) {
251  // mostly (> 50%) take steps of 'floor(r2) + 1'
252  step += 1;
253  return false; // assign from high to low
254  } else {
255  // mostly take steps of 'floor(r2)'
256  return true; // assign from low to high
257  }
258  }();
259 
260  // initially set all as unassigned
261  std::fill_n(permute, N2, -1);
262 
263  unsigned nxt1, nxt2;
264  unsigned restart = incr ? 0 : N2 - 1;
265  unsigned curr = restart;
266  // check for chain (instead of cycles)
267  if (incr) {
268  for (auto i : xrange(N2)) {
269  std::tie(nxt1, nxt2) = next(i, step);
270  if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
271  }
272  } else {
273  for (unsigned i = N2 - 1; int(i) >= 0; --i) {
274  std::tie(nxt1, nxt2) = next(i, step);
275  if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
276  }
277  }
278 
279  // assign all rows (in chain of cycle(s))
280  unsigned cnt = 0;
281  while (true) {
282  assert(permute[curr] == -1);
283  assert(cnt < N2);
284  permute[curr] = cnt++;
285 
286  std::tie(nxt1, nxt2) = next(curr, step);
287  if (permute[nxt1] == -1) {
288  curr = nxt1;
289  continue;
290  } else if (permute[nxt2] == -1) {
291  curr = nxt2;
292  continue;
293  }
294 
295  // finished chain or cycle
296  if (cnt == N2) break; // done
297 
298  // continue with next cycle
299  while (permute[restart] != -1) {
300  if (incr) {
301  ++restart;
302  assert(restart != N2);
303  } else {
304  assert(restart != 0);
305  --restart;
306  }
307  }
308  curr = restart;
309  }
310 
311 #ifdef DEBUG
312  int16_t testPerm[N2];
313  ranges::iota(testPerm, 0);
314  assert(std::is_permutation(permute, permute + N2, testPerm));
315 #endif
316 }
317 
318 static constexpr double getCoeff(FilterIndex index)
319 {
320  double fraction = index.fractionAsDouble();
321  int indx = index.toInt();
322  return double(coeffs[indx]) +
323  fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
324 }
325 
326 ResampleCoeffs::Table ResampleCoeffs::calcTable(
327  double ratio, int16_t* permute, unsigned& filterLen)
328 {
329  calcPermute(ratio, permute);
330 
331  double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
332  double normFactor = floatIncr / INDEX_INC;
333  auto increment = FilterIndex(floatIncr);
334  FilterIndex maxFilterIndex(COEFF_HALF_LEN);
335 
336  int min_idx = -maxFilterIndex.divAsInt(increment);
337  int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
338  int idx_cnt = max_idx - min_idx + 1;
339  filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
340  min_idx -= (filterLen - idx_cnt) / 2;
341  Table table(HALF_TAB_LEN * filterLen);
342  memset(table.data(), 0, HALF_TAB_LEN * filterLen * sizeof(float));
343 
344  for (auto t : xrange(HALF_TAB_LEN)) {
345  float* tab = &table[permute[t] * filterLen];
346  double lastPos = (double(t) + 0.5) / TAB_LEN;
347  FilterIndex startFilterIndex(lastPos * floatIncr);
348 
349  FilterIndex filterIndex(startFilterIndex);
350  int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
351  filterIndex += increment * coeffCount;
352  int bufIndex = -coeffCount;
353  do {
354  tab[bufIndex - min_idx] =
355  float(getCoeff(filterIndex) * normFactor);
356  filterIndex -= increment;
357  bufIndex += 1;
358  } while (filterIndex >= FilterIndex(0));
359 
360  filterIndex = increment - startFilterIndex;
361  coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
362  filterIndex += increment * coeffCount;
363  bufIndex = 1 + coeffCount;
364  do {
365  tab[bufIndex - min_idx] =
366  float(getCoeff(filterIndex) * normFactor);
367  filterIndex -= increment;
368  bufIndex -= 1;
369  } while (filterIndex > FilterIndex(0));
370  }
371  return table;
372 }
373 
374 
375 template<unsigned CHANNELS>
377  ResampledSoundDevice& input_, const DynamicClock& hostClock_)
378  : ResampleAlgo(input_)
379  , hostClock(hostClock_)
380  , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
381 {
382  ResampleCoeffs::instance().getCoeffs(double(ratio), permute, table, filterLen);
383 
384  // fill buffer with 'enough' zero's
385  unsigned extra = int(filterLen + 1 + ratio + 1);
386  bufStart = 0;
387  bufEnd = extra;
388  nonzeroSamples = 0;
389  unsigned initialSize = 4000; // buffer grows dynamically if this is too small
390  buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
391 }
392 
393 template<unsigned CHANNELS>
395 {
396  ResampleCoeffs::instance().releaseCoeffs(double(ratio));
397 }
398 
399 #ifdef __SSE2__
400 template<bool REVERSE>
401 static inline void calcSseMono(const float* buf_, const float* tab_, size_t len, float* out)
402 {
403  assert((len % 4) == 0);
404  assert((uintptr_t(tab_) % 16) == 0);
405 
406  ptrdiff_t x = (len & ~7) * sizeof(float);
407  assert((x % 32) == 0);
408  const char* buf = reinterpret_cast<const char*>(buf_) + x;
409  const char* tab = reinterpret_cast<const char*>(tab_) + (REVERSE ? -x : x);
410  x = -x;
411 
412  __m128 a0 = _mm_setzero_ps();
413  __m128 a1 = _mm_setzero_ps();
414  do {
415  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
416  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
417  __m128 t0, t1;
418  if constexpr (REVERSE) {
419  t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 16));
420  t1 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 32));
421  } else {
422  t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
423  t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
424  }
425  __m128 m0 = _mm_mul_ps(b0, t0);
426  __m128 m1 = _mm_mul_ps(b1, t1);
427  a0 = _mm_add_ps(a0, m0);
428  a1 = _mm_add_ps(a1, m1);
429  x += 2 * sizeof(__m128);
430  } while (x < 0);
431  if (len & 4) {
432  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
433  __m128 t0;
434  if constexpr (REVERSE) {
435  t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
436  } else {
437  t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
438  }
439  __m128 m0 = _mm_mul_ps(b0, t0);
440  a0 = _mm_add_ps(a0, m0);
441  }
442 
443  __m128 a = _mm_add_ps(a0, a1);
444  // The following can be _slightly_ faster by using the SSE3 _mm_hadd_ps()
445  // intrinsic, but not worth the trouble.
446  __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
447  __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
448 
449  _mm_store_ss(out, s);
450 }
451 
452 template<int N> static inline __m128 shuffle(__m128 x)
453 {
454  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
455 }
456 template<bool REVERSE>
457 static inline void calcSseStereo(const float* buf_, const float* tab_, size_t len, float* out)
458 {
459  assert((len % 4) == 0);
460  assert((uintptr_t(tab_) % 16) == 0);
461 
462  ptrdiff_t x = 2 * (len & ~7) * sizeof(float);
463  const char* buf = reinterpret_cast<const char*>(buf_) + x;
464  const char* tab = reinterpret_cast<const char*>(tab_);
465  x = -x;
466 
467  __m128 a0 = _mm_setzero_ps();
468  __m128 a1 = _mm_setzero_ps();
469  __m128 a2 = _mm_setzero_ps();
470  __m128 a3 = _mm_setzero_ps();
471  do {
472  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
473  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
474  __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 32));
475  __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 48));
476  __m128 ta, tb;
477  if constexpr (REVERSE) {
478  ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
479  tb = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 32));
480  tab -= 2 * sizeof(__m128);
481  } else {
482  ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
483  tb = _mm_load_ps (reinterpret_cast<const float*>(tab + 16));
484  tab += 2 * sizeof(__m128);
485  }
486  __m128 t0 = shuffle<0x50>(ta);
487  __m128 t1 = shuffle<0xFA>(ta);
488  __m128 t2 = shuffle<0x50>(tb);
489  __m128 t3 = shuffle<0xFA>(tb);
490  __m128 m0 = _mm_mul_ps(b0, t0);
491  __m128 m1 = _mm_mul_ps(b1, t1);
492  __m128 m2 = _mm_mul_ps(b2, t2);
493  __m128 m3 = _mm_mul_ps(b3, t3);
494  a0 = _mm_add_ps(a0, m0);
495  a1 = _mm_add_ps(a1, m1);
496  a2 = _mm_add_ps(a2, m2);
497  a3 = _mm_add_ps(a3, m3);
498  x += 4 * sizeof(__m128);
499  } while (x < 0);
500  if (len & 4) {
501  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
502  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
503  __m128 ta;
504  if constexpr (REVERSE) {
505  ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
506  } else {
507  ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
508  }
509  __m128 t0 = shuffle<0x50>(ta);
510  __m128 t1 = shuffle<0xFA>(ta);
511  __m128 m0 = _mm_mul_ps(b0, t0);
512  __m128 m1 = _mm_mul_ps(b1, t1);
513  a0 = _mm_add_ps(a0, m0);
514  a1 = _mm_add_ps(a1, m1);
515  }
516 
517  __m128 a01 = _mm_add_ps(a0, a1);
518  __m128 a23 = _mm_add_ps(a2, a3);
519  __m128 a = _mm_add_ps(a01, a23);
520  // Can faster with SSE3, but (like above) not worth the trouble.
521  __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
522  _mm_store_ss(&out[0], s);
523  _mm_store_ss(&out[1], shuffle<0x55>(s));
524 }
525 
526 #endif
527 
528 template<unsigned CHANNELS>
529 void ResampleHQ<CHANNELS>::calcOutput(
530  float pos, float* __restrict output)
531 {
532  assert((filterLen & 3) == 0);
533 
534  int bufIdx = int(pos) + bufStart;
535  assert((bufIdx + filterLen) <= bufEnd);
536  bufIdx *= CHANNELS;
537  const float* buf = &buffer[bufIdx];
538 
539  int t = unsigned(lrintf(pos * TAB_LEN)) % TAB_LEN;
540  if (!(t & HALF_TAB_LEN)) {
541  // first half, begin of row 't'
542  t = permute[t];
543  const float* tab = &table[t * filterLen];
544 
545 #ifdef __SSE2__
546  if constexpr (CHANNELS == 1) {
547  calcSseMono <false>(buf, tab, filterLen, output);
548  } else {
549  calcSseStereo<false>(buf, tab, filterLen, output);
550  }
551  return;
552 #endif
553 
554  // c++ version, both mono and stereo
555  for (auto ch : xrange(CHANNELS)) {
556  float r0 = 0.0f;
557  float r1 = 0.0f;
558  float r2 = 0.0f;
559  float r3 = 0.0f;
560  for (unsigned i = 0; i < filterLen; i += 4) {
561  r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
562  r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
563  r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
564  r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
565  }
566  output[ch] = r0 + r1 + r2 + r3;
567  ++buf;
568  }
569  } else {
570  // 2nd half, end of row 'TAB_LEN - 1 - t'
571  t = permute[TAB_LEN - 1 - t];
572  const float* tab = &table[(t + 1) * filterLen];
573 
574 #ifdef __SSE2__
575  if constexpr (CHANNELS == 1) {
576  calcSseMono <true>(buf, tab, filterLen, output);
577  } else {
578  calcSseStereo<true>(buf, tab, filterLen, output);
579  }
580  return;
581 #endif
582 
583  // c++ version, both mono and stereo
584  for (auto ch : xrange(CHANNELS)) {
585  float r0 = 0.0f;
586  float r1 = 0.0f;
587  float r2 = 0.0f;
588  float r3 = 0.0f;
589  for (int i = 0; i < int(filterLen); i += 4) {
590  r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
591  r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
592  r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
593  r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
594  }
595  output[ch] = r0 + r1 + r2 + r3;
596  ++buf;
597  }
598  }
599 }
600 
601 template<unsigned CHANNELS>
602 void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
603 {
604  // Still enough free space at end of buffer?
605  unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
606  if (free < emuNum) {
607  // No, then move everything to the start
608  // (data needs to be in a contiguous memory block)
609  unsigned available = bufEnd - bufStart;
610  memmove(&buffer[0], &buffer[bufStart * CHANNELS],
611  available * CHANNELS * sizeof(float));
612  bufStart = 0;
613  bufEnd = available;
614 
615  free = unsigned(buffer.size() / CHANNELS) - bufEnd;
616  int missing = emuNum - free;
617  if (unlikely(missing > 0)) {
618  // Still not enough room: grow the buffer.
619  // TODO an alternative is to instead of using a large
620  // buffer, chop the work in multiple smaller pieces.
621  // That may have the advantage that the data fits in
622  // the CPU's data cache. OTOH too small chunks have
623  // more overhead. (Not yet implemented because it's
624  // more complex).
625  buffer.resize(buffer.size() + missing * CHANNELS);
626  }
627  }
628  VLA_SSE_ALIGNED(float, tmpBuf, emuNum * CHANNELS + 3);
629  if (input.generateInput(tmpBuf, emuNum)) {
630  memcpy(&buffer[bufEnd * CHANNELS], tmpBuf,
631  emuNum * CHANNELS * sizeof(float));
632  bufEnd += emuNum;
633  nonzeroSamples = bufEnd - bufStart;
634  } else {
635  memset(&buffer[bufEnd * CHANNELS], 0,
636  emuNum * CHANNELS * sizeof(float));
637  bufEnd += emuNum;
638  }
639 
640  assert(bufStart <= bufEnd);
641  assert(bufEnd <= (buffer.size() / CHANNELS));
642 }
643 
644 template<unsigned CHANNELS>
646  float* __restrict dataOut, unsigned hostNum, EmuTime::param time)
647 {
648  auto& emuClk = getEmuClock();
649  unsigned emuNum = emuClk.getTicksTill(time);
650  if (emuNum > 0) {
651  prepareData(emuNum);
652  }
653 
654  bool notMuted = nonzeroSamples > 0;
655  if (notMuted) {
656  // main processing loop
657  EmuTime host1 = hostClock.getFastAdd(1);
658  assert(host1 > emuClk.getTime());
659  float pos = emuClk.getTicksTillDouble(host1);
660  assert(pos <= (ratio + 2));
661  for (auto i : xrange(hostNum)) {
662  calcOutput(pos, &dataOut[i * CHANNELS]);
663  pos += ratio;
664  }
665  }
666  emuClk += emuNum;
667  bufStart += emuNum;
668  nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
669 
670  assert(bufStart <= bufEnd);
671  unsigned available = bufEnd - bufStart;
672  unsigned extra = int(filterLen + 1 + ratio + 1);
673  assert(available == extra); (void)available; (void)extra;
674 
675  return notMuted;
676 }
677 
678 // Force template instantiation.
679 template class ResampleHQ<1>;
680 template class ResampleHQ<2>;
681 
682 } // namespace openmsx
TclObject t
constexpr unsigned CHANNELS
Definition: YM2413Test.cc:20
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:17
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:84
ResampleCoeffs(const ResampleCoeffs &)=delete
ResampleCoeffs & operator=(const ResampleCoeffs &)=delete
void getCoeffs(double ratio, int16_t *&permute, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:90
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:112
bool generateOutputImpl(float *dataOut, unsigned num, EmuTime::param time) override
Definition: ResampleHQ.cc:645
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
Definition: ResampleHQ.cc:376
~ResampleHQ() override
Definition: ResampleHQ.cc:394
constexpr auto step
Definition: eeprom.cc:9
#define unlikely(x)
Definition: likely.hh:15
ALWAYS_INLINE unsigned count(const uint8_t *pIn, const uint8_t *pMatch, const uint8_t *pInLimit)
Definition: lz4.cc:207
This file implemented 3 utility functions:
Definition: Autofire.cc:5
constexpr float coeffs[]
Definition: ResampleHQ.cc:38
constexpr int COEFF_HALF_LEN
Definition: ResampleHQ.cc:46
constexpr Table table
Definition: CPUCore.cc:264
constexpr unsigned N2
Definition: ResampleHQ.cc:231
constexpr unsigned N1
Definition: ResampleHQ.cc:230
FixedPoint< 16 > FilterIndex
Definition: ResampleHQ.cc:42
constexpr unsigned N
Definition: ResampleHQ.cc:229
constexpr unsigned TAB_LEN
Definition: ResampleHQ.cc:47
constexpr int COEFF_LEN
Definition: ResampleHQ.cc:45
constexpr unsigned HALF_TAB_LEN
Definition: ResampleHQ.cc:48
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:124
constexpr int INDEX_INC
Definition: ResampleHQ.cc:44
auto find_if(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:113
constexpr void iota(ForwardIt first, ForwardIt last, T value)
Definition: ranges.hh:204
size_t size(std::string_view utf8)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:182
constexpr auto rfind_if_unguarded(RANGE &range, PRED pred)
Definition: stl.hh:165
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
constexpr auto xrange(T e)
Definition: xrange.hh:155
constexpr auto end(const zstring_view &x)
Definition: zstring_view.hh:83