openMSX
ResampleHQ.cc
Go to the documentation of this file.
1 // Based on libsamplerate-0.1.2 (aka Secret Rabit Code)
2 //
3 // simplified code in several ways:
4 // - resample algorithm is no longer switchable, we took this variant:
5 // Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6 // - don't allow to change sample rate on-the-fly
7 // - assume input (and thus also output) signals have infinte length, so
8 // there is no special code to handle the ending of the signal
9 // - changed/simplified API to better match openmsx use model
10 // (e.g. remove all error checking)
11 
12 #include "ResampleHQ.hh"
13 #include "ResampledSoundDevice.hh"
14 #include "FixedPoint.hh"
15 #include "MemBuffer.hh"
16 #include "likely.hh"
17 #include "ranges.hh"
18 #include "stl.hh"
19 #include "vla.hh"
20 #include "build-info.hh"
21 #include <vector>
22 #include <cmath>
23 #include <cstddef>
24 #include <cstring>
25 #include <cassert>
26 #include <iterator>
27 #ifdef __SSE2__
28 #include <emmintrin.h>
29 #endif
30 
31 namespace openmsx {
32 
33 // Note: without appending 'f' to the values in ResampleCoeffs.ii,
34 // this will generate thousands of C4305 warnings in VC++
35 // E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
36 constexpr float coeffs[] = {
37  #include "ResampleCoeffs.ii"
38 };
39 
41 
42 constexpr int INDEX_INC = 128;
43 constexpr int COEFF_LEN = std::size(coeffs);
44 constexpr int COEFF_HALF_LEN = COEFF_LEN - 1;
45 constexpr unsigned TAB_LEN = 4096;
46 constexpr unsigned HALF_TAB_LEN = TAB_LEN / 2;
47 
49 {
50 public:
51  static ResampleCoeffs& instance();
52  void getCoeffs(double ratio, int16_t*& permute, float*& table, unsigned& filterLen);
53  void releaseCoeffs(double ratio);
54 
55 private:
58 
59  ResampleCoeffs() = default;
60  ~ResampleCoeffs();
61 
62  Table calcTable(double ratio, int16_t* permute, unsigned& filterLen);
63 
64  struct Element {
65  double ratio;
66  PermuteTable permute;
67  Table table;
68  unsigned filterLen;
69  unsigned count;
70  };
71  std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
72 };
73 
74 ResampleCoeffs::~ResampleCoeffs()
75 {
76  assert(cache.empty());
77 }
78 
80 {
81  static ResampleCoeffs resampleCoeffs;
82  return resampleCoeffs;
83 }
84 
86  double ratio, int16_t*& permute, float*& table, unsigned& filterLen)
87 {
88  auto it = ranges::find_if(cache, [=](auto& e) { return e.ratio == ratio; });
89  if (it != end(cache)) {
90  permute = it->permute.data();
91  table = it->table.data();
92  filterLen = it->filterLen;
93  it->count++;
94  return;
95  }
96  Element elem;
97  elem.ratio = ratio;
98  elem.count = 1;
99  elem.permute = PermuteTable(HALF_TAB_LEN);
100  elem.table = calcTable(ratio, elem.permute.data(), elem.filterLen);
101  permute = elem.permute.data();
102  table = elem.table.data();
103  filterLen = elem.filterLen;
104  cache.push_back(std::move(elem));
105 }
106 
108 {
109  auto it = rfind_if_unguarded(cache,
110  [=](const Element& e) { return e.ratio == ratio; });
111  it->count--;
112  if (it->count == 0) {
113  move_pop_back(cache, it);
114  }
115 }
116 
117 // -- Permutation stuff --
118 //
119 // The rows in the resample coefficient table are not visited sequentially.
120 // Instead, depending on the resample-ratio, we take fixed non-integer jumps
121 // from one row to the next.
122 //
123 // In reality the table has 4096 rows (of which only 2048 are actually stored).
124 // But for simplicity I'll here work out examples for a table with only 16 rows
125 // (of which 8 are stored).
126 //
127 // Let's first assume a jump of '5.2'. This means that after we've used row
128 // 'r', the next row we need is 'r + 5.2'. Of course row numbers must be
129 // integers, so a jump of 5.2 actually means that 80% of the time we advance 5
130 // rows and 20% of the time we advance 6 rows.
131 //
132 // The rows in the (full) table are circular. This means that once we're past
133 // row 15 (in this example) we restart at row 0. So rows 'wrap' past the end
134 // (modulo arithmetic). We also only store the 1st half of the table, the
135 // entries for the 2nd half are 'folded' back to the 1st half according to the
136 // formula: y = 15 - x.
137 //
138 // Let's now calculate the possible transitions. If we're currently on row '0',
139 // the next row will be either '5' (80% chance) or row '6' (20% chance). When
140 // we're on row '5' the next most likely row will be '10', but after folding
141 // '10' becomes '15-10 = 5' (so 5 goes to itself (80% chance)). Row '10' most
142 // likely goes to '15', after folding we get that '5' goes to '0'. Row '15'
143 // most likely goes to '20', and after wrapping and folding that becomes '0'
144 // goes to '4'. Calculating this for all rows gives:
145 // 0 -> 5 or 4 (80%) 0 -> 6 or 5 (20%)
146 // 1 -> 6 or 3 1 -> 7 or 4
147 // 2 -> 7 or 2 2 -> 7 or 3
148 // 3 -> 7 or 1 3 -> 6 or 2
149 // 4 -> 6 or 0 4 -> 5 or 1
150 // 5 -> 5 or 0 5 -> 4 or 0
151 // 6 -> 4 or 1 6 -> 3 or 0
152 // 7 -> 3 or 2 7 -> 2 or 1
153 // So every row has 4 possible successors (2 more and 2 less likely). Possibly
154 // some of these 4 are the same, or even the same as the starting row. Note
155 // that if row x goes to row y (x->y) then also y->x, this turns out to be true
156 // in general.
157 //
158 // For cache efficiency it's best if rows that are needed after each other in
159 // time are also stored sequentially in memory (both before or after is fine).
160 // Clearly storing the rows in numeric order will not read the memory
161 // sequentially. For this specific example we could stores the rows in the
162 // order:
163 // 2, 7, 3, 1, 6, 4, 0, 5
164 // With this order all likely transitions are sequential. The less likely
165 // transitions are not. But I don't believe there exists an order that's good
166 // for both the likely and the unlikely transitions. Do let me know if I'm
167 // wrong.
168 //
169 // In this example the transitions form a single chain (it turns out this is
170 // often the case). But for example for a step-size of 4.3 we get
171 // 0 -> 4 or 3 (70%) 0 -> 5 or 4 (30%)
172 // 1 -> 5 or 2 1 -> 6 or 3
173 // 2 -> 6 or 1 2 -> 7 or 2
174 // 3 -> 7 or 0 3 -> 7 or 1
175 // 4 -> 7 or 0 4 -> 6 or 0
176 // 5 -> 6 or 1 5 -> 5 or 0
177 // 6 -> 5 or 2 6 -> 4 or 1
178 // 7 -> 4 or 3 7 -> 3 or 2
179 // Only looking at the more likely transitions, we get 2 cycles of length 4:
180 // 0, 4, 7, 3
181 // 1, 5, 6, 2
182 //
183 // So the previous example gave a single chain with 2 clear end-points. Now we
184 // have 2 separate cycles. It turns out that for any possible step-size we
185 // either get a single chain or k cycles of size N/k. (So e.g. a chain of
186 // length 5 plus a cycle of length 3 is impossible. Also 1 cycle of length 4
187 // plus 2 cycles of length 2 is impossible). To be honest I've only partially
188 // mathematically proven this, but at least I've verified it for N=16 and
189 // N=4096 for all possible step-sizes.
190 //
191 // To linearise a chain in memory there are only 2 (good) possibilities: start
192 // at either end-point. But to store a cycle any point is as good as any other.
193 // Also the order in which to store the cycles themselves can still be chosen.
194 //
195 // Let's come back to the example with step-size 4.3. If we linearise this as
196 // | 0, 4, 7, 3 | 1, 5, 6, 2 |
197 // then most of the more likely transitions are sequential. The exceptions are
198 // 0 <-> 3 and 1 <-> 2
199 // but those are unavoidable with cycles. In return 2 of the less likely
200 // transitions '3 <-> 1' are now sequential. I believe this is the best
201 // possible linearization (better said: there are other linearizations that are
202 // equally good, but none is better). But do let me know if you find a better
203 // one!
204 //
205 // For step-size '8.4' an optimal(?) linearization seems to be
206 // | 0, 7 | 1, 6 | 2, 5 | 3, 4 |
207 // For step-size '7.9' the order is:
208 // | 7, 0 | 6, 1 | 5, 2 | 4, 3 |
209 // And for step-size '3.8':
210 // | 7, 4, 0, 3 | 6, 5, 1, 2 |
211 //
212 // I've again not (fully) mathematically proven it, but it seems we can
213 // optimally(?) linearise cycles by:
214 // * if likely step < unlikely step:
215 // pick unassigned rows from 0 to N/2-1, and complete each cycle
216 // * if likely step > unlikely step:
217 // pick unassigned rows from N/2-1 to 0, and complete each cycle
218 //
219 // The routine calcPermute() below calculates these optimal(?) linearizations.
220 // More in detail it calculates a permutation table: the i-th element in this
221 // table tells where in memory the i-th logical row of the original (half)
222 // resample coefficient table is physically stored.
223 
224 constexpr unsigned N = TAB_LEN;
225 constexpr unsigned N1 = N - 1;
226 constexpr unsigned N2 = N / 2;
227 
228 static unsigned mapIdx(unsigned x)
229 {
230  unsigned t = x & N1; // first wrap
231  return (t < N2) ? t : N1 - t; // then fold
232 }
233 
234 static std::pair<unsigned, unsigned> next(unsigned x, unsigned step)
235 {
236  return {mapIdx(x + step), mapIdx(N1 - x + step)};
237 }
238 
239 static void calcPermute(double ratio, int16_t* permute)
240 {
241  double r2 = ratio * N;
242  double fract = r2 - floor(r2);
243  unsigned step = floor(r2);
244  bool incr;
245  if (fract > 0.5) {
246  // mostly (> 50%) take steps of 'floor(r2) + 1'
247  step += 1;
248  incr = false; // assign from high to low
249  } else {
250  // mostly take steps of 'floor(r2)'
251  incr = true; // assign from low to high
252  }
253 
254  // initially set all as unassigned
255  for (unsigned i = 0; i < N2; ++i) {
256  permute[i] = -1;
257  }
258 
259  unsigned nxt1, nxt2;
260  unsigned restart = incr ? 0 : N2 - 1;
261  unsigned curr = restart;
262  // check for chain (instead of cycles)
263  if (incr) {
264  for (unsigned i = 0; i < N2; ++i) {
265  std::tie(nxt1, nxt2) = next(i, step);
266  if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
267  }
268  } else {
269  for (unsigned i = N2 - 1; int(i) >= 0; --i) {
270  std::tie(nxt1, nxt2) = next(i, step);
271  if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
272  }
273  }
274 
275  // assign all rows (in chain of cycle(s))
276  unsigned cnt = 0;
277  while (true) {
278  assert(permute[curr] == -1);
279  assert(cnt < N2);
280  permute[curr] = cnt++;
281 
282  std::tie(nxt1, nxt2) = next(curr, step);
283  if (permute[nxt1] == -1) {
284  curr = nxt1;
285  continue;
286  } else if (permute[nxt2] == -1) {
287  curr = nxt2;
288  continue;
289  }
290 
291  // finished chain or cycle
292  if (cnt == N2) break; // done
293 
294  // continue with next cycle
295  while (permute[restart] != -1) {
296  if (incr) {
297  ++restart;
298  assert(restart != N2);
299  } else {
300  assert(restart != 0);
301  --restart;
302  }
303  }
304  curr = restart;
305  }
306 
307 #ifdef DEBUG
308  int16_t testPerm[N2];
309  for (unsigned i = 0; i < N2; ++i) testPerm[i] = i;
310  assert(std::is_permutation(permute, permute + N2, testPerm));
311 #endif
312 }
313 
314 static double getCoeff(FilterIndex index)
315 {
316  double fraction = index.fractionAsDouble();
317  int indx = index.toInt();
318  return double(coeffs[indx]) +
319  fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
320 }
321 
322 ResampleCoeffs::Table ResampleCoeffs::calcTable(
323  double ratio, int16_t* permute, unsigned& filterLen)
324 {
325  calcPermute(ratio, permute);
326 
327  double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
328  double normFactor = floatIncr / INDEX_INC;
329  FilterIndex increment = FilterIndex(floatIncr);
330  FilterIndex maxFilterIndex(COEFF_HALF_LEN);
331 
332  int min_idx = -maxFilterIndex.divAsInt(increment);
333  int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
334  int idx_cnt = max_idx - min_idx + 1;
335  filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
336  min_idx -= (filterLen - idx_cnt) / 2;
337  Table table(HALF_TAB_LEN * filterLen);
338  memset(table.data(), 0, HALF_TAB_LEN * filterLen * sizeof(float));
339 
340  for (unsigned t = 0; t < HALF_TAB_LEN; ++t) {
341  float* tab = &table[permute[t] * filterLen];
342  double lastPos = (double(t) + 0.5) / TAB_LEN;
343  FilterIndex startFilterIndex(lastPos * floatIncr);
344 
345  FilterIndex filterIndex(startFilterIndex);
346  int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
347  filterIndex += increment * coeffCount;
348  int bufIndex = -coeffCount;
349  do {
350  tab[bufIndex - min_idx] =
351  float(getCoeff(filterIndex) * normFactor);
352  filterIndex -= increment;
353  bufIndex += 1;
354  } while (filterIndex >= FilterIndex(0));
355 
356  filterIndex = increment - startFilterIndex;
357  coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
358  filterIndex += increment * coeffCount;
359  bufIndex = 1 + coeffCount;
360  do {
361  tab[bufIndex - min_idx] =
362  float(getCoeff(filterIndex) * normFactor);
363  filterIndex -= increment;
364  bufIndex -= 1;
365  } while (filterIndex > FilterIndex(0));
366  }
367  return table;
368 }
369 
370 
371 template <unsigned CHANNELS>
373  ResampledSoundDevice& input_, const DynamicClock& hostClock_)
374  : ResampleAlgo(input_)
375  , hostClock(hostClock_)
376  , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
377 {
378  ResampleCoeffs::instance().getCoeffs(double(ratio), permute, table, filterLen);
379 
380  // fill buffer with 'enough' zero's
381  unsigned extra = int(filterLen + 1 + ratio + 1);
382  bufStart = 0;
383  bufEnd = extra;
384  nonzeroSamples = 0;
385  unsigned initialSize = 4000; // buffer grows dynamically if this is too small
386  buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
387 }
388 
389 template <unsigned CHANNELS>
391 {
392  ResampleCoeffs::instance().releaseCoeffs(double(ratio));
393 }
394 
395 #ifdef __SSE2__
396 template<bool REVERSE>
397 static inline void calcSseMono(const float* buf_, const float* tab_, size_t len, float* out)
398 {
399  assert((len % 4) == 0);
400  assert((uintptr_t(tab_) % 16) == 0);
401 
402  ptrdiff_t x = (len & ~7) * sizeof(float);
403  assert((x % 32) == 0);
404  const char* buf = reinterpret_cast<const char*>(buf_) + x;
405  const char* tab = reinterpret_cast<const char*>(tab_) + (REVERSE ? -x : x);
406  x = -x;
407 
408  __m128 a0 = _mm_setzero_ps();
409  __m128 a1 = _mm_setzero_ps();
410  do {
411  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
412  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
413  __m128 t0, t1;
414  if (REVERSE) {
415  t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 16));
416  t1 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 32));
417  } else {
418  t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
419  t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
420  }
421  __m128 m0 = _mm_mul_ps(b0, t0);
422  __m128 m1 = _mm_mul_ps(b1, t1);
423  a0 = _mm_add_ps(a0, m0);
424  a1 = _mm_add_ps(a1, m1);
425  x += 2 * sizeof(__m128);
426  } while (x < 0);
427  if (len & 4) {
428  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
429  __m128 t0;
430  if (REVERSE) {
431  t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
432  } else {
433  t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
434  }
435  __m128 m0 = _mm_mul_ps(b0, t0);
436  a0 = _mm_add_ps(a0, m0);
437  }
438 
439  __m128 a = _mm_add_ps(a0, a1);
440  // The following can be _slightly_ faster by using the SSE3 _mm_hadd_ps()
441  // intrinsic, but not worth the trouble.
442  __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
443  __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
444 
445  _mm_store_ss(out, s);
446 }
447 
448 template<int N> static inline __m128 shuffle(__m128 x)
449 {
450  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
451 }
452 template<bool REVERSE>
453 static inline void calcSseStereo(const float* buf_, const float* tab_, size_t len, float* out)
454 {
455  assert((len % 4) == 0);
456  assert((uintptr_t(tab_) % 16) == 0);
457 
458  ptrdiff_t x = 2 * (len & ~7) * sizeof(float);
459  const char* buf = reinterpret_cast<const char*>(buf_) + x;
460  const char* tab = reinterpret_cast<const char*>(tab_);
461  x = -x;
462 
463  __m128 a0 = _mm_setzero_ps();
464  __m128 a1 = _mm_setzero_ps();
465  __m128 a2 = _mm_setzero_ps();
466  __m128 a3 = _mm_setzero_ps();
467  do {
468  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
469  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
470  __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 32));
471  __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 48));
472  __m128 ta, tb;
473  if (REVERSE) {
474  ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
475  tb = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 32));
476  tab -= 2 * sizeof(__m128);
477  } else {
478  ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
479  tb = _mm_load_ps (reinterpret_cast<const float*>(tab + 16));
480  tab += 2 * sizeof(__m128);
481  }
482  __m128 t0 = shuffle<0x50>(ta);
483  __m128 t1 = shuffle<0xFA>(ta);
484  __m128 t2 = shuffle<0x50>(tb);
485  __m128 t3 = shuffle<0xFA>(tb);
486  __m128 m0 = _mm_mul_ps(b0, t0);
487  __m128 m1 = _mm_mul_ps(b1, t1);
488  __m128 m2 = _mm_mul_ps(b2, t2);
489  __m128 m3 = _mm_mul_ps(b3, t3);
490  a0 = _mm_add_ps(a0, m0);
491  a1 = _mm_add_ps(a1, m1);
492  a2 = _mm_add_ps(a2, m2);
493  a3 = _mm_add_ps(a3, m3);
494  x += 4 * sizeof(__m128);
495  } while (x < 0);
496  if (len & 4) {
497  __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
498  __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
499  __m128 ta;
500  if (REVERSE) {
501  ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
502  } else {
503  ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
504  }
505  __m128 t0 = shuffle<0x50>(ta);
506  __m128 t1 = shuffle<0xFA>(ta);
507  __m128 m0 = _mm_mul_ps(b0, t0);
508  __m128 m1 = _mm_mul_ps(b1, t1);
509  a0 = _mm_add_ps(a0, m0);
510  a1 = _mm_add_ps(a1, m1);
511  }
512 
513  __m128 a01 = _mm_add_ps(a0, a1);
514  __m128 a23 = _mm_add_ps(a2, a3);
515  __m128 a = _mm_add_ps(a01, a23);
516  // Can faster with SSE3, but (like above) not worth the trouble.
517  __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
518  _mm_store_ss(&out[0], s);
519  _mm_store_ss(&out[1], shuffle<0x55>(s));
520 }
521 
522 #endif
523 
524 template <unsigned CHANNELS>
525 void ResampleHQ<CHANNELS>::calcOutput(
526  float pos, float* __restrict output)
527 {
528  assert((filterLen & 3) == 0);
529 
530  int bufIdx = int(pos) + bufStart;
531  assert((bufIdx + filterLen) <= bufEnd);
532  bufIdx *= CHANNELS;
533  const float* buf = &buffer[bufIdx];
534 
535  int t = unsigned(lrintf(pos * TAB_LEN)) % TAB_LEN;
536  if (!(t & HALF_TAB_LEN)) {
537  // first half, begin of row 't'
538  t = permute[t];
539  const float* tab = &table[t * filterLen];
540 
541 #ifdef __SSE2__
542  if (CHANNELS == 1) {
543  calcSseMono <false>(buf, tab, filterLen, output);
544  } else {
545  calcSseStereo<false>(buf, tab, filterLen, output);
546  }
547  return;
548 #endif
549 
550  // c++ version, both mono and stereo
551  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
552  float r0 = 0.0f;
553  float r1 = 0.0f;
554  float r2 = 0.0f;
555  float r3 = 0.0f;
556  for (unsigned i = 0; i < filterLen; i += 4) {
557  r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
558  r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
559  r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
560  r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
561  }
562  output[ch] = r0 + r1 + r2 + r3;
563  ++buf;
564  }
565  } else {
566  // 2nd half, end of row 'TAB_LEN - 1 - t'
567  t = permute[TAB_LEN - 1 - t];
568  const float* tab = &table[(t + 1) * filterLen];
569 
570 #ifdef __SSE2__
571  if (CHANNELS == 1) {
572  calcSseMono <true>(buf, tab, filterLen, output);
573  } else {
574  calcSseStereo<true>(buf, tab, filterLen, output);
575  }
576  return;
577 #endif
578 
579  // c++ version, both mono and stereo
580  for (unsigned ch = 0; ch < CHANNELS; ++ch) {
581  float r0 = 0.0f;
582  float r1 = 0.0f;
583  float r2 = 0.0f;
584  float r3 = 0.0f;
585  for (int i = 0; i < int(filterLen); i += 4) {
586  r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
587  r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
588  r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
589  r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
590  }
591  output[ch] = r0 + r1 + r2 + r3;
592  ++buf;
593  }
594  }
595 }
596 
597 template <unsigned CHANNELS>
598 void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
599 {
600  // Still enough free space at end of buffer?
601  unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
602  if (free < emuNum) {
603  // No, then move everything to the start
604  // (data needs to be in a contiguous memory block)
605  unsigned available = bufEnd - bufStart;
606  memmove(&buffer[0], &buffer[bufStart * CHANNELS],
607  available * CHANNELS * sizeof(float));
608  bufStart = 0;
609  bufEnd = available;
610 
611  free = unsigned(buffer.size() / CHANNELS) - bufEnd;
612  int missing = emuNum - free;
613  if (unlikely(missing > 0)) {
614  // Still not enough room: grow the buffer.
615  // TODO an alternative is to instead of using a large
616  // buffer, chop the work in multiple smaller pieces.
617  // That may have the advantage that the data fits in
618  // the CPU's data cache. OTOH too small chunks have
619  // more overhead. (Not yet implemented because it's
620  // more complex).
621  buffer.resize(buffer.size() + missing * CHANNELS);
622  }
623  }
624  VLA_SSE_ALIGNED(float, tmpBuf, emuNum * CHANNELS + 3);
625  if (input.generateInput(tmpBuf, emuNum)) {
626  memcpy(&buffer[bufEnd * CHANNELS], tmpBuf,
627  emuNum * CHANNELS * sizeof(float));
628  bufEnd += emuNum;
629  nonzeroSamples = bufEnd - bufStart;
630  } else {
631  memset(&buffer[bufEnd * CHANNELS], 0,
632  emuNum * CHANNELS * sizeof(float));
633  bufEnd += emuNum;
634  }
635 
636  assert(bufStart <= bufEnd);
637  assert(bufEnd <= (buffer.size() / CHANNELS));
638 }
639 
640 template <unsigned CHANNELS>
642  float* __restrict dataOut, unsigned hostNum, EmuTime::param time)
643 {
644  auto& emuClk = getEmuClock();
645  unsigned emuNum = emuClk.getTicksTill(time);
646  if (emuNum > 0) {
647  prepareData(emuNum);
648  }
649 
650  bool notMuted = nonzeroSamples > 0;
651  if (notMuted) {
652  // main processing loop
653  EmuTime host1 = hostClock.getFastAdd(1);
654  assert(host1 > emuClk.getTime());
655  float pos = emuClk.getTicksTillDouble(host1);
656  assert(pos <= (ratio + 2));
657  for (unsigned i = 0; i < hostNum; ++i) {
658  calcOutput(pos, &dataOut[i * CHANNELS]);
659  pos += ratio;
660  }
661  }
662  emuClk += emuNum;
663  bufStart += emuNum;
664  nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
665 
666  assert(bufStart <= bufEnd);
667  unsigned available = bufEnd - bufStart;
668  unsigned extra = int(filterLen + 1 + ratio + 1);
669  assert(available == extra); (void)available; (void)extra;
670 
671  return notMuted;
672 }
673 
674 // Force template instantiation.
675 template class ResampleHQ<1>;
676 template class ResampleHQ<2>;
677 
678 } // namespace openmsx
openmsx::ResampleCoeffs::getCoeffs
void getCoeffs(double ratio, int16_t *&permute, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:85
openmsx::N2
constexpr unsigned N2
Definition: ResampleHQ.cc:226
CHANNELS
constexpr unsigned CHANNELS
Definition: YM2413Test.cc:20
openmsx::ResampledSoundDevice
Definition: ResampledSoundDevice.hh:16
unlikely
#define unlikely(x)
Definition: likely.hh:15
ResampleHQ.hh
openmsx::ResampleHQ::ResampleHQ
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
Definition: ResampleHQ.cc:372
utf8::unchecked::size
size_t size(std::string_view utf8)
Definition: utf8_unchecked.hh:227
openmsx::ResampleCoeffs
Definition: ResampleHQ.cc:48
openmsx::ResampleHQ::generateOutputImpl
bool generateOutputImpl(float *dataOut, unsigned num, EmuTime::param time) override
Definition: ResampleHQ.cc:641
openmsx::INDEX_INC
constexpr int INDEX_INC
Definition: ResampleHQ.cc:42
LZ4::count
ALWAYS_INLINE unsigned count(const uint8_t *pIn, const uint8_t *pMatch, const uint8_t *pInLimit)
Definition: lz4.cc:207
t
TclObject t
Definition: TclObject_test.cc:264
ranges.hh
openmsx::FilterIndex
FixedPoint< 16 > FilterIndex
Definition: ResampleHQ.cc:40
openmsx::HALF_TAB_LEN
constexpr unsigned HALF_TAB_LEN
Definition: ResampleHQ.cc:46
openmsx::coeffs
constexpr float coeffs[]
Definition: ResampleHQ.cc:36
step
constexpr auto step
Definition: eeprom.cc:9
openmsx::ResampleCoeffs::instance
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:79
likely.hh
openmsx::DynamicClock
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:16
vla.hh
move_pop_back
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:177
openmsx::N1
constexpr unsigned N1
Definition: ResampleHQ.cc:225
openmsx::table
constexpr Table table
Definition: CPUCore.cc:263
openmsx::MemBuffer< float, SSE2_ALIGNMENT >
openmsx::ResampleCoeffs::releaseCoeffs
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:107
openmsx::TAB_LEN
constexpr unsigned TAB_LEN
Definition: ResampleHQ.cc:45
build-info.hh
openmsx::N
constexpr unsigned N
Definition: ResampleHQ.cc:224
MemBuffer.hh
FixedPoint.hh
openmsx::x
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:1419
openmsx::COEFF_LEN
constexpr int COEFF_LEN
Definition: ResampleHQ.cc:43
openmsx::MemBuffer::data
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:90
stl.hh
openmsx::ResampleHQ
Definition: ResampleHQ.hh:14
ranges::find_if
auto find_if(InputRange &&range, UnaryPredicate pred)
Definition: ranges.hh:113
VLA_SSE_ALIGNED
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
openmsx::ResampleHQ::~ResampleHQ
~ResampleHQ() override
Definition: ResampleHQ.cc:390
openmsx
Thanks to enen for testing this on a real cartridge:
Definition: Autofire.cc:5
openmsx::ResampleAlgo
Definition: ResampleAlgo.hh:11
rfind_if_unguarded
auto rfind_if_unguarded(RANGE &range, PRED pred)
Definition: stl.hh:160
openmsx::COEFF_HALF_LEN
constexpr int COEFF_HALF_LEN
Definition: ResampleHQ.cc:44
openmsx::FixedPoint< 16 >
ResampledSoundDevice.hh