openMSX
ResampleHQ.cc
Go to the documentation of this file.
1// Based on libsamplerate-0.1.2 (aka Secret Rabbit Code)
2//
3// simplified code in several ways:
4// - resample algorithm is no longer switchable, we took this variant:
5// Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6// - don't allow to change sample rate on-the-fly
7// - assume input (and thus also output) signals have infinite length, so
8// there is no special code to handle the ending of the signal
9// - changed/simplified API to better match openmsx use model
10// (e.g. remove all error checking)
11
12#include "ResampleHQ.hh"
14#include "FixedPoint.hh"
15#include "MemBuffer.hh"
16#include "aligned.hh"
17#include "ranges.hh"
18#include "stl.hh"
19#include "vla.hh"
20#include "xrange.hh"
21#include <vector>
22#include <cmath>
23#include <cstddef>
24#include <cstring>
25#include <cassert>
26#include <iterator>
27#ifdef __SSE2__
28#include <emmintrin.h>
29#endif
30
31namespace openmsx {
32
33// Note: without appending 'f' to the values in ResampleCoeffs.ii,
34// this will generate thousands of C4305 warnings in VC++
35// E.g. warning C4305: 'initializing' : truncation from 'double' to 'const float'
36constexpr float coeffs[] = {
37 #include "ResampleCoeffs.ii"
38};
39
41
42constexpr int INDEX_INC = 128;
43constexpr int COEFF_LEN = int(std::size(coeffs));
44constexpr int COEFF_HALF_LEN = COEFF_LEN - 1;
45constexpr unsigned TAB_LEN = 4096;
46constexpr unsigned HALF_TAB_LEN = TAB_LEN / 2;
47
49{
50public:
53
54 static ResampleCoeffs& instance();
55 void getCoeffs(double ratio, int16_t*& permute, float*& table, unsigned& filterLen);
56 void releaseCoeffs(double ratio);
57
58private:
61
62 ResampleCoeffs() = default;
64
65 static Table calcTable(double ratio, int16_t* permute, unsigned& filterLen);
66
67 struct Element {
68 double ratio;
69 PermuteTable permute;
71 unsigned filterLen;
72 unsigned count;
73 };
74 std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
75};
76
77ResampleCoeffs::~ResampleCoeffs()
78{
79 assert(cache.empty());
80}
81
83{
84 static ResampleCoeffs resampleCoeffs;
85 return resampleCoeffs;
86}
87
89 double ratio, int16_t*& permute, float*& table, unsigned& filterLen)
90{
91 if (auto it = ranges::find(cache, ratio, &Element::ratio);
92 it != end(cache)) {
93 permute = it->permute.data();
94 table = it->table.data();
95 filterLen = it->filterLen;
96 it->count++;
97 return;
98 }
99 Element elem;
100 elem.ratio = ratio;
101 elem.count = 1;
102 elem.permute = PermuteTable(HALF_TAB_LEN);
103 elem.table = calcTable(ratio, elem.permute.data(), elem.filterLen);
104 permute = elem.permute.data();
105 table = elem.table.data();
106 filterLen = elem.filterLen;
107 cache.push_back(std::move(elem));
108}
109
111{
112 auto it = rfind_unguarded(cache, ratio, &Element::ratio);
113 it->count--;
114 if (it->count == 0) {
115 move_pop_back(cache, it);
116 }
117}
118
119// -- Permutation stuff --
120//
121// The rows in the resample coefficient table are not visited sequentially.
122// Instead, depending on the resample-ratio, we take fixed non-integer jumps
123// from one row to the next.
124//
125// In reality the table has 4096 rows (of which only 2048 are actually stored).
126// But for simplicity I'll here work out examples for a table with only 16 rows
127// (of which 8 are stored).
128//
129// Let's first assume a jump of '5.2'. This means that after we've used row
130// 'r', the next row we need is 'r + 5.2'. Of course row numbers must be
131// integers, so a jump of 5.2 actually means that 80% of the time we advance 5
132// rows and 20% of the time we advance 6 rows.
133//
134// The rows in the (full) table are circular. This means that once we're past
135// row 15 (in this example) we restart at row 0. So rows 'wrap' past the end
136// (modulo arithmetic). We also only store the 1st half of the table, the
137// entries for the 2nd half are 'folded' back to the 1st half according to the
138// formula: y = 15 - x.
139//
140// Let's now calculate the possible transitions. If we're currently on row '0',
141// the next row will be either '5' (80% chance) or row '6' (20% chance). When
142// we're on row '5' the next most likely row will be '10', but after folding
143// '10' becomes '15-10 = 5' (so 5 goes to itself (80% chance)). Row '10' most
144// likely goes to '15', after folding we get that '5' goes to '0'. Row '15'
145// most likely goes to '20', and after wrapping and folding that becomes '0'
146// goes to '4'. Calculating this for all rows gives:
147// 0 -> 5 or 4 (80%) 0 -> 6 or 5 (20%)
148// 1 -> 6 or 3 1 -> 7 or 4
149// 2 -> 7 or 2 2 -> 7 or 3
150// 3 -> 7 or 1 3 -> 6 or 2
151// 4 -> 6 or 0 4 -> 5 or 1
152// 5 -> 5 or 0 5 -> 4 or 0
153// 6 -> 4 or 1 6 -> 3 or 0
154// 7 -> 3 or 2 7 -> 2 or 1
155// So every row has 4 possible successors (2 more and 2 less likely). Possibly
156// some of these 4 are the same, or even the same as the starting row. Note
157// that if row x goes to row y (x->y) then also y->x, this turns out to be true
158// in general.
159//
160// For cache efficiency it's best if rows that are needed after each other in
161// time are also stored sequentially in memory (both before or after is fine).
162// Clearly storing the rows in numeric order will not read the memory
163// sequentially. For this specific example we could stores the rows in the
164// order:
165// 2, 7, 3, 1, 6, 4, 0, 5
166// With this order all likely transitions are sequential. The less likely
167// transitions are not. But I don't believe there exists an order that's good
168// for both the likely and the unlikely transitions. Do let me know if I'm
169// wrong.
170//
171// In this example the transitions form a single chain (it turns out this is
172// often the case). But for example for a step-size of 4.3 we get
173// 0 -> 4 or 3 (70%) 0 -> 5 or 4 (30%)
174// 1 -> 5 or 2 1 -> 6 or 3
175// 2 -> 6 or 1 2 -> 7 or 2
176// 3 -> 7 or 0 3 -> 7 or 1
177// 4 -> 7 or 0 4 -> 6 or 0
178// 5 -> 6 or 1 5 -> 5 or 0
179// 6 -> 5 or 2 6 -> 4 or 1
180// 7 -> 4 or 3 7 -> 3 or 2
181// Only looking at the more likely transitions, we get 2 cycles of length 4:
182// 0, 4, 7, 3
183// 1, 5, 6, 2
184//
185// So the previous example gave a single chain with 2 clear end-points. Now we
186// have 2 separate cycles. It turns out that for any possible step-size we
187// either get a single chain or k cycles of size N/k. (So e.g. a chain of
188// length 5 plus a cycle of length 3 is impossible. Also 1 cycle of length 4
189// plus 2 cycles of length 2 is impossible). To be honest I've only partially
190// mathematically proven this, but at least I've verified it for N=16 and
191// N=4096 for all possible step-sizes.
192//
193// To linearise a chain in memory there are only 2 (good) possibilities: start
194// at either end-point. But to store a cycle any point is as good as any other.
195// Also the order in which to store the cycles themselves can still be chosen.
196//
197// Let's come back to the example with step-size 4.3. If we linearise this as
198// | 0, 4, 7, 3 | 1, 5, 6, 2 |
199// then most of the more likely transitions are sequential. The exceptions are
200// 0 <-> 3 and 1 <-> 2
201// but those are unavoidable with cycles. In return 2 of the less likely
202// transitions '3 <-> 1' are now sequential. I believe this is the best
203// possible linearization (better said: there are other linearizations that are
204// equally good, but none is better). But do let me know if you find a better
205// one!
206//
207// For step-size '8.4' an optimal(?) linearization seems to be
208// | 0, 7 | 1, 6 | 2, 5 | 3, 4 |
209// For step-size '7.9' the order is:
210// | 7, 0 | 6, 1 | 5, 2 | 4, 3 |
211// And for step-size '3.8':
212// | 7, 4, 0, 3 | 6, 5, 1, 2 |
213//
214// I've again not (fully) mathematically proven it, but it seems we can
215// optimally(?) linearise cycles by:
216// * if likely step < unlikely step:
217// pick unassigned rows from 0 to N/2-1, and complete each cycle
218// * if likely step > unlikely step:
219// pick unassigned rows from N/2-1 to 0, and complete each cycle
220//
221// The routine calcPermute() below calculates these optimal(?) linearizations.
222// More in detail it calculates a permutation table: the i-th element in this
223// table tells where in memory the i-th logical row of the original (half)
224// resample coefficient table is physically stored.
225
226constexpr unsigned N = TAB_LEN;
227constexpr unsigned N1 = N - 1;
228constexpr unsigned N2 = N / 2;
229
230static constexpr unsigned mapIdx(unsigned x)
231{
232 unsigned t = x & N1; // first wrap
233 return (t < N2) ? t : N1 - t; // then fold
234}
235
236static constexpr std::pair<unsigned, unsigned> next(unsigned x, unsigned step)
237{
238 return {mapIdx(x + step), mapIdx(N1 - x + step)};
239}
240
241static void calcPermute(double ratio, int16_t* permute)
242{
243 double r2 = ratio * N;
244 double fract = r2 - floor(r2);
245 unsigned step = floor(r2);
246 bool incr = [&] {
247 if (fract > 0.5) {
248 // mostly (> 50%) take steps of 'floor(r2) + 1'
249 step += 1;
250 return false; // assign from high to low
251 } else {
252 // mostly take steps of 'floor(r2)'
253 return true; // assign from low to high
254 }
255 }();
256
257 // initially set all as unassigned
258 std::fill_n(permute, N2, -1);
259
260 unsigned restart = incr ? 0 : N2 - 1;
261 unsigned curr = restart;
262 // check for chain (instead of cycles)
263 if (incr) {
264 for (auto i : xrange(N2)) {
265 auto [nxt1, nxt2] = next(i, step);
266 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
267 }
268 } else {
269 for (unsigned i = N2 - 1; int(i) >= 0; --i) {
270 auto [nxt1, nxt2] = next(i, step);
271 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
272 }
273 }
274
275 // assign all rows (in chain of cycle(s))
276 unsigned cnt = 0;
277 while (true) {
278 assert(permute[curr] == -1);
279 assert(cnt < N2);
280 permute[curr] = cnt++;
281
282 auto [nxt1, nxt2] = next(curr, step);
283 if (permute[nxt1] == -1) {
284 curr = nxt1;
285 continue;
286 } else if (permute[nxt2] == -1) {
287 curr = nxt2;
288 continue;
289 }
290
291 // finished chain or cycle
292 if (cnt == N2) break; // done
293
294 // continue with next cycle
295 while (permute[restart] != -1) {
296 if (incr) {
297 ++restart;
298 assert(restart != N2);
299 } else {
300 assert(restart != 0);
301 --restart;
302 }
303 }
304 curr = restart;
305 }
306
307#ifdef DEBUG
308 int16_t testPerm[N2];
309 ranges::iota(testPerm, 0);
310 assert(std::is_permutation(permute, permute + N2, testPerm));
311#endif
312}
313
314static constexpr double getCoeff(FilterIndex index)
315{
316 double fraction = index.fractionAsDouble();
317 int indx = index.toInt();
318 return double(coeffs[indx]) +
319 fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
320}
321
322ResampleCoeffs::Table ResampleCoeffs::calcTable(
323 double ratio, int16_t* permute, unsigned& filterLen)
324{
325 calcPermute(ratio, permute);
326
327 double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
328 double normFactor = floatIncr / INDEX_INC;
329 auto increment = FilterIndex(floatIncr);
330 FilterIndex maxFilterIndex(COEFF_HALF_LEN);
331
332 int min_idx = -maxFilterIndex.divAsInt(increment);
333 int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
334 int idx_cnt = max_idx - min_idx + 1;
335 filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
336 min_idx -= (filterLen - idx_cnt) / 2;
337 Table table(HALF_TAB_LEN * filterLen);
338 memset(table.data(), 0, HALF_TAB_LEN * filterLen * sizeof(float));
339
340 for (auto t : xrange(HALF_TAB_LEN)) {
341 float* tab = &table[permute[t] * filterLen];
342 double lastPos = (double(t) + 0.5) / TAB_LEN;
343 FilterIndex startFilterIndex(lastPos * floatIncr);
344
345 FilterIndex filterIndex(startFilterIndex);
346 int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
347 filterIndex += increment * coeffCount;
348 int bufIndex = -coeffCount;
349 do {
350 tab[bufIndex - min_idx] =
351 float(getCoeff(filterIndex) * normFactor);
352 filterIndex -= increment;
353 bufIndex += 1;
354 } while (filterIndex >= FilterIndex(0));
355
356 filterIndex = increment - startFilterIndex;
357 coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
358 filterIndex += increment * coeffCount;
359 bufIndex = 1 + coeffCount;
360 do {
361 tab[bufIndex - min_idx] =
362 float(getCoeff(filterIndex) * normFactor);
363 filterIndex -= increment;
364 bufIndex -= 1;
365 } while (filterIndex > FilterIndex(0));
366 }
367 return table;
368}
369
370
371template<unsigned CHANNELS>
373 ResampledSoundDevice& input_, const DynamicClock& hostClock_)
374 : ResampleAlgo(input_)
375 , hostClock(hostClock_)
376 , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
377{
378 ResampleCoeffs::instance().getCoeffs(double(ratio), permute, table, filterLen);
379
380 // fill buffer with 'enough' zero's
381 unsigned extra = int(filterLen + 1 + ratio + 1);
382 bufStart = 0;
383 bufEnd = extra;
384 nonzeroSamples = 0;
385 unsigned initialSize = 4000; // buffer grows dynamically if this is too small
386 buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
387}
388
389template<unsigned CHANNELS>
391{
393}
394
395#ifdef __SSE2__
396template<bool REVERSE>
397static inline void calcSseMono(const float* buf_, const float* tab_, size_t len, float* out)
398{
399 assert((len % 4) == 0);
400 assert((uintptr_t(tab_) % 16) == 0);
401
402 ptrdiff_t x = (len & ~7) * sizeof(float);
403 assert((x % 32) == 0);
404 const char* buf = reinterpret_cast<const char*>(buf_) + x;
405 const char* tab = reinterpret_cast<const char*>(tab_) + (REVERSE ? -x : x);
406 x = -x;
407
408 __m128 a0 = _mm_setzero_ps();
409 __m128 a1 = _mm_setzero_ps();
410 do {
411 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
412 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
413 __m128 t0, t1;
414 if constexpr (REVERSE) {
415 t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 16));
416 t1 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 32));
417 } else {
418 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
419 t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
420 }
421 __m128 m0 = _mm_mul_ps(b0, t0);
422 __m128 m1 = _mm_mul_ps(b1, t1);
423 a0 = _mm_add_ps(a0, m0);
424 a1 = _mm_add_ps(a1, m1);
425 x += 2 * sizeof(__m128);
426 } while (x < 0);
427 if (len & 4) {
428 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
429 __m128 t0;
430 if constexpr (REVERSE) {
431 t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
432 } else {
433 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
434 }
435 __m128 m0 = _mm_mul_ps(b0, t0);
436 a0 = _mm_add_ps(a0, m0);
437 }
438
439 __m128 a = _mm_add_ps(a0, a1);
440 // The following can be _slightly_ faster by using the SSE3 _mm_hadd_ps()
441 // intrinsic, but not worth the trouble.
442 __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
443 __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
444
445 _mm_store_ss(out, s);
446}
447
448template<int N> static inline __m128 shuffle(__m128 x)
449{
450 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
451}
452template<bool REVERSE>
453static inline void calcSseStereo(const float* buf_, const float* tab_, size_t len, float* out)
454{
455 assert((len % 4) == 0);
456 assert((uintptr_t(tab_) % 16) == 0);
457
458 ptrdiff_t x = 2 * (len & ~7) * sizeof(float);
459 const char* buf = reinterpret_cast<const char*>(buf_) + x;
460 const char* tab = reinterpret_cast<const char*>(tab_);
461 x = -x;
462
463 __m128 a0 = _mm_setzero_ps();
464 __m128 a1 = _mm_setzero_ps();
465 __m128 a2 = _mm_setzero_ps();
466 __m128 a3 = _mm_setzero_ps();
467 do {
468 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
469 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
470 __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 32));
471 __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 48));
472 __m128 ta, tb;
473 if constexpr (REVERSE) {
474 ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
475 tb = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 32));
476 tab -= 2 * sizeof(__m128);
477 } else {
478 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
479 tb = _mm_load_ps (reinterpret_cast<const float*>(tab + 16));
480 tab += 2 * sizeof(__m128);
481 }
482 __m128 t0 = shuffle<0x50>(ta);
483 __m128 t1 = shuffle<0xFA>(ta);
484 __m128 t2 = shuffle<0x50>(tb);
485 __m128 t3 = shuffle<0xFA>(tb);
486 __m128 m0 = _mm_mul_ps(b0, t0);
487 __m128 m1 = _mm_mul_ps(b1, t1);
488 __m128 m2 = _mm_mul_ps(b2, t2);
489 __m128 m3 = _mm_mul_ps(b3, t3);
490 a0 = _mm_add_ps(a0, m0);
491 a1 = _mm_add_ps(a1, m1);
492 a2 = _mm_add_ps(a2, m2);
493 a3 = _mm_add_ps(a3, m3);
494 x += 4 * sizeof(__m128);
495 } while (x < 0);
496 if (len & 4) {
497 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
498 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
499 __m128 ta;
500 if constexpr (REVERSE) {
501 ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
502 } else {
503 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
504 }
505 __m128 t0 = shuffle<0x50>(ta);
506 __m128 t1 = shuffle<0xFA>(ta);
507 __m128 m0 = _mm_mul_ps(b0, t0);
508 __m128 m1 = _mm_mul_ps(b1, t1);
509 a0 = _mm_add_ps(a0, m0);
510 a1 = _mm_add_ps(a1, m1);
511 }
512
513 __m128 a01 = _mm_add_ps(a0, a1);
514 __m128 a23 = _mm_add_ps(a2, a3);
515 __m128 a = _mm_add_ps(a01, a23);
516 // Can faster with SSE3, but (like above) not worth the trouble.
517 __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
518 _mm_store_ss(&out[0], s);
519 _mm_store_ss(&out[1], shuffle<0x55>(s));
520}
521
522#endif
523
524template<unsigned CHANNELS>
525void ResampleHQ<CHANNELS>::calcOutput(
526 float pos, float* __restrict output)
527{
528 assert((filterLen & 3) == 0);
529
530 int bufIdx = int(pos) + bufStart;
531 assert((bufIdx + filterLen) <= bufEnd);
532 bufIdx *= CHANNELS;
533 const float* buf = &buffer[bufIdx];
534
535 int t = unsigned(lrintf(pos * TAB_LEN)) % TAB_LEN;
536 if (!(t & HALF_TAB_LEN)) {
537 // first half, begin of row 't'
538 t = permute[t];
539 const float* tab = &table[t * filterLen];
540
541#ifdef __SSE2__
542 if constexpr (CHANNELS == 1) {
543 calcSseMono <false>(buf, tab, filterLen, output);
544 } else {
545 calcSseStereo<false>(buf, tab, filterLen, output);
546 }
547 return;
548#endif
549
550 // c++ version, both mono and stereo
551 for (auto ch : xrange(CHANNELS)) {
552 float r0 = 0.0f;
553 float r1 = 0.0f;
554 float r2 = 0.0f;
555 float r3 = 0.0f;
556 for (unsigned i = 0; i < filterLen; i += 4) {
557 r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
558 r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
559 r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
560 r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
561 }
562 output[ch] = r0 + r1 + r2 + r3;
563 ++buf;
564 }
565 } else {
566 // 2nd half, end of row 'TAB_LEN - 1 - t'
567 t = permute[TAB_LEN - 1 - t];
568 const float* tab = &table[(t + 1) * filterLen];
569
570#ifdef __SSE2__
571 if constexpr (CHANNELS == 1) {
572 calcSseMono <true>(buf, tab, filterLen, output);
573 } else {
574 calcSseStereo<true>(buf, tab, filterLen, output);
575 }
576 return;
577#endif
578
579 // c++ version, both mono and stereo
580 for (auto ch : xrange(CHANNELS)) {
581 float r0 = 0.0f;
582 float r1 = 0.0f;
583 float r2 = 0.0f;
584 float r3 = 0.0f;
585 for (int i = 0; i < int(filterLen); i += 4) {
586 r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
587 r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
588 r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
589 r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
590 }
591 output[ch] = r0 + r1 + r2 + r3;
592 ++buf;
593 }
594 }
595}
596
597template<unsigned CHANNELS>
598void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
599{
600 // Still enough free space at end of buffer?
601 unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
602 if (free < emuNum) {
603 // No, then move everything to the start
604 // (data needs to be in a contiguous memory block)
605 unsigned available = bufEnd - bufStart;
606 memmove(&buffer[0], &buffer[bufStart * CHANNELS],
607 available * CHANNELS * sizeof(float));
608 bufStart = 0;
609 bufEnd = available;
610
611 free = unsigned(buffer.size() / CHANNELS) - bufEnd;
612 int missing = emuNum - free;
613 if (missing > 0) [[unlikely]] {
614 // Still not enough room: grow the buffer.
615 // TODO an alternative is to instead of using a large
616 // buffer, chop the work in multiple smaller pieces.
617 // That may have the advantage that the data fits in
618 // the CPU's data cache. OTOH too small chunks have
619 // more overhead. (Not yet implemented because it's
620 // more complex).
621 buffer.resize(buffer.size() + missing * CHANNELS);
622 }
623 }
624 VLA_SSE_ALIGNED(float, tmpBuf, emuNum * CHANNELS + 3);
625 if (input.generateInput(tmpBuf, emuNum)) {
626 memcpy(&buffer[bufEnd * CHANNELS], tmpBuf,
627 emuNum * CHANNELS * sizeof(float));
628 bufEnd += emuNum;
629 nonzeroSamples = bufEnd - bufStart;
630 } else {
631 memset(&buffer[bufEnd * CHANNELS], 0,
632 emuNum * CHANNELS * sizeof(float));
633 bufEnd += emuNum;
634 }
635
636 assert(bufStart <= bufEnd);
637 assert(bufEnd <= (buffer.size() / CHANNELS));
638}
639
640template<unsigned CHANNELS>
642 float* __restrict dataOut, unsigned hostNum, EmuTime::param time)
643{
644 auto& emuClk = getEmuClock();
645 unsigned emuNum = emuClk.getTicksTill(time);
646 if (emuNum > 0) {
647 prepareData(emuNum);
648 }
649
650 bool notMuted = nonzeroSamples > 0;
651 if (notMuted) {
652 // main processing loop
653 EmuTime host1 = hostClock.getFastAdd(1);
654 assert(host1 > emuClk.getTime());
655 float pos = emuClk.getTicksTillDouble(host1);
656 assert(pos <= (ratio + 2));
657 for (auto i : xrange(hostNum)) {
658 calcOutput(pos, &dataOut[i * CHANNELS]);
659 pos += ratio;
660 }
661 }
662 emuClk += emuNum;
663 bufStart += emuNum;
664 nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
665
666 assert(bufStart <= bufEnd);
667 unsigned available = bufEnd - bufStart;
668 unsigned extra = int(filterLen + 1 + ratio + 1);
669 assert(available == extra); (void)available; (void)extra;
670
671 return notMuted;
672}
673
674// Force template instantiation.
675template class ResampleHQ<1>;
676template class ResampleHQ<2>;
677
678} // namespace openmsx
TclObject t
constexpr unsigned CHANNELS
Definition: YM2413Test.cc:20
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:16
const T * data() const
Returns pointer to the start of the memory buffer.
Definition: MemBuffer.hh:81
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:82
ResampleCoeffs(const ResampleCoeffs &)=delete
void getCoeffs(double ratio, int16_t *&permute, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:88
ResampleCoeffs & operator=(const ResampleCoeffs &)=delete
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:110
bool generateOutputImpl(float *dataOut, unsigned num, EmuTime::param time) override
Definition: ResampleHQ.cc:641
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
Definition: ResampleHQ.cc:372
~ResampleHQ() override
Definition: ResampleHQ.cc:390
constexpr auto step
Definition: eeprom.cc:9
ALWAYS_INLINE unsigned count(const uint8_t *pIn, const uint8_t *pMatch, const uint8_t *pInLimit)
Definition: lz4.cc:146
This file implemented 3 utility functions:
Definition: Autofire.cc:9
constexpr float coeffs[]
Definition: ResampleHQ.cc:36
constexpr int COEFF_HALF_LEN
Definition: ResampleHQ.cc:44
constexpr Table table
Definition: CPUCore.cc:259
constexpr unsigned N2
Definition: ResampleHQ.cc:228
constexpr unsigned N1
Definition: ResampleHQ.cc:227
FixedPoint< 16 > FilterIndex
Definition: ResampleHQ.cc:40
constexpr unsigned N
Definition: ResampleHQ.cc:226
constexpr unsigned TAB_LEN
Definition: ResampleHQ.cc:45
constexpr int COEFF_LEN
Definition: ResampleHQ.cc:43
constexpr unsigned HALF_TAB_LEN
Definition: ResampleHQ.cc:46
constexpr KeyMatrixPosition x
Keyboard bindings.
Definition: Keyboard.cc:127
constexpr int INDEX_INC
Definition: ResampleHQ.cc:42
constexpr void iota(ForwardIt first, ForwardIt last, T value)
Definition: ranges.hh:263
auto find(InputRange &&range, const T &value)
Definition: ranges.hh:144
size_t size(std::string_view utf8)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:125
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition: stl.hh:100
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:44
constexpr auto xrange(T e)
Definition: xrange.hh:133
constexpr auto end(const zstring_view &x)