openMSX
ResampleHQ.cc
Go to the documentation of this file.
1// Based on libsamplerate-0.1.2 (aka Secret Rabbit Code)
2//
3// simplified code in several ways:
4// - resample algorithm is no longer switchable, we took this variant:
5// Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6// - don't allow to change sample rate on-the-fly
7// - assume input (and thus also output) signals have infinite length, so
8// there is no special code to handle the ending of the signal
9// - changed/simplified API to better match openmsx use model
10// (e.g. remove all error checking)
11
12#include "ResampleHQ.hh"
14#include "FixedPoint.hh"
15#include "MemBuffer.hh"
16#include "aligned.hh"
17#include "narrow.hh"
18#include "ranges.hh"
19#include "stl.hh"
20#include "vla.hh"
21#include "xrange.hh"
22#include <array>
23#include <cmath>
24#include <cstddef>
25#include <cassert>
26#include <iterator>
27#include <vector>
28#ifdef __SSE2__
29#include <emmintrin.h>
30#endif
31
32namespace openmsx {
33
34// Letting the compiler deduce the (type and) size of the std::array works fine
35// with gcc, msvc and clang-11 but is broken from clang-12 onwards. More
36// specifically it only works for sizes upto 256. For more details see:
37// https://www.mail-archive.com/llvm-bugs@lists.llvm.org/msg50598.html
38// https://reviews.llvm.org/D86936
39// As a workaround we do hardcode the (type and) size here:
40//static constexpr std::array coeffs = {
41static constexpr std::array<float, 2464> coeffs = {
42 #include "ResampleCoeffs.ii"
43};
44
46
47static constexpr int INDEX_INC = 128;
48static constexpr int COEFF_LEN = int(std::size(coeffs));
49static constexpr int COEFF_HALF_LEN = COEFF_LEN - 1;
50static constexpr size_t TAB_LEN = ResampleHQ<1>::TAB_LEN;
51static constexpr size_t HALF_TAB_LEN = ResampleHQ<1>::HALF_TAB_LEN;
52
54{
55public:
58
59 static ResampleCoeffs& instance();
60 void getCoeffs(double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute, float*& table, unsigned& filterLen);
61 void releaseCoeffs(double ratio);
62
63private:
65
66 ResampleCoeffs() = default;
68
69 static Table calcTable(double ratio, std::span<int16_t, HALF_TAB_LEN> permute, unsigned& filterLen);
70
71 struct Element {
72 double ratio;
73 std::array<int16_t, HALF_TAB_LEN> permute;
74 Table table;
75 unsigned filterLen;
76 unsigned count;
77 };
78 std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
79};
80
81ResampleCoeffs::~ResampleCoeffs()
82{
83 assert(cache.empty());
84}
85
87{
88 static ResampleCoeffs resampleCoeffs;
89 return resampleCoeffs;
90}
91
93 double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute, float*& table, unsigned& filterLen)
94{
95 if (auto it = ranges::find(cache, ratio, &Element::ratio);
96 it != end(cache)) {
97 permute = it->permute;
98 table = it->table.data();
99 filterLen = it->filterLen;
100 it->count++;
101 return;
102 }
103 Element elem;
104 elem.ratio = ratio;
105 elem.count = 1;
106 elem.table = calcTable(ratio, elem.permute, elem.filterLen);
107 permute = elem.permute;
108 table = elem.table.data();
109 filterLen = elem.filterLen;
110 cache.push_back(std::move(elem));
111}
112
114{
115 auto it = rfind_unguarded(cache, ratio, &Element::ratio);
116 it->count--;
117 if (it->count == 0) {
118 move_pop_back(cache, it);
119 }
120}
121
122// -- Permutation stuff --
123//
124// The rows in the resample coefficient table are not visited sequentially.
125// Instead, depending on the resample-ratio, we take fixed non-integer jumps
126// from one row to the next.
127//
128// In reality the table has 4096 rows (of which only 2048 are actually stored).
129// But for simplicity I'll here work out examples for a table with only 16 rows
130// (of which 8 are stored).
131//
132// Let's first assume a jump of '5.2'. This means that after we've used row
133// 'r', the next row we need is 'r + 5.2'. Of course row numbers must be
134// integers, so a jump of 5.2 actually means that 80% of the time we advance 5
135// rows and 20% of the time we advance 6 rows.
136//
137// The rows in the (full) table are circular. This means that once we're past
138// row 15 (in this example) we restart at row 0. So rows 'wrap' past the end
139// (modulo arithmetic). We also only store the 1st half of the table, the
140// entries for the 2nd half are 'folded' back to the 1st half according to the
141// formula: y = 15 - x.
142//
143// Let's now calculate the possible transitions. If we're currently on row '0',
144// the next row will be either '5' (80% chance) or row '6' (20% chance). When
145// we're on row '5' the next most likely row will be '10', but after folding
146// '10' becomes '15-10 = 5' (so 5 goes to itself (80% chance)). Row '10' most
147// likely goes to '15', after folding we get that '5' goes to '0'. Row '15'
148// most likely goes to '20', and after wrapping and folding that becomes '0'
149// goes to '4'. Calculating this for all rows gives:
150// 0 -> 5 or 4 (80%) 0 -> 6 or 5 (20%)
151// 1 -> 6 or 3 1 -> 7 or 4
152// 2 -> 7 or 2 2 -> 7 or 3
153// 3 -> 7 or 1 3 -> 6 or 2
154// 4 -> 6 or 0 4 -> 5 or 1
155// 5 -> 5 or 0 5 -> 4 or 0
156// 6 -> 4 or 1 6 -> 3 or 0
157// 7 -> 3 or 2 7 -> 2 or 1
158// So every row has 4 possible successors (2 more and 2 less likely). Possibly
159// some of these 4 are the same, or even the same as the starting row. Note
160// that if row x goes to row y (x->y) then also y->x, this turns out to be true
161// in general.
162//
163// For cache efficiency it's best if rows that are needed after each other in
164// time are also stored sequentially in memory (both before or after is fine).
165// Clearly storing the rows in numeric order will not read the memory
166// sequentially. For this specific example we could stores the rows in the
167// order:
168// 2, 7, 3, 1, 6, 4, 0, 5
169// With this order all likely transitions are sequential. The less likely
170// transitions are not. But I don't believe there exists an order that's good
171// for both the likely and the unlikely transitions. Do let me know if I'm
172// wrong.
173//
174// In this example the transitions form a single chain (it turns out this is
175// often the case). But for example for a step-size of 4.3 we get
176// 0 -> 4 or 3 (70%) 0 -> 5 or 4 (30%)
177// 1 -> 5 or 2 1 -> 6 or 3
178// 2 -> 6 or 1 2 -> 7 or 2
179// 3 -> 7 or 0 3 -> 7 or 1
180// 4 -> 7 or 0 4 -> 6 or 0
181// 5 -> 6 or 1 5 -> 5 or 0
182// 6 -> 5 or 2 6 -> 4 or 1
183// 7 -> 4 or 3 7 -> 3 or 2
184// Only looking at the more likely transitions, we get 2 cycles of length 4:
185// 0, 4, 7, 3
186// 1, 5, 6, 2
187//
188// So the previous example gave a single chain with 2 clear end-points. Now we
189// have 2 separate cycles. It turns out that for any possible step-size we
190// either get a single chain or k cycles of size N/k. (So e.g. a chain of
191// length 5 plus a cycle of length 3 is impossible. Also 1 cycle of length 4
192// plus 2 cycles of length 2 is impossible). To be honest I've only partially
193// mathematically proven this, but at least I've verified it for N=16 and
194// N=4096 for all possible step-sizes.
195//
196// To linearize a chain in memory there are only 2 (good) possibilities: start
197// at either end-point. But to store a cycle any point is as good as any other.
198// Also the order in which to store the cycles themselves can still be chosen.
199//
200// Let's come back to the example with step-size 4.3. If we linearize this as
201// | 0, 4, 7, 3 | 1, 5, 6, 2 |
202// then most of the more likely transitions are sequential. The exceptions are
203// 0 <-> 3 and 1 <-> 2
204// but those are unavoidable with cycles. In return 2 of the less likely
205// transitions '3 <-> 1' are now sequential. I believe this is the best
206// possible linearization (better said: there are other linearizations that are
207// equally good, but none is better). But do let me know if you find a better
208// one!
209//
210// For step-size '8.4' an optimal(?) linearization seems to be
211// | 0, 7 | 1, 6 | 2, 5 | 3, 4 |
212// For step-size '7.9' the order is:
213// | 7, 0 | 6, 1 | 5, 2 | 4, 3 |
214// And for step-size '3.8':
215// | 7, 4, 0, 3 | 6, 5, 1, 2 |
216//
217// I've again not (fully) mathematically proven it, but it seems we can
218// optimally(?) linearize cycles by:
219// * if likely step < unlikely step:
220// pick unassigned rows from 0 to N/2-1, and complete each cycle
221// * if likely step > unlikely step:
222// pick unassigned rows from N/2-1 to 0, and complete each cycle
223//
224// The routine calcPermute() below calculates these optimal(?) linearizations.
225// More in detail it calculates a permutation table: the i-th element in this
226// table tells where in memory the i-th logical row of the original (half)
227// resample coefficient table is physically stored.
228
229static constexpr unsigned N = TAB_LEN;
230static constexpr unsigned N1 = N - 1;
231static constexpr unsigned N2 = N / 2;
232
233static constexpr unsigned mapIdx(unsigned x)
234{
235 unsigned t = x & N1; // first wrap
236 return (t < N2) ? t : N1 - t; // then fold
237}
238
239static constexpr std::pair<unsigned, unsigned> next(unsigned x, unsigned step)
240{
241 return {mapIdx(x + step), mapIdx(N1 - x + step)};
242}
243
244static void calcPermute(double ratio, std::span<int16_t, HALF_TAB_LEN> permute)
245{
246 double r2 = ratio * N;
247 double fract = r2 - floor(r2);
248 auto step = narrow_cast<unsigned>(floor(r2));
249 bool incr = [&] {
250 if (fract > 0.5) {
251 // mostly (> 50%) take steps of 'floor(r2) + 1'
252 step += 1;
253 return false; // assign from high to low
254 } else {
255 // mostly take steps of 'floor(r2)'
256 return true; // assign from low to high
257 }
258 }();
259
260 // initially set all as unassigned
261 ranges::fill(permute, -1);
262
263 unsigned restart = incr ? 0 : N2 - 1;
264 unsigned curr = restart;
265 // check for chain (instead of cycles)
266 if (incr) {
267 for (auto i : xrange(N2)) {
268 auto [nxt1, nxt2] = next(i, step);
269 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
270 }
271 } else {
272 for (unsigned i = N2 - 1; int(i) >= 0; --i) {
273 auto [nxt1, nxt2] = next(i, step);
274 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
275 }
276 }
277
278 // assign all rows (in chain of cycle(s))
279 unsigned cnt = 0;
280 while (true) {
281 assert(permute[curr] == -1);
282 assert(cnt < N2);
283 permute[curr] = narrow<int16_t>(cnt++);
284
285 auto [nxt1, nxt2] = next(curr, step);
286 if (permute[nxt1] == -1) {
287 curr = nxt1;
288 continue;
289 } else if (permute[nxt2] == -1) {
290 curr = nxt2;
291 continue;
292 }
293
294 // finished chain or cycle
295 if (cnt == N2) break; // done
296
297 // continue with next cycle
298 while (permute[restart] != -1) {
299 if (incr) {
300 ++restart;
301 assert(restart != N2);
302 } else {
303 assert(restart != 0);
304 --restart;
305 }
306 }
307 curr = restart;
308 }
309
310#ifdef DEBUG
311 std::array<int16_t, N2> testPerm;
312 ranges::iota(testPerm, int16_t(0));
313 assert(std::is_permutation(permute.begin(), permute.end(), testPerm.begin()));
314#endif
315}
316
317static constexpr double getCoeff(FilterIndex index)
318{
319 double fraction = index.fractionAsDouble();
320 int indx = index.toInt();
321 return double(coeffs[indx]) +
322 fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
323}
324
325ResampleCoeffs::Table ResampleCoeffs::calcTable(
326 double ratio, std::span<int16_t, HALF_TAB_LEN> permute, unsigned& filterLen)
327{
328 calcPermute(ratio, permute);
329
330 double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
331 double normFactor = floatIncr / INDEX_INC;
332 auto increment = FilterIndex(floatIncr);
333 FilterIndex maxFilterIndex(COEFF_HALF_LEN);
334
335 int min_idx = -maxFilterIndex.divAsInt(increment);
336 int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
337 int idx_cnt = max_idx - min_idx + 1;
338 filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
339 min_idx -= (narrow<int>(filterLen) - idx_cnt) / 2;
340 Table table(HALF_TAB_LEN * filterLen);
341 ranges::fill(std::span{table.data(), HALF_TAB_LEN * filterLen}, 0);
342
343 for (auto t : xrange(HALF_TAB_LEN)) {
344 float* tab = &table[permute[t] * filterLen];
345 double lastPos = (double(t) + 0.5) / TAB_LEN;
346 FilterIndex startFilterIndex(lastPos * floatIncr);
347
348 FilterIndex filterIndex(startFilterIndex);
349 int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
350 filterIndex += increment * coeffCount;
351 int bufIndex = -coeffCount;
352 do {
353 tab[bufIndex - min_idx] =
354 float(getCoeff(filterIndex) * normFactor);
355 filterIndex -= increment;
356 bufIndex += 1;
357 } while (filterIndex >= FilterIndex(0));
358
359 filterIndex = increment - startFilterIndex;
360 coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
361 filterIndex += increment * coeffCount;
362 bufIndex = 1 + coeffCount;
363 do {
364 tab[bufIndex - min_idx] =
365 float(getCoeff(filterIndex) * normFactor);
366 filterIndex -= increment;
367 bufIndex -= 1;
368 } while (filterIndex > FilterIndex(0));
369 }
370 return table;
371}
372
373static const std::array<int16_t, HALF_TAB_LEN> dummyPermute = {};
374
375template<unsigned CHANNELS>
377 ResampledSoundDevice& input_, const DynamicClock& hostClock_)
378 : ResampleAlgo(input_)
379 , hostClock(hostClock_)
380 , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
381 , permute(dummyPermute) // Any better way to do this? (that also works with debug-STL)
382{
383 ResampleCoeffs::instance().getCoeffs(double(ratio), permute, table, filterLen);
384
385 // fill buffer with 'enough' zero's
386 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
387 bufStart = 0;
388 bufEnd = extra;
389 size_t initialSize = 4000; // buffer grows dynamically if this is too small
390 buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
391}
392
393template<unsigned CHANNELS>
395{
397}
398
399#ifdef __SSE2__
400template<bool REVERSE>
401static inline void calcSseMono(const float* buf_, const float* tab_, size_t len, float* out)
402{
403 assert((len % 4) == 0);
404 assert((uintptr_t(tab_) % 16) == 0);
405
406 auto x = narrow<ptrdiff_t>((len & ~7) * sizeof(float));
407 assert((x % 32) == 0);
408 const char* buf = reinterpret_cast<const char*>(buf_) + x;
409 const char* tab = reinterpret_cast<const char*>(tab_) + (REVERSE ? -x : x);
410 x = -x;
411
412 __m128 a0 = _mm_setzero_ps();
413 __m128 a1 = _mm_setzero_ps();
414 do {
415 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
416 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
417 __m128 t0, t1;
418 if constexpr (REVERSE) {
419 t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 16));
420 t1 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - x - 32));
421 } else {
422 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 0));
423 t1 = _mm_load_ps (reinterpret_cast<const float*>(tab + x + 16));
424 }
425 __m128 m0 = _mm_mul_ps(b0, t0);
426 __m128 m1 = _mm_mul_ps(b1, t1);
427 a0 = _mm_add_ps(a0, m0);
428 a1 = _mm_add_ps(a1, m1);
429 x += 2 * sizeof(__m128);
430 } while (x < 0);
431 if (len & 4) {
432 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf));
433 __m128 t0;
434 if constexpr (REVERSE) {
435 t0 = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
436 } else {
437 t0 = _mm_load_ps (reinterpret_cast<const float*>(tab));
438 }
439 __m128 m0 = _mm_mul_ps(b0, t0);
440 a0 = _mm_add_ps(a0, m0);
441 }
442
443 __m128 a = _mm_add_ps(a0, a1);
444 // The following can be _slightly_ faster by using the SSE3 _mm_hadd_ps()
445 // intrinsic, but not worth the trouble.
446 __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
447 __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
448
449 _mm_store_ss(out, s);
450}
451
452template<int N> static inline __m128 shuffle(__m128 x)
453{
454 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
455}
456template<bool REVERSE>
457static inline void calcSseStereo(const float* buf_, const float* tab_, size_t len, float* out)
458{
459 assert((len % 4) == 0);
460 assert((uintptr_t(tab_) % 16) == 0);
461
462 auto x = narrow<ptrdiff_t>(2 * (len & ~7) * sizeof(float));
463 const char* buf = reinterpret_cast<const char*>(buf_) + x;
464 const char* tab = reinterpret_cast<const char*>(tab_);
465 x = -x;
466
467 __m128 a0 = _mm_setzero_ps();
468 __m128 a1 = _mm_setzero_ps();
469 __m128 a2 = _mm_setzero_ps();
470 __m128 a3 = _mm_setzero_ps();
471 do {
472 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 0));
473 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 16));
474 __m128 b2 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 32));
475 __m128 b3 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + x + 48));
476 __m128 ta, tb;
477 if constexpr (REVERSE) {
478 ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
479 tb = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 32));
480 tab -= 2 * sizeof(__m128);
481 } else {
482 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
483 tb = _mm_load_ps (reinterpret_cast<const float*>(tab + 16));
484 tab += 2 * sizeof(__m128);
485 }
486 __m128 t0 = shuffle<0x50>(ta);
487 __m128 t1 = shuffle<0xFA>(ta);
488 __m128 t2 = shuffle<0x50>(tb);
489 __m128 t3 = shuffle<0xFA>(tb);
490 __m128 m0 = _mm_mul_ps(b0, t0);
491 __m128 m1 = _mm_mul_ps(b1, t1);
492 __m128 m2 = _mm_mul_ps(b2, t2);
493 __m128 m3 = _mm_mul_ps(b3, t3);
494 a0 = _mm_add_ps(a0, m0);
495 a1 = _mm_add_ps(a1, m1);
496 a2 = _mm_add_ps(a2, m2);
497 a3 = _mm_add_ps(a3, m3);
498 x += 4 * sizeof(__m128);
499 } while (x < 0);
500 if (len & 4) {
501 __m128 b0 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 0));
502 __m128 b1 = _mm_loadu_ps(reinterpret_cast<const float*>(buf + 16));
503 __m128 ta;
504 if constexpr (REVERSE) {
505 ta = _mm_loadr_ps(reinterpret_cast<const float*>(tab - 16));
506 } else {
507 ta = _mm_load_ps (reinterpret_cast<const float*>(tab + 0));
508 }
509 __m128 t0 = shuffle<0x50>(ta);
510 __m128 t1 = shuffle<0xFA>(ta);
511 __m128 m0 = _mm_mul_ps(b0, t0);
512 __m128 m1 = _mm_mul_ps(b1, t1);
513 a0 = _mm_add_ps(a0, m0);
514 a1 = _mm_add_ps(a1, m1);
515 }
516
517 __m128 a01 = _mm_add_ps(a0, a1);
518 __m128 a23 = _mm_add_ps(a2, a3);
519 __m128 a = _mm_add_ps(a01, a23);
520 // Can faster with SSE3, but (like above) not worth the trouble.
521 __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
522 _mm_store_ss(&out[0], s);
523 _mm_store_ss(&out[1], shuffle<0x55>(s));
524}
525
526#endif
527
528template<unsigned CHANNELS>
529void ResampleHQ<CHANNELS>::calcOutput(
530 float pos, float* __restrict output)
531{
532 assert((filterLen & 3) == 0);
533
534 int bufIdx = int(pos) + bufStart;
535 assert((bufIdx + filterLen) <= bufEnd);
536 bufIdx *= CHANNELS;
537 const float* buf = &buffer[bufIdx];
538
539 auto t = size_t(lrintf(pos * TAB_LEN)) % TAB_LEN;
540 if (!(t & HALF_TAB_LEN)) {
541 // first half, begin of row 't'
542 t = permute[t];
543 const float* tab = &table[t * filterLen];
544
545#ifdef __SSE2__
546 if constexpr (CHANNELS == 1) {
547 calcSseMono <false>(buf, tab, filterLen, output);
548 } else {
549 calcSseStereo<false>(buf, tab, filterLen, output);
550 }
551 return;
552#endif
553
554 // c++ version, both mono and stereo
555 for (auto ch : xrange(CHANNELS)) {
556 float r0 = 0.0f;
557 float r1 = 0.0f;
558 float r2 = 0.0f;
559 float r3 = 0.0f;
560 for (unsigned i = 0; i < filterLen; i += 4) {
561 r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
562 r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
563 r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
564 r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
565 }
566 output[ch] = r0 + r1 + r2 + r3;
567 ++buf;
568 }
569 } else {
570 // 2nd half, end of row 'TAB_LEN - 1 - t'
571 t = permute[TAB_LEN - 1 - t];
572 const float* tab = &table[(t + 1) * filterLen];
573
574#ifdef __SSE2__
575 if constexpr (CHANNELS == 1) {
576 calcSseMono <true>(buf, tab, filterLen, output);
577 } else {
578 calcSseStereo<true>(buf, tab, filterLen, output);
579 }
580 return;
581#endif
582
583 // c++ version, both mono and stereo
584 for (auto ch : xrange(CHANNELS)) {
585 float r0 = 0.0f;
586 float r1 = 0.0f;
587 float r2 = 0.0f;
588 float r3 = 0.0f;
589 for (int i = 0; i < int(filterLen); i += 4) {
590 r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
591 r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
592 r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
593 r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
594 }
595 output[ch] = r0 + r1 + r2 + r3;
596 ++buf;
597 }
598 }
599}
600
601template<unsigned CHANNELS>
602void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
603{
604 // Still enough free space at end of buffer?
605 unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
606 if (free < emuNum) {
607 // No, then move everything to the start
608 // (data needs to be in a contiguous memory block)
609 unsigned available = bufEnd - bufStart;
610 memmove(&buffer[0], &buffer[bufStart * size_t(CHANNELS)],
611 available * size_t(CHANNELS) * sizeof(float));
612 bufStart = 0;
613 bufEnd = available;
614
615 free = unsigned(buffer.size() / CHANNELS) - bufEnd;
616 auto missing = narrow_cast<int>(emuNum - free);
617 if (missing > 0) [[unlikely]] {
618 // Still not enough room: grow the buffer.
619 // TODO an alternative is to instead of using a large
620 // buffer, chop the work in multiple smaller pieces.
621 // That may have the advantage that the data fits in
622 // the CPU's data cache. OTOH too small chunks have
623 // more overhead. (Not yet implemented because it's
624 // more complex).
625 buffer.resize(buffer.size() + missing * size_t(CHANNELS));
626 }
627 }
628 VLA_SSE_ALIGNED(float, tmpBufExtra, emuNum * CHANNELS + 3);
629 auto tmpBuf = tmpBufExtra.subspan(0, emuNum * CHANNELS);
630 if (input.generateInput(tmpBufExtra.data(), emuNum)) {
631 ranges::copy(tmpBuf,
632 subspan(buffer, bufEnd * CHANNELS));
633 bufEnd += emuNum;
634 nonzeroSamples = bufEnd - bufStart;
635 } else {
636 ranges::fill(subspan(buffer, bufEnd * CHANNELS, emuNum * CHANNELS), 0);
637 bufEnd += emuNum;
638 }
639
640 assert(bufStart <= bufEnd);
641 assert(bufEnd <= (buffer.size() / CHANNELS));
642}
643
644template<unsigned CHANNELS>
646 float* __restrict dataOut, size_t hostNum, EmuTime::param time)
647{
648 auto& emuClk = getEmuClock();
649 unsigned emuNum = emuClk.getTicksTill(time);
650 if (emuNum > 0) {
651 prepareData(emuNum);
652 }
653
654 bool notMuted = nonzeroSamples > 0;
655 if (notMuted) {
656 // main processing loop
657 EmuTime host1 = hostClock.getFastAdd(1);
658 assert(host1 > emuClk.getTime());
659 float pos = narrow_cast<float>(emuClk.getTicksTillDouble(host1));
660 assert(pos <= (ratio + 2));
661 for (auto i : xrange(hostNum)) {
662 calcOutput(pos, &dataOut[i * CHANNELS]);
663 pos += ratio;
664 }
665 }
666 emuClk += emuNum;
667 bufStart += emuNum;
668 nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
669
670 assert(bufStart <= bufEnd);
671 unsigned available = bufEnd - bufStart;
672 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
673 assert(available == extra); (void)available; (void)extra;
674
675 return notMuted;
676}
677
678// Force template instantiation.
679template class ResampleHQ<1>;
680template class ResampleHQ<2>;
681
682} // namespace openmsx
TclObject t
Represents a clock with a variable frequency.
Definition: DynamicClock.hh:17
static ResampleCoeffs & instance()
Definition: ResampleHQ.cc:86
ResampleCoeffs(const ResampleCoeffs &)=delete
void getCoeffs(double ratio, std::span< const int16_t, HALF_TAB_LEN > &permute, float *&table, unsigned &filterLen)
Definition: ResampleHQ.cc:92
ResampleCoeffs & operator=(const ResampleCoeffs &)=delete
void releaseCoeffs(double ratio)
Definition: ResampleHQ.cc:113
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
Definition: ResampleHQ.cc:376
~ResampleHQ() override
Definition: ResampleHQ.cc:394
bool generateOutputImpl(float *dataOut, size_t num, EmuTime::param time) override
Definition: ResampleHQ.cc:645
ALWAYS_INLINE unsigned count(const uint8_t *pIn, const uint8_t *pMatch, const uint8_t *pInLimit)
Definition: lz4.cc:147
This file implemented 3 utility functions:
Definition: Autofire.cc:9
FixedPoint< 16 > FilterIndex
Definition: ResampleHQ.cc:45
constexpr void fill(ForwardRange &&range, const T &value)
Definition: ranges.hh:287
auto copy(InputRange &&range, OutputIter out)
Definition: ranges.hh:232
constexpr void iota(ForwardIt first, ForwardIt last, T value)
Definition: ranges.hh:294
auto find(InputRange &&range, const T &value)
Definition: ranges.hh:160
size_t size(std::string_view utf8)
constexpr auto subspan(Range &&range, size_t offset, size_t count=std::dynamic_extent)
Definition: ranges.hh:446
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition: stl.hh:125
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition: stl.hh:100
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
Definition: vla.hh:50
constexpr auto xrange(T e)
Definition: xrange.hh:132
constexpr auto end(const zstring_view &x)