openMSX
ResampleHQ.cc
Go to the documentation of this file.
1// Based on libsamplerate-0.1.2 (aka Secret Rabbit Code)
2//
3// simplified code in several ways:
4// - resample algorithm is no longer switchable, we took this variant:
5// Band limited sinc interpolation, fastest, 97dB SNR, 80% BW
6// - don't allow to change sample rate on-the-fly
7// - assume input (and thus also output) signals have infinite length, so
8// there is no special code to handle the ending of the signal
9// - changed/simplified API to better match openmsx use model
10// (e.g. remove all error checking)
11
12#include "ResampleHQ.hh"
13
15
16#include "FixedPoint.hh"
17#include "MemBuffer.hh"
18#include "aligned.hh"
19#include "narrow.hh"
20#include "ranges.hh"
21#include "small_buffer.hh"
22#include "stl.hh"
23#include "xrange.hh"
24
25#include <array>
26#include <bit>
27#include <cmath>
28#include <cstddef>
29#include <cassert>
30#include <iterator>
31#include <vector>
32#ifdef __SSE2__
33#include <emmintrin.h>
34#endif
35
36namespace openmsx {
37
38// Letting the compiler deduce the (type and) size of the std::array works fine
39// with gcc, msvc and clang-11 but is broken from clang-12 onwards. More
40// specifically it only works for sizes upto 256. For more details see:
41// https://www.mail-archive.com/llvm-bugs@lists.llvm.org/msg50598.html
42// https://reviews.llvm.org/D86936
43// As a workaround we do hardcode the (type and) size here:
44//static constexpr std::array coeffs = {
45static constexpr std::array<float, 2464> coeffs = {
46 #include "ResampleCoeffs.ii"
47};
48
50
51static constexpr int INDEX_INC = 128;
52static constexpr int COEFF_LEN = int(std::size(coeffs));
53static constexpr int COEFF_HALF_LEN = COEFF_LEN - 1;
54static constexpr size_t TAB_LEN = ResampleHQ<1>::TAB_LEN;
55static constexpr size_t HALF_TAB_LEN = ResampleHQ<1>::HALF_TAB_LEN;
56
58{
59public:
64
65 static ResampleCoeffs& instance();
66 void getCoeffs(double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute, float*& table, unsigned& filterLen);
67 void releaseCoeffs(double ratio);
68
69private:
71 using PermuteTable = MemBuffer<int16_t>; // array<int16_t, HALF_TAB_LEN>
72
73 ResampleCoeffs() = default;
75
76 static Table calcTable(double ratio, std::span<int16_t, HALF_TAB_LEN> permute, unsigned& filterLen);
77
78 struct Element {
79 double ratio;
80 PermuteTable permute; // need stable address (can't directly use std::array)
81 Table table;
82 unsigned filterLen;
83 unsigned count;
84 };
85 std::vector<Element> cache; // typically 1-4 entries -> unsorted vector
86};
87
88ResampleCoeffs::~ResampleCoeffs()
89{
90 assert(cache.empty());
91}
92
94{
95 static ResampleCoeffs resampleCoeffs;
96 return resampleCoeffs;
97}
98
100 double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute, float*& table, unsigned& filterLen)
101{
102 if (auto it = ranges::find(cache, ratio, &Element::ratio);
103 it != end(cache)) {
104 permute = std::span<int16_t, HALF_TAB_LEN>{it->permute};
105 table = it->table.data();
106 filterLen = it->filterLen;
107 it->count++;
108 return;
109 }
110 Element elem;
111 elem.ratio = ratio;
112 elem.count = 1;
113 elem.permute = PermuteTable(HALF_TAB_LEN);
114 auto perm = std::span<int16_t, HALF_TAB_LEN>{elem.permute};
115 elem.table = calcTable(ratio, perm, elem.filterLen);
116 permute = perm;
117 table = elem.table.data();
118 filterLen = elem.filterLen;
119 cache.push_back(std::move(elem));
120}
121
123{
124 auto it = rfind_unguarded(cache, ratio, &Element::ratio);
125 it->count--;
126 if (it->count == 0) {
127 move_pop_back(cache, it);
128 }
129}
130
131// -- Permutation stuff --
132//
133// The rows in the resample coefficient table are not visited sequentially.
134// Instead, depending on the resample-ratio, we take fixed non-integer jumps
135// from one row to the next.
136//
137// In reality the table has 4096 rows (of which only 2048 are actually stored).
138// But for simplicity I'll here work out examples for a table with only 16 rows
139// (of which 8 are stored).
140//
141// Let's first assume a jump of '5.2'. This means that after we've used row
142// 'r', the next row we need is 'r + 5.2'. Of course row numbers must be
143// integers, so a jump of 5.2 actually means that 80% of the time we advance 5
144// rows and 20% of the time we advance 6 rows.
145//
146// The rows in the (full) table are circular. This means that once we're past
147// row 15 (in this example) we restart at row 0. So rows 'wrap' past the end
148// (modulo arithmetic). We also only store the 1st half of the table, the
149// entries for the 2nd half are 'folded' back to the 1st half according to the
150// formula: y = 15 - x.
151//
152// Let's now calculate the possible transitions. If we're currently on row '0',
153// the next row will be either '5' (80% chance) or row '6' (20% chance). When
154// we're on row '5' the next most likely row will be '10', but after folding
155// '10' becomes '15-10 = 5' (so 5 goes to itself (80% chance)). Row '10' most
156// likely goes to '15', after folding we get that '5' goes to '0'. Row '15'
157// most likely goes to '20', and after wrapping and folding that becomes '0'
158// goes to '4'. Calculating this for all rows gives:
159// 0 -> 5 or 4 (80%) 0 -> 6 or 5 (20%)
160// 1 -> 6 or 3 1 -> 7 or 4
161// 2 -> 7 or 2 2 -> 7 or 3
162// 3 -> 7 or 1 3 -> 6 or 2
163// 4 -> 6 or 0 4 -> 5 or 1
164// 5 -> 5 or 0 5 -> 4 or 0
165// 6 -> 4 or 1 6 -> 3 or 0
166// 7 -> 3 or 2 7 -> 2 or 1
167// So every row has 4 possible successors (2 more and 2 less likely). Possibly
168// some of these 4 are the same, or even the same as the starting row. Note
169// that if row x goes to row y (x->y) then also y->x, this turns out to be true
170// in general.
171//
172// For cache efficiency it's best if rows that are needed after each other in
173// time are also stored sequentially in memory (both before or after is fine).
174// Clearly storing the rows in numeric order will not read the memory
175// sequentially. For this specific example we could stores the rows in the
176// order:
177// 2, 7, 3, 1, 6, 4, 0, 5
178// With this order all likely transitions are sequential. The less likely
179// transitions are not. But I don't believe there exists an order that's good
180// for both the likely and the unlikely transitions. Do let me know if I'm
181// wrong.
182//
183// In this example the transitions form a single chain (it turns out this is
184// often the case). But for example for a step-size of 4.3 we get
185// 0 -> 4 or 3 (70%) 0 -> 5 or 4 (30%)
186// 1 -> 5 or 2 1 -> 6 or 3
187// 2 -> 6 or 1 2 -> 7 or 2
188// 3 -> 7 or 0 3 -> 7 or 1
189// 4 -> 7 or 0 4 -> 6 or 0
190// 5 -> 6 or 1 5 -> 5 or 0
191// 6 -> 5 or 2 6 -> 4 or 1
192// 7 -> 4 or 3 7 -> 3 or 2
193// Only looking at the more likely transitions, we get 2 cycles of length 4:
194// 0, 4, 7, 3
195// 1, 5, 6, 2
196//
197// So the previous example gave a single chain with 2 clear end-points. Now we
198// have 2 separate cycles. It turns out that for any possible step-size we
199// either get a single chain or k cycles of size N/k. (So e.g. a chain of
200// length 5 plus a cycle of length 3 is impossible. Also 1 cycle of length 4
201// plus 2 cycles of length 2 is impossible). To be honest I've only partially
202// mathematically proven this, but at least I've verified it for N=16 and
203// N=4096 for all possible step-sizes.
204//
205// To linearize a chain in memory there are only 2 (good) possibilities: start
206// at either end-point. But to store a cycle any point is as good as any other.
207// Also the order in which to store the cycles themselves can still be chosen.
208//
209// Let's come back to the example with step-size 4.3. If we linearize this as
210// | 0, 4, 7, 3 | 1, 5, 6, 2 |
211// then most of the more likely transitions are sequential. The exceptions are
212// 0 <-> 3 and 1 <-> 2
213// but those are unavoidable with cycles. In return 2 of the less likely
214// transitions '3 <-> 1' are now sequential. I believe this is the best
215// possible linearization (better said: there are other linearizations that are
216// equally good, but none is better). But do let me know if you find a better
217// one!
218//
219// For step-size '8.4' an optimal(?) linearization seems to be
220// | 0, 7 | 1, 6 | 2, 5 | 3, 4 |
221// For step-size '7.9' the order is:
222// | 7, 0 | 6, 1 | 5, 2 | 4, 3 |
223// And for step-size '3.8':
224// | 7, 4, 0, 3 | 6, 5, 1, 2 |
225//
226// I've again not (fully) mathematically proven it, but it seems we can
227// optimally(?) linearize cycles by:
228// * if likely step < unlikely step:
229// pick unassigned rows from 0 to N/2-1, and complete each cycle
230// * if likely step > unlikely step:
231// pick unassigned rows from N/2-1 to 0, and complete each cycle
232//
233// The routine calcPermute() below calculates these optimal(?) linearizations.
234// More in detail it calculates a permutation table: the i-th element in this
235// table tells where in memory the i-th logical row of the original (half)
236// resample coefficient table is physically stored.
237
238static constexpr unsigned N = TAB_LEN;
239static constexpr unsigned N1 = N - 1;
240static constexpr unsigned N2 = N / 2;
241
242static constexpr unsigned mapIdx(unsigned x)
243{
244 unsigned t = x & N1; // first wrap
245 return (t < N2) ? t : N1 - t; // then fold
246}
247
248static constexpr std::pair<unsigned, unsigned> next(unsigned x, unsigned step)
249{
250 return {mapIdx(x + step), mapIdx(N1 - x + step)};
251}
252
253static void calcPermute(double ratio, std::span<int16_t, HALF_TAB_LEN> permute)
254{
255 double r2 = ratio * N;
256 double fract = r2 - floor(r2);
257 auto step = narrow_cast<unsigned>(floor(r2));
258 bool incr = [&] {
259 if (fract > 0.5) {
260 // mostly (> 50%) take steps of 'floor(r2) + 1'
261 step += 1;
262 return false; // assign from high to low
263 } else {
264 // mostly take steps of 'floor(r2)'
265 return true; // assign from low to high
266 }
267 }();
268
269 // initially set all as unassigned
270 ranges::fill(permute, -1);
271
272 unsigned restart = incr ? 0 : N2 - 1;
273 unsigned curr = restart;
274 // check for chain (instead of cycles)
275 if (incr) {
276 for (auto i : xrange(N2)) {
277 auto [nxt1, nxt2] = next(i, step);
278 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
279 }
280 } else {
281 for (unsigned i = N2 - 1; int(i) >= 0; --i) {
282 auto [nxt1, nxt2] = next(i, step);
283 if ((nxt1 == i) || (nxt2 == i)) { curr = i; break; }
284 }
285 }
286
287 // assign all rows (in chain of cycle(s))
288 unsigned cnt = 0;
289 while (true) {
290 assert(permute[curr] == -1);
291 assert(cnt < N2);
292 permute[curr] = narrow<int16_t>(cnt++);
293
294 auto [nxt1, nxt2] = next(curr, step);
295 if (permute[nxt1] == -1) {
296 curr = nxt1;
297 continue;
298 } else if (permute[nxt2] == -1) {
299 curr = nxt2;
300 continue;
301 }
302
303 // finished chain or cycle
304 if (cnt == N2) break; // done
305
306 // continue with next cycle
307 while (permute[restart] != -1) {
308 if (incr) {
309 ++restart;
310 assert(restart != N2);
311 } else {
312 assert(restart != 0);
313 --restart;
314 }
315 }
316 curr = restart;
317 }
318
319#ifdef DEBUG
320 std::array<int16_t, N2> testPerm;
321 ranges::iota(testPerm, int16_t(0));
322 assert(std::is_permutation(permute.begin(), permute.end(), testPerm.begin()));
323#endif
324}
325
326static constexpr double getCoeff(FilterIndex index)
327{
328 double fraction = index.fractionAsDouble();
329 int indx = index.toInt();
330 return double(coeffs[indx]) +
331 fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
332}
333
334ResampleCoeffs::Table ResampleCoeffs::calcTable(
335 double ratio, std::span<int16_t, HALF_TAB_LEN> permute, unsigned& filterLen)
336{
337 calcPermute(ratio, permute);
338
339 double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
340 double normFactor = floatIncr / INDEX_INC;
341 auto increment = FilterIndex(floatIncr);
342 FilterIndex maxFilterIndex(COEFF_HALF_LEN);
343
344 int min_idx = -maxFilterIndex.divAsInt(increment);
345 int max_idx = 1 + (maxFilterIndex - (increment - FilterIndex(floatIncr))).divAsInt(increment);
346 int idx_cnt = max_idx - min_idx + 1;
347 filterLen = (idx_cnt + 3) & ~3; // round up to multiple of 4
348 min_idx -= (narrow<int>(filterLen) - idx_cnt) / 2;
349 Table table(HALF_TAB_LEN * filterLen);
350 ranges::fill(std::span{table}, 0.0f);
351
352 for (auto t : xrange(HALF_TAB_LEN)) {
353 float* tab = &table[permute[t] * filterLen];
354 double lastPos = (double(t) + 0.5) / TAB_LEN;
355 FilterIndex startFilterIndex(lastPos * floatIncr);
356
357 FilterIndex filterIndex(startFilterIndex);
358 int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
359 filterIndex += increment * coeffCount;
360 int bufIndex = -coeffCount;
361 do {
362 tab[bufIndex - min_idx] =
363 float(getCoeff(filterIndex) * normFactor);
364 filterIndex -= increment;
365 bufIndex += 1;
366 } while (filterIndex >= FilterIndex(0));
367
368 filterIndex = increment - startFilterIndex;
369 coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
370 filterIndex += increment * coeffCount;
371 bufIndex = 1 + coeffCount;
372 do {
373 tab[bufIndex - min_idx] =
374 float(getCoeff(filterIndex) * normFactor);
375 filterIndex -= increment;
376 bufIndex -= 1;
377 } while (filterIndex > FilterIndex(0));
378 }
379 return table;
380}
381
382static const std::array<int16_t, HALF_TAB_LEN> dummyPermute = {};
383
384template<unsigned CHANNELS>
386 ResampledSoundDevice& input_, const DynamicClock& hostClock_)
387 : ResampleAlgo(input_)
388 , hostClock(hostClock_)
389 , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
390 , permute(dummyPermute) // Any better way to do this? (that also works with debug-STL)
391{
392 ResampleCoeffs::instance().getCoeffs(double(ratio), permute, table, filterLen);
393
394 // fill buffer with 'enough' zero's
395 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
396 bufStart = 0;
397 bufEnd = extra;
398 size_t initialSize = 4000; // buffer grows dynamically if this is too small
399 buffer.resize((initialSize + extra) * CHANNELS); // zero-initialized
400}
401
402template<unsigned CHANNELS>
407
408#ifdef __SSE2__
409
410static inline __m128 reverse(__m128 x)
411{
412 return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3));
413}
414
415template<bool REVERSE>
416static inline void calcSseMono(const float* buf_, const float* tab_, size_t len, float* out)
417{
418 assert((len % 4) == 0);
419 assert((uintptr_t(tab_) % 16) == 0);
420
421 auto x = narrow<ptrdiff_t>((len & ~7) * sizeof(float));
422 assert((x % 32) == 0);
423 const char* buf = std::bit_cast<const char*>(buf_) + x;
424 const char* tab = std::bit_cast<const char*>(tab_) + (REVERSE ? -x : x);
425 x = -x;
426
427 __m128 a0 = _mm_setzero_ps();
428 __m128 a1 = _mm_setzero_ps();
429 do {
430 __m128 b0 = _mm_loadu_ps(std::bit_cast<const float*>(buf + x + 0));
431 __m128 b1 = _mm_loadu_ps(std::bit_cast<const float*>(buf + x + 16));
432 __m128 t0, t1;
433 if constexpr (REVERSE) {
434 t0 = reverse(_mm_loadu_ps(std::bit_cast<const float*>(tab - x - 16)));
435 t1 = reverse(_mm_loadu_ps(std::bit_cast<const float*>(tab - x - 32)));
436 } else {
437 t0 = _mm_loadu_ps (std::bit_cast<const float*>(tab + x + 0));
438 t1 = _mm_loadu_ps (std::bit_cast<const float*>(tab + x + 16));
439 }
440 __m128 m0 = _mm_mul_ps(b0, t0);
441 __m128 m1 = _mm_mul_ps(b1, t1);
442 a0 = _mm_add_ps(a0, m0);
443 a1 = _mm_add_ps(a1, m1);
444 x += 2 * sizeof(__m128);
445 } while (x < 0);
446 if (len & 4) {
447 __m128 b0 = _mm_loadu_ps(std::bit_cast<const float*>(buf));
448 __m128 t0;
449 if constexpr (REVERSE) {
450 t0 = reverse(_mm_loadu_ps(std::bit_cast<const float*>(tab - 16)));
451 } else {
452 t0 = _mm_loadu_ps (std::bit_cast<const float*>(tab));
453 }
454 __m128 m0 = _mm_mul_ps(b0, t0);
455 a0 = _mm_add_ps(a0, m0);
456 }
457
458 __m128 a = _mm_add_ps(a0, a1);
459 // The following can be _slightly_ faster by using the SSE3 _mm_hadd_ps()
460 // intrinsic, but not worth the trouble.
461 __m128 t = _mm_add_ps(a, _mm_movehl_ps(a, a));
462 __m128 s = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
463
464 _mm_store_ss(out, s);
465}
466
467template<int N> static inline __m128 shuffle(__m128 x)
468{
469 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
470}
471template<bool REVERSE>
472static inline void calcSseStereo(const float* buf_, const float* tab_, size_t len, float* out)
473{
474 assert((len % 4) == 0);
475 assert((uintptr_t(tab_) % 16) == 0);
476
477 auto x = narrow<ptrdiff_t>(2 * (len & ~7) * sizeof(float));
478 const auto* buf = std::bit_cast<const char*>(buf_) + x;
479 const auto* tab = std::bit_cast<const char*>(tab_);
480 x = -x;
481
482 __m128 a0 = _mm_setzero_ps();
483 __m128 a1 = _mm_setzero_ps();
484 __m128 a2 = _mm_setzero_ps();
485 __m128 a3 = _mm_setzero_ps();
486 do {
487 __m128 b0 = _mm_loadu_ps(std::bit_cast<const float*>(buf + x + 0));
488 __m128 b1 = _mm_loadu_ps(std::bit_cast<const float*>(buf + x + 16));
489 __m128 b2 = _mm_loadu_ps(std::bit_cast<const float*>(buf + x + 32));
490 __m128 b3 = _mm_loadu_ps(std::bit_cast<const float*>(buf + x + 48));
491 __m128 ta, tb;
492 if constexpr (REVERSE) {
493 ta = reverse(_mm_loadu_ps(std::bit_cast<const float*>(tab - 16)));
494 tb = reverse(_mm_loadu_ps(std::bit_cast<const float*>(tab - 32)));
495 tab -= 2 * sizeof(__m128);
496 } else {
497 ta = _mm_loadu_ps (std::bit_cast<const float*>(tab + 0));
498 tb = _mm_loadu_ps (std::bit_cast<const float*>(tab + 16));
499 tab += 2 * sizeof(__m128);
500 }
501 __m128 t0 = shuffle<0x50>(ta);
502 __m128 t1 = shuffle<0xFA>(ta);
503 __m128 t2 = shuffle<0x50>(tb);
504 __m128 t3 = shuffle<0xFA>(tb);
505 __m128 m0 = _mm_mul_ps(b0, t0);
506 __m128 m1 = _mm_mul_ps(b1, t1);
507 __m128 m2 = _mm_mul_ps(b2, t2);
508 __m128 m3 = _mm_mul_ps(b3, t3);
509 a0 = _mm_add_ps(a0, m0);
510 a1 = _mm_add_ps(a1, m1);
511 a2 = _mm_add_ps(a2, m2);
512 a3 = _mm_add_ps(a3, m3);
513 x += 4 * sizeof(__m128);
514 } while (x < 0);
515 if (len & 4) {
516 __m128 b0 = _mm_loadu_ps(std::bit_cast<const float*>(buf + 0));
517 __m128 b1 = _mm_loadu_ps(std::bit_cast<const float*>(buf + 16));
518 __m128 ta;
519 if constexpr (REVERSE) {
520 ta = reverse(_mm_loadu_ps(std::bit_cast<const float*>(tab - 16)));
521 } else {
522 ta = _mm_loadu_ps (std::bit_cast<const float*>(tab + 0));
523 }
524 __m128 t0 = shuffle<0x50>(ta);
525 __m128 t1 = shuffle<0xFA>(ta);
526 __m128 m0 = _mm_mul_ps(b0, t0);
527 __m128 m1 = _mm_mul_ps(b1, t1);
528 a0 = _mm_add_ps(a0, m0);
529 a1 = _mm_add_ps(a1, m1);
530 }
531
532 __m128 a01 = _mm_add_ps(a0, a1);
533 __m128 a23 = _mm_add_ps(a2, a3);
534 __m128 a = _mm_add_ps(a01, a23);
535 // Can faster with SSE3, but (like above) not worth the trouble.
536 __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
537 _mm_store_ss(&out[0], s);
538 _mm_store_ss(&out[1], shuffle<0x55>(s));
539}
540
541#endif
542
543template<unsigned CHANNELS>
544void ResampleHQ<CHANNELS>::calcOutput(
545 float pos, float* __restrict output)
546{
547 assert((filterLen & 3) == 0);
548
549 int bufIdx = int(pos) + bufStart;
550 assert((bufIdx + filterLen) <= bufEnd);
551 bufIdx *= CHANNELS;
552 const float* buf = &buffer[bufIdx];
553
554 auto t = size_t(lrintf(pos * TAB_LEN)) % TAB_LEN;
555 if (!(t & HALF_TAB_LEN)) {
556 // first half, begin of row 't'
557 t = permute[t];
558 const float* tab = &table[t * filterLen];
559
560#ifdef __SSE2__
561 if constexpr (CHANNELS == 1) {
562 calcSseMono <false>(buf, tab, filterLen, output);
563 } else {
564 calcSseStereo<false>(buf, tab, filterLen, output);
565 }
566 return;
567#endif
568
569 // c++ version, both mono and stereo
570 for (auto ch : xrange(CHANNELS)) {
571 float r0 = 0.0f;
572 float r1 = 0.0f;
573 float r2 = 0.0f;
574 float r3 = 0.0f;
575 for (size_t i = 0; i < filterLen; i += 4) {
576 r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
577 r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
578 r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
579 r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
580 }
581 output[ch] = r0 + r1 + r2 + r3;
582 ++buf;
583 }
584 } else {
585 // 2nd half, end of row 'TAB_LEN - 1 - t'
586 t = permute[TAB_LEN - 1 - t];
587 const float* tab = &table[(t + 1) * filterLen];
588
589#ifdef __SSE2__
590 if constexpr (CHANNELS == 1) {
591 calcSseMono <true>(buf, tab, filterLen, output);
592 } else {
593 calcSseStereo<true>(buf, tab, filterLen, output);
594 }
595 return;
596#endif
597
598 // c++ version, both mono and stereo
599 for (auto ch : xrange(CHANNELS)) {
600 float r0 = 0.0f;
601 float r1 = 0.0f;
602 float r2 = 0.0f;
603 float r3 = 0.0f;
604 for (ptrdiff_t i = 0; i < ptrdiff_t(filterLen); i += 4) {
605 r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
606 r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
607 r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
608 r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
609 }
610 output[ch] = r0 + r1 + r2 + r3;
611 ++buf;
612 }
613 }
614}
615
616template<unsigned CHANNELS>
617void ResampleHQ<CHANNELS>::prepareData(unsigned emuNum)
618{
619 // Still enough free space at end of buffer?
620 unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
621 if (free < emuNum) {
622 // No, then move everything to the start
623 // (data needs to be in a contiguous memory block)
624 unsigned available = bufEnd - bufStart;
625 memmove(&buffer[0], &buffer[bufStart * size_t(CHANNELS)],
626 available * size_t(CHANNELS) * sizeof(float));
627 bufStart = 0;
628 bufEnd = available;
629
630 free = unsigned(buffer.size() / CHANNELS) - bufEnd;
631 auto missing = narrow_cast<int>(emuNum - free);
632 if (missing > 0) [[unlikely]] {
633 // Still not enough room: grow the buffer.
634 // TODO an alternative is to instead of using a large
635 // buffer, chop the work in multiple smaller pieces.
636 // That may have the advantage that the data fits in
637 // the CPU's data cache. OTOH too small chunks have
638 // more overhead. (Not yet implemented because it's
639 // more complex).
640 buffer.resize(buffer.size() + missing * size_t(CHANNELS));
641 }
642 }
643 small_buffer<float, 8192> tmpBufExtra(uninitialized_tag{}, emuNum * CHANNELS + 3); // typical ~5194 (PSG, samples=1024) but could be larger
644 auto tmpBuf = subspan(tmpBufExtra, 0, emuNum * CHANNELS);
645 if (input.generateInput(tmpBufExtra.data(), emuNum)) {
646 ranges::copy(tmpBuf,
647 subspan(buffer, bufEnd * CHANNELS));
648 bufEnd += emuNum;
649 nonzeroSamples = bufEnd - bufStart;
650 } else {
651 ranges::fill(subspan(buffer, bufEnd * CHANNELS, emuNum * CHANNELS), 0);
652 bufEnd += emuNum;
653 }
654
655 assert(bufStart <= bufEnd);
656 assert(bufEnd <= (buffer.size() / CHANNELS));
657}
658
659template<unsigned CHANNELS>
661 float* __restrict dataOut, size_t hostNum, EmuTime::param time)
662{
663 auto& emuClk = getEmuClock();
664 unsigned emuNum = emuClk.getTicksTill(time);
665 if (emuNum > 0) {
666 prepareData(emuNum);
667 }
668
669 bool notMuted = nonzeroSamples > 0;
670 if (notMuted) {
671 // main processing loop
672 EmuTime host1 = hostClock.getFastAdd(1);
673 assert(host1 > emuClk.getTime());
674 auto pos = narrow_cast<float>(emuClk.getTicksTillDouble(host1));
675 assert(pos <= (ratio + 2));
676 for (auto i : xrange(hostNum)) {
677 calcOutput(pos, &dataOut[i * CHANNELS]);
678 pos += ratio;
679 }
680 }
681 emuClk += emuNum;
682 bufStart += emuNum;
683 nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
684
685 assert(bufStart <= bufEnd);
686 unsigned available = bufEnd - bufStart;
687 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
688 assert(available == extra); (void)available; (void)extra;
689
690 return notMuted;
691}
692
693// Force template instantiation.
694template class ResampleHQ<1>;
695template class ResampleHQ<2>;
696
697} // namespace openmsx
TclObject t
Represents a clock with a variable frequency.
static ResampleCoeffs & instance()
Definition ResampleHQ.cc:93
ResampleCoeffs(const ResampleCoeffs &)=delete
ResampleCoeffs & operator=(ResampleCoeffs &&)=delete
void getCoeffs(double ratio, std::span< const int16_t, HALF_TAB_LEN > &permute, float *&table, unsigned &filterLen)
Definition ResampleHQ.cc:99
ResampleCoeffs & operator=(const ResampleCoeffs &)=delete
ResampleCoeffs(ResampleCoeffs &&)=delete
void releaseCoeffs(double ratio)
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
~ResampleHQ() override
bool generateOutputImpl(float *dataOut, size_t num, EmuTime::param time) override
This file implemented 3 utility functions:
Definition Autofire.cc:11
FixedPoint< 16 > FilterIndex
Definition ResampleHQ.cc:49
constexpr void fill(ForwardRange &&range, const T &value)
Definition ranges.hh:315
constexpr void iota(ForwardIt first, ForwardIt last, T value)
Definition ranges.hh:322
auto find(InputRange &&range, const T &value)
Definition ranges.hh:162
constexpr auto copy(InputRange &&range, OutputIter out)
Definition ranges.hh:252
constexpr auto reverse(Range &&range)
Definition view.hh:435
constexpr auto subspan(Range &&range, size_t offset, size_t count=std::dynamic_extent)
Definition ranges.hh:481
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
Definition stl.hh:137
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
Definition stl.hh:112
constexpr auto xrange(T e)
Definition xrange.hh:132
constexpr auto end(const zstring_view &x)