41static constexpr std::array<float, 2464> coeffs = {
42 #include "ResampleCoeffs.ii"
47static constexpr int INDEX_INC = 128;
48static constexpr int COEFF_LEN = int(
std::size(coeffs));
49static constexpr int COEFF_HALF_LEN = COEFF_LEN - 1;
60 void getCoeffs(
double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute,
float*& table,
unsigned& filterLen);
70 static Table calcTable(
double ratio, std::span<int16_t, HALF_TAB_LEN> permute,
unsigned& filterLen);
79 std::vector<Element> cache;
82ResampleCoeffs::~ResampleCoeffs()
84 assert(cache.empty());
90 return resampleCoeffs;
94 double ratio, std::span<const int16_t, HALF_TAB_LEN>& permute,
float*& table,
unsigned& filterLen)
96 if (
auto it =
ranges::find(cache, ratio, &Element::ratio);
98 permute = std::span<int16_t, HALF_TAB_LEN>{it->permute.data(), HALF_TAB_LEN};
99 table = it->table.data();
100 filterLen = it->filterLen;
108 auto perm = std::span<int16_t, HALF_TAB_LEN>{elem.permute.data(), HALF_TAB_LEN};
109 elem.table = calcTable(ratio, perm, elem.filterLen);
111 table = elem.table.data();
112 filterLen = elem.filterLen;
113 cache.push_back(std::move(elem));
120 if (it->count == 0) {
232static constexpr unsigned N = TAB_LEN;
233static constexpr unsigned N1 = N - 1;
234static constexpr unsigned N2 = N / 2;
236static constexpr unsigned mapIdx(
unsigned x)
239 return (
t < N2) ?
t : N1 -
t;
242static constexpr std::pair<unsigned, unsigned> next(
unsigned x,
unsigned step)
244 return {mapIdx(x + step), mapIdx(N1 - x + step)};
247static void calcPermute(
double ratio, std::span<int16_t, HALF_TAB_LEN> permute)
249 double r2 = ratio * N;
250 double fract = r2 - floor(r2);
251 auto step = narrow_cast<unsigned>(floor(r2));
266 unsigned restart = incr ? 0 : N2 - 1;
267 unsigned curr = restart;
270 for (
auto i :
xrange(N2)) {
271 auto [nxt1, nxt2] = next(i, step);
272 if ((nxt1 == i) || (nxt2 == i)) { curr = i;
break; }
275 for (
unsigned i = N2 - 1; int(i) >= 0; --i) {
276 auto [nxt1, nxt2] = next(i, step);
277 if ((nxt1 == i) || (nxt2 == i)) { curr = i;
break; }
284 assert(permute[curr] == -1);
286 permute[curr] = narrow<int16_t>(cnt++);
288 auto [nxt1, nxt2] = next(curr, step);
289 if (permute[nxt1] == -1) {
292 }
else if (permute[nxt2] == -1) {
298 if (cnt == N2)
break;
301 while (permute[restart] != -1) {
304 assert(restart != N2);
306 assert(restart != 0);
314 std::array<int16_t, N2> testPerm;
316 assert(std::is_permutation(permute.begin(), permute.end(), testPerm.begin()));
322 double fraction = index.fractionAsDouble();
323 int indx = index.toInt();
324 return double(coeffs[indx]) +
325 fraction * (double(coeffs[indx + 1]) - double(coeffs[indx]));
328ResampleCoeffs::Table ResampleCoeffs::calcTable(
329 double ratio, std::span<int16_t, HALF_TAB_LEN> permute,
unsigned& filterLen)
331 calcPermute(ratio, permute);
333 double floatIncr = (ratio > 1.0) ? INDEX_INC / ratio : INDEX_INC;
334 double normFactor = floatIncr / INDEX_INC;
338 int min_idx = -maxFilterIndex.divAsInt(increment);
339 int max_idx = 1 + (maxFilterIndex - (increment -
FilterIndex(floatIncr))).divAsInt(increment);
340 int idx_cnt = max_idx - min_idx + 1;
341 filterLen = (idx_cnt + 3) & ~3;
342 min_idx -= (narrow<int>(filterLen) - idx_cnt) / 2;
343 Table table(HALF_TAB_LEN * filterLen);
344 ranges::fill(std::span{table.data(), HALF_TAB_LEN * filterLen}, 0);
346 for (
auto t :
xrange(HALF_TAB_LEN)) {
347 float* tab = &table[permute[
t] * filterLen];
348 double lastPos = (double(
t) + 0.5) / TAB_LEN;
352 int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
353 filterIndex += increment * coeffCount;
354 int bufIndex = -coeffCount;
356 tab[bufIndex - min_idx] =
357 float(getCoeff(filterIndex) * normFactor);
358 filterIndex -= increment;
362 filterIndex = increment - startFilterIndex;
363 coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
364 filterIndex += increment * coeffCount;
365 bufIndex = 1 + coeffCount;
367 tab[bufIndex - min_idx] =
368 float(getCoeff(filterIndex) * normFactor);
369 filterIndex -= increment;
376static const std::array<int16_t, HALF_TAB_LEN> dummyPermute = {};
378template<
unsigned CHANNELS>
382 , hostClock(hostClock_)
383 , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
384 , permute(dummyPermute)
389 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
392 size_t initialSize = 4000;
393 buffer.resize((initialSize + extra) * CHANNELS);
396template<
unsigned CHANNELS>
403template<
bool REVERSE>
404static inline void calcSseMono(
const float* buf_,
const float* tab_,
size_t len,
float* out)
406 assert((len % 4) == 0);
407 assert((uintptr_t(tab_) % 16) == 0);
409 auto x = narrow<ptrdiff_t>((len & ~7) *
sizeof(
float));
410 assert((x % 32) == 0);
411 const char* buf =
reinterpret_cast<const char*
>(buf_) + x;
412 const char* tab =
reinterpret_cast<const char*
>(tab_) + (REVERSE ? -x : x);
415 __m128 a0 = _mm_setzero_ps();
416 __m128 a1 = _mm_setzero_ps();
418 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + x + 0));
419 __m128 b1 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + x + 16));
421 if constexpr (REVERSE) {
422 t0 = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - x - 16));
423 t1 = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - x - 32));
425 t0 = _mm_load_ps (
reinterpret_cast<const float*
>(tab + x + 0));
426 t1 = _mm_load_ps (
reinterpret_cast<const float*
>(tab + x + 16));
428 __m128 m0 = _mm_mul_ps(b0, t0);
429 __m128 m1 = _mm_mul_ps(b1, t1);
430 a0 = _mm_add_ps(a0, m0);
431 a1 = _mm_add_ps(a1, m1);
432 x += 2 *
sizeof(__m128);
435 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf));
437 if constexpr (REVERSE) {
438 t0 = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 16));
440 t0 = _mm_load_ps (
reinterpret_cast<const float*
>(tab));
442 __m128 m0 = _mm_mul_ps(b0, t0);
443 a0 = _mm_add_ps(a0, m0);
446 __m128 a = _mm_add_ps(a0, a1);
449 __m128
t = _mm_add_ps(a, _mm_movehl_ps(a, a));
450 __m128 s = _mm_add_ss(
t, _mm_shuffle_ps(
t,
t, 1));
452 _mm_store_ss(out, s);
455template<
int N>
static inline __m128 shuffle(__m128 x)
457 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x), N));
459template<
bool REVERSE>
460static inline void calcSseStereo(
const float* buf_,
const float* tab_,
size_t len,
float* out)
462 assert((len % 4) == 0);
463 assert((uintptr_t(tab_) % 16) == 0);
465 auto x = narrow<ptrdiff_t>(2 * (len & ~7) *
sizeof(
float));
466 const char* buf =
reinterpret_cast<const char*
>(buf_) + x;
467 const char* tab =
reinterpret_cast<const char*
>(tab_);
470 __m128 a0 = _mm_setzero_ps();
471 __m128 a1 = _mm_setzero_ps();
472 __m128 a2 = _mm_setzero_ps();
473 __m128 a3 = _mm_setzero_ps();
475 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + x + 0));
476 __m128 b1 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + x + 16));
477 __m128 b2 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + x + 32));
478 __m128 b3 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + x + 48));
480 if constexpr (REVERSE) {
481 ta = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 16));
482 tb = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 32));
483 tab -= 2 *
sizeof(__m128);
485 ta = _mm_load_ps (
reinterpret_cast<const float*
>(tab + 0));
486 tb = _mm_load_ps (
reinterpret_cast<const float*
>(tab + 16));
487 tab += 2 *
sizeof(__m128);
489 __m128 t0 = shuffle<0x50>(ta);
490 __m128 t1 = shuffle<0xFA>(ta);
491 __m128 t2 = shuffle<0x50>(tb);
492 __m128 t3 = shuffle<0xFA>(tb);
493 __m128 m0 = _mm_mul_ps(b0, t0);
494 __m128 m1 = _mm_mul_ps(b1, t1);
495 __m128 m2 = _mm_mul_ps(b2, t2);
496 __m128 m3 = _mm_mul_ps(b3, t3);
497 a0 = _mm_add_ps(a0, m0);
498 a1 = _mm_add_ps(a1, m1);
499 a2 = _mm_add_ps(a2, m2);
500 a3 = _mm_add_ps(a3, m3);
501 x += 4 *
sizeof(__m128);
504 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + 0));
505 __m128 b1 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + 16));
507 if constexpr (REVERSE) {
508 ta = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 16));
510 ta = _mm_load_ps (
reinterpret_cast<const float*
>(tab + 0));
512 __m128 t0 = shuffle<0x50>(ta);
513 __m128 t1 = shuffle<0xFA>(ta);
514 __m128 m0 = _mm_mul_ps(b0, t0);
515 __m128 m1 = _mm_mul_ps(b1, t1);
516 a0 = _mm_add_ps(a0, m0);
517 a1 = _mm_add_ps(a1, m1);
520 __m128 a01 = _mm_add_ps(a0, a1);
521 __m128 a23 = _mm_add_ps(a2, a3);
522 __m128 a = _mm_add_ps(a01, a23);
524 __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
525 _mm_store_ss(&out[0], s);
526 _mm_store_ss(&out[1], shuffle<0x55>(s));
531template<
unsigned CHANNELS>
532void ResampleHQ<CHANNELS>::calcOutput(
533 float pos,
float* __restrict output)
535 assert((filterLen & 3) == 0);
537 int bufIdx = int(pos) + bufStart;
538 assert((bufIdx + filterLen) <= bufEnd);
540 const float* buf = &buffer[bufIdx];
542 auto t = size_t(lrintf(pos * TAB_LEN)) % TAB_LEN;
543 if (!(
t & HALF_TAB_LEN)) {
546 const float* tab = &table[
t * filterLen];
549 if constexpr (CHANNELS == 1) {
550 calcSseMono <false>(buf, tab, filterLen, output);
552 calcSseStereo<false>(buf, tab, filterLen, output);
558 for (
auto ch :
xrange(CHANNELS)) {
563 for (
unsigned i = 0; i < filterLen; i += 4) {
564 r0 += tab[i + 0] * buf[CHANNELS * (i + 0)];
565 r1 += tab[i + 1] * buf[CHANNELS * (i + 1)];
566 r2 += tab[i + 2] * buf[CHANNELS * (i + 2)];
567 r3 += tab[i + 3] * buf[CHANNELS * (i + 3)];
569 output[ch] = r0 + r1 + r2 + r3;
574 t = permute[TAB_LEN - 1 -
t];
575 const float* tab = &table[(
t + 1) * filterLen];
578 if constexpr (CHANNELS == 1) {
579 calcSseMono <true>(buf, tab, filterLen, output);
581 calcSseStereo<true>(buf, tab, filterLen, output);
587 for (
auto ch :
xrange(CHANNELS)) {
592 for (
int i = 0; i < int(filterLen); i += 4) {
593 r0 += tab[-i - 1] * buf[CHANNELS * (i + 0)];
594 r1 += tab[-i - 2] * buf[CHANNELS * (i + 1)];
595 r2 += tab[-i - 3] * buf[CHANNELS * (i + 2)];
596 r3 += tab[-i - 4] * buf[CHANNELS * (i + 3)];
598 output[ch] = r0 + r1 + r2 + r3;
604template<
unsigned CHANNELS>
605void ResampleHQ<CHANNELS>::prepareData(
unsigned emuNum)
608 unsigned free = unsigned(buffer.size() / CHANNELS) - bufEnd;
612 unsigned available = bufEnd - bufStart;
613 memmove(&buffer[0], &buffer[bufStart *
size_t(CHANNELS)],
614 available *
size_t(CHANNELS) *
sizeof(
float));
618 free = unsigned(buffer.size() / CHANNELS) - bufEnd;
619 auto missing = narrow_cast<int>(emuNum - free);
620 if (missing > 0) [[unlikely]] {
628 buffer.resize(buffer.size() + missing *
size_t(CHANNELS));
632 auto tmpBuf = tmpBufExtra.subspan(0, emuNum * CHANNELS);
633 if (input.generateInput(tmpBufExtra.data(), emuNum)) {
635 subspan(buffer, bufEnd * CHANNELS));
637 nonzeroSamples = bufEnd - bufStart;
643 assert(bufStart <= bufEnd);
644 assert(bufEnd <= (buffer.size() / CHANNELS));
647template<
unsigned CHANNELS>
649 float* __restrict dataOut,
size_t hostNum, EmuTime::param time)
651 auto& emuClk = getEmuClock();
652 unsigned emuNum = emuClk.getTicksTill(time);
657 bool notMuted = nonzeroSamples > 0;
660 EmuTime host1 = hostClock.getFastAdd(1);
661 assert(host1 > emuClk.getTime());
662 float pos = narrow_cast<float>(emuClk.getTicksTillDouble(host1));
663 assert(pos <= (ratio + 2));
664 for (
auto i :
xrange(hostNum)) {
665 calcOutput(pos, &dataOut[i * CHANNELS]);
671 nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
673 assert(bufStart <= bufEnd);
674 unsigned available = bufEnd - bufStart;
675 unsigned extra = filterLen + 1 + narrow_cast<int>(ratio) + 1;
676 assert(available == extra); (void)available; (void)extra;
Represents a clock with a variable frequency.
static ResampleCoeffs & instance()
ResampleCoeffs(const ResampleCoeffs &)=delete
void getCoeffs(double ratio, std::span< const int16_t, HALF_TAB_LEN > &permute, float *&table, unsigned &filterLen)
ResampleCoeffs & operator=(const ResampleCoeffs &)=delete
void releaseCoeffs(double ratio)
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
bool generateOutputImpl(float *dataOut, size_t num, EmuTime::param time) override
ALWAYS_INLINE unsigned count(const uint8_t *pIn, const uint8_t *pMatch, const uint8_t *pInLimit)
This file implemented 3 utility functions:
FixedPoint< 16 > FilterIndex
constexpr void fill(ForwardRange &&range, const T &value)
auto copy(InputRange &&range, OutputIter out)
constexpr void iota(ForwardIt first, ForwardIt last, T value)
auto find(InputRange &&range, const T &value)
size_t size(std::string_view utf8)
constexpr auto subspan(Range &&range, size_t offset, size_t count=std::dynamic_extent)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
constexpr auto xrange(T e)
constexpr auto end(const zstring_view &x)