29 #include <emmintrin.h>
38 #include "ResampleCoeffs.ii"
56 void getCoeffs(
double ratio, int16_t*& permute,
float*&
table,
unsigned& filterLen);
66 static Table calcTable(
double ratio, int16_t* permute,
unsigned& filterLen);
75 std::vector<Element> cache;
78 ResampleCoeffs::~ResampleCoeffs()
80 assert(cache.empty());
86 return resampleCoeffs;
90 double ratio, int16_t*& permute,
float*&
table,
unsigned& filterLen)
92 if (
auto it =
ranges::find(cache, ratio, &Element::ratio);
94 permute = it->permute.data();
96 filterLen = it->filterLen;
104 elem.table = calcTable(ratio, elem.permute.data(), elem.filterLen);
105 permute = elem.permute.data();
106 table = elem.table.data();
107 filterLen = elem.filterLen;
108 cache.push_back(std::move(elem));
115 if (it->count == 0) {
228 constexpr
unsigned N1 =
N - 1;
229 constexpr
unsigned N2 =
N / 2;
231 static constexpr
unsigned mapIdx(
unsigned x)
237 static constexpr std::pair<unsigned, unsigned> next(
unsigned x,
unsigned step)
242 static void calcPermute(
double ratio, int16_t* permute)
244 double r2 = ratio *
N;
245 double fract = r2 - floor(r2);
246 unsigned step = floor(r2);
259 std::fill_n(permute,
N2, -1);
261 unsigned restart = incr ? 0 :
N2 - 1;
262 unsigned curr = restart;
266 auto [nxt1, nxt2] = next(i,
step);
267 if ((nxt1 == i) || (nxt2 == i)) { curr = i;
break; }
270 for (
unsigned i =
N2 - 1; int(i) >= 0; --i) {
271 auto [nxt1, nxt2] = next(i,
step);
272 if ((nxt1 == i) || (nxt2 == i)) { curr = i;
break; }
279 assert(permute[curr] == -1);
281 permute[curr] = cnt++;
283 auto [nxt1, nxt2] = next(curr,
step);
284 if (permute[nxt1] == -1) {
287 }
else if (permute[nxt2] == -1) {
293 if (cnt ==
N2)
break;
296 while (permute[restart] != -1) {
299 assert(restart !=
N2);
301 assert(restart != 0);
309 int16_t testPerm[
N2];
311 assert(std::is_permutation(permute, permute +
N2, testPerm));
315 static constexpr
double getCoeff(
FilterIndex index)
317 double fraction = index.fractionAsDouble();
318 int indx = index.toInt();
319 return double(
coeffs[indx]) +
320 fraction * (double(
coeffs[indx + 1]) - double(
coeffs[indx]));
323 ResampleCoeffs::Table ResampleCoeffs::calcTable(
324 double ratio, int16_t* permute,
unsigned& filterLen)
326 calcPermute(ratio, permute);
329 double normFactor = floatIncr /
INDEX_INC;
333 int min_idx = -maxFilterIndex.divAsInt(increment);
334 int max_idx = 1 + (maxFilterIndex - (increment -
FilterIndex(floatIncr))).divAsInt(increment);
335 int idx_cnt = max_idx - min_idx + 1;
336 filterLen = (idx_cnt + 3) & ~3;
337 min_idx -= (filterLen - idx_cnt) / 2;
342 float* tab = &
table[permute[
t] * filterLen];
343 double lastPos = (double(
t) + 0.5) /
TAB_LEN;
347 int coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
348 filterIndex += increment * coeffCount;
349 int bufIndex = -coeffCount;
351 tab[bufIndex - min_idx] =
352 float(getCoeff(filterIndex) * normFactor);
353 filterIndex -= increment;
357 filterIndex = increment - startFilterIndex;
358 coeffCount = (maxFilterIndex - filterIndex).divAsInt(increment);
359 filterIndex += increment * coeffCount;
360 bufIndex = 1 + coeffCount;
362 tab[bufIndex - min_idx] =
363 float(getCoeff(filterIndex) * normFactor);
364 filterIndex -= increment;
372 template<
unsigned CHANNELS>
376 , hostClock(hostClock_)
377 , ratio(float(hostClock.getPeriod().toDouble() / getEmuClock().getPeriod().toDouble()))
382 unsigned extra = int(filterLen + 1 + ratio + 1);
386 unsigned initialSize = 4000;
387 buffer.resize((initialSize + extra) *
CHANNELS);
390 template<
unsigned CHANNELS>
397 template<
bool REVERSE>
398 static inline void calcSseMono(
const float* buf_,
const float* tab_,
size_t len,
float* out)
400 assert((len % 4) == 0);
401 assert((uintptr_t(tab_) % 16) == 0);
403 ptrdiff_t
x = (len & ~7) *
sizeof(
float);
404 assert((
x % 32) == 0);
405 const char* buf =
reinterpret_cast<const char*
>(buf_) +
x;
406 const char* tab =
reinterpret_cast<const char*
>(tab_) + (REVERSE ? -
x :
x);
409 __m128 a0 = _mm_setzero_ps();
410 __m128 a1 = _mm_setzero_ps();
412 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf +
x + 0));
413 __m128 b1 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf +
x + 16));
415 if constexpr (REVERSE) {
416 t0 = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab -
x - 16));
417 t1 = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab -
x - 32));
419 t0 = _mm_load_ps (
reinterpret_cast<const float*
>(tab +
x + 0));
420 t1 = _mm_load_ps (
reinterpret_cast<const float*
>(tab +
x + 16));
422 __m128 m0 = _mm_mul_ps(b0, t0);
423 __m128 m1 = _mm_mul_ps(b1, t1);
424 a0 = _mm_add_ps(a0, m0);
425 a1 = _mm_add_ps(a1, m1);
426 x += 2 *
sizeof(__m128);
429 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf));
431 if constexpr (REVERSE) {
432 t0 = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 16));
434 t0 = _mm_load_ps (
reinterpret_cast<const float*
>(tab));
436 __m128 m0 = _mm_mul_ps(b0, t0);
437 a0 = _mm_add_ps(a0, m0);
440 __m128 a = _mm_add_ps(a0, a1);
443 __m128
t = _mm_add_ps(a, _mm_movehl_ps(a, a));
444 __m128 s = _mm_add_ss(
t, _mm_shuffle_ps(
t,
t, 1));
446 _mm_store_ss(out, s);
449 template<
int N>
static inline __m128 shuffle(__m128
x)
451 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(
x),
N));
453 template<
bool REVERSE>
454 static inline void calcSseStereo(
const float* buf_,
const float* tab_,
size_t len,
float* out)
456 assert((len % 4) == 0);
457 assert((uintptr_t(tab_) % 16) == 0);
459 ptrdiff_t
x = 2 * (len & ~7) *
sizeof(
float);
460 const char* buf =
reinterpret_cast<const char*
>(buf_) +
x;
461 const char* tab =
reinterpret_cast<const char*
>(tab_);
464 __m128 a0 = _mm_setzero_ps();
465 __m128 a1 = _mm_setzero_ps();
466 __m128 a2 = _mm_setzero_ps();
467 __m128 a3 = _mm_setzero_ps();
469 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf +
x + 0));
470 __m128 b1 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf +
x + 16));
471 __m128 b2 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf +
x + 32));
472 __m128 b3 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf +
x + 48));
474 if constexpr (REVERSE) {
475 ta = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 16));
476 tb = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 32));
477 tab -= 2 *
sizeof(__m128);
479 ta = _mm_load_ps (
reinterpret_cast<const float*
>(tab + 0));
480 tb = _mm_load_ps (
reinterpret_cast<const float*
>(tab + 16));
481 tab += 2 *
sizeof(__m128);
483 __m128 t0 = shuffle<0x50>(ta);
484 __m128 t1 = shuffle<0xFA>(ta);
485 __m128 t2 = shuffle<0x50>(tb);
486 __m128 t3 = shuffle<0xFA>(tb);
487 __m128 m0 = _mm_mul_ps(b0, t0);
488 __m128 m1 = _mm_mul_ps(b1, t1);
489 __m128 m2 = _mm_mul_ps(b2, t2);
490 __m128 m3 = _mm_mul_ps(b3, t3);
491 a0 = _mm_add_ps(a0, m0);
492 a1 = _mm_add_ps(a1, m1);
493 a2 = _mm_add_ps(a2, m2);
494 a3 = _mm_add_ps(a3, m3);
495 x += 4 *
sizeof(__m128);
498 __m128 b0 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + 0));
499 __m128 b1 = _mm_loadu_ps(
reinterpret_cast<const float*
>(buf + 16));
501 if constexpr (REVERSE) {
502 ta = _mm_loadr_ps(
reinterpret_cast<const float*
>(tab - 16));
504 ta = _mm_load_ps (
reinterpret_cast<const float*
>(tab + 0));
506 __m128 t0 = shuffle<0x50>(ta);
507 __m128 t1 = shuffle<0xFA>(ta);
508 __m128 m0 = _mm_mul_ps(b0, t0);
509 __m128 m1 = _mm_mul_ps(b1, t1);
510 a0 = _mm_add_ps(a0, m0);
511 a1 = _mm_add_ps(a1, m1);
514 __m128 a01 = _mm_add_ps(a0, a1);
515 __m128 a23 = _mm_add_ps(a2, a3);
516 __m128 a = _mm_add_ps(a01, a23);
518 __m128 s = _mm_add_ps(a, _mm_movehl_ps(a, a));
519 _mm_store_ss(&out[0], s);
520 _mm_store_ss(&out[1], shuffle<0x55>(s));
525 template<
unsigned CHANNELS>
526 void ResampleHQ<CHANNELS>::calcOutput(
527 float pos,
float* __restrict output)
529 assert((filterLen & 3) == 0);
531 int bufIdx = int(pos) + bufStart;
532 assert((bufIdx + filterLen) <= bufEnd);
534 const float* buf = &buffer[bufIdx];
540 const float* tab = &
table[
t * filterLen];
544 calcSseMono <false>(buf, tab, filterLen, output);
546 calcSseStereo<false>(buf, tab, filterLen, output);
557 for (
unsigned i = 0; i < filterLen; i += 4) {
558 r0 += tab[i + 0] * buf[
CHANNELS * (i + 0)];
559 r1 += tab[i + 1] * buf[
CHANNELS * (i + 1)];
560 r2 += tab[i + 2] * buf[
CHANNELS * (i + 2)];
561 r3 += tab[i + 3] * buf[
CHANNELS * (i + 3)];
563 output[ch] = r0 + r1 + r2 + r3;
569 const float* tab = &
table[(
t + 1) * filterLen];
573 calcSseMono <true>(buf, tab, filterLen, output);
575 calcSseStereo<true>(buf, tab, filterLen, output);
586 for (
int i = 0; i < int(filterLen); i += 4) {
587 r0 += tab[-i - 1] * buf[
CHANNELS * (i + 0)];
588 r1 += tab[-i - 2] * buf[
CHANNELS * (i + 1)];
589 r2 += tab[-i - 3] * buf[
CHANNELS * (i + 2)];
590 r3 += tab[-i - 4] * buf[
CHANNELS * (i + 3)];
592 output[ch] = r0 + r1 + r2 + r3;
598 template<
unsigned CHANNELS>
599 void ResampleHQ<CHANNELS>::prepareData(
unsigned emuNum)
602 unsigned free = unsigned(buffer.size() /
CHANNELS) - bufEnd;
606 unsigned available = bufEnd - bufStart;
607 memmove(&buffer[0], &buffer[bufStart *
CHANNELS],
608 available *
CHANNELS *
sizeof(
float));
612 free = unsigned(buffer.size() /
CHANNELS) - bufEnd;
613 int missing = emuNum - free;
622 buffer.resize(buffer.size() + missing *
CHANNELS);
626 if (input.generateInput(tmpBuf, emuNum)) {
627 memcpy(&buffer[bufEnd *
CHANNELS], tmpBuf,
630 nonzeroSamples = bufEnd - bufStart;
632 memset(&buffer[bufEnd *
CHANNELS], 0,
637 assert(bufStart <= bufEnd);
638 assert(bufEnd <= (buffer.size() /
CHANNELS));
641 template<
unsigned CHANNELS>
643 float* __restrict dataOut,
unsigned hostNum, EmuTime::param time)
645 auto& emuClk = getEmuClock();
646 unsigned emuNum = emuClk.getTicksTill(time);
651 bool notMuted = nonzeroSamples > 0;
654 EmuTime host1 = hostClock.getFastAdd(1);
655 assert(host1 > emuClk.getTime());
656 float pos = emuClk.getTicksTillDouble(host1);
657 assert(pos <= (ratio + 2));
658 for (
auto i :
xrange(hostNum)) {
659 calcOutput(pos, &dataOut[i *
CHANNELS]);
665 nonzeroSamples = std::max<int>(0, nonzeroSamples - emuNum);
667 assert(bufStart <= bufEnd);
668 unsigned available = bufEnd - bufStart;
669 unsigned extra = int(filterLen + 1 + ratio + 1);
670 assert(available == extra); (void)available; (void)extra;
constexpr unsigned CHANNELS
Represents a clock with a variable frequency.
const T * data() const
Returns pointer to the start of the memory buffer.
static ResampleCoeffs & instance()
ResampleCoeffs(const ResampleCoeffs &)=delete
ResampleCoeffs & operator=(const ResampleCoeffs &)=delete
void getCoeffs(double ratio, int16_t *&permute, float *&table, unsigned &filterLen)
void releaseCoeffs(double ratio)
bool generateOutputImpl(float *dataOut, unsigned num, EmuTime::param time) override
ResampleHQ(ResampledSoundDevice &input, const DynamicClock &hostClock)
ALWAYS_INLINE unsigned count(const uint8_t *pIn, const uint8_t *pMatch, const uint8_t *pInLimit)
This file implemented 3 utility functions:
constexpr int COEFF_HALF_LEN
FixedPoint< 16 > FilterIndex
constexpr unsigned TAB_LEN
constexpr unsigned HALF_TAB_LEN
constexpr KeyMatrixPosition x
Keyboard bindings.
constexpr void iota(ForwardIt first, ForwardIt last, T value)
auto find(InputRange &&range, const T &value)
size_t size(std::string_view utf8)
void move_pop_back(VECTOR &v, typename VECTOR::iterator it)
Erase the pointed to element from the given vector.
auto rfind_unguarded(RANGE &range, const VAL &val, Proj proj={})
Similar to the find(_if)_unguarded functions above, but searches from the back to front.
#define VLA_SSE_ALIGNED(TYPE, NAME, LENGTH)
constexpr auto xrange(T e)
constexpr auto end(const zstring_view &x)