Skip to content

Commit 38cf226

Browse files
committed
Version 5.4.
Added compiler specializations to previously optimized inner loops. "Shuffled" SIMD interpolation code is not efficient on Apple M1. Intel C++ Compiler vectorizes "whole stepping" interpolation as good as a manually-written SSE. Reorganized SIMD instructions for a slightly better performance. Changed internal buffer sizes of half-band resamplers (1-2% performance boost). Fixed compiler warnings in PFFFT code. Added several asserts to the code.
1 parent 8c32e8e commit 38cf226

17 files changed

+98
-53
lines changed

CDSPBlockConvolver.h

+2
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ class CDSPBlockConvolver : public CDSPProcessor
157157
}
158158
}
159159

160+
R8BASSERT( Latency >= 0 );
161+
160162
fftin = new CDSPRealFFTKeeper( fftinBits );
161163

162164
if( fftoutBits == fftinBits )

CDSPFracInterpolator.h

+32-23
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ class CDSPFracDelayFilterBank : public R8B_BASECLASS
141141
p += ElementSize;
142142
}
143143

144-
#if defined( R8B_SSE2 ) || defined( R8B_NEON )
144+
#if defined( R8B_SIMD_ISH )
145145
shuffle2_3( Table, TableEnd );
146146
#endif // SIMD
147147
}
@@ -160,7 +160,7 @@ class CDSPFracDelayFilterBank : public R8B_BASECLASS
160160
p += ElementSize;
161161
}
162162

163-
#if defined( R8B_SSE2 ) || defined( R8B_NEON )
163+
#if defined( R8B_SIMD_ISH )
164164
shuffle2_4( Table, TableEnd );
165165
#endif // SIMD
166166
}
@@ -177,7 +177,7 @@ class CDSPFracDelayFilterBank : public R8B_BASECLASS
177177
p += ElementSize;
178178
}
179179

180-
#if defined( R8B_SSE2 ) || defined( R8B_NEON )
180+
#if defined( R8B_SIMD_ISH )
181181
shuffle2_2( Table, TableEnd );
182182
#endif // SIMD
183183
}
@@ -717,12 +717,13 @@ class CDSPFracInterpolator : public CDSPProcessor
717717
R8BASSERT( DstSampleRate > 0.0 );
718718
R8BASSERT( PrevLatency >= 0.0 );
719719
R8BASSERT( BufLenBits >= 5 );
720-
R8BASSERT(( 1 << BufLenBits ) >= FilterLen * 3 );
721720

722721
InitFracPos = PrevLatency;
723722
Latency = (int) InitFracPos;
724723
InitFracPos -= Latency;
725724

725+
R8BASSERT( Latency >= 0 );
726+
726727
#if R8B_FLTTEST
727728

728729
IsWhole = false;
@@ -756,6 +757,8 @@ class CDSPFracInterpolator : public CDSPProcessor
756757
fll = fl2 - 1;
757758
flo = fll + fl2;
758759

760+
R8BASSERT(( 1 << BufLenBits ) >= FilterLen * 3 );
761+
759762
static const CConvolveFn FltConvFn0[ 13 ] = {
760763
&CDSPFracInterpolator :: convolve0< 6 >,
761764
&CDSPFracInterpolator :: convolve0< 8 >,
@@ -1004,7 +1007,7 @@ class CDSPFracInterpolator : public CDSPProcessor
10041007
const double* const rp = Buf + ReadPos;
10051008
int i;
10061009

1007-
#if defined( R8B_SSE2 )
1010+
#if defined( R8B_SSE2 ) && !defined( __INTEL_COMPILER )
10081011

10091012
__m128d s = _mm_setzero_pd();
10101013

@@ -1024,10 +1027,7 @@ class CDSPFracInterpolator : public CDSPProcessor
10241027

10251028
for( i = 0; i < fltlen; i += 2 )
10261029
{
1027-
const float64x2_t m = vmulq_f64( vld1q_f64( ftp + i ),
1028-
vld1q_f64( rp + i ));
1029-
1030-
s = vaddq_f64( s, m );
1030+
s = vmlaq_f64( s, vld1q_f64( ftp + i ), vld1q_f64( rp + i ));
10311031
}
10321032

10331033
*op = vaddvq_f64( s );
@@ -1081,40 +1081,49 @@ class CDSPFracInterpolator : public CDSPProcessor
10811081
const double* const rp = Buf + ReadPos;
10821082
int i;
10831083

1084-
#if defined( R8B_SSE2 )
1084+
#if defined( R8B_SSE2 ) && defined( R8B_SIMD_ISH )
10851085

10861086
const __m128d x1 = _mm_set1_pd( x );
10871087
const __m128d x2 = _mm_set1_pd( x2d );
10881088
__m128d s = _mm_setzero_pd();
10891089

10901090
for( i = 0; i < fltlen; i += 2 )
10911091
{
1092-
const __m128d xx1 = _mm_mul_pd( _mm_load_pd( ftp + 2 ), x1 );
1093-
const __m128d xx2 = _mm_mul_pd( _mm_load_pd( ftp + 4 ), x2 );
1094-
const __m128d xxs1 = _mm_add_pd( xx1, xx2 );
1095-
const __m128d xxs2 = _mm_add_pd( _mm_load_pd( ftp ), xxs1 );
1096-
1097-
s = _mm_add_pd( s, _mm_mul_pd( xxs2, _mm_loadu_pd( rp + i )));
1092+
const __m128d ftp2 = _mm_load_pd( ftp + 2 );
1093+
const __m128d xx1 = _mm_mul_pd( ftp2, x1 );
1094+
const __m128d ftp4 = _mm_load_pd( ftp + 4 );
1095+
const __m128d xx2 = _mm_mul_pd( ftp4, x2 );
1096+
const __m128d ftp0 = _mm_load_pd( ftp );
10981097
ftp += 6;
1098+
1099+
const __m128d rpi = _mm_loadu_pd( rp + i );
1100+
const __m128d xxs = _mm_add_pd( ftp0, _mm_add_pd( xx1, xx2 ));
1101+
1102+
s = _mm_add_pd( s, _mm_mul_pd( rpi, xxs ));
10991103
}
11001104

11011105
_mm_storel_pd( op, _mm_add_pd( s, _mm_shuffle_pd( s, s, 1 )));
11021106

1103-
#elif defined( R8B_NEON )
1107+
#elif defined( R8B_NEON ) && defined( R8B_SIMD_ISH )
11041108

11051109
const float64x2_t x1 = vdupq_n_f64( x );
11061110
const float64x2_t x2 = vdupq_n_f64( x2d );
11071111
float64x2_t s = vdupq_n_f64( 0.0 );
11081112

11091113
for( i = 0; i < fltlen; i += 2 )
11101114
{
1111-
const float64x2_t xx1 = vmulq_f64( vld1q_f64( ftp + 2 ), x1 );
1112-
const float64x2_t xx2 = vmulq_f64( vld1q_f64( ftp + 4 ), x2 );
1113-
const float64x2_t xxs1 = vaddq_f64( xx1, xx2 );
1114-
const float64x2_t xxs2 = vaddq_f64( vld1q_f64( ftp ), xxs1 );
1115-
1116-
s = vaddq_f64( s, vmulq_f64( xxs2, vld1q_f64( rp + i )));
1115+
const float64x2_t ftp2 = vld1q_f64( ftp + 2 );
1116+
const float64x2_t xx1 = vmulq_f64( ftp2, x1 );
1117+
const float64x2_t ftp4 = vld1q_f64( ftp + 4 );
1118+
const float64x2_t xx2 = vmulq_f64( ftp4, x2 );
1119+
const float64x2_t ftp0 = vld1q_f64( ftp );
11171120
ftp += 6;
1121+
1122+
const float64x2_t rpi = vld1q_f64( rp + i );
1123+
const float64x2_t xxs = vaddq_f64( ftp0,
1124+
vaddq_f64( xx1, xx2 ));
1125+
1126+
s = vmlaq_f64( s, rpi, xxs );
11181127
}
11191128

11201129
*op = vaddvq_f64( s );

CDSPHBDownsampler.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ class CDSPHBDownsampler : public CDSPProcessor
7979
Latency = (int) LatencyFrac;
8080
LatencyFrac -= Latency;
8181

82+
R8BASSERT( Latency >= 0 );
83+
8284
R8BCONSOLE( "CDSPHBDownsampler: taps=%i third=%i att=%.1f io=1/2\n",
8385
fltt, (int) IsThird, att );
8486

@@ -171,7 +173,7 @@ class CDSPHBDownsampler : public CDSPProcessor
171173
}
172174

173175
private:
174-
static const int BufLenBits = 8; ///< The length of the ring buffer,
176+
static const int BufLenBits = 10; ///< The length of the ring buffer,
175177
///< expressed as Nth power of 2. This value can be reduced if it is
176178
///< known that only short input buffers will be passed to the
177179
///< interpolator. The minimum value of this parameter is 5, and
@@ -223,7 +225,7 @@ class CDSPHBDownsampler : public CDSPProcessor
223225
const double* const rp0, int& ReadPos0 ) \
224226
{ \
225227
int rpos = ReadPos0; \
226-
while( op < opend ) \
228+
while( op != opend ) \
227229
{ \
228230
const double* const rp = rp0 + rpos; \
229231
*op = rp[ 0 ] +

CDSPHBUpsampler.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,8 @@ class CDSPHBUpsampler : public CDSPProcessor
591591
Latency = (int) LatencyFrac;
592592
LatencyFrac -= Latency;
593593

594+
R8BASSERT( Latency >= 0 );
595+
594596
R8BCONSOLE( "CDSPHBUpsampler: sti=%i third=%i taps=%i att=%.1f "
595597
"io=2/1\n", SteepIndex, (int) IsThird, fltt, att );
596598

@@ -683,7 +685,7 @@ class CDSPHBUpsampler : public CDSPProcessor
683685
}
684686

685687
private:
686-
static const int BufLenBits = 8; ///< The length of the ring buffer,
688+
static const int BufLenBits = 9; ///< The length of the ring buffer,
687689
///< expressed as Nth power of 2. This value can be reduced if it is
688690
///< known that only short input buffers will be passed to the
689691
///< interpolator. The minimum value of this parameter is 5, and
@@ -735,7 +737,7 @@ class CDSPHBUpsampler : public CDSPProcessor
735737
const double* const rp0, int& ReadPos0 ) \
736738
{ \
737739
int rpos = ReadPos0; \
738-
while( op < opend ) \
740+
while( op != opend ) \
739741
{ \
740742
const double* const rp = rp0 + rpos; \
741743
op[ 0 ] = rp[ 0 ]; \

DLL/Win32/r8bsrc.dll

512 Bytes
Binary file not shown.

DLL/Win32/r8bsrc.lib

0 Bytes
Binary file not shown.

DLL/Win64/r8bsrc.dll

0 Bytes
Binary file not shown.

DLL/Win64/r8bsrc.lib

0 Bytes
Binary file not shown.

README.md

+20-9
Original file line numberDiff line numberDiff line change
@@ -104,16 +104,15 @@ style. To generate the documentation locally you may run the
104104
`doxygen ./other/r8bdoxy.txt` command from the library's directory.
105105

106106
Preliminary tests show that the r8b::CDSPResampler24 resampler class achieves
107-
`61.2*n_cores` Mflops (`83.3*n_cores` for Intel IPP FFT) when converting 1
107+
`31*n_cores` Mrops (`46*n_cores` for Intel IPP FFT) when converting 1
108108
channel of 24-bit audio from 44100 to 96000 sample rate (2% transition band),
109-
on an Intel Core i7-7700K processor-based 64-bit AVX2-enabled system without
110-
overclocking. This approximately translates to a real-time resampling of
111-
`637*n_cores` (`868*n_cores`) audio streams, at 100% CPU load. Speed
112-
performance when converting to other sample rates may vary greatly. When
113-
comparing performance of this resampler library to another library make sure
114-
that the competing library is also tuned to produce a fully linear-phase
115-
response, has similar stop-band characteristics, and similar sample timing
116-
precision.
109+
on a Ryzen 3700X processor-based 64-bit system. This approximately translates
110+
to a real-time resampling of `700*n_cores` (`1000*n_cores`) audio streams, at
111+
100% CPU load. Speed performance when converting to other sample rates may
112+
vary greatly. When comparing performance of this resampler library to another
113+
library make sure that the competing library is also tuned to produce a fully
114+
linear-phase response, has similar stop-band characteristics, and similar
115+
sample timing precision.
117116

118117
## Dynamic Link Library ##
119118

@@ -207,6 +206,18 @@ inclusion into this list is not mandatory.
207206

208207
## Change Log ##
209208

209+
Version 5.4:
210+
211+
* Added compiler specializations to previously optimized inner loops.
212+
"Shuffled" SIMD interpolation code is not efficient on Apple M1. Intel C++
213+
Compiler vectorizes "whole stepping" interpolation as good as a
214+
manually-written SSE.
215+
* Reorganized SIMD instructions for a slightly better performance.
216+
* Changed internal buffer sizes of half-band resamplers (1-2% performance
217+
boost).
218+
* Fixed compiler warnings in PFFFT code.
219+
* Added several asserts to the code.
220+
210221
Version 5.3:
211222

212223
* Optimized inner loops of the fractional interpolator, added SSE2 and NEON

bench/Win64/r8bfreesrc.exe

0 Bytes
Binary file not shown.

bench/masstest.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,18 @@ VOXMAIN
125125

126126
const TClock t1( CSystem :: getClock() );
127127
Resamp -> oneshot( &Ref[ 0 ], InBufSize, &OutBuf[ 0 ], ol );
128-
const double perf = 1e-6 * ol / CSystem :: getClockDiffSec( t1 );
128+
double perf = 1e-6 * InBufSize /
129+
CSystem :: getClockDiffSec( t1 );
129130

130131
// addSine( OutBuf, ol, ( SrcSampleRate + DstSampleRate ) * 0.25,
131132
// DstSampleRate );
132133

133134
Resamp = new CResamp( DstSampleRate, SrcSampleRate, MaxInLen, tb );
135+
136+
const TClock t2( CSystem :: getClock() );
134137
Resamp -> oneshot( &OutBuf[ 0 ], ol, &OutBuf2[ 0 ], InBufSize );
138+
perf = ( perf + 1e-6 * InBufSize /
139+
CSystem :: getClockDiffSec( t2 )) * 0.5;
135140

136141
const double r = calcRMS( &Ref[ 5000 ], &OutBuf2[ 5000 ],
137142
InBufSize - 10000, peakd );
@@ -150,7 +155,7 @@ VOXMAIN
150155
printf( "avg rms %.2f\n", 10.0 * log( avgr / TestCount ) / log( 10.0 ));
151156
printf( "max rms %.2f\n", 20.0 * log( maxr ) / log( 10.0 ));
152157
printf( "peak diff %.2f\n", 20.0 * log( peakd ) / log( 10.0 ));
153-
printf( "avg perf %.2f Mflops\n", avgperf / TestCount );
158+
printf( "avg perf %.2f Mrops\n", avgperf / TestCount );
154159
printf( "avg latency %.0f\n", avglatency / TestCount );
155160

156161
VOXRET;

bench/r8bfreesrc.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ VOXMAIN
9292
int64_t ol = (int64_t) ( inf.SampleCount * OutSampleRate /
9393
InSampleRate );
9494

95+
const int64_t ol0 = inf.SampleCount * inf.ChannelCount;
9596
int64_t ool = 0;
9697
double srct = 0.0;
9798
CArray< double* > opp( inf.ChannelCount );
@@ -136,8 +137,8 @@ VOXMAIN
136137

137138
VOXCHECK( outf.finalize() );
138139

139-
printf( "Resampled in %.4f s, %.3f Mflops (excluding IO operations)\n",
140-
srct, 1e-6 * ool / srct );
140+
printf( "Resampled in %.4f s, %.3f Mrops (excluding IO operations)\n",
141+
srct, 1e-6 * ol0 / srct );
141142

142143
VOXRET;
143144
}

bench/zerotest.cpp

+7-3
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,13 @@ VOXMAIN
110110

111111
const TClock t1( CSystem :: getClock() );
112112
Resamp1 -> oneshot( &Ref[ 0 ], InBufSize, &OutBuf1[ 0 ], ol1 );
113-
const double perf = 1e-6 * ol1 / CSystem :: getClockDiffSec( t1 );
113+
double perf = 1e-6 * InBufSize /
114+
CSystem :: getClockDiffSec( t1 );
114115

116+
const TClock t2( CSystem :: getClock() );
115117
Resamp2 -> oneshot( &OutBuf1[ 0 ], ol1, &OutBuf2[ 0 ], InBufSize );
118+
perf = ( perf + 1e-6 * InBufSize /
119+
CSystem :: getClockDiffSec( t2 )) * 0.5;
116120

117121
const double r = calcRMS( &Ref[ 5000 ], &OutBuf2[ 5000 ],
118122
InBufSize - 10000, peakd );
@@ -132,12 +136,12 @@ VOXMAIN
132136
printf( "%7.2f", 20.0 * log( r ) / log( 10.0 ));
133137
}
134138

135-
printf( "\t%.2f\tMflops\n", perf );
139+
printf( "\t%.2f\tMrops\n", perf );
136140
}
137141

138142
printf( "Average rms %.2f\n", 10.0 * log( avgr / avgc ) / log( 10.0 ));
139143
printf( "Peak diff %.2f\n", 20.0 * log( peakd ) / log( 10.0 ));
140-
printf( "Average perf %.2f Mflops\n", avgperf / avgc );
144+
printf( "Average perf %.2f Mrops\n", avgperf / avgc );
141145
printf( "Average latency %.0f\n", avglatency / avgc );
142146

143147
VOXRET;

pffft.cpp

+8-6
Original file line numberDiff line numberDiff line change
@@ -266,10 +266,12 @@ void validate_pffft_simd() {
266266

267267
/* SSE and co like 16-bytes aligned pointers */
268268
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
269+
#define MALLOC_V4SF_SIZE (MALLOC_V4SF_ALIGNMENT+sizeof(void*))
269270
void *pffft_aligned_malloc(size_t nb_bytes) {
270-
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
271+
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_SIZE);
271272
if (!p0) return (void *) 0;
272-
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
273+
p = (void *) (((uintptr_t) p0 + MALLOC_V4SF_SIZE) &
274+
~(uintptr_t) (MALLOC_V4SF_ALIGNMENT-1));
273275
*((void **) p - 1) = p0;
274276
return p;
275277
}
@@ -1180,19 +1182,19 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
11801182
int ix2 = iw + idot;
11811183
int ix3 = ix2 + idot;
11821184
int ix4 = ix3 + idot;
1183-
passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1185+
passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], (float)isign);
11841186
} break;
11851187
case 4: {
11861188
int ix2 = iw + idot;
11871189
int ix3 = ix2 + idot;
1188-
passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1190+
passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], (float)isign);
11891191
} break;
11901192
case 2: {
1191-
passf2_ps(idot, l1, in, out, &wa[iw], isign);
1193+
passf2_ps(idot, l1, in, out, &wa[iw], (float)isign);
11921194
} break;
11931195
case 3: {
11941196
int ix2 = iw + idot;
1195-
passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1197+
passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], (float)isign);
11961198
} break;
11971199
default:
11981200
assert(0);

pffft_double/pffft_double.c

+3-1
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
# include <alloca.h>
8080
#endif
8181

82+
#include <stdint.h>
8283
#include <stdlib.h>
8384
#include <stdio.h>
8485
#include <math.h>
@@ -149,7 +150,8 @@
149150
static void * Valigned_malloc(size_t nb_bytes) {
150151
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_SIZE);
151152
if (!p0) return (void *) 0;
152-
p = (void *) (((size_t) p0 + MALLOC_V4SF_SIZE) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
153+
p = (void *) (((uintptr_t) p0 + MALLOC_V4SF_SIZE) &
154+
~(uintptr_t) (MALLOC_V4SF_ALIGNMENT-1));
153155
*((void **) p - 1) = p0;
154156
return p;
155157
}

pffft_double/pffft_priv_impl.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -1818,7 +1818,7 @@ void FUNC_VALIDATE_SIMD_A() {
18181818

18191819
static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo )
18201820
{
1821-
if ( !( fabsf( result - ref ) < 0.01F ) )
1821+
if ( !( fabs( result - ref ) < 0.01 ) )
18221822
{
18231823
fprintf(stderr, "%s: assert for %s at %s(%d)\n expected %f value %f\n", functxt, vartxt, f, lineNo, ref, result);
18241824
++(*numErrs);

0 commit comments

Comments
 (0)