avaneev
diff --git a/‎CDSPBlockConvolver.h
+2 b/‎CDSPBlockConvolver.h
+2
diff --git a/‎CDSPFracInterpolator.h
+32-23 b/‎CDSPFracInterpolator.h
+32-23
diff --git a/‎CDSPHBDownsampler.h
+4-2 b/‎CDSPHBDownsampler.h
+4-2
diff --git a/‎CDSPHBUpsampler.h
+4-2 b/‎CDSPHBUpsampler.h
+4-2
diff --git a/‎DLL/Win32/r8bsrc.dll
512 Bytes b/‎DLL/Win32/r8bsrc.dll
512 Bytes
diff --git a/‎DLL/Win32/r8bsrc.lib
0 Bytes b/‎DLL/Win32/r8bsrc.lib
0 Bytes
diff --git a/‎DLL/Win64/r8bsrc.dll
0 Bytes b/‎DLL/Win64/r8bsrc.dll
0 Bytes
diff --git a/‎DLL/Win64/r8bsrc.lib
0 Bytes b/‎DLL/Win64/r8bsrc.lib
0 Bytes
diff --git a/‎README.md
+20-9 b/‎README.md
+20-9
diff --git a/‎bench/Win64/r8bfreesrc.exe
0 Bytes b/‎bench/Win64/r8bfreesrc.exe
0 Bytes
diff --git a/‎bench/masstest.cpp
+7-2 b/‎bench/masstest.cpp
+7-2
diff --git a/‎bench/r8bfreesrc.cpp
+3-2 b/‎bench/r8bfreesrc.cpp
+3-2
diff --git a/‎bench/zerotest.cpp
+7-3 b/‎bench/zerotest.cpp
+7-3
diff --git a/‎pffft.cpp
+8-6 b/‎pffft.cpp
+8-6
diff --git a/‎pffft_double/pffft_double.c
+3-1 b/‎pffft_double/pffft_double.c
+3-1
diff --git a/‎pffft_double/pffft_priv_impl.h
+1-1 b/‎pffft_double/pffft_priv_impl.h
+1-1
@@ -157,6 +157,8 @@ class CDSPBlockConvolver : public CDSPProcessor
 			}
 		}
 
+		R8BASSERT( Latency >= 0 );
+
 		fftin = new CDSPRealFFTKeeper( fftinBits );
 
 		if( fftoutBits == fftinBits )
 
@@ -141,7 +141,7 @@ class CDSPFracDelayFilterBank : public R8B_BASECLASS
 					p += ElementSize;
 				}
 
-				#if defined( R8B_SSE2 ) || defined( R8B_NEON )
+				#if defined( R8B_SIMD_ISH )
 					shuffle2_3( Table, TableEnd );
 				#endif // SIMD
 			}
@@ -160,7 +160,7 @@ class CDSPFracDelayFilterBank : public R8B_BASECLASS
 					p += ElementSize;
 				}
 
-				#if defined( R8B_SSE2 ) || defined( R8B_NEON )
+				#if defined( R8B_SIMD_ISH )
 					shuffle2_4( Table, TableEnd );
 				#endif // SIMD
 			}
@@ -177,7 +177,7 @@ class CDSPFracDelayFilterBank : public R8B_BASECLASS
 					p += ElementSize;
 				}
 
-				#if defined( R8B_SSE2 ) || defined( R8B_NEON )
+				#if defined( R8B_SIMD_ISH )
 					shuffle2_2( Table, TableEnd );
 				#endif // SIMD
 			}
@@ -717,12 +717,13 @@ class CDSPFracInterpolator : public CDSPProcessor
 		R8BASSERT( DstSampleRate > 0.0 );
 		R8BASSERT( PrevLatency >= 0.0 );
 		R8BASSERT( BufLenBits >= 5 );
-		R8BASSERT(( 1 << BufLenBits ) >= FilterLen * 3 );
 
 		InitFracPos = PrevLatency;
 		Latency = (int) InitFracPos;
 		InitFracPos -= Latency;
 
+		R8BASSERT( Latency >= 0 );
+
 		#if R8B_FLTTEST
 
 			IsWhole = false;
@@ -756,6 +757,8 @@ class CDSPFracInterpolator : public CDSPProcessor
 		fll = fl2 - 1;
 		flo = fll + fl2;
 
+		R8BASSERT(( 1 << BufLenBits ) >= FilterLen * 3 );
+
 		static const CConvolveFn FltConvFn0[ 13 ] = {
 			&CDSPFracInterpolator :: convolve0< 6 >,
 			&CDSPFracInterpolator :: convolve0< 8 >,
@@ -1004,7 +1007,7 @@ class CDSPFracInterpolator : public CDSPProcessor
 			const double* const rp = Buf + ReadPos;
 			int i;
 
-		#if defined( R8B_SSE2 )
+		#if defined( R8B_SSE2 ) && !defined( __INTEL_COMPILER )
 
 			__m128d s = _mm_setzero_pd();
 
@@ -1024,10 +1027,7 @@ class CDSPFracInterpolator : public CDSPProcessor
 
 			for( i = 0; i < fltlen; i += 2 )
 			{
-				const float64x2_t m = vmulq_f64( vld1q_f64( ftp + i ),
-					vld1q_f64( rp + i ));
-
-				s = vaddq_f64( s, m );
+				s = vmlaq_f64( s, vld1q_f64( ftp + i ), vld1q_f64( rp + i ));
 			}
 
 			*op = vaddvq_f64( s );
@@ -1081,40 +1081,49 @@ class CDSPFracInterpolator : public CDSPProcessor
 			const double* const rp = Buf + ReadPos;
 			int i;
 
-		#if defined( R8B_SSE2 )
+		#if defined( R8B_SSE2 ) && defined( R8B_SIMD_ISH )
 
 			const __m128d x1 = _mm_set1_pd( x );
 			const __m128d x2 = _mm_set1_pd( x2d );
 			__m128d s = _mm_setzero_pd();
 
 			for( i = 0; i < fltlen; i += 2 )
 			{
-				const __m128d xx1 = _mm_mul_pd( _mm_load_pd( ftp + 2 ), x1 );
-				const __m128d xx2 = _mm_mul_pd( _mm_load_pd( ftp + 4 ), x2 );
-				const __m128d xxs1 = _mm_add_pd( xx1, xx2 );
-				const __m128d xxs2 = _mm_add_pd( _mm_load_pd( ftp ), xxs1 );
-
-				s = _mm_add_pd( s, _mm_mul_pd( xxs2, _mm_loadu_pd( rp + i )));
+				const __m128d ftp2 = _mm_load_pd( ftp + 2 );
+				const __m128d xx1 = _mm_mul_pd( ftp2, x1 );
+				const __m128d ftp4 = _mm_load_pd( ftp + 4 );
+				const __m128d xx2 = _mm_mul_pd( ftp4, x2 );
+				const __m128d ftp0 = _mm_load_pd( ftp );
 				ftp += 6;
+
+				const __m128d rpi = _mm_loadu_pd( rp + i );
+				const __m128d xxs = _mm_add_pd( ftp0, _mm_add_pd( xx1, xx2 ));
+
+				s = _mm_add_pd( s, _mm_mul_pd( rpi, xxs ));
 			}
 
 			_mm_storel_pd( op, _mm_add_pd( s, _mm_shuffle_pd( s, s, 1 )));
 
-		#elif defined( R8B_NEON )
+		#elif defined( R8B_NEON ) && defined( R8B_SIMD_ISH )
 
 			const float64x2_t x1 = vdupq_n_f64( x );
 			const float64x2_t x2 = vdupq_n_f64( x2d );
 			float64x2_t s = vdupq_n_f64( 0.0 );
 
 			for( i = 0; i < fltlen; i += 2 )
 			{
-				const float64x2_t xx1 = vmulq_f64( vld1q_f64( ftp + 2 ), x1 );
-				const float64x2_t xx2 = vmulq_f64( vld1q_f64( ftp + 4 ), x2 );
-				const float64x2_t xxs1 = vaddq_f64( xx1, xx2 );
-				const float64x2_t xxs2 = vaddq_f64( vld1q_f64( ftp ), xxs1 );
-
-				s = vaddq_f64( s, vmulq_f64( xxs2, vld1q_f64( rp + i )));
+				const float64x2_t ftp2 = vld1q_f64( ftp + 2 );
+				const float64x2_t xx1 = vmulq_f64( ftp2, x1 );
+				const float64x2_t ftp4 = vld1q_f64( ftp + 4 );
+				const float64x2_t xx2 = vmulq_f64( ftp4, x2 );
+				const float64x2_t ftp0 = vld1q_f64( ftp );
 				ftp += 6;
+
+				const float64x2_t rpi = vld1q_f64( rp + i );
+				const float64x2_t xxs = vaddq_f64( ftp0,
+					vaddq_f64( xx1, xx2 ));
+
+				s = vmlaq_f64( s, rpi, xxs );
 			}
 
 			*op = vaddvq_f64( s );
 
@@ -79,6 +79,8 @@ class CDSPHBDownsampler : public CDSPProcessor
 		Latency = (int) LatencyFrac;
 		LatencyFrac -= Latency;
 
+		R8BASSERT( Latency >= 0 );
+
 		R8BCONSOLE( "CDSPHBDownsampler: taps=%i third=%i att=%.1f io=1/2\n",
 			fltt, (int) IsThird, att );
 
@@ -171,7 +173,7 @@ class CDSPHBDownsampler : public CDSPProcessor
 	}
 
 private:
-	static const int BufLenBits = 8; ///< The length of the ring buffer,
+	static const int BufLenBits = 10; ///< The length of the ring buffer,
 		///< expressed as Nth power of 2. This value can be reduced if it is
 		///< known that only short input buffers will be passed to the
 		///< interpolator. The minimum value of this parameter is 5, and
@@ -223,7 +225,7 @@ class CDSPHBDownsampler : public CDSPProcessor
 		const double* const rp0, int& ReadPos0 ) \
 	{ \
 		int rpos = ReadPos0; \
-		while( op < opend ) \
+		while( op != opend ) \
 		{ \
 			const double* const rp = rp0 + rpos; \
 			*op = rp[ 0 ] +
 
@@ -591,6 +591,8 @@ class CDSPHBUpsampler : public CDSPProcessor
 		Latency = (int) LatencyFrac;
 		LatencyFrac -= Latency;
 
+		R8BASSERT( Latency >= 0 );
+
 		R8BCONSOLE( "CDSPHBUpsampler: sti=%i third=%i taps=%i att=%.1f "
 			"io=2/1\n", SteepIndex, (int) IsThird, fltt, att );
 
@@ -683,7 +685,7 @@ class CDSPHBUpsampler : public CDSPProcessor
 	}
 
 private:
-	static const int BufLenBits = 8; ///< The length of the ring buffer,
+	static const int BufLenBits = 9; ///< The length of the ring buffer,
 		///< expressed as Nth power of 2. This value can be reduced if it is
 		///< known that only short input buffers will be passed to the
 		///< interpolator. The minimum value of this parameter is 5, and
@@ -735,7 +737,7 @@ class CDSPHBUpsampler : public CDSPProcessor
 		const double* const rp0, int& ReadPos0 ) \
 	{ \
 		int rpos = ReadPos0; \
-		while( op < opend ) \
+		while( op != opend ) \
 		{ \
 			const double* const rp = rp0 + rpos; \
 			op[ 0 ] = rp[ 0 ]; \
 
@@ -104,16 +104,15 @@ style.  To generate the documentation locally you may run the
 `doxygen ./other/r8bdoxy.txt` command from the library's directory.
 
 Preliminary tests show that the r8b::CDSPResampler24 resampler class achieves
-`61.2*n_cores` Mflops (`83.3*n_cores` for Intel IPP FFT) when converting 1
+`31*n_cores` Mrops (`46*n_cores` for Intel IPP FFT) when converting 1
 channel of 24-bit audio from 44100 to 96000 sample rate (2% transition band),
-on an Intel Core i7-7700K processor-based 64-bit AVX2-enabled system without
-overclocking.  This approximately translates to a real-time resampling of
-`637*n_cores` (`868*n_cores`) audio streams, at 100% CPU load.  Speed
-performance when converting to other sample rates may vary greatly.  When
-comparing performance of this resampler library to another library make sure
-that the competing library is also tuned to produce a fully linear-phase
-response, has similar stop-band characteristics, and similar sample timing
-precision.
+on a Ryzen 3700X processor-based 64-bit system.  This approximately translates
+to a real-time resampling of `700*n_cores` (`1000*n_cores`) audio streams, at
+100% CPU load.  Speed performance when converting to other sample rates may
+vary greatly.  When comparing performance of this resampler library to another
+library make sure that the competing library is also tuned to produce a fully
+linear-phase response, has similar stop-band characteristics, and similar
+sample timing precision.
 
 ## Dynamic Link Library ##
 
@@ -207,6 +206,18 @@ inclusion into this list is not mandatory.
 
 ## Change Log ##
 
+Version 5.4:
+
+* Added compiler specializations to previously optimized inner loops.
+"Shuffled" SIMD interpolation code is not efficient on Apple M1. Intel C++
+Compiler vectorizes "whole stepping" interpolation as good as a
+manually-written SSE.
+* Reorganized SIMD instructions for a slightly better performance.
+* Changed internal buffer sizes of half-band resamplers (1-2% performance
+boost).
+* Fixed compiler warnings in PFFFT code.
+* Added several asserts to the code.
+
 Version 5.3:
 
 * Optimized inner loops of the fractional interpolator, added SSE2 and NEON
 
@@ -125,13 +125,18 @@ VOXMAIN
 
 		const TClock t1( CSystem :: getClock() );
 		Resamp -> oneshot( &Ref[ 0 ], InBufSize, &OutBuf[ 0 ], ol );
-		const double perf = 1e-6 * ol / CSystem :: getClockDiffSec( t1 );
+		double perf = 1e-6 * InBufSize /
+			CSystem :: getClockDiffSec( t1 );
 
 //		addSine( OutBuf, ol, ( SrcSampleRate + DstSampleRate ) * 0.25,
 //			DstSampleRate );
 
 		Resamp = new CResamp( DstSampleRate, SrcSampleRate, MaxInLen, tb );
+
+		const TClock t2( CSystem :: getClock() );
 		Resamp -> oneshot( &OutBuf[ 0 ], ol, &OutBuf2[ 0 ], InBufSize );
+		perf = ( perf + 1e-6 * InBufSize /
+			CSystem :: getClockDiffSec( t2 )) * 0.5;
 
 		const double r = calcRMS( &Ref[ 5000 ], &OutBuf2[ 5000 ],
 			InBufSize - 10000, peakd );
@@ -150,7 +155,7 @@ VOXMAIN
 	printf( "avg rms %.2f\n", 10.0 * log( avgr / TestCount ) / log( 10.0 ));
 	printf( "max rms %.2f\n", 20.0 * log( maxr ) / log( 10.0 ));
 	printf( "peak diff %.2f\n", 20.0 * log( peakd ) / log( 10.0 ));
-	printf( "avg perf %.2f Mflops\n", avgperf / TestCount );
+	printf( "avg perf %.2f Mrops\n", avgperf / TestCount );
 	printf( "avg latency %.0f\n", avglatency / TestCount );
 
 	VOXRET;
 
@@ -92,6 +92,7 @@ VOXMAIN
 	int64_t ol = (int64_t) ( inf.SampleCount * OutSampleRate /
 		InSampleRate );
 
+	const int64_t ol0 = inf.SampleCount * inf.ChannelCount;
 	int64_t ool = 0;
 	double srct = 0.0;
 	CArray< double* > opp( inf.ChannelCount );
@@ -136,8 +137,8 @@ VOXMAIN
 
 	VOXCHECK( outf.finalize() );
 
-	printf( "Resampled in %.4f s, %.3f Mflops (excluding IO operations)\n",
-		srct, 1e-6 * ool / srct );
+	printf( "Resampled in %.4f s, %.3f Mrops (excluding IO operations)\n",
+		srct, 1e-6 * ol0 / srct );
 
 	VOXRET;
 }
@@ -110,9 +110,13 @@ VOXMAIN
 
 		const TClock t1( CSystem :: getClock() );
 		Resamp1 -> oneshot( &Ref[ 0 ], InBufSize, &OutBuf1[ 0 ], ol1 );
-		const double perf = 1e-6 * ol1 / CSystem :: getClockDiffSec( t1 );
+		double perf = 1e-6 * InBufSize /
+			CSystem :: getClockDiffSec( t1 );
 
+		const TClock t2( CSystem :: getClock() );
 		Resamp2 -> oneshot( &OutBuf1[ 0 ], ol1, &OutBuf2[ 0 ], InBufSize );
+		perf = ( perf + 1e-6 * InBufSize /
+			CSystem :: getClockDiffSec( t2 )) * 0.5;
 
 		const double r = calcRMS( &Ref[ 5000 ], &OutBuf2[ 5000 ],
 			InBufSize - 10000, peakd );
@@ -132,12 +136,12 @@ VOXMAIN
 			printf( "%7.2f", 20.0 * log( r ) / log( 10.0 ));
 		}
 
-		printf( "\t%.2f\tMflops\n", perf );
+		printf( "\t%.2f\tMrops\n", perf );
 	}
 
 	printf( "Average rms %.2f\n", 10.0 * log( avgr / avgc ) / log( 10.0 ));
 	printf( "Peak diff %.2f\n", 20.0 * log( peakd ) / log( 10.0 ));
-	printf( "Average perf %.2f Mflops\n", avgperf / avgc );
+	printf( "Average perf %.2f Mrops\n", avgperf / avgc );
 	printf( "Average latency %.0f\n", avglatency / avgc );
 
 	VOXRET;
 
@@ -266,10 +266,12 @@ void validate_pffft_simd() {
 
 /* SSE and co like 16-bytes aligned pointers */
 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
+#define MALLOC_V4SF_SIZE (MALLOC_V4SF_ALIGNMENT+sizeof(void*))
 void *pffft_aligned_malloc(size_t nb_bytes) {
-  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_SIZE);
   if (!p0) return (void *) 0;
-  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  p = (void *) (((uintptr_t) p0 + MALLOC_V4SF_SIZE) &
+    ~(uintptr_t) (MALLOC_V4SF_ALIGNMENT-1));
   *((void **) p - 1) = p0;
   return p;
 }
@@ -1180,19 +1182,19 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
         int ix2 = iw + idot;
         int ix3 = ix2 + idot;
         int ix4 = ix3 + idot;
-        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], (float)isign);
       } break;
       case 4: {
         int ix2 = iw + idot;
         int ix3 = ix2 + idot;
-        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
+        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], (float)isign);
       } break;
       case 2: {
-        passf2_ps(idot, l1, in, out, &wa[iw], isign);
+        passf2_ps(idot, l1, in, out, &wa[iw], (float)isign);
       } break;
       case 3: {
         int ix2 = iw + idot;
-        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
+        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], (float)isign);
       } break;
       default:
         assert(0);
 
@@ -79,6 +79,7 @@
 #  include <alloca.h>
 #endif
 
+#include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
@@ -149,7 +150,8 @@
 static void * Valigned_malloc(size_t nb_bytes) {
   void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_SIZE);
   if (!p0) return (void *) 0;
-  p = (void *) (((size_t) p0 + MALLOC_V4SF_SIZE) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  p = (void *) (((uintptr_t) p0 + MALLOC_V4SF_SIZE) &
+    ~(uintptr_t) (MALLOC_V4SF_ALIGNMENT-1));
   *((void **) p - 1) = p0;
   return p;
 }
 
@@ -1818,7 +1818,7 @@ void FUNC_VALIDATE_SIMD_A() {
 
 static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo )
 {
-  if ( !( fabsf( result - ref ) < 0.01F ) )
+  if ( !( fabs( result - ref ) < 0.01 ) )
   {
     fprintf(stderr, "%s: assert for %s at %s(%d)\n  expected %f  value %f\n", functxt, vartxt, f, lineNo, ref, result);
     ++(*numErrs);
Original file line number	Diff line number	Diff line change
`@@ -157,6 +157,8 @@ class CDSPBlockConvolver : public CDSPProcessor`
`157`	`157`	`}`
`158`	`158`	`}`
`159`	`159`
	`160`	`+ R8BASSERT( Latency >= 0 );`
	`161`	`+`
`160`	`162`	`fftin = new CDSPRealFFTKeeper( fftinBits );`
`161`	`163`
`162`	`164`	`if( fftoutBits == fftinBits )`
Original file line number	Diff line number	Diff line change
`@@ -1818,7 +1818,7 @@ void FUNC_VALIDATE_SIMD_A() {`
`1818`	`1818`
`1819`	`1819`	`static void pffft_assert1( float result, float ref, const char * vartxt, const char * functxt, int * numErrs, const char * f, int lineNo )`
`1820`	`1820`	`{`
`1821`		`- if ( !( fabsf( result - ref ) < 0.01F ) )`
	`1821`	`+ if ( !( fabs( result - ref ) < 0.01 ) )`
`1822`	`1822`	`{`
`1823`	`1823`	`fprintf(stderr, "%s: assert for %s at %s(%d)\n expected %f value %f\n", functxt, vartxt, f, lineNo, ref, result);`
`1824`	`1824`	`++(*numErrs);`