From 14f7dad3b7d728159bbeab72deb9e7878d108760 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 22 Sep 2020 16:52:15 +0800 Subject: [PATCH] performance improved --- kernel/simd/intrin.h | 20 ++++++++++++++++++++ kernel/simd/intrin_avx.h | 10 ++++++++++ kernel/simd/intrin_avx512.h | 4 +++- kernel/simd/intrin_sse.h | 11 +++++++++++ kernel/x86_64/daxpy.c | 4 ++-- 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h index ef599f065..5997bb6ac 100644 --- a/kernel/simd/intrin.h +++ b/kernel/simd/intrin.h @@ -1,6 +1,26 @@ #ifndef _INTRIN_H_ #define _INTRIN_H_ +#if defined(_MSC_VER) +#define BLAS_INLINE __inline +#elif defined(__GNUC__) +#if defined(__STRICT_ANSI__) +#define BLAS_INLINE __inline__ +#else +#define BLAS_INLINE inline +#endif +#else +#define BLAS_INLINE +#endif + +#ifdef _MSC_VER +#define BLAS_FINLINE static __forceinline +#elif defined(__GNUC__) +#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) +#else +#define BLAS_FINLINE static +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index 726254429..f6257ae98 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -10,6 +10,16 @@ arithmetic */ #define v_add_f32 _mm256_add_ps #define v_mul_f32 _mm256_mul_ps + +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm256_fmadd_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // !HAVE_FMA3 + /* memory */ diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index 775fe7aa5..cb116a9a3 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -10,10 +10,12 @@ arithmetic */ #define v_add_f32 _mm512_add_ps #define v_mul_f32 _mm512_mul_ps +// multiply and add, a*b + c +#define v_muladd_f32 _mm512_fmadd_ps /* memory */ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) -#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR)) +#define v_storeu_f32 _mm512_storeu_ps #define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 0cc159aa7..260112028 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -10,6 +10,17 @@ arithmetic */ #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm_fmadd_ps +#elif defined(HAVE_FMA4) + // multiply and add, a*b + c + #define v_muladd_f32 _mm_macc_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // HAVE_FMA3 /* memory */ diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 9836faca1..b62e3dcb3 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 #include"../simd/intrin.h" -void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; @@ -57,7 +57,7 @@ void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __alpha = v_setall_f32(*alpha); const int vstep = v_nlanes_f32; for (; i < n; i += vstep) { - tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i ))); + tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); v_storeu_f32(y + i, tmp); } #else