From c9c3ae07afaf7833f14025164360da1efe3eb4df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 18:10:45 +0200 Subject: [PATCH 1/4] Add double precision operations --- kernel/simd/intrin_sse.h | 48 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 9de7e1b27..7449a5a0b 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -3,25 +3,59 @@ /*************************** * Data Type ***************************/ +#ifdef DOUBLE +typedef __m128d v_f32; +#else typedef __m128 v_f32; +#endif + #define v_nlanes_f32 4 /*************************** * Arithmetic ***************************/ +#ifdef DOUBLE +#define v_add_f32 _mm_add_pd +#define v_mul_f32 _mm_mul_pd +#else #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps +#endif #ifdef HAVE_FMA3 // multiply and add, a*b + c - #define v_muladd_f32 _mm_fmadd_ps +#ifdef DOUBLE + #define v_muladd_f32 _mm_fmadd_pd +#else + #define v_muladd_f32 _mm_fmadd_ps +#endif #elif defined(HAVE_FMA4) // multiply and add, a*b + c - #define v_muladd_f32 _mm_macc_ps + #ifdef DOUBLE + #define v_muladd_f32 _mm_macc_pd + #else + #define v_muladd_f32 _mm_macc_ps + #endif #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 +// Horizontal add: Calculates the sum of all vector elements. +#ifdef DOUBLE +BLAS_FINLINE double v_sum_f32(__m128d a) +{ +#ifdef HAVE_SSE3 + __m128d sum_halves = _mm_hadd_pd(a, a); + return _mm_cvtsd_f64(_mm_hadd_pd(sum_halves, sum_halves)); +#else + __m128d t1 = _mm_movehl_pd(a, a); + __m128d t2 = _mm_add_pd(a, t1); + __m128d t3 = _mm_shuffle_pd(t2, t2, 1); + __m128d t4 = _mm_add_ss(t2, t3); + return _mm_cvtsd_f64(t4); +#endif +} +#else // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(__m128 a) { @@ -36,11 +70,19 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } +#endif /*************************** * memory ***************************/ // unaligned load +#ifdef DOUBLE +#define v_loadu_f32 _mm_loadu_pd +#define v_storeu_f32 _mm_storeu_pd +#define v_setall_f32(VAL) _mm_set1_pd(VAL) +#define v_zero_f32 _mm_setzero_pd +#else #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps \ No newline at end of file +#define v_zero_f32 _mm_setzero_ps +#endif From ca160bb4400a298f10ac358dce328eabb8c49a70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 19:18:07 +0200 Subject: [PATCH 2/4] Add -msse4.1 when SSE4.1 is supported --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 8a3fc4eae..27eb571ee 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -16,6 +16,10 @@ ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 FCOMMON_OPT += -mssse3 endif +ifdef HAVE_SSE4_1 +CCOMMON_OPT += -msse4.1 +FCOMMON_OPT += -msse4.1 +endif endif endif From ebf0470fc25fd902a923d743977804ae672d4d20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 20:34:33 +0200 Subject: [PATCH 3/4] add sse4.1 for DYNAMIC_ARCH kernels --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index c95c15f56..abe2e08d6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,7 +45,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse3 -mssse3 + override CFLAGS += -msse3 -mssse3 -msse4.1 endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From ae6ac83991539d688095bcfc66bfb22f054860be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 08:37:02 +0200 Subject: [PATCH 4/4] Revert "add double precision SSE" --- kernel/simd/intrin_sse.h | 48 +++------------------------------------- 1 file changed, 3 insertions(+), 45 deletions(-) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 7449a5a0b..9de7e1b27 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -3,59 +3,25 @@ /*************************** * Data Type ***************************/ -#ifdef DOUBLE -typedef __m128d v_f32; -#else typedef __m128 v_f32; -#endif - #define v_nlanes_f32 4 /*************************** * Arithmetic ***************************/ -#ifdef DOUBLE -#define v_add_f32 _mm_add_pd -#define v_mul_f32 _mm_mul_pd -#else #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps -#endif #ifdef HAVE_FMA3 // multiply and add, a*b + c -#ifdef DOUBLE - #define v_muladd_f32 _mm_fmadd_pd -#else - #define v_muladd_f32 _mm_fmadd_ps -#endif + #define v_muladd_f32 _mm_fmadd_ps #elif defined(HAVE_FMA4) // multiply and add, a*b + c - #ifdef DOUBLE - #define v_muladd_f32 _mm_macc_pd - #else - #define v_muladd_f32 _mm_macc_ps - #endif + #define v_muladd_f32 _mm_macc_ps #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 -// Horizontal add: Calculates the sum of all vector elements. -#ifdef DOUBLE -BLAS_FINLINE double v_sum_f32(__m128d a) -{ -#ifdef HAVE_SSE3 - __m128d sum_halves = _mm_hadd_pd(a, a); - return _mm_cvtsd_f64(_mm_hadd_pd(sum_halves, sum_halves)); -#else - __m128d t1 = _mm_movehl_pd(a, a); - __m128d t2 = _mm_add_pd(a, t1); - __m128d t3 = _mm_shuffle_pd(t2, t2, 1); - __m128d t4 = _mm_add_ss(t2, t3); - return _mm_cvtsd_f64(t4); -#endif -} -#else // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(__m128 a) { @@ -70,19 +36,11 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } -#endif /*************************** * memory ***************************/ // unaligned load -#ifdef DOUBLE -#define v_loadu_f32 _mm_loadu_pd -#define v_storeu_f32 _mm_storeu_pd -#define v_setall_f32(VAL) _mm_set1_pd(VAL) -#define v_zero_f32 _mm_setzero_pd -#else #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps -#endif +#define v_zero_f32 _mm_setzero_ps \ No newline at end of file