From 325b539c26414f05666c0b0bfb2d6fe3e95cb039 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 22 Sep 2020 10:38:35 +0800 Subject: [PATCH 1/3] Optimize the performance of daxpy by using universal intrinsics --- kernel/simd/intrin.h | 51 +++++++++++++++++++++++++++++++++++++ kernel/simd/intrin_avx.h | 19 ++++++++++++++ kernel/simd/intrin_avx512.h | 19 ++++++++++++++ kernel/simd/intrin_sse.h | 19 ++++++++++++++ kernel/x86_64/daxpy.c | 39 ++++++++++++++++------------ 5 files changed, 131 insertions(+), 16 deletions(-) create mode 100644 kernel/simd/intrin.h create mode 100644 kernel/simd/intrin_avx.h create mode 100644 kernel/simd/intrin_avx512.h create mode 100644 kernel/simd/intrin_sse.h diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h new file mode 100644 index 000000000..ef599f065 --- /dev/null +++ b/kernel/simd/intrin.h @@ -0,0 +1,51 @@ +#ifndef _INTRIN_H_ +#define _INTRIN_H_ + +#ifdef __cplusplus +extern "C" { +#endif +// include head +/** SSE **/ +#ifdef HAVE_SSE +#include +#endif +/** SSE2 **/ +#ifdef HAVE_SSE2 +#include +#endif +/** SSE3 **/ +#ifdef HAVE_SSE3 +#include +#endif +/** SSSE3 **/ +#ifdef HAVE_SSSE3 +#include +#endif +/** SSE41 **/ +#ifdef HAVE_SSE4_1 +#include +#endif + +/** AVX **/ +#ifdef HAVE_AVX +#include +#endif + +// distribute +#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) +#include "intrin_avx512.h" +#elif defined(HAVE_AVX2) +#include "intrin_avx.h" +#elif defined(HAVE_SSE2) +#include "intrin_sse.h" +#endif + +#ifndef V_SIMD + #define V_SIMD 0 + #define V_SIMD_F64 0 +#endif + +#ifdef __cplusplus +} +#endif +#endif // _INTRIN_H_ diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h new file mode 100644 index 000000000..726254429 --- /dev/null +++ b/kernel/simd/intrin_avx.h @@ -0,0 +1,19 @@ +#define V_SIMD 256 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m256 v_f32; +#define v_nlanes_f32 8 +/* +arithmetic +*/ +#define v_add_f32 _mm256_add_ps +#define v_mul_f32 _mm256_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm256_loadu_ps +#define v_storeu_f32 _mm256_storeu_ps +#define v_setall_f32(VAL) _mm256_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h new file mode 100644 index 000000000..775fe7aa5 --- /dev/null +++ b/kernel/simd/intrin_avx512.h @@ -0,0 +1,19 @@ +#define V_SIMD 512 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m512 v_f32; +#define v_nlanes_f32 16 +/* +arithmetic +*/ +#define v_add_f32 _mm512_add_ps +#define v_mul_f32 _mm512_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR)) +#define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h new file mode 100644 index 000000000..0cc159aa7 --- /dev/null +++ b/kernel/simd/intrin_sse.h @@ -0,0 +1,19 @@ +#define V_SIMD 128 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m128 v_f32; +#define v_nlanes_f32 4 +/* +arithmetic +*/ +#define v_add_f32 _mm_add_ps +#define v_mul_f32 _mm_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm_loadu_ps +#define v_storeu_f32 _mm_storeu_ps +#define v_setall_f32(VAL) _mm_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index d84c0c221..9836faca1 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_sandy-2.c" #endif - #ifndef HAVE_KERNEL_8 +#include"../simd/intrin.h" -static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; - +#if V_SIMD + v_f32 __alpha, tmp; + __alpha = v_setall_f32(*alpha); + const int vstep = v_nlanes_f32; + for (; i < n; i += vstep) { + tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i ))); + v_storeu_f32(y + i, tmp); + } +#else while(i < n) - { - y[i] += a * x[i]; - y[i+1] += a * x[i+1]; - y[i+2] += a * x[i+2]; - y[i+3] += a * x[i+3]; - y[i+4] += a * x[i+4]; - y[i+5] += a * x[i+5]; - y[i+6] += a * x[i+6]; - y[i+7] += a * x[i+7]; - i+=8 ; - - } - + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + } +#endif } #endif From 14f7dad3b7d728159bbeab72deb9e7878d108760 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 22 Sep 2020 16:52:15 +0800 Subject: [PATCH 2/3] performance improved --- kernel/simd/intrin.h | 20 ++++++++++++++++++++ kernel/simd/intrin_avx.h | 10 ++++++++++ kernel/simd/intrin_avx512.h | 4 +++- kernel/simd/intrin_sse.h | 11 +++++++++++ kernel/x86_64/daxpy.c | 4 ++-- 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h index ef599f065..5997bb6ac 100644 --- a/kernel/simd/intrin.h +++ b/kernel/simd/intrin.h @@ -1,6 +1,26 @@ #ifndef _INTRIN_H_ #define _INTRIN_H_ +#if defined(_MSC_VER) +#define BLAS_INLINE __inline +#elif defined(__GNUC__) +#if defined(__STRICT_ANSI__) +#define BLAS_INLINE __inline__ +#else +#define BLAS_INLINE inline +#endif +#else +#define BLAS_INLINE +#endif + +#ifdef _MSC_VER +#define BLAS_FINLINE static __forceinline +#elif defined(__GNUC__) +#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) +#else +#define BLAS_FINLINE static +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index 726254429..f6257ae98 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -10,6 +10,16 @@ arithmetic */ #define v_add_f32 _mm256_add_ps #define v_mul_f32 _mm256_mul_ps + +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm256_fmadd_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // !HAVE_FMA3 + /* memory */ diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index 775fe7aa5..cb116a9a3 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -10,10 +10,12 @@ arithmetic */ #define v_add_f32 _mm512_add_ps #define v_mul_f32 _mm512_mul_ps +// multiply and add, a*b + c +#define v_muladd_f32 _mm512_fmadd_ps /* memory */ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) -#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR)) +#define v_storeu_f32 _mm512_storeu_ps #define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 0cc159aa7..260112028 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -10,6 +10,17 @@ arithmetic */ #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm_fmadd_ps +#elif defined(HAVE_FMA4) + // multiply and add, a*b + c + #define v_muladd_f32 _mm_macc_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // HAVE_FMA3 /* memory */ diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 9836faca1..b62e3dcb3 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 #include"../simd/intrin.h" -void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; @@ -57,7 +57,7 @@ void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __alpha = v_setall_f32(*alpha); const int vstep = v_nlanes_f32; for (; i < n; i += vstep) { - tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i ))); + tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); v_storeu_f32(y + i, tmp); } #else From 881c15179f93c96d9567ef74dceef1dfdbd5ccfa Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Sun, 27 Sep 2020 09:35:50 +0800 Subject: [PATCH 3/3] remove default support for FMA4 on zen architect --- getarch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/getarch.c b/getarch.c index 83043bdf2..e2c22d3a0 100644 --- a/getarch.c +++ b/getarch.c @@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ - "-DHAVE_AVX -DHAVE_FMA4" + "-DHAVE_AVX" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" #endif @@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "piledriver" #define CORENAME "PILEDRIVER" #endif @@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "steamroller" #define CORENAME "STEAMROLLER" #endif @@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "excavator" #define CORENAME "EXCAVATOR" #endif