Merge pull request #2853 from Qiyu8/usimd-daxpy

Optimize the performance of daxpy by using universal intrinsics
This commit is contained in:
Martin Kroeker 2020-09-27 23:19:59 +02:00 committed by GitHub
commit ba31c8f5f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 177 additions and 19 deletions

View File

@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
"-DHAVE_AVX -DHAVE_FMA4" "-DHAVE_AVX"
#define LIBNAME "bulldozer" #define LIBNAME "bulldozer"
#define CORENAME "BULLDOZER" #define CORENAME "BULLDOZER"
#endif #endif
@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" "-DHAVE_AVX -DHAVE_FMA3"
#define LIBNAME "piledriver" #define LIBNAME "piledriver"
#define CORENAME "PILEDRIVER" #define CORENAME "PILEDRIVER"
#endif #endif
@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" "-DHAVE_AVX -DHAVE_FMA3"
#define LIBNAME "steamroller" #define LIBNAME "steamroller"
#define CORENAME "STEAMROLLER" #define CORENAME "STEAMROLLER"
#endif #endif
@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" "-DHAVE_AVX -DHAVE_FMA3"
#define LIBNAME "excavator" #define LIBNAME "excavator"
#define CORENAME "EXCAVATOR" #define CORENAME "EXCAVATOR"
#endif #endif

71
kernel/simd/intrin.h Normal file
View File

@ -0,0 +1,71 @@
#ifndef _INTRIN_H_
#define _INTRIN_H_
#if defined(_MSC_VER)
#define BLAS_INLINE __inline
#elif defined(__GNUC__)
#if defined(__STRICT_ANSI__)
#define BLAS_INLINE __inline__
#else
#define BLAS_INLINE inline
#endif
#else
#define BLAS_INLINE
#endif
#ifdef _MSC_VER
#define BLAS_FINLINE static __forceinline
#elif defined(__GNUC__)
#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline))
#else
#define BLAS_FINLINE static
#endif
#ifdef __cplusplus
extern "C" {
#endif
// include head
/** SSE **/
#ifdef HAVE_SSE
#include <xmmintrin.h>
#endif
/** SSE2 **/
#ifdef HAVE_SSE2
#include <emmintrin.h>
#endif
/** SSE3 **/
#ifdef HAVE_SSE3
#include <pmmintrin.h>
#endif
/** SSSE3 **/
#ifdef HAVE_SSSE3
#include <tmmintrin.h>
#endif
/** SSE41 **/
#ifdef HAVE_SSE4_1
#include <smmintrin.h>
#endif
/** AVX **/
#ifdef HAVE_AVX
#include <immintrin.h>
#endif
// distribute
#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16)
#include "intrin_avx512.h"
#elif defined(HAVE_AVX2)
#include "intrin_avx.h"
#elif defined(HAVE_SSE2)
#include "intrin_sse.h"
#endif
#ifndef V_SIMD
#define V_SIMD 0
#define V_SIMD_F64 0
#endif
#ifdef __cplusplus
}
#endif
#endif // _INTRIN_H_

29
kernel/simd/intrin_avx.h Normal file
View File

@ -0,0 +1,29 @@
#define V_SIMD 256
#define V_SIMD_F64 1
/*
Data Type
*/
typedef __m256 v_f32;
#define v_nlanes_f32 8
/*
arithmetic
*/
#define v_add_f32 _mm256_add_ps
#define v_mul_f32 _mm256_mul_ps
#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm256_fmadd_ps
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
#endif // !HAVE_FMA3
/*
memory
*/
// unaligned load
#define v_loadu_f32 _mm256_loadu_ps
#define v_storeu_f32 _mm256_storeu_ps
#define v_setall_f32(VAL) _mm256_set1_ps(VAL)

View File

@ -0,0 +1,21 @@
#define V_SIMD 512
#define V_SIMD_F64 1
/*
Data Type
*/
typedef __m512 v_f32;
#define v_nlanes_f32 16
/*
arithmetic
*/
#define v_add_f32 _mm512_add_ps
#define v_mul_f32 _mm512_mul_ps
// multiply and add, a*b + c
#define v_muladd_f32 _mm512_fmadd_ps
/*
memory
*/
// unaligned load
#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
#define v_storeu_f32 _mm512_storeu_ps
#define v_setall_f32(VAL) _mm512_set1_ps(VAL)

30
kernel/simd/intrin_sse.h Normal file
View File

@ -0,0 +1,30 @@
#define V_SIMD 128
#define V_SIMD_F64 1
/*
Data Type
*/
typedef __m128 v_f32;
#define v_nlanes_f32 4
/*
arithmetic
*/
#define v_add_f32 _mm_add_ps
#define v_mul_f32 _mm_mul_ps
#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm_fmadd_ps
#elif defined(HAVE_FMA4)
// multiply and add, a*b + c
#define v_muladd_f32 _mm_macc_ps
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
#endif // HAVE_FMA3
/*
memory
*/
// unaligned load
#define v_loadu_f32 _mm_loadu_ps
#define v_storeu_f32 _mm_storeu_ps
#define v_setall_f32(VAL) _mm_set1_ps(VAL)

View File

@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "daxpy_microk_sandy-2.c" #include "daxpy_microk_sandy-2.c"
#endif #endif
#ifndef HAVE_KERNEL_8 #ifndef HAVE_KERNEL_8
#include"../simd/intrin.h"
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{ {
BLASLONG register i = 0; BLASLONG register i = 0;
FLOAT a = *alpha; FLOAT a = *alpha;
#if V_SIMD
v_f32 __alpha, tmp;
__alpha = v_setall_f32(*alpha);
const int vstep = v_nlanes_f32;
for (; i < n; i += vstep) {
tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
v_storeu_f32(y + i, tmp);
}
#else
while(i < n) while(i < n)
{ {
y[i] += a * x[i]; y[i] += a * x[i];
y[i+1] += a * x[i+1]; y[i+1] += a * x[i+1];
y[i+2] += a * x[i+2]; y[i+2] += a * x[i+2];
y[i+3] += a * x[i+3]; y[i+3] += a * x[i+3];
y[i+4] += a * x[i+4]; y[i+4] += a * x[i+4];
y[i+5] += a * x[i+5]; y[i+5] += a * x[i+5];
y[i+6] += a * x[i+6]; y[i+6] += a * x[i+6];
y[i+7] += a * x[i+7]; y[i+7] += a * x[i+7];
i+=8 ; i+=8 ;
}
} #endif
} }
#endif #endif