diff --git a/getarch.c b/getarch.c index 83043bdf2..e2c22d3a0 100644 --- a/getarch.c +++ b/getarch.c @@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ - "-DHAVE_AVX -DHAVE_FMA4" + "-DHAVE_AVX" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" #endif @@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "piledriver" #define CORENAME "PILEDRIVER" #endif @@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "steamroller" #define CORENAME "STEAMROLLER" #endif @@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "excavator" #define CORENAME "EXCAVATOR" #endif diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h new file mode 100644 index 000000000..5997bb6ac --- /dev/null +++ b/kernel/simd/intrin.h @@ -0,0 +1,71 @@ +#ifndef _INTRIN_H_ +#define _INTRIN_H_ + +#if defined(_MSC_VER) +#define BLAS_INLINE __inline +#elif defined(__GNUC__) +#if defined(__STRICT_ANSI__) +#define BLAS_INLINE __inline__ +#else +#define BLAS_INLINE inline +#endif +#else +#define BLAS_INLINE +#endif + +#ifdef _MSC_VER +#define BLAS_FINLINE static __forceinline +#elif defined(__GNUC__) +#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) +#else +#define BLAS_FINLINE static +#endif + +#ifdef __cplusplus +extern "C" { +#endif +// include head +/** SSE **/ +#ifdef HAVE_SSE +#include +#endif +/** SSE2 **/ +#ifdef HAVE_SSE2 +#include +#endif +/** SSE3 **/ +#ifdef HAVE_SSE3 +#include +#endif +/** SSSE3 **/ +#ifdef HAVE_SSSE3 +#include +#endif +/** SSE41 **/ +#ifdef HAVE_SSE4_1 +#include +#endif + +/** AVX **/ +#ifdef HAVE_AVX +#include +#endif + +// distribute +#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) +#include "intrin_avx512.h" +#elif defined(HAVE_AVX2) +#include "intrin_avx.h" +#elif defined(HAVE_SSE2) +#include "intrin_sse.h" +#endif + +#ifndef V_SIMD + #define V_SIMD 0 + #define V_SIMD_F64 0 +#endif + +#ifdef __cplusplus +} +#endif +#endif // _INTRIN_H_ diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h new file mode 100644 index 000000000..f6257ae98 --- /dev/null +++ b/kernel/simd/intrin_avx.h @@ -0,0 +1,29 @@ +#define V_SIMD 256 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m256 v_f32; +#define v_nlanes_f32 8 +/* +arithmetic +*/ +#define v_add_f32 _mm256_add_ps +#define v_mul_f32 _mm256_mul_ps + +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm256_fmadd_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // !HAVE_FMA3 + +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm256_loadu_ps +#define v_storeu_f32 _mm256_storeu_ps +#define v_setall_f32(VAL) _mm256_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h new file mode 100644 index 000000000..cb116a9a3 --- /dev/null +++ b/kernel/simd/intrin_avx512.h @@ -0,0 +1,21 @@ +#define V_SIMD 512 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m512 v_f32; +#define v_nlanes_f32 16 +/* +arithmetic +*/ +#define v_add_f32 _mm512_add_ps +#define v_mul_f32 _mm512_mul_ps +// multiply and add, a*b + c +#define v_muladd_f32 _mm512_fmadd_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define v_storeu_f32 _mm512_storeu_ps +#define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h new file mode 100644 index 000000000..260112028 --- /dev/null +++ b/kernel/simd/intrin_sse.h @@ -0,0 +1,30 @@ +#define V_SIMD 128 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m128 v_f32; +#define v_nlanes_f32 4 +/* +arithmetic +*/ +#define v_add_f32 _mm_add_ps +#define v_mul_f32 _mm_mul_ps +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm_fmadd_ps +#elif defined(HAVE_FMA4) + // multiply and add, a*b + c + #define v_muladd_f32 _mm_macc_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // HAVE_FMA3 +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm_loadu_ps +#define v_storeu_f32 _mm_storeu_ps +#define v_setall_f32(VAL) _mm_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index d84c0c221..b62e3dcb3 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_sandy-2.c" #endif - #ifndef HAVE_KERNEL_8 +#include"../simd/intrin.h" static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; - +#if V_SIMD + v_f32 __alpha, tmp; + __alpha = v_setall_f32(*alpha); + const int vstep = v_nlanes_f32; + for (; i < n; i += vstep) { + tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); + v_storeu_f32(y + i, tmp); + } +#else while(i < n) - { - y[i] += a * x[i]; - y[i+1] += a * x[i+1]; - y[i+2] += a * x[i+2]; - y[i+3] += a * x[i+3]; - y[i+4] += a * x[i+4]; - y[i+5] += a * x[i+5]; - y[i+6] += a * x[i+6]; - y[i+7] += a * x[i+7]; - i+=8 ; - - } - + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + } +#endif } #endif