diff --git a/README.md b/README.md index f8226f5cb..6d44129c2 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,10 @@ Building OpenBLAS requires the following to be installed: Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. -The full target list is in the file `TargetList.txt`. +The full target list is in the file `TargetList.txt`. For building with `cmake`, the +usual conventions apply, i.e. create a build directory either underneath the toplevel +OpenBLAS source directory or separate from it, and invoke `cmake` there with the path +to the source tree and any build options you plan to set. ### Cross compile @@ -152,13 +155,17 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Falkor**: same as A57 (different cpu specifications) - **ThunderX**: Optimized some Level-1 functions - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 +- **ThunderX3T110** - **TSV110**: Optimized some Level-3 helper functions - **EMAG 8180**: preliminary support based on A57 +- **Neoverse N1**: (AWS Graviton2) preliminary support +- **Apple Vortex**: preliminary support based on ARMV8 #### PPC/PPC64 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. +- **POWER10**: #### IBM zEnterprise System @@ -226,7 +233,8 @@ We provide the following functions to control the number of threads at runtime: void goto_set_num_threads(int num_threads); void openblas_set_num_threads(int num_threads); ``` - +Note that these are only used once at library initialization, and are not available for +fine-tuning thread numbers in individual BLAS calls. If you compile this library with `USE_OPENMP=1`, you should use the above functions too. ## Reporting bugs diff --git a/getarch.c b/getarch.c index 83043bdf2..e2c22d3a0 100644 --- a/getarch.c +++ b/getarch.c @@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ - "-DHAVE_AVX -DHAVE_FMA4" + "-DHAVE_AVX" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" #endif @@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "piledriver" #define CORENAME "PILEDRIVER" #endif @@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "steamroller" #define CORENAME "STEAMROLLER" #endif @@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "excavator" #define CORENAME "EXCAVATOR" #endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index ec02e09ad..d0cda7fb6 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -151,9 +151,9 @@ endif ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy.c -DCOPYKERNEL = dcopy.c +DCOPYKERNEL = dcopy_power10.c CCOPYKERNEL = ccopy.c -ZCOPYKERNEL = zcopy.c +ZCOPYKERNEL = zcopy_power10.c # SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/power/dcopy_microk_power10.c b/kernel/power/dcopy_microk_power10.c new file mode 100644 index 000000000..8940e0db9 --- /dev/null +++ b/kernel/power/dcopy_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_64 1 + +static void dcopy_kernel_64 (long n, double *x, double *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" + "lxvp 54, 352(%2) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" + "lxvp 62, 480(%2) \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -64 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "stxvp 34, 32(%3) \n\t" + "lxvp 34, 32(%2) \n\t" + "stxvp 36, 64(%3) \n\t" + "lxvp 36, 64(%2) \n\t" + "stxvp 38, 96(%3) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxvp 40, 128(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "stxvp 42, 160(%3) \n\t" + "lxvp 42, 160(%2) \n\t" + "stxvp 44, 192(%3) \n\t" + "lxvp 44, 192(%2) \n\t" + "stxvp 46, 224(%3) \n\t" + "lxvp 46, 224(%2) \n\t" + + "stxvp 48, 256(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "stxvp 50, 288(%3) \n\t" + "lxvp 50, 288(%2) \n\t" + "stxvp 52, 320(%3) \n\t" + "lxvp 52, 320(%2) \n\t" + "stxvp 54, 352(%3) \n\t" + "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "stxvp 58, 416(%3) \n\t" + "lxvp 58, 416(%2) \n\t" + "stxvp 60, 448(%3) \n\t" + "lxvp 60, 448(%2) \n\t" + "stxvp 62, 480(%3) \n\t" + "lxvp 62, 480(%2) \n\t" + + "addi %3, %3, 512 \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -64 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + "stxvp 48, 256(%3) \n\t" + "stxvp 50, 288(%3) \n\t" + "stxvp 52, 320(%3) \n\t" + "stxvp 54, 352(%3) \n\t" + "stxvp 56, 384(%3) \n\t" + "stxvp 58, 416(%3) \n\t" + "stxvp 60, 448(%3) \n\t" + "stxvp 62, 480(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c new file mode 100644 index 000000000..32530d570 --- /dev/null +++ b/kernel/power/dcopy_power10.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "dcopy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_64 + +static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + dcopy_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/zcopy_microk_power10.c b/kernel/power/zcopy_microk_power10.c new file mode 100644 index 000000000..f2f2119a3 --- /dev/null +++ b/kernel/power/zcopy_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void zcopy_kernel_32 (long n, double *x, double *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" + "lxvp 54, 352(%2) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" + "lxvp 62, 480(%2) \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "stxvp 34, 32(%3) \n\t" + "lxvp 34, 32(%2) \n\t" + "stxvp 36, 64(%3) \n\t" + "lxvp 36, 64(%2) \n\t" + "stxvp 38, 96(%3) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxvp 40, 128(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "stxvp 42, 160(%3) \n\t" + "lxvp 42, 160(%2) \n\t" + "stxvp 44, 192(%3) \n\t" + "lxvp 44, 192(%2) \n\t" + "stxvp 46, 224(%3) \n\t" + "lxvp 46, 224(%2) \n\t" + + "stxvp 48, 256(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "stxvp 50, 288(%3) \n\t" + "lxvp 50, 288(%2) \n\t" + "stxvp 52, 320(%3) \n\t" + "lxvp 52, 320(%2) \n\t" + "stxvp 54, 352(%3) \n\t" + "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "stxvp 58, 416(%3) \n\t" + "lxvp 58, 416(%2) \n\t" + "stxvp 60, 448(%3) \n\t" + "lxvp 60, 448(%2) \n\t" + "stxvp 62, 480(%3) \n\t" + "lxvp 62, 480(%2) \n\t" + + "addi %3, %3, 512 \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + "stxvp 48, 256(%3) \n\t" + "stxvp 50, 288(%3) \n\t" + "stxvp 52, 320(%3) \n\t" + "stxvp 54, 352(%3) \n\t" + "stxvp 56, 384(%3) \n\t" + "stxvp 58, 416(%3) \n\t" + "stxvp 60, 448(%3) \n\t" + "stxvp 62, 480(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/zcopy_power10.c b/kernel/power/zcopy_power10.c new file mode 100644 index 000000000..99d463b02 --- /dev/null +++ b/kernel/power/zcopy_power10.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "zcopy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + zcopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h new file mode 100644 index 000000000..5997bb6ac --- /dev/null +++ b/kernel/simd/intrin.h @@ -0,0 +1,71 @@ +#ifndef _INTRIN_H_ +#define _INTRIN_H_ + +#if defined(_MSC_VER) +#define BLAS_INLINE __inline +#elif defined(__GNUC__) +#if defined(__STRICT_ANSI__) +#define BLAS_INLINE __inline__ +#else +#define BLAS_INLINE inline +#endif +#else +#define BLAS_INLINE +#endif + +#ifdef _MSC_VER +#define BLAS_FINLINE static __forceinline +#elif defined(__GNUC__) +#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) +#else +#define BLAS_FINLINE static +#endif + +#ifdef __cplusplus +extern "C" { +#endif +// include head +/** SSE **/ +#ifdef HAVE_SSE +#include +#endif +/** SSE2 **/ +#ifdef HAVE_SSE2 +#include +#endif +/** SSE3 **/ +#ifdef HAVE_SSE3 +#include +#endif +/** SSSE3 **/ +#ifdef HAVE_SSSE3 +#include +#endif +/** SSE41 **/ +#ifdef HAVE_SSE4_1 +#include +#endif + +/** AVX **/ +#ifdef HAVE_AVX +#include +#endif + +// distribute +#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) +#include "intrin_avx512.h" +#elif defined(HAVE_AVX2) +#include "intrin_avx.h" +#elif defined(HAVE_SSE2) +#include "intrin_sse.h" +#endif + +#ifndef V_SIMD + #define V_SIMD 0 + #define V_SIMD_F64 0 +#endif + +#ifdef __cplusplus +} +#endif +#endif // _INTRIN_H_ diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h new file mode 100644 index 000000000..f6257ae98 --- /dev/null +++ b/kernel/simd/intrin_avx.h @@ -0,0 +1,29 @@ +#define V_SIMD 256 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m256 v_f32; +#define v_nlanes_f32 8 +/* +arithmetic +*/ +#define v_add_f32 _mm256_add_ps +#define v_mul_f32 _mm256_mul_ps + +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm256_fmadd_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // !HAVE_FMA3 + +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm256_loadu_ps +#define v_storeu_f32 _mm256_storeu_ps +#define v_setall_f32(VAL) _mm256_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h new file mode 100644 index 000000000..cb116a9a3 --- /dev/null +++ b/kernel/simd/intrin_avx512.h @@ -0,0 +1,21 @@ +#define V_SIMD 512 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m512 v_f32; +#define v_nlanes_f32 16 +/* +arithmetic +*/ +#define v_add_f32 _mm512_add_ps +#define v_mul_f32 _mm512_mul_ps +// multiply and add, a*b + c +#define v_muladd_f32 _mm512_fmadd_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define v_storeu_f32 _mm512_storeu_ps +#define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h new file mode 100644 index 000000000..260112028 --- /dev/null +++ b/kernel/simd/intrin_sse.h @@ -0,0 +1,30 @@ +#define V_SIMD 128 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m128 v_f32; +#define v_nlanes_f32 4 +/* +arithmetic +*/ +#define v_add_f32 _mm_add_ps +#define v_mul_f32 _mm_mul_ps +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm_fmadd_ps +#elif defined(HAVE_FMA4) + // multiply and add, a*b + c + #define v_muladd_f32 _mm_macc_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // HAVE_FMA3 +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm_loadu_ps +#define v_storeu_f32 _mm_storeu_ps +#define v_setall_f32(VAL) _mm_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index d84c0c221..b62e3dcb3 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_sandy-2.c" #endif - #ifndef HAVE_KERNEL_8 +#include"../simd/intrin.h" static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; - +#if V_SIMD + v_f32 __alpha, tmp; + __alpha = v_setall_f32(*alpha); + const int vstep = v_nlanes_f32; + for (; i < n; i += vstep) { + tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); + v_storeu_f32(y + i, tmp); + } +#else while(i < n) - { - y[i] += a * x[i]; - y[i+1] += a * x[i+1]; - y[i+2] += a * x[i+2]; - y[i+3] += a * x[i+3]; - y[i+4] += a * x[i+4]; - y[i+5] += a * x[i+5]; - y[i+6] += a * x[i+6]; - y[i+7] += a * x[i+7]; - i+=8 ; - - } - + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + } +#endif } #endif diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index d68481f7e..61b016f16 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -140,13 +140,16 @@ * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0 ) + PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, + $ TWO = 2.0D0 ) DOUBLE PRECISION MULTPL PARAMETER ( MULTPL = 4.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, - $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z + $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, + $ SAFMN2, SAFMX2 + INTEGER COUNT * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY2 @@ -157,7 +160,11 @@ * .. * .. Executable Statements .. * + SAFMIN = DLAMCH( 'S' ) EPS = DLAMCH( 'P' ) + SAFMN2 = DLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) / + $ LOG( DLAMCH( 'B' ) ) / TWO ) + SAFMX2 = ONE / SAFMN2 IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO @@ -212,7 +219,24 @@ * Complex eigenvalues, or real (almost) equal eigenvalues. * Make diagonal elements equal. * + COUNT = 0 SIGMA = B + C + 10 CONTINUE + COUNT = COUNT + 1 + SCALE = MAX( ABS(TEMP), ABS(SIGMA) ) + IF( SCALE.GE.SAFMX2 ) THEN + SIGMA = SIGMA * SAFMN2 + TEMP = TEMP * SAFMN2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + IF( SCALE.LE.SAFMN2 ) THEN + SIGMA = SIGMA * SAFMX2 + TEMP = TEMP * SAFMX2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + P = HALF*TEMP TAU = DLAPY2( SIGMA, TEMP ) CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) ) SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA ) diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index 1163446fa..e678305f2 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -140,13 +140,16 @@ * * .. Parameters .. REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0 ) + PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, + $ TWO = 2.0E+0 ) REAL MULTPL PARAMETER ( MULTPL = 4.0E+0 ) * .. * .. Local Scalars .. REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, - $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z + $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, + $ SAFMN2, SAFMX2 + INTEGER COUNT * .. * .. External Functions .. REAL SLAMCH, SLAPY2 @@ -157,7 +160,11 @@ * .. * .. Executable Statements .. * + SAFMIN = SLAMCH( 'S' ) EPS = SLAMCH( 'P' ) + SAFMN2 = SLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) / + $ LOG( SLAMCH( 'B' ) ) / TWO ) + SAFMX2 = ONE / SAFMN2 IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO @@ -212,7 +219,24 @@ * Complex eigenvalues, or real (almost) equal eigenvalues. * Make diagonal elements equal. * + COUNT = 0 SIGMA = B + C + 10 CONTINUE + COUNT = COUNT + 1 + SCALE = MAX( ABS(TEMP), ABS(SIGMA) ) + IF( SCALE.GE.SAFMX2 ) THEN + SIGMA = SIGMA * SAFMN2 + TEMP = TEMP * SAFMN2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + IF( SCALE.LE.SAFMN2 ) THEN + SIGMA = SIGMA * SAFMX2 + TEMP = TEMP * SAFMX2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + P = HALF*TEMP TAU = SLAPY2( SIGMA, TEMP ) CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) ) SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA )