From 3b4c016110a7de5e52a76045aaa4be25965c8e6c Mon Sep 17 00:00:00 2001 From: cyy Date: Sun, 29 Nov 2020 17:17:07 +0800 Subject: [PATCH 01/20] link math lib on FreeBSD --- utest/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 357e61301..0c99e0d12 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -58,7 +58,7 @@ add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME}) -if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") target_link_libraries(${OpenBLAS_utest_bin} m) endif() From ca17d3dc3d51589c8048f23355b2ac1cdf32771c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Nov 2020 13:19:51 +0100 Subject: [PATCH 02/20] Restore RISCV entries accidentally trashed by my PR 3005 --- getarch.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/getarch.c b/getarch.c index cf0be8d23..f107da3e9 100644 --- a/getarch.c +++ b/getarch.c @@ -983,6 +983,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_GENERIC +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_GENERIC" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_GENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_generic" +#define CORENAME "RISCV64_GENERIC" +#else +#endif + #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1268,6 +1282,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif +#ifdef FORCE_C910V +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "C910V" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DC910V " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "c910v" +#define CORENAME "C910V" +#else +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1322,6 +1351,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __riscv +#include "cpuid_riscv64.c" +#endif + #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED From 2e99e2699b6d381a7d5709ad2e0dbcd0269826ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Nov 2020 15:32:17 +0100 Subject: [PATCH 03/20] Add workaround for gcc 4.6 miscompiling assembly kernels with -mavx --- Makefile.system | 1 + Makefile.x86_64 | 4 ++++ c_check | 12 +++++++++++ getarch.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+) diff --git a/Makefile.system b/Makefile.system index afc8ee207..b5974f872 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1561,6 +1561,7 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export NO_AVX2 export BUILD_BFLOAT16 export SBGEMM_UNROLL_M diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 43bfc9ecd..d806a4ed2 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -20,14 +20,18 @@ ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif +ifndef OLDGCC ifdef HAVE_AVX CCOMMON_OPT += -mavx FCOMMON_OPT += -mavx endif +endif +ifndef NO_AVX2 ifdef HAVE_AVX2 CCOMMON_OPT += -mavx2 FCOMMON_OPT += -mavx2 endif +endif ifdef HAVE_FMA3 CCOMMON_OPT += -mfma FCOMMON_OPT += -mfma diff --git a/c_check b/c_check index 405963ae6..efea9b0fb 100644 --- a/c_check +++ b/c_check @@ -229,6 +229,16 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); + +if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { +$no_avx2 = 0; +$oldgcc = 0; +$data = `$compiler_name -dumpversion`; +if ($data <= 4.6) { +$no_avx2 = 1; +$oldgcc = 1; +} +} $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { eval "use File::Temp qw(tempfile)"; @@ -368,6 +378,8 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; +print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; +print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; diff --git a/getarch.c b/getarch.c index cf0be8d23..9344defb5 100644 --- a/getarch.c +++ b/getarch.c @@ -326,6 +326,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -336,6 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "haswell" #define CORENAME "HASWELL" #endif +#endif #ifdef FORCE_SKYLAKEX #ifdef NO_AVX512 @@ -551,6 +562,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#else #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ @@ -565,6 +586,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "zen" #define CORENAME "ZEN" #endif +#endif #ifdef FORCE_SSE_GENERIC @@ -983,6 +1005,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_RISCV64_GENERIC +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "RISCV64_GENERIC" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DRISCV64_GENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "riscv64_generic" +#define CORENAME "RISCV64_GENERIC" +#else +#endif + #ifdef FORCE_CORTEXA15 #define FORCE #define ARCHITECTURE "ARM" @@ -1268,6 +1304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif +#ifdef FORCE_C910V +#define FORCE +#define ARCHITECTURE "RISCV64" +#define SUBARCHITECTURE "C910V" +#define SUBDIRNAME "riscv64" +#define ARCHCONFIG "-DC910V " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " +#define LIBNAME "c910v" +#define CORENAME "C910V" +#else +#endif + + #ifndef FORCE #ifdef USER_TARGET @@ -1322,6 +1373,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __riscv +#include "cpuid_riscv64.c" +#endif + #ifdef __arm__ #include "cpuid_arm.c" #define OPENBLAS_SUPPORTED From 62a2eb884f0d364716a94d12284e339d20ffcc29 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Nov 2020 15:33:07 +0100 Subject: [PATCH 04/20] Add SSE flags for x86 --- Makefile.x86 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Makefile.x86 b/Makefile.x86 index f310f4973..0e27264d8 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -59,9 +59,11 @@ LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm endif - +ifdef HAVE_SSE2 +CCOMMON_OPT += -msse2 +FCOMMON_OPT += -msse2 +endif ifdef HAVE_SSE3 -ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 ifdef HAVE_SSSE3 @@ -73,5 +75,4 @@ CCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1 endif endif -endif From 7d46e31de1a206ea55ae31e7a0a1ae4b704458e0 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sun, 29 Nov 2020 15:28:28 -0600 Subject: [PATCH 05/20] POWER10: Optimize dgemv_n Handling as 4x8 with vector pairs gives better performance than existing code in POWER10. --- kernel/power/dgemv_n_microk_power10.c | 150 +++++++++++++++++++-- kernel/power/dgemv_n_power10.c | 185 ++------------------------ 2 files changed, 155 insertions(+), 180 deletions(-) diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index 4be8a5f9b..e47de2cb5 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -/************************************************************************************** -* 2016/03/30 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - #define HAVE_KERNEL_4x4 1 static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) @@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" ); } +static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + + double *a0; + double *a1; + double *a2; + double *a3; + double *a4; + double *a5; + double *a6; + double *a7; + long tmp; + __asm__ + ( + "lxvp 34, 0( %15) \n\t" // x0, x1 + "lxvp 38, 32( %15) \n\t" // x4, x5 + + XXSPLTD_S(58,%x14,0) // alpha, alpha + "sldi %10, %17, 3 \n\t" // lda * sizeof (double) + "xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha + "xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha + "xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha + + "li %11, 32 \n\t" + + "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda + "add %10, %10, %10 \n\t" // 2 * lda + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha + + "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda + "add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda + "add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda + "add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda + "add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda + + "lxvp 40, 0( %3) \n\t" // a0[0], a0[1] + "lxvp 42, 0( %4) \n\t" // a1[0], a1[1] + "lxvp 44, 0( %5) \n\t" // a2[0], a2[1] + "lxvp 46, 0( %6) \n\t" // a3[0], a3[1] + "lxvp 50, 0( %7) \n\t" // a4[0] + "lxvp 52, 0( %8) \n\t" // a5[0] + "lxvp 54, 0( %9) \n\t" // a6[0] + "lxvp 56, 0( %10) \n\t" // a7[0] + + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "lxvpx 50, %7, %11 \n\t" // a4[0] + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "lxvpx 52, %8, %11 \n\t" // a5[0] + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "lxvpx 54, %9, %11 \n\t" // a6[0] + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "lxvpx 56, %10, %11 \n\t" // a7[0] + "addi %11, %11, 32 \n\t" + + "stxvp 36, 0( %2) \n\t" // y0, y1 + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0( %2) \n\t" // y0, y1 + "xvmaddadp 36, 40, 34 \n\t" + "xvmaddadp 37, 41, 34 \n\t" + "xvmaddadp 36, 42, 35 \n\t" + "xvmaddadp 37, 43, 35 \n\t" + "xvmaddadp 36, 44, 32 \n\t" + "xvmaddadp 37, 45, 32 \n\t" + "xvmaddadp 36, 46, 33 \n\t" + "xvmaddadp 37, 47, 33 \n\t" + "xvmaddadp 36, 50, 48 \n\t" + "xvmaddadp 37, 51, 48 \n\t" + "xvmaddadp 36, 52, 49 \n\t" + "xvmaddadp 37, 53, 49 \n\t" + "xvmaddadp 36, 54, 38 \n\t" + "xvmaddadp 37, 55, 38 \n\t" + "xvmaddadp 36, 56, 39 \n\t" + "xvmaddadp 37, 57, 39 \n\t" + "stxvp 36, 0( %2) \n\t" // y0, y1 + + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3), // 6 + "=&b" (a4), // 7 + "=&b" (a5), // 8 + "=&b" (a6), // 9 + "=&b" (a7), // 10 + "=b" (tmp) + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 14 + "r" (x), // 15 + "3" (ap), // 16 + "4" (lda) // 17 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48", + "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c index ad5f1ba0d..aba15ab4e 100644 --- a/kernel/power/dgemv_n_power10.c +++ b/kernel/power/dgemv_n_power10.c @@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" -#include - -typedef __vector unsigned char vec_t; -typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef __vector_pair __attribute__((aligned(8))) vecp_t; #include "dgemv_n_microk_power10.c" -#define MMA(X, APTR, ACC) \ - rX = (vec_t *) & X; \ - rowA = *((vecp_t*)((void*)&APTR)); \ - __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); - -#define SAVE(ACC, Z) \ - rowC = (v4sf_t *) &y[Z]; \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - result[0][1] = result[1][0]; \ - result[2][1] = result[3][0]; \ - rowC[0] += valpha * result[0]; \ - rowC[1] += valpha * result[2]; - -void -dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, - FLOAT * y, FLOAT alpha) -{ - BLASLONG i, j, tmp; - FLOAT *a0 = a_ptr; - FLOAT *x1 = xo; - vector double valpha = { alpha, alpha }; - v4sf_t *rowC; - __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - v4sf_t result[4]; - vecp_t rowA; - vec_t *rX; - tmp = (n / 32) * 32; - for (i = 0; i < tmp; i += 32) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); - __builtin_mma_xxsetaccz (&acc2); - __builtin_mma_xxsetaccz (&acc3); - __builtin_mma_xxsetaccz (&acc4); - __builtin_mma_xxsetaccz (&acc5); - __builtin_mma_xxsetaccz (&acc6); - __builtin_mma_xxsetaccz (&acc7); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + 0 + j * lda], &acc0); - MMA (xo[j], a0[i + 4 + j * lda], &acc1); - MMA (xo[j], a0[i + 8 + j * lda], &acc2); - MMA (xo[j], a0[i + 12 + j * lda], &acc3); - MMA (xo[j], a0[i + 16 + j * lda], &acc4); - MMA (xo[j], a0[i + 20 + j * lda], &acc5); - MMA (xo[j], a0[i + 24 + j * lda], &acc6); - MMA (xo[j], a0[i + 28 + j * lda], &acc7); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i + 0); - SAVE (&acc1, i + 4); - SAVE (&acc2, i + 8); - SAVE (&acc3, i + 12); - SAVE (&acc4, i + 16); - SAVE (&acc5, i + 20); - SAVE (&acc6, i + 24); - SAVE (&acc7, i + 28); - - } - for (i = tmp; i < n; i += 4) - { - xo = x1; - a0 = a_ptr; - __builtin_mma_xxsetaccz (&acc0); - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - for (j = 0; j < 32; j++) - { - __builtin_prefetch (xo+j); - __builtin_prefetch (a0+i+j+lda); - MMA (xo[j], a0[i + j * lda], &acc0); - } - xo += 32; - a0 += lda << 5; - SAVE (&acc0, i); - } -} - - #define NBMAX 4096 #ifndef HAVE_KERNEL_4x4 @@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; - BLASLONG n1; BLASLONG m1; BLASLONG m2; BLASLONG m3; BLASLONG n2; BLASLONG lda4 = lda << 2; - BLASLONG lda128 = lda << 7; + BLASLONG lda8 = lda << 3; FLOAT xbuffer[8] __attribute__ ((aligned (16))); FLOAT *ybuffer; @@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n < 1 ) return(0); ybuffer = buffer; - BLASLONG n128 = n >> 7; - n1 = (n - (n128 * 128)) >> 2; - n2 = (n - (n128 * 128)) & 3; + BLASLONG n8 = n >> 3; + n2 = n & 3; m3 = m & 3 ; m1 = m & -4 ; @@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( inc_x == 1 ) { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); - a_ptr += lda128; - x_ptr += 128; + dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda8; + x_ptr += 8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); a_ptr += lda4; @@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { - for( i = 0; i < n128 ; i++) + for( i = 0; i < n8 ; i++) { - FLOAT xbuffer[128] __attribute__ ((aligned (16))); BLASLONG j; - for ( j = 0; j < 128 ; j++) + for ( j = 0; j < 8 ; j++) { xbuffer[j] = x_ptr[0]; x_ptr += inc_x; } - dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); - a_ptr += lda128; + dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda8; } - for( i = 0; i < n1 ; i++) + if( n & 4 ) { xbuffer[0] = x_ptr[0]; x_ptr += inc_x; From f6620229942eb7b670d13a527e2b22bc5ac05441 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Nov 2020 17:24:27 +0100 Subject: [PATCH 06/20] Move the version check to avoid overwriting unprocessed compiler data --- c_check | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/c_check b/c_check index efea9b0fb..a841df153 100644 --- a/c_check +++ b/c_check @@ -229,16 +229,6 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); - -if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { -$no_avx2 = 0; -$oldgcc = 0; -$data = `$compiler_name -dumpversion`; -if ($data <= 4.6) { -$no_avx2 = 1; -$oldgcc = 1; -} -} $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { eval "use File::Temp qw(tempfile)"; @@ -286,6 +276,15 @@ if ($data =~ /HAVE_C11/) { } } +if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) { + $no_avx2 = 0; + $oldgcc = 0; + $data = `$compiler_name -dumpversion`; + if ($data <= 4.6) { + $no_avx2 = 1; + $oldgcc = 1; + } +} $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; From 22574b474eec3220b4fe78257f66898281502bd5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Nov 2020 21:41:51 +0100 Subject: [PATCH 07/20] Suppress -mfma as well for gcc 4.6 --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index d806a4ed2..00967bcb6 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -32,10 +32,12 @@ CCOMMON_OPT += -mavx2 FCOMMON_OPT += -mavx2 endif endif +ifndef OLDGCC ifdef HAVE_FMA3 CCOMMON_OPT += -mfma FCOMMON_OPT += -mfma endif +endif ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH From b766c1e9bb592396b0c71ba47bf48e83534ca52c Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Tue, 1 Dec 2020 16:49:26 +0800 Subject: [PATCH 08/20] Improve the performance of zasum and casum with AVX512 intrinsic --- kernel/x86_64/KERNEL.SKYLAKEX | 3 + kernel/x86_64/casum.c | 144 ++++++++++ kernel/x86_64/casum_microk_skylakex-2.c | 349 ++++++++++++++++++++++++ kernel/x86_64/zasum.c | 144 ++++++++++ kernel/x86_64/zasum_microk_skylakex-2.c | 340 +++++++++++++++++++++++ 5 files changed, 980 insertions(+) create mode 100644 kernel/x86_64/casum.c create mode 100644 kernel/x86_64/casum_microk_skylakex-2.c create mode 100644 kernel/x86_64/zasum.c create mode 100644 kernel/x86_64/zasum_microk_skylakex-2.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 9b8b84c30..3d71584fe 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -27,3 +27,6 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c + +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c new file mode 100644 index 000000000..dce30e9b0 --- /dev/null +++ b/kernel/x86_64/casum.c @@ -0,0 +1,144 @@ +#include "common.h" + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#if defined(SKYLAKEX) +#include "casum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_CASUM_KERNEL +static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=4; + } + + while (i < n) { + sum4 += (ABS_K(x1[0]) + ABS_K(x1[1])); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = casum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]); + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c new file mode 100644 index 000000000..d51929f9f --- /dev/null +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -0,0 +1,349 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_CASUM_KERNEL 1 + +#include + +#include + +static FLOAT casum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + if (n2 < 64) { + __m128 accum_10, accum_11, accum_12, accum_13; + __m128 abs_mask1; + + accum_10 = _mm_setzero_ps(); + accum_11 = _mm_setzero_ps(); + accum_12 = _mm_setzero_ps(); + accum_13 = _mm_setzero_ps(); + + abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); + abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + + if (n2 >= 32){ + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + _mm_prefetch(&x1[16], _MM_HINT_T0); + __m128 x04 = _mm_loadu_ps(&x1[16]); + __m128 x05 = _mm_loadu_ps(&x1[20]); + __m128 x06 = _mm_loadu_ps(&x1[24]); + __m128 x07 = _mm_loadu_ps(&x1[28]); + + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + x02 = _mm_and_ps(x02, abs_mask1); + x03 = _mm_and_ps(x03, abs_mask1); + + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + x04 = _mm_and_ps(x04, abs_mask1); + x05 = _mm_and_ps(x05, abs_mask1); + x06 = _mm_and_ps(x06, abs_mask1); + x07 = _mm_and_ps(x07, abs_mask1); + + accum_10 = _mm_add_ps(accum_10, x04); + accum_11 = _mm_add_ps(accum_11, x05); + accum_12 = _mm_add_ps(accum_12, x06); + accum_13 = _mm_add_ps(accum_13, x07); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + __m128 x02 = _mm_loadu_ps(&x1[ 8]); + __m128 x03 = _mm_loadu_ps(&x1[12]); + + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + x02 = _mm_and_ps(x02, abs_mask1); + x03 = _mm_and_ps(x03, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + accum_12 = _mm_add_ps(accum_12, x02); + accum_13 = _mm_add_ps(accum_13, x03); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + __m128 x01 = _mm_loadu_ps(&x1[ 4]); + x00 = _mm_and_ps(x00, abs_mask1); + x01 = _mm_and_ps(x01, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + accum_11 = _mm_add_ps(accum_11, x01); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128 x00 = _mm_loadu_ps(&x1[ 0]); + x00 = _mm_and_ps(x00, abs_mask1); + accum_10 = _mm_add_ps(accum_10, x00); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + sumf += (ABS_K(x1[0]) + ABS_K(x1[1])); + } + + accum_10 = _mm_add_ps(accum_10, accum_11); + accum_12 = _mm_add_ps(accum_12, accum_13); + accum_10 = _mm_add_ps(accum_10, accum_12); + + accum_10 = _mm_hadd_ps(accum_10, accum_10); + accum_10 = _mm_hadd_ps(accum_10, accum_10); + + sumf += accum_10[0]; + } + else { + __m512 accum_0, accum_1, accum_2, accum_3; + __m512 x00, x01, x02, x03, x04, x05, x06, x07; + __m512 abs_mask = (__m512)_mm512_set1_epi32(0x7fffffff); + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 256) { + if (n2 >= 128) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[ 16]); + x02 = _mm512_loadu_ps(&x1[ 32]); + x03 = _mm512_loadu_ps(&x1[ 48]); + x04 = _mm512_loadu_ps(&x1[ 64]); + x05 = _mm512_loadu_ps(&x1[ 80]); + x06 = _mm512_loadu_ps(&x1[ 96]); + x07 = _mm512_loadu_ps(&x1[112]); + + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + n2 -= 128; + x1 += 128; + } + + if (n2 >= 64) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x02 = _mm512_loadu_ps(&x1[32]); + x03 = _mm512_loadu_ps(&x1[48]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x01 = _mm512_loadu_ps(&x1[16]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_ps(&x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + } + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + + sumf = _mm512_reduce_add_ps(accum_0); + } + // n2 >= 256, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (0 != align_header) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header)); + x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[ 16]); + x02 = _mm512_load_ps(&x1[ 32]); + x03 = _mm512_load_ps(&x1[ 48]); + x04 = _mm512_load_ps(&x1[ 64]); + x05 = _mm512_load_ps(&x1[ 80]); + x06 = _mm512_load_ps(&x1[ 96]); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + + while (n2 >= 128) { + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + x00 = _mm512_load_ps(&x1[ 0]); + accum_1 = _mm512_add_ps(accum_1, x01); + x01 = _mm512_load_ps(&x1[ 16]); + accum_2 = _mm512_add_ps(accum_2, x02); + x02 = _mm512_load_ps(&x1[ 32]); + accum_3 = _mm512_add_ps(accum_3, x03); + x03 = _mm512_load_ps(&x1[ 48]); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x04); + x04 = _mm512_load_ps(&x1[ 64]); + accum_1 = _mm512_add_ps(accum_1, x05); + x05 = _mm512_load_ps(&x1[ 80]); + accum_2 = _mm512_add_ps(accum_2, x06); + x06 = _mm512_load_ps(&x1[ 96]); + accum_3 = _mm512_add_ps(accum_3, x07); + x07 = _mm512_load_ps(&x1[112]); + + n2 -= 128; + x1 += 128; + } + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + x04 = _mm512_and_ps(x04, abs_mask); + x05 = _mm512_and_ps(x05, abs_mask); + x06 = _mm512_and_ps(x06, abs_mask); + x07 = _mm512_and_ps(x07, abs_mask); + + accum_0 = _mm512_add_ps(accum_0, x04); + accum_1 = _mm512_add_ps(accum_1, x05); + accum_2 = _mm512_add_ps(accum_2, x06); + accum_3 = _mm512_add_ps(accum_3, x07); + + if (n2 >= 64) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x02 = _mm512_load_ps(&x1[32]); + x03 = _mm512_load_ps(&x1[48]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + x02 = _mm512_and_ps(x02, abs_mask); + x03 = _mm512_and_ps(x03, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + accum_2 = _mm512_add_ps(accum_2, x02); + accum_3 = _mm512_add_ps(accum_3, x03); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_load_ps(&x1[ 0]); + x01 = _mm512_load_ps(&x1[16]); + x00 = _mm512_and_ps(x00, abs_mask); + x01 = _mm512_and_ps(x01, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + accum_1 = _mm512_add_ps(accum_1, x01); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_ps(&x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + + n2 -= 16; + x1 += 16; + } + + if (n2) { + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2)); + x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]); + x00 = _mm512_and_ps(x00, abs_mask); + accum_0 = _mm512_add_ps(accum_0, x00); + } + + accum_0 = _mm512_add_ps(accum_0, accum_1); + accum_2 = _mm512_add_ps(accum_2, accum_3); + accum_0 = _mm512_add_ps(accum_0, accum_2); + sumf = _mm512_reduce_add_ps(accum_0); + } + } + + return sumf; +} +#endif diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c new file mode 100644 index 000000000..514ce2434 --- /dev/null +++ b/kernel/x86_64/zasum.c @@ -0,0 +1,144 @@ +#include "common.h" + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +#if defined(SKYLAKEX) +#include "zasum_microk_skylakex-2.c" +#endif + +#ifndef HAVE_ZASUM_KERNEL +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x1 = x; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x1[0]); + temp1 = ABS_K(x1[1]); + temp2 = ABS_K(x1[2]); + temp3 = ABS_K(x1[3]); + temp4 = ABS_K(x1[4]); + temp5 = ABS_K(x1[5]); + temp6 = ABS_K(x1[6]); + temp7 = ABS_K(x1[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x1+=8; + i+=4; + } + + while (i < n) { + sum4 += ABS_K(x1[0]) + ABS_K(x1[1]); + x1 += 2; + i++; + } + + return sum0+sum1+sum2+sum3+sum4; +} + +#endif + +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ip = 0; + BLASLONG inc_x2; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + if (inc_x == 1) { + sumf = zasum_kernel(n, x); + } + else { + inc_x2 = 2 * inc_x; + + while (i < n) { + sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]); + ip += inc_x2; + i++; + } + } + + return(sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, + FLOAT *x, BLASLONG inc_x, + FLOAT * dummy3, BLASLONG dummy4, + FLOAT * result, BLASLONG dummy5) +{ + *(FLOAT *) result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_value(int mode, + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, + int (*function)(), + int nthread); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha[2]; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 10000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, + NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c new file mode 100644 index 000000000..b44c53801 --- /dev/null +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -0,0 +1,340 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_ZASUM_KERNEL 1 + +#include + +#include + +static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) +{ + FLOAT *x1 = x; + FLOAT sumf=0.0; + BLASLONG n2 = n + n; + + + if (n2 < 32) { + __m128d accum_10, accum_11, accum_12, accum_13; + __m128d abs_mask1; + + accum_10 = _mm_setzero_pd(); + accum_11 = _mm_setzero_pd(); + accum_12 = _mm_setzero_pd(); + accum_13 = _mm_setzero_pd(); + + // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff); + abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1); + abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1); + + _mm_prefetch(&x1[0], _MM_HINT_T0); + if (n2 >= 16){ + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + _mm_prefetch(&x1[8], _MM_HINT_T0); + __m128d x04 = _mm_loadu_pd(&x1[ 8]); + __m128d x05 = _mm_loadu_pd(&x1[10]); + __m128d x06 = _mm_loadu_pd(&x1[12]); + __m128d x07 = _mm_loadu_pd(&x1[14]); + + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + x02 = _mm_and_pd(x02, abs_mask1); + x03 = _mm_and_pd(x03, abs_mask1); + + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + x04 = _mm_and_pd(x04, abs_mask1); + x05 = _mm_and_pd(x05, abs_mask1); + x06 = _mm_and_pd(x06, abs_mask1); + x07 = _mm_and_pd(x07, abs_mask1); + + accum_10 = _mm_add_pd(accum_10, x04); + accum_11 = _mm_add_pd(accum_11, x05); + accum_12 = _mm_add_pd(accum_12, x06); + accum_13 = _mm_add_pd(accum_13, x07); + + x1 += 16; + n2 -= 16; + } + + if (n2 >= 8) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + __m128d x02 = _mm_loadu_pd(&x1[ 4]); + __m128d x03 = _mm_loadu_pd(&x1[ 6]); + + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + x02 = _mm_and_pd(x02, abs_mask1); + x03 = _mm_and_pd(x03, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + accum_12 = _mm_add_pd(accum_12, x02); + accum_13 = _mm_add_pd(accum_13, x03); + + n2 -= 8; + x1 += 8; + } + + if (n2 >= 4) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + __m128d x01 = _mm_loadu_pd(&x1[ 2]); + x00 = _mm_and_pd(x00, abs_mask1); + x01 = _mm_and_pd(x01, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + accum_11 = _mm_add_pd(accum_11, x01); + + n2 -= 4; + x1 += 4; + } + + if (n2) { + __m128d x00 = _mm_loadu_pd(&x1[ 0]); + x00 = _mm_and_pd(x00, abs_mask1); + accum_10 = _mm_add_pd(accum_10, x00); + } + + accum_10 = _mm_add_pd(accum_10, accum_11); + accum_12 = _mm_add_pd(accum_12, accum_13); + accum_10 = _mm_add_pd(accum_10, accum_12); + + accum_10 = _mm_hadd_pd(accum_10, accum_10); + + sumf = accum_10[0]; + } + else { + __m512d accum_0, accum_1, accum_2, accum_3; + __m512d x00, x01, x02, x03, x04, x05, x06, x07; + __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff); + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + + // alignment has side-effect when the size of input array is not large enough + if (n2 < 128) { + if (n2 >= 64) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x04 = _mm512_loadu_pd(&x1[32]); + x05 = _mm512_loadu_pd(&x1[40]); + x06 = _mm512_loadu_pd(&x1[48]); + x07 = _mm512_loadu_pd(&x1[56]); + + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + n2 -= 64; + x1 += 64; + } + + if (n2 >= 32) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x02 = _mm512_loadu_pd(&x1[16]); + x03 = _mm512_loadu_pd(&x1[24]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x01 = _mm512_loadu_pd(&x1[ 8]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_loadu_pd(&x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + } + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + // n2 >= 128, doing alignment + else { + + int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (0 != align_header) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header)); + x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= align_header; + x1 += align_header; + } + + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x04 = _mm512_load_pd(&x1[32]); + x05 = _mm512_load_pd(&x1[40]); + x06 = _mm512_load_pd(&x1[48]); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + + while (n2 >= 64) { + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + x00 = _mm512_load_pd(&x1[ 0]); + accum_1 = _mm512_add_pd(accum_1, x01); + x01 = _mm512_load_pd(&x1[ 8]); + accum_2 = _mm512_add_pd(accum_2, x02); + x02 = _mm512_load_pd(&x1[16]); + accum_3 = _mm512_add_pd(accum_3, x03); + x03 = _mm512_load_pd(&x1[24]); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x04); + x04 = _mm512_load_pd(&x1[32]); + accum_1 = _mm512_add_pd(accum_1, x05); + x05 = _mm512_load_pd(&x1[40]); + accum_2 = _mm512_add_pd(accum_2, x06); + x06 = _mm512_load_pd(&x1[48]); + accum_3 = _mm512_add_pd(accum_3, x07); + x07 = _mm512_load_pd(&x1[56]); + + n2 -= 64; + x1 += 64; + } + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + x04 = _mm512_and_pd(x04, abs_mask); + x05 = _mm512_and_pd(x05, abs_mask); + x06 = _mm512_and_pd(x06, abs_mask); + x07 = _mm512_and_pd(x07, abs_mask); + + accum_0 = _mm512_add_pd(accum_0, x04); + accum_1 = _mm512_add_pd(accum_1, x05); + accum_2 = _mm512_add_pd(accum_2, x06); + accum_3 = _mm512_add_pd(accum_3, x07); + + if (n2 >= 32) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x02 = _mm512_load_pd(&x1[16]); + x03 = _mm512_load_pd(&x1[24]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + x02 = _mm512_and_pd(x02, abs_mask); + x03 = _mm512_and_pd(x03, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + accum_2 = _mm512_add_pd(accum_2, x02); + accum_3 = _mm512_add_pd(accum_3, x03); + + n2 -= 32; + x1 += 32; + } + + if (n2 >= 16) { + x00 = _mm512_load_pd(&x1[ 0]); + x01 = _mm512_load_pd(&x1[ 8]); + x00 = _mm512_and_pd(x00, abs_mask); + x01 = _mm512_and_pd(x01, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + accum_1 = _mm512_add_pd(accum_1, x01); + + n2 -= 16; + x1 += 16; + } + + if (n2 >= 8) { + x00 = _mm512_load_pd(&x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + + n2 -= 8; + x1 += 8; + } + + if (n2) { + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2)); + x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]); + x00 = _mm512_and_pd(x00, abs_mask); + accum_0 = _mm512_add_pd(accum_0, x00); + } + + accum_0 = _mm512_add_pd(accum_0, accum_1); + accum_2 = _mm512_add_pd(accum_2, accum_3); + accum_0 = _mm512_add_pd(accum_0, accum_2); + sumf = _mm512_reduce_add_pd(accum_0); + } + } + + return sumf; +} +#endif From 9621062ebabcfb8f75a318fbcaf9558b26de9799 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Dec 2020 12:23:30 +0100 Subject: [PATCH 09/20] Update OSX xcode version to 11.5 --- .travis.yml | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3f917ce72..909d1eddb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -211,7 +211,7 @@ matrix: - &test-macos os: osx - osx_image: xcode10.1 + osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - brew update @@ -238,17 +238,23 @@ matrix: - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - <<: *test-macos - osx_image: xcode10.1 + osx_image: xcode11.5 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" +# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode10.1 + osx_image: xcode11.5 env: - - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" +# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" + - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" + - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" - &test-graviton2 From 77a538d4ba34b2736014346285006b43ece2d0a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Dec 2020 22:05:35 +0100 Subject: [PATCH 10/20] Update an overlooked instance of xcode 10.0 as well --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 909d1eddb..7fe2ab388 100644 --- a/.travis.yml +++ b/.travis.yml @@ -233,7 +233,7 @@ matrix: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - <<: *test-macos - osx_image: xcode10.0 + osx_image: xcode11.5 env: - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" From 0cb7a403b25ebd623f9de97123742c0274fb7147 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Wed, 2 Dec 2020 09:51:52 +0800 Subject: [PATCH 11/20] fix error declare function blas_level1_thread_with_return_value --- kernel/x86_64/casum.c | 2 +- kernel/x86_64/zasum.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index dce30e9b0..a1bd76f33 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -93,7 +93,7 @@ static int asum_thread_function(BLASLONG n, return 0; } -extern int blas_level1_thread_with_value(int mode, +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c index 514ce2434..6e758e2e3 100644 --- a/kernel/x86_64/zasum.c +++ b/kernel/x86_64/zasum.c @@ -93,7 +93,7 @@ static int asum_thread_function(BLASLONG n, return 0; } -extern int blas_level1_thread_with_value(int mode, +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, From c361313564b9909aea1587435d56a0f5ffe8fcf7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Dec 2020 07:49:43 +0100 Subject: [PATCH 12/20] Disable deprecated 32bit xcode --- .travis.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7fe2ab388..d532899fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -214,8 +214,6 @@ matrix: osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: @@ -232,10 +230,10 @@ matrix: env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - - <<: *test-macos - osx_image: xcode11.5 - env: - - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" + # - <<: *test-macos + # osx_image: xcode10 + # env: + # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - <<: *test-macos osx_image: xcode11.5 From 57456c248b6b240d396cc628b4e361836afb1a10 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Dec 2020 15:56:21 +0100 Subject: [PATCH 13/20] fix gfortran requirement in osx interface64 test --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d532899fe..83237662f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -216,8 +216,10 @@ matrix: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - brew update + - brew install gcc-10 env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - <<: *test-macos osx_image: xcode12 From dcbb3b5ef1e2aecad926526d21cf080d659eb6fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Dec 2020 23:13:13 +0100 Subject: [PATCH 14/20] fix misplaced lines --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 83237662f..771e70d42 100644 --- a/.travis.yml +++ b/.travis.yml @@ -214,10 +214,10 @@ matrix: osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - - brew update - - brew install gcc-10 env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" From 72a553f5bc032a2c9fdb08729e6a5e8a0b722d07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 09:17:27 +0100 Subject: [PATCH 15/20] Update .travis.yml --- .travis.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 771e70d42..6c5fb2f96 100644 --- a/.travis.yml +++ b/.travis.yml @@ -214,23 +214,19 @@ matrix: osx_image: xcode11.5 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" - <<: *test-macos osx_image: xcode12 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@10 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" # - <<: *test-macos # osx_image: xcode10 From a6692dc129acdd317f011c6dab1ea0a7e5080931 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 14:32:21 +0100 Subject: [PATCH 16/20] use gfortran-10 with xcode 12 --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6c5fb2f96..bde0e202d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -223,10 +223,12 @@ matrix: osx_image: xcode12 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" # - <<: *test-macos # osx_image: xcode10 From da0c94c76f1494b50274e9e41227a3f15e4765ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 21:25:57 +0100 Subject: [PATCH 17/20] Avoid linking both GNU libgomp and LLVM libomp in clang/gfortran builds --- f_check | 3 +++ 1 file changed, 3 insertions(+) diff --git a/f_check b/f_check index 9ef7b8086..cb869b3bb 100644 --- a/f_check +++ b/f_check @@ -330,6 +330,9 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } + if ($flags =~ /-lgomp/ && $CC == /clang/) { + $flags = "-lomp"; + } if ( ($flags =~ /^\-l/) From 74b585058145ee362ab57fbcbbc5c0d19332b432 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 21:28:10 +0100 Subject: [PATCH 18/20] Add libomp to the LAPACK(-test) dependencies in clang/gfortran builds --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index a9af62a22..54dd3be41 100644 --- a/Makefile +++ b/Makefile @@ -268,7 +268,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) + -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc From 41fe6e864ed70860cda1b1ccef09b55caf41fec9 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 3 Dec 2020 14:40:11 -0600 Subject: [PATCH 19/20] POWER10: Update param.h Increasing the values of DGEMM_DEFAULT_P and DGEMM_DEFAULT_Q helps in improving performance ~10% for DGEMM. --- param.h | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/param.h b/param.h index 7789c83c7..ee5ad17fb 100644 --- a/param.h +++ b/param.h @@ -2388,7 +2388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER9) || defined(POWER10) +#if defined(POWER9) #define SNUMOPT 16 #define DNUMOPT 8 @@ -2426,6 +2426,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER10) +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 65536 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 832 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 1026 +#define DGEMM_DEFAULT_Q 960 +#define CGEMM_DEFAULT_Q 1026 +#define ZGEMM_DEFAULT_Q 1026 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 8 + #undef SBGEMM_DEFAULT_UNROLL_N #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_P @@ -2436,10 +2469,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMM_DEFAULT_P 832 #define SBGEMM_DEFAULT_Q 1026 #define SBGEMM_DEFAULT_R 4096 -#undef DGEMM_DEFAULT_UNROLL_M -#undef DGEMM_DEFAULT_UNROLL_N -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 8 #endif #if defined(SPARC) && defined(V7) From a1eecccda28cf7d00a5ffbbcd5afb4ca6ef6c6a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Dec 2020 23:43:17 +0100 Subject: [PATCH 20/20] Update f_check --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index cb869b3bb..42241ae10 100644 --- a/f_check +++ b/f_check @@ -330,7 +330,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC == /clang/) { + if ($flags =~ /-lgomp/ && $CC =~ /clang/) { $flags = "-lomp"; }