From 2e3b15d68bc108c112abdc0ea3dc8074134b3815 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:43:55 +0200 Subject: [PATCH 1/6] Add CMakeLists.txt --- cpp_thread_test/CMakeLists.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 cpp_thread_test/CMakeLists.txt diff --git a/cpp_thread_test/CMakeLists.txt b/cpp_thread_test/CMakeLists.txt new file mode 100644 index 000000000..5eccb12ce --- /dev/null +++ b/cpp_thread_test/CMakeLists.txt @@ -0,0 +1,23 @@ +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) + +enable_language(CXX) + +set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") + +if (USE_OPENMP) +if (CPP_THREAD_SAFETY_TEST) + message(STATUS building thread safety test) + add_executable(dgemm_thread_safety dgemm_thread_safety.cpp) + target_link_libraries(dgemm_thread_safety ${OpenBLAS_LIBNAME}) + add_test( dgemm_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemm_thread_safety) +endif() + + +if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_executable(dgemv_thread_safety dgemv_thread_safety.cpp) + target_link_libraries(dgemv_thread_safety ${OpenBLAS_LIBNAME}) + add_test(dgemv_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemv_thread_safety) +endif() + +endif() From 8c5c991bd7e4eb89fc46d6c5ac41bd5ab9363836 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:45:40 +0200 Subject: [PATCH 2/6] Add cpp_thread_test options --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b82d7670..954c053e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() +option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -234,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES From 84c00c3c6e3f8f1344d632a559610d03a861f9fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:46:41 +0200 Subject: [PATCH 3/6] Support running just the GEMV version of the thread safety test --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 7a03b08f0..93e8af2eb 100644 --- a/Makefile +++ b/Makefile @@ -146,6 +146,9 @@ ifneq ($(NO_CBLAS), 1) ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all endif +ifeq ($(CPP_THREAD_SAFETY_GEMV), 1) + $(MAKE) -C cpp_thread_test dgemv_tester +endif endif endif From 6abca76c4e0171a598ffc7f3bef8279c13d71546 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:49:24 +0200 Subject: [PATCH 4/6] Add option for running only the less demanding GEMV version of the thread safety tests --- Makefile.rule | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 40bd1a854..4d6f2d313 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,6 +272,9 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 +# +# use this to run only the less memory-hungry GEMV test +# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support From 75d440caa083a32ca3b30809f18f1e29c75a967b Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Thu, 17 Sep 2020 16:45:07 +0200 Subject: [PATCH 5/6] s390x/DYNAMIC_ARCH: fixup broken merge and reapply simplification An unrelated commit and merge inadvertently reverted our recent two changes for simplifying DYNAMIC_ARCH on s390x. Simply reapply the changes. Simplify detection of which kernels we can compile on s390x. Instead of decoding the gcc version in a complicated manner, just check if CC supports a given -march=archXY flag. Together with the next patch, we thereby gain support for builds with LLVM/clang with DYNAMIC_ARCH=1. To enable builds with DYNAMIC_ARCH with older compiler releases, the Makefile and drivers/other/dynamic_arch.c need a common view of the architecture support built into the library. We follow the notation from x86 when used with DYNAMIC_LIST, where defines DYN_ denote support for a given generation to be built in. Since there are far fewer architecture generations in OpenBLAS for s390x, that does not bloat command lines too much. Closes: #2842 Fixes: ba644378dce7 ("Copy BUILD_ options available to the compiler flags" Signed-off-by: Marius Hillenbrand --- Makefile.system | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/Makefile.system b/Makefile.system index 0ccf9eaed..c46c88581 100644 --- a/Makefile.system +++ b/Makefile.system @@ -295,7 +295,6 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -594,34 +593,36 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC -# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer -ifeq ($(GCCVERSIONGT5), 1) - ZARCH_SUPPORT_Z13 := 1 -else ifeq ($(GCCVERSIONEQ5), 1) -ifeq ($(GCCMINORVERSIONGTEQ2), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif +# if the compiler accepts -march=arch11 or -march=z13 and can compile a file +# with z13-specific inline assembly, then we can include support for Z13. +# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases +# only support one or the other. +# note: LLVM version 6.x supported -march=z13 yet could not handle vector +# registers in inline assembly, so the check for supporting the -march flag is +# not enough. +ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null +ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) -ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) -ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(ZARCH_SUPPORT_Z13), 1) +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) DYNAMIC_CORE += Z13 +CCOMMON_OPT += -DDYN_Z13 else -$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) endif -ifeq ($(GCCVERSIONGTEQ7), 1) +# as above for z13, check for -march=arch12 and z14 support in the compiler. +ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) DYNAMIC_CORE += Z14 +CCOMMON_OPT += -DDYN_Z14 else -$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) -endif +$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) endif +endif # ARCH zarch + ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 From be43d2cb9651d37aed44307037dc98b837f95358 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 17 Sep 2020 12:56:28 -0500 Subject: [PATCH 6/6] Optimize daxpy/zaxpy for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. Tested in simulator and no new failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/daxpy_microk_power10.c | 131 ++++++++++++++++++ kernel/power/daxpy_power10.c | 121 +++++++++++++++++ kernel/power/zaxpy_microk_power10.c | 200 ++++++++++++++++++++++++++++ kernel/power/zaxpy_power10.c | 126 ++++++++++++++++++ 5 files changed, 580 insertions(+), 2 deletions(-) create mode 100644 kernel/power/daxpy_microk_power10.c create mode 100644 kernel/power/daxpy_power10.c create mode 100644 kernel/power/zaxpy_microk_power10.c create mode 100644 kernel/power/zaxpy_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index f390fac61..ec02e09ad 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -142,13 +142,13 @@ CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c -DAXPYKERNEL = daxpy.c +DAXPYKERNEL = daxpy_power10.c ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power9.S else CAXPYKERNEL = caxpy.c endif -ZAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy.c DCOPYKERNEL = dcopy.c diff --git a/kernel/power/daxpy_microk_power10.c b/kernel/power/daxpy_microk_power10.c new file mode 100644 index 000000000..bc9199efd --- /dev/null +++ b/kernel/power/daxpy_microk_power10.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) +{ + __vector double t0; + + __asm__ + ( + XXSPLTD_S(%x4,%x6,0) + + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 40, 64(%2) \n\t" + "lxvp 42, 96(%2) \n\t" + + "lxvp 36, 0(%3) \n\t" + "lxvp 38, 32(%3) \n\t" + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 36, 32, %x4 \n\t" + "xvmaddadp 37, 33, %x4 \n\t" + + "lxvp 32, 0(%2) \n\t" + "stxvp 36, 0(%3) \n\t" + + "xvmaddadp 38, 34, %x4 \n\t" + "xvmaddadp 39, 35, %x4 \n\t" + + "lxvp 34, 32(%2) \n\t" + "stxvp 38, 32(%3) \n\t" + + + "lxvp 36, 128(%3) \n\t" + "lxvp 38, 160(%3) \n\t" + + "xvmaddadp 44, 40, %x4 \n\t" + "xvmaddadp 45, 41, %x4 \n\t" + + "lxvp 40, 64(%2) \n\t" + "stxvp 44, 64(%3) \n\t" + + "xvmaddadp 46, 42, %x4 \n\t" + "xvmaddadp 47, 43, %x4 \n\t" + + "lxvp 42, 96(%2) \n\t" + "stxvp 46, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddadp 36, 32, %x4 \n\t" + "xvmaddadp 37, 33, %x4 \n\t" + "xvmaddadp 38, 34, %x4 \n\t" + "xvmaddadp 39, 35, %x4 \n\t" + + "xvmaddadp 44, 40, %x4 \n\t" + "xvmaddadp 45, 41, %x4 \n\t" + "xvmaddadp 46, 42, %x4 \n\t" + "xvmaddadp 47, 43, %x4 \n\t" + + "stxvp 36, 0(%3) \n\t" + "stxvp 38, 32(%3) \n\t" + "stxvp 44, 64(%3) \n\t" + "stxvp 46, 96(%3) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=wa" (t0) // 4 + : + "m" (*x), + "d" (alpha) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); + +} + + diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c new file mode 100644 index 000000000..ebe91a80f --- /dev/null +++ b/kernel/power/daxpy_power10.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "daxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG register i = 0; + + while(i < n) + { + y[i] += alpha * x[i]; + y[i+1] += alpha * x[i+1]; + y[i+2] += alpha * x[i+2]; + y[i+3] += alpha * x[i+3]; + y[i+4] += alpha * x[i+4]; + y[i+5] += alpha * x[i+5]; + y[i+6] += alpha * x[i+6]; + y[i+7] += alpha * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + daxpy_kernel_8(n1, x, y, da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c new file mode 100644 index 000000000..8e593bbfa --- /dev/null +++ b/kernel/power/zaxpy_microk_power10.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4 (long n, double *x, double *y, + double alpha_r, double alpha_i) +{ +#if !defined(CONJ) + static const double mvec[2] = { 1.0, -1.0 }; +#else + static const double mvec[2] = { -1.0, 1.0 }; +#endif + const double *mvecp = mvec; + + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + __vector double t6; + __vector double t7; + long ytmp; + + __asm__ + ( + XXSPLTD_S(32,%x15,0) // alpha_r + XXSPLTD_S(33,%x16,0) // alpha_i + "lxvd2x 36, 0, %17 \n\t" // mvec + +#if !defined(CONJ) + "xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec +#else + "xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec +#endif + + "mr %12, %3 \n\t" + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + + "lxvp 40, 0(%2) \n\t" // x0 + "lxvp 42, 32(%2) \n\t" // x2 + "lxvp 48, 0(%3) \n\t" // y0 + "lxvp 50, 32(%3) \n\t" // y2 + + XXSWAPD_S(%x4,40) // exchange real and imag part + XXSWAPD_S(%x5,41) // exchange real and imag part + XXSWAPD_S(%x6,42) // exchange real and imag part + XXSWAPD_S(%x7,43) // exchange real and imag part + + "lxvp 44, 64(%2) \n\t" // x4 + "lxvp 46, 96(%2) \n\t" // x6 + "lxvp 34, 64(%3) \n\t" // y4 + "lxvp 38, 96(%3) \n\t" // y6 + + XXSWAPD_S(%x8,44) // exchange real and imag part + XXSWAPD_S(%x9,45) // exchange real and imag part + XXSWAPD_S(%x10,46) // exchange real and imag part + XXSWAPD_S(%x11,47) // exchange real and imag part + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "lxvp 40, 0(%2) \n\t" // x0 + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + "lxvp 42, 32(%2) \n\t" // x2 + + "xvmaddadp 34, 44, 32 \n\t" + "xvmaddadp 35, 45, 32 \n\t" + "lxvp 44, 64(%2) \n\t" // x4 + "xvmaddadp 38, 46, 32 \n\t" + "xvmaddadp 39, 47, 32 \n\t" + "lxvp 46, 96(%2) \n\t" // x6 + + "xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 128 \n\t" + "xvmaddadp 49, %x5, 33 \n\t" + "xvmaddadp 50, %x6, 33 \n\t" + "xvmaddadp 51, %x7, 33 \n\t" + + "xvmaddadp 34, %x8, 33 \n\t" + "xvmaddadp 35, %x9, 33 \n\t" + "xvmaddadp 38, %x10, 33 \n\t" + "xvmaddadp 39, %x11, 33 \n\t" + + "stxvp 48, 0(%12) \n\t" + "stxvp 50, 32(%12) \n\t" + "stxvp 34, 64(%12) \n\t" + "stxvp 38, 96(%12) \n\t" + + "addi %12, %12, 128 \n\t" + + XXSWAPD_S(%x4,40) // exchange real and imag part + XXSWAPD_S(%x5,41) // exchange real and imag part + "lxvp 48, 0(%3) \n\t" // y0 + XXSWAPD_S(%x6,42) // exchange real and imag part + XXSWAPD_S(%x7,43) // exchange real and imag part + "lxvp 50, 32(%3) \n\t" // y2 + + XXSWAPD_S(%x8,44) // exchange real and imag part + XXSWAPD_S(%x9,45) // exchange real and imag part + "lxvp 34, 64(%3) \n\t" // y4 + XXSWAPD_S(%x10,46) // exchange real and imag part + XXSWAPD_S(%x11,47) // exchange real and imag part + "lxvp 38, 96(%3) \n\t" // y6 + + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "xvmaddadp 34, 44, 32 \n\t" + "xvmaddadp 35, 45, 32 \n\t" + "xvmaddadp 38, 46, 32 \n\t" + "xvmaddadp 39, 47, 32 \n\t" + + "xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddadp 49, %x5, 33 \n\t" + "xvmaddadp 50, %x6, 33 \n\t" + "xvmaddadp 51, %x7, 33 \n\t" + + "xvmaddadp 34, %x8, 33 \n\t" + "xvmaddadp 35, %x9, 33 \n\t" + "xvmaddadp 38, %x10, 33 \n\t" + "xvmaddadp 39, %x11, 33 \n\t" + + "stxvp 48, 0(%12) \n\t" + "stxvp 50, 32(%12) \n\t" + "stxvp 34, 64(%12) \n\t" + "stxvp 38, 96(%12) \n\t" + + "#n=%1 x=%13=%2 y=%0=%3 alpha=(%15,%16) mvecp=%14=%17 ytmp=%12\n" + "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=wa" (t0), // 4 + "=wa" (t1), // 5 + "=wa" (t2), // 6 + "=wa" (t3), // 7 + "=wa" (t4), // 8 + "=wa" (t5), // 9 + "=wa" (t6), // 10 + "=wa" (t7), // 11 + "=b" (ytmp) // 12 + : + "m" (*x), + "m" (*mvecp), + "d" (alpha_r), // 15 + "d" (alpha_i), // 16 + "12" (mvecp) // 17 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} diff --git a/kernel/power/zaxpy_power10.c b/kernel/power/zaxpy_power10.c new file mode 100644 index 000000000..54cfb8fd7 --- /dev/null +++ b/kernel/power/zaxpy_power10.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "zaxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_4 + +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + zaxpy_kernel_4 (n1, x, y, da_r, da_i); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + +