From bd2498c88643834f49f6d0bc764754631a71ee50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:07:58 +0200 Subject: [PATCH 01/24] Use POWER6 GEMM parameters on 32bit POWER8 --- param.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index e8cf53f0a..efe0e1096 100644 --- a/param.h +++ b/param.h @@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL - +#if defined(__32BIT__) +#warning using BINARY32==POWER6 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 @@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 - +#endif #define SGEMM_DEFAULT_P 1280UL #define DGEMM_DEFAULT_P 640UL #define CGEMM_DEFAULT_P 640UL From b144423f0f4d91e0f642b4c4c66b1cf919fcae0e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:10:12 +0200 Subject: [PATCH 02/24] Do not define USE_TRMM for 32bit POWER8 --- kernel/Makefile.L3 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index dfdaf5cf4..1904264be 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,8 +44,10 @@ USE_TRMM = 1 endif ifeq ($(CORE), POWER8) +ifeq ($(BINARY64),1) USE_TRMM = 1 endif +endif ifeq ($(CORE), POWER9) USE_TRMM = 1 @@ -514,7 +516,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -530,7 +532,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s From f8c2697701dfbcc3cba307245aab06134c86f53f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:11:19 +0200 Subject: [PATCH 03/24] Use POWER6 GEMM, TRMM and DTRSM on 32bit POWER8 --- kernel/power/KERNEL.POWER8 | 84 ++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 7fba5b4d6..dc6646d50 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,3 +1,51 @@ +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +$(info baue power6) +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_power6.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_power6.S +CGEMMINCOPY = ../generic/zgemm_ncopy_2.c +CGEMMITCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_power6.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S + +CAXPYKERNEL = zaxpy.S + +else + +$(info baue power8) #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -47,16 +95,21 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +#DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +#DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +#DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +#DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -153,15 +206,15 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -ifneq ($(GCCVERSIONGTEQ9),1) -CAXPYKERNEL = caxpy_power8.S -else -CAXPYKERNEL = caxpy.c -endif -else -CAXPYKERNEL = caxpy.c -endif +#ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +#ifneq ($(GCCVERSIONGTEQ9),1) +#CAXPYKERNEL = caxpy_power8.S +#else +#CAXPYKERNEL = caxpy.c +#endif +#else +#CAXPYKERNEL = caxpy.c +#endif # ZAXPYKERNEL = zaxpy.c # @@ -173,7 +226,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = cdot.c +CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -183,7 +236,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -CROTKERNEL = crot.c +#CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c @@ -239,4 +292,3 @@ IDAMINKERNEL = ../arm/iamin.c IZAMAXKERNEL = ../arm/izamax.c IZAMINKERNEL = ../arm/izamin.c endif - From da17abec871ed96e1c959eee4ad11a1346d25b2d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:20:03 +0200 Subject: [PATCH 04/24] fix trailing whitespace --- kernel/Makefile.L3 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 1904264be..d5de070a5 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -516,7 +516,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -532,7 +532,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s From f308e741b2cad79196b096fde3aad9b562b1410a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Jul 2020 10:00:07 +0200 Subject: [PATCH 05/24] remove debug output and revert changes to cdot and crot --- kernel/power/KERNEL.POWER8 | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index dc6646d50..bb93a6a23 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,5 +1,4 @@ ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) -$(info baue power6) SGEMMKERNEL = gemm_kernel_power6.S SGEMMINCOPY = SGEMMITCOPY = @@ -45,7 +44,6 @@ CAXPYKERNEL = zaxpy.S else -$(info baue power8) #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -226,7 +224,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -236,7 +234,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = crot.c +CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c From 0033f8be0d8fcc5c8ae9ba8f0cae556297015c81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Jul 2020 23:32:54 +0200 Subject: [PATCH 06/24] Use vec_vsx_ld/st to fix misaligned accesses flagged by asan --- kernel/power/saxpy.c | 96 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c index 393cdfadc..360d64146 100644 --- a/kernel/power/saxpy.c +++ b/kernel/power/saxpy.c @@ -28,6 +28,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#define offset_0 0 +#define offset_1 16 +#define offset_2 32 +#define offset_3 48 +#define offset_4 64 +#define offset_5 80 +#define offset_6 96 +#define offset_7 112 +#define offset_8 128 +#define offset_9 144 +#define offset_10 160 +#define offset_11 176 +#define offset_12 192 +#define offset_13 208 +#define offset_14 224 +#define offset_15 240 @@ -37,12 +53,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) { BLASLONG i = 0; - __vector float v_a = {alpha,alpha,alpha,alpha}; - __vector float * v_y=(__vector float *)y; - __vector float * v_x=(__vector float *)x; + __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha}; + __vector float * vptr_y =(__vector float *)y; + __vector float * vptr_x =(__vector float *)x; for(; i Date: Thu, 23 Jul 2020 17:03:28 +0200 Subject: [PATCH 07/24] Avoid undefining NAME,CNAME etc for pgcc as it makes it ignore the new defininitions --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index d62c66ad3..cc72c02e8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1241,7 +1241,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) +ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) From 661c6bfa5a245fdcfd0788d29dff4ce83a508e1e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:08:20 +0200 Subject: [PATCH 08/24] Exclude altivec code paths if the compiler does not support them --- kernel/power/casum.c | 2 ++ kernel/power/ccopy.c | 2 ++ kernel/power/cdot.c | 4 ++++ kernel/power/cgemv_n.c | 7 +++++-- kernel/power/cgemv_t.c | 7 +++++-- kernel/power/crot.c | 2 ++ kernel/power/cswap.c | 2 ++ kernel/power/dasum.c | 2 ++ kernel/power/daxpy.c | 2 ++ kernel/power/dcopy.c | 2 ++ kernel/power/ddot.c | 2 ++ kernel/power/dgemv_n.c | 2 ++ kernel/power/dgemv_t.c | 7 ++++++- kernel/power/drot.c | 2 ++ kernel/power/dscal.c | 2 ++ kernel/power/dswap.c | 2 ++ kernel/power/idamax.c | 9 +++++++++ kernel/power/idamin.c | 7 ++++++- kernel/power/izamax.c | 6 +++++- kernel/power/izamin.c | 8 +++++--- kernel/power/sasum.c | 2 ++ kernel/power/saxpy.c | 4 ++++ kernel/power/scopy.c | 2 ++ kernel/power/sdot.c | 3 +++ kernel/power/sgemv_n.c | 4 ++++ kernel/power/sgemv_t.c | 5 +++++ kernel/power/srot.c | 2 ++ kernel/power/sscal.c | 2 ++ kernel/power/sswap.c | 2 ++ kernel/power/zasum.c | 2 ++ kernel/power/zaxpy.c | 2 ++ kernel/power/zcopy.c | 2 ++ kernel/power/zdot.c | 12 +++++++++++- kernel/power/zgemv_n_4.c | 3 +++ kernel/power/zgemv_t_4.c | 3 +++ kernel/power/zrot.c | 5 ++++- kernel/power/zscal.c | 2 ++ kernel/power/zswap.c | 2 ++ 38 files changed, 126 insertions(+), 12 deletions(-) diff --git a/kernel/power/casum.c b/kernel/power/casum.c index 3478a39ef..06982bfba 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "casum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index cbe5b48d2..5e58034dd 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ccopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index d5b18729a..ef5e4710f 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zdot.c" +#else #include "common.h" #ifndef HAVE_KERNEL_8 @@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA return (result); } +#endif diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index eec3fa37c..8663039c5 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_n.c" +#else #include #include @@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c index 691f7a3d3..1bfc235db 100644 --- a/kernel/power/cgemv_t.c +++ b/kernel/power/cgemv_t.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else #include "common.h" @@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 5c1d44620..fb4860dcd 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { @@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) } #endif +#endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 88cb1d638..5144a2e93 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "cswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 09e06d909..999dc677a 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index 018beafd1..2de4e0911 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "daxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index cf203e71e..24279f8a2 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index bd9e1fb97..c5493015a 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ddot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index b4dfda550..ac365b3b2 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dgemv_n_microk_power8.c" #endif +#endif #define NBMAX 4096 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 5d43f673f..09abd5a43 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" +#else + #include "common.h" #define NBMAX 1024 //#define PREFETCH 1 + #include #define HAVE_KERNEL4x8_ASM 1 - #if defined(HAVE_KERNEL4x8_ASM) static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { @@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return (0); } +#endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index b808ab566..951c2f9c9 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "drot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 7e0fe48c0..39293252b 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_8) diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 795bb10b4..ff3f95c79 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 195a8c68e..5016f67dd 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include +#if defined(__VEC__) || defined(__ALTIVEC__) #include +#endif + #if defined(DOUBLE) #define ABS fabs @@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find maximum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { return index; } +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) > maxf) { diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 8a5538821..e37718c48 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find minimum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } - +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { min = diamin_kernel_32(n1, x, &minf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) < minf) { diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index 7149da28b..fe9d5bf95 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find maximum index @@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { } - +#endif @@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 692315b89..94f2383e0 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include @@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabs #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find minimum index @@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } +#endif @@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) minf = CABS1(x,0); //index will not be incremented #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) @@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - - diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index b259d7d76..733137012 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c index 393cdfadc..d005427b5 100644 --- a/kernel/power/saxpy.c +++ b/kernel/power/saxpy.c @@ -30,6 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#if defined(__VEC__) || defined(__ALTIVEC__) #ifndef HAVE_KERNEL_8 #include @@ -62,6 +63,7 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) } } #endif +#endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { @@ -74,11 +76,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { BLASLONG n1 = n & -64; +#if defined(__VEC__) || defined(__ALTIVEC__) if ( n1 ) saxpy_kernel_64(n1, x, y, da); i = n1; +#endif while(i < n) { diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 5207d386e..8ff8cb329 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "scopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index 8de434e41..ffeab6638 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) + #include "sdot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 81ac031a3..5dfb18f5b 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" +#else #include "common.h" @@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } +#endif diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 3d8a442dc..62c517a9d 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else #include "common.h" @@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } +#endif diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 9638a59eb..a53342f61 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "srot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index ddd5b2c5b..de37e10a5 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_16) diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index a56434444..44522f0a0 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index 8383e39ab..305e50ede 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index 4a7c26c69..3064d5435 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zaxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_4 diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index bb80decd2..453f4e551 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index 9086ef35b..690765797 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zdot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 @@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; if ( n <= 0 ) - { + { /* __real__ result = 0.0 ; __imag__ result = 0.0 ; + */ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); return(result); } @@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in } #if !defined(CONJ) + /* __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; + */ + result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else + /* __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; + */ + result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index ba019d6a5..1f7199c89 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 @@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif +#endif // #define NBMAX 4096 diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index b34199af6..4ed27d96b 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #define NBMAX 4096 +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 +#endif #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c index c6d666178..5e7ca3b23 100644 --- a/kernel/power/zrot.c +++ b/kernel/power/zrot.c @@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zrot.c" +#else #include "common.h" @@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - \ No newline at end of file +#endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 16b584bca..5526f4d67 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index c6508f032..3a5a8eb83 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 From 7c6e56b5dfa0dee6e39eef9cc17c10ea92c39ac2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:10:59 +0200 Subject: [PATCH 09/24] Rewrite assignment to complex for better portability --- kernel/arm/zdot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 733c235c6..a9f46dde7 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,10 +48,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; - +/* CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; - +*/ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; @@ -71,8 +72,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } - CREAL(result) = dot[0]; - CIMAG(result) = dot[1]; + /*CREAL(result) = dot[0]; + CIMAG(result) = dot[1];*/ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); return(result); } From 21072e502ae620186dea2293e91b5685906bdc25 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:34:56 +0000 Subject: [PATCH 10/24] Typo fix --- kernel/power/zdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index 690765797..fe0e9284e 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -157,13 +157,13 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; */ - result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else /* __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; */ - result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif From ca3561cab9d698b7816544a08848306853c17c01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 18:30:42 +0000 Subject: [PATCH 11/24] Add ifdefs around call to altivec microkernel --- kernel/power/crot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/crot.c b/kernel/power/crot.c index fb4860dcd..84ba5d913 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -185,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { - +#if defined(__VEC__) || defined(__ALTIVEC__) BLASLONG n1 = n & -8; if ( n1 > 0 ) { @@ -193,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT i=n1; ix=2*n1; } - +#endif while(i < n) { temp[0] = c*x[ix] + s*y[ix] ; From bf1f0734ff8c90261bc0f3b0f3887b489a10f8b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 20:40:13 +0000 Subject: [PATCH 12/24] Use OPENBLAS_MAKE_COMPLEX_FLOAT on PPC only --- kernel/arm/zdot.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index a9f46dde7..ba0e57eb5 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,11 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -/* +#if !defined(__PPC__) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; -*/ +#else result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); +#endif if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; @@ -72,9 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } - /*CREAL(result) = dot[0]; - CIMAG(result) = dot[1];*/ +#if !defined(__POWER__) + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; +#else result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); +#endif return(result); } From 95d37e15754955f5c73195d2ca09208e99600ab9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Jul 2020 10:13:46 +0000 Subject: [PATCH 13/24] Regroup the 32 and 64bit sections and restore 64bit CAXPY --- kernel/power/KERNEL.POWER8 | 49 ++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index bb93a6a23..cbf285913 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,3 +1,4 @@ +# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) SGEMMKERNEL = gemm_kernel_power6.S SGEMMINCOPY = @@ -35,12 +36,6 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRSMKERNEL_LN = trsm_kernel_power6_LN.S -DTRSMKERNEL_LT = trsm_kernel_power6_LT.S -DTRSMKERNEL_RN = trsm_kernel_power6_LT.S -DTRSMKERNEL_RT = trsm_kernel_power6_RT.S - -CAXPYKERNEL = zaxpy.S else @@ -93,10 +88,6 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c @@ -104,10 +95,17 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -#DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -#DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -#DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -#DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -204,15 +202,20 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -#ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -#ifneq ($(GCCVERSIONGTEQ9),1) -#CAXPYKERNEL = caxpy_power8.S -#else -#CAXPYKERNEL = caxpy.c -#endif -#else -#CAXPYKERNEL = caxpy.c -#endif +ä +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +CAXPYKERNEL = zaxpy.S +else +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +ifneq ($(GCCVERSIONGTEQ9),1) +CAXPYKERNEL = caxpy_power8.S +else +CAXPYKERNEL = caxpy.c +endif +else +CAXPYKERNEL = caxpy.c +endif +endif # ZAXPYKERNEL = zaxpy.c # From 251a09ec903fb05a93bbd36bd4138a73b330f09a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Jul 2020 16:04:58 +0000 Subject: [PATCH 14/24] Typo fix --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index cbf285913..c2f4cd204 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -202,7 +202,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ä +# ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) CAXPYKERNEL = zaxpy.S else From 9be2688c78e1646e406e425b4c79e6f82db9f94e Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 24 Jul 2020 23:08:11 -0500 Subject: [PATCH 15/24] Fix to store results in correct order for POWER10 GEMM kernels There is a recent compiler change in __builtin_mma_disassemble_acc() which affects the order of storing result in POWER10. Also removing new LDFLAG -mno-power10-stub as it is handled by linker automatically. --- Makefile.system | 3 - kernel/power/dgemm_kernel_power10.c | 66 ++++++++--------- kernel/power/sgemm_kernel_power10.c | 106 +++++++++++++-------------- kernel/power/shgemm_kernel_power10.c | 54 +++++++------- 4 files changed, 113 insertions(+), 116 deletions(-) diff --git a/Makefile.system b/Makefile.system index cc72c02e8..db651ef99 100644 --- a/Makefile.system +++ b/Makefile.system @@ -617,7 +617,6 @@ DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGT5), 1) @@ -627,11 +626,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs else ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCMINORVERSIONGTEQ2), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b3ee301be..a0bc1a777 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ + rowC[0] = result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] = result[1] * alpha; \ - rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; -#define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ - rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ + rowC[0] += result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] += result[1] * alpha; \ - rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; -#define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ - rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define SET_ACC_ZERO4() \ diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 01c122c6d..81a5ec76b 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #if defined(TRMMKERNEL) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ + rowC[0] = result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] = result[1] * alpha; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[6] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[4] * alpha; \ + rowC[0] = result[2] * alpha; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] = result[0] * alpha; -#define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[6] * alpha; \ - rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] = result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] = result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ + rowC[0] += result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] += result[1] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; -#define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ - rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define KERNEL(i, j) \ __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c index 7455f925c..1ae9e04bf 100644 --- a/kernel/power/shgemm_kernel_power10.c +++ b/kernel/power/shgemm_kernel_power10.c @@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16) #define BF16TOF32(x) x #endif -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); @@ -64,54 +64,54 @@ vector char mask = #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ + rowC[0] += result[1] * alpha; \ rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] += result[1] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ + rowC[0] += result[2] * alpha; \ rowC = (v2sf_t *) &CO[2* ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; -#define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ - rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ - rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[6] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #define SET_ACC_ZERO4() \ __builtin_mma_xxsetaccz (&acc0); \ From 4fda217f99f611df04f4dcec8378ee0441fdf6e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Jul 2020 06:42:39 +0000 Subject: [PATCH 16/24] Delete potrf_parallel.c (moving it to ../potrf) --- lapack/getrf/potrf_parallel.c | 667 ---------------------------------- 1 file changed, 667 deletions(-) delete mode 100644 lapack/getrf/potrf_parallel.c diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c deleted file mode 100644 index 008fcb8cc..000000000 --- a/lapack/getrf/potrf_parallel.c +++ /dev/null @@ -1,667 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" - -#ifndef USE_SIMPLE_THREADED_LEVEL3 - -//The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. -#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD -#define USE_ALLOC_HEAP -#endif - - -static FLOAT dm1 = -1.; - -#ifndef KERNEL_FUNC -#ifndef LOWER -#define KERNEL_FUNC SYRK_KERNEL_U -#else -#define KERNEL_FUNC SYRK_KERNEL_L -#endif -#endif - -#ifndef LOWER -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_LT -#else -#define TRSM_KERNEL TRSM_KERNEL_LC -#endif -#else -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_RN -#else -#define TRSM_KERNEL TRSM_KERNEL_RR -#endif -#endif - -#ifndef CACHE_LINE_SIZE -#define CACHE_LINE_SIZE 8 -#endif - -#ifndef DIVIDE_RATE -#define DIVIDE_RATE 2 -#endif - -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - -#ifndef LOWER -#define TRANS -#endif - -#ifndef SYRK_LOCAL -#if !defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_UN -#elif !defined(LOWER) && defined(TRANS) -#define SYRK_LOCAL SYRK_UT -#elif defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_LN -#else -#define SYRK_LOCAL SYRK_LT -#endif -#endif - -typedef struct { -#ifdef HAVE_C11 - _Atomic -#else - volatile -#endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; -} job_t; - - -#ifndef KERNEL_OPERATION -#ifndef COMPLEX -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#else -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#endif -#endif - -#ifndef ICOPY_OPERATION -#ifndef TRANS -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef OCOPY_OPERATION -#ifdef TRANS -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef S -#define S args -> a -#endif -#ifndef A -#define A args -> b -#endif -#ifndef C -#define C args -> c -#endif -#ifndef LDA -#define LDA args -> lda -#endif -#ifndef N -#define N args -> m -#endif -#ifndef K -#define K args -> k -#endif - -static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ - - FLOAT *buffer[DIVIDE_RATE]; - - BLASLONG k, lda; - BLASLONG m_from, m_to; - - FLOAT *alpha; - FLOAT *a, *c; - job_t *job = (job_t *)args -> common; - BLASLONG xxx, bufferside; - - BLASLONG jjs, min_jj; - BLASLONG is, min_i, div_n; - - BLASLONG i, current; - - k = K; - - a = (FLOAT *)A; - c = (FLOAT *)C; - - lda = LDA; - - alpha = (FLOAT *)args -> alpha; - - m_from = range_n[mypos + 0]; - m_to = range_n[mypos + 1]; - -#if 0 - fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); -#endif - - div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); - for (i = 1; i < DIVIDE_RATE; i++) { - buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; - } - -#ifndef LOWER - TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#else - TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#endif - - for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { - - for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ - - min_jj = MIN(m_to, xxx + div_n) - jjs; - -#ifndef LOWER - if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; -#else - if (min_jj > GEMM_P) min_jj = GEMM_P; -#endif - -#ifndef LOWER - OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (k, min_jj, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - sb, - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - a + jjs * lda * COMPSIZE, lda, 0); -#else - ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (min_jj, k, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - sb, - a + jjs * COMPSIZE, lda, 0); -#endif - } - -#ifndef LOWER - for (i = 0; i <= mypos; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#else - for (i = mypos; i < args -> nthreads; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#endif - - WMB; - } - - min_i = m_to - m_from; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - /* thread has to wait */ - if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, m_from, xxx); - - if (m_from + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } - -#ifndef LOWER - current ++; -#else - current --; -#endif - } - - for(is = m_from + min_i; is < m_to; is += min_i){ - min_i = m_to - is; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, is, xxx); - - if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } -#ifndef LOWER - current ++; -#else - current --; -#endif - } - } - - for (i = 0; i < args -> nthreads; i++) { - if (i != mypos) { - for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; - } - } - } - - return 0; - } - -static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ - - blas_arg_t newarg; - -#ifndef USE_ALLOC_HEAP - job_t job[MAX_CPU_NUMBER]; -#else - job_t * job = NULL; -#endif - - blas_queue_t queue[MAX_CPU_NUMBER]; - - BLASLONG range[MAX_CPU_NUMBER + 100]; - - BLASLONG num_cpu; - - BLASLONG nthreads = args -> nthreads; - - BLASLONG width, i, j, k; - BLASLONG n, n_from, n_to; - int mode, mask; - double dnum; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; - mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; - mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; -#elif defined(HALF) - mode = BLAS_HALF | BLAS_REAL; - mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_REAL; - mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; - mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; - mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; - mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; -#endif -#endif - - newarg.m = args -> m; - newarg.k = args -> k; - newarg.a = args -> a; - newarg.b = args -> b; - newarg.c = args -> c; - newarg.lda = args -> lda; - newarg.alpha = args -> alpha; - -#ifdef USE_ALLOC_HEAP - job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); - if(job==NULL){ - fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); - exit(1); - } -#endif - - newarg.common = (void *)job; - - n_from = 0; - n_to = args -> m; - -#ifndef LOWER - - range[MAX_CPU_NUMBER] = n_to - n_from; - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - - for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; - -#else - - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[num_cpu + 1] = range[num_cpu] + width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - queue[num_cpu].range_n = range; - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - -#endif - - newarg.nthreads = num_cpu; - - if (num_cpu) { - - for (j = 0; j < num_cpu; j++) { - for (i = 0; i < num_cpu; i++) { - for (k = 0; k < DIVIDE_RATE; k++) { - job[j].working[i][CACHE_LINE_SIZE * k] = 0; - } - } - } - - queue[0].sa = sa; - queue[0].sb = sb; - queue[num_cpu - 1].next = NULL; - - exec_blas(num_cpu, queue); - } - -#ifdef USE_ALLOC_HEAP - free(job); -#endif - - return 0; -} - -#endif - -blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { - - BLASLONG n, bk, i, blocking, lda; - BLASLONG info; - int mode; - blas_arg_t newarg; - FLOAT *a; - FLOAT alpha[2] = { -ONE, ZERO}; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; -#else - mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif - - if (args -> nthreads == 1) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); -#endif - return info; - } - - n = args -> n; - a = (FLOAT *)args -> a; - lda = args -> lda; - - if (range_n) n = range_n[1] - range_n[0]; - - if (n <= GEMM_UNROLL_N * 2) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); -#endif - return info; - } - - newarg.lda = lda; - newarg.ldb = lda; - newarg.ldc = lda; - newarg.alpha = alpha; - newarg.beta = NULL; - newarg.nthreads = args -> nthreads; - - blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; - if (blocking > GEMM_Q) blocking = GEMM_Q; - - for (i = 0; i < n; i += blocking) { - bk = n - i; - if (bk > blocking) bk = blocking; - - newarg.m = bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - - info = CNAME(&newarg, NULL, NULL, sa, sb, 0); - if (info) return info + i; - - if (n - i - bk > 0) { -#ifndef USE_SIMPLE_THREADED_LEVEL3 - newarg.m = n - i - bk; - newarg.k = bk; -#ifndef LOWER - newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; -#else - newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; -#endif - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - - thread_driver(&newarg, sa, sb); -#else - -#ifndef LOWER - newarg.m = bk; - newarg.n = n - i - bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; - - gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); -#endif -#else - newarg.m = n - i - bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + bk + i * lda) * COMPSIZE; - - gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + (i + bk + i * lda) * COMPSIZE; - newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); -#endif -#endif - -#endif - } - } - return 0; -} From f194ad59e1399a7fc99e877a3ec26a8d7ff5c585 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Jul 2020 08:52:24 +0200 Subject: [PATCH 17/24] Use _Atomic instead of volatile where available (file moved from ../getrf) must have misplaced this in ../getrf when I made that change in March 2018 (40160ff) the only changes since then were RFC : Add half precision gemm for bfloat16 in OpenBLAS Rajalakshmi Srinivasaraghavan Rajalakshmi Srinivasaraghavan committed on 14 Apr 2020 as 7ebbb50 Change _STDC_VERSION__ to __STDC_VERSION__ Zhiyong Dang committed on 11 May 2018 as 3716267 --- lapack/potrf/potrf_parallel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index e61e8decb..008fcb8cc 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -101,7 +101,12 @@ static FLOAT dm1 = -1.; #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#ifdef HAVE_C11 + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; @@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#elif defined(HALF) + mode = BLAS_HALF | BLAS_REAL; + mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; From 4e1be0e4813df72c26d94f8a452611b62576fcf9 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 11 Jun 2020 04:12:49 -0700 Subject: [PATCH 18/24] ARM64: Add THUNDERX3T110 Target --- Makefile.arm64 | 10 ++ Makefile.system | 1 + TargetList.txt | 1 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 27 +++++ cpuid_arm64.c | 27 ++++- driver/others/dynamic_arm64.c | 8 +- getarch.c | 18 +++ interface/swap.c | 2 +- interface/zswap.c | 2 +- kernel/arm64/KERNEL.THUNDERX3T110 | 184 ++++++++++++++++++++++++++++++ param.h | 29 +++++ 12 files changed, 305 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/KERNEL.THUNDERX3T110 diff --git a/Makefile.arm64 b/Makefile.arm64 index a7cd82e3a..1091edfe5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(CORE), THUNDERX3T110) +ifeq ($(GCCVERSIONGTEQ10), 1) +CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +else +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +endif +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/Makefile.system b/Makefile.system index db651ef99..d7e71d00a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 +DYNAMIC_CORE += THUNDERX3T110 endif ifeq ($(ARCH), zarch) diff --git a/TargetList.txt b/TargetList.txt index 4e54e3077..8ea2df9b7 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -96,6 +96,7 @@ FALKOR THUNDERX THUNDERX2T99 TSV110 +THUNDERX3T110 9.System Z: ZARCH_GENERIC diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d56ba99cb..5388156bc 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) endif () if (POWER) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 30256870c..e50483a2f 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -338,6 +338,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX3T110") + file(APPEND ${TARGET_CONF_TEMP} + "#define THUNDERX3T110\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t94371840\n" + "#define L3_LINESIZE\t64\n" + "#define L3_ASSOCIATIVE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "TSV110") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 4103216e6..6f41be604 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -40,6 +40,7 @@ // Cavium #define CPU_THUNDERX 7 #define CPU_THUNDERX2T99 8 +#define CPU_THUNDERX3T110 12 //Hisilicon #define CPU_TSV110 9 // Ampere @@ -57,7 +58,8 @@ static char *cpuname[] = { "THUNDERX2T99", "TSV110", "EMAG8180", - "NEOVERSEN1" + "NEOVERSEN1", + "THUNDERX3T110" }; static char *cpuname_lower[] = { @@ -72,7 +74,8 @@ static char *cpuname_lower[] = { "thunderx2t99", "tsv110", "emag8180", - "neoversen1" + "neoversen1", + "thunderx3t110" }; int get_feature(char *search) @@ -158,6 +161,8 @@ int detect(void) return CPU_THUNDERX; else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8")) + return CPU_THUNDERX3T110; // HiSilicon else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) return CPU_TSV110; @@ -372,7 +377,25 @@ void get_cpuconfig(void) printf("#define L2_LINESIZE 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); + break; + case CPU_THUNDERX3T110: + printf("#define THUNDERX3T110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 524288 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 94371840 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; } get_cpucount(); } diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 11ef2725c..157b03365 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; +extern gotoblas_t gotoblas_THUNDERX3T110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 11 +#define NUM_CORETYPES 12 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -82,6 +83,7 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "thunderx3t110", "unknown" }; @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; return corename[NUM_CORETYPES]; } @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 8: return (&gotoblas_TSV110); case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); + case 11: return (&gotoblas_THUNDERX3T110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX; case 0x0af: // ThunderX2 return &gotoblas_THUNDERX2T99; + case 0x0b8: // ThunderX3 + return &gotoblas_THUNDERX3T110; } break; case 0x48: // HiSilicon diff --git a/getarch.c b/getarch.c index 2cdf77259..51c9a84e5 100644 --- a/getarch.c +++ b/getarch.c @@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "EMAG8180" #endif +#ifdef FORCE_THUNDERX3T110 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "THUNDERX3T110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTHUNDERX3T110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "thunderx3t110" +#define CORENAME "THUNDERX3T110" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" diff --git a/interface/swap.c b/interface/swap.c index 17a9868a9..ea40b1fc2 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/interface/zswap.c b/interface/zswap.c index 372b15447..43971b73e 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 new file mode 100644 index 000000000..a20d0d4a6 --- /dev/null +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -0,0 +1,184 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif diff --git a/param.h b/param.h index efe0e1096..476f237a1 100644 --- a/param.h +++ b/param.h @@ -2779,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(THUNDERX3T110) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(NEOVERSEN1) #define SGEMM_DEFAULT_UNROLL_M 16 From d557584b71578620520e7bcdea7e0f029d0a76e7 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Mon, 27 Jul 2020 14:11:07 -0500 Subject: [PATCH 19/24] Fix compilation issues with clang on POWER As gcc defaults to -malign-power, removing that option. Also adding -fno-integrated-as to use GNU assembler for powerpc assembly optimization files. Fixed other compilation errors reported in dgemv_t.c file. --- Makefile.power | 26 +++++++++++++------------- kernel/Makefile | 5 +++++ kernel/power/dgemv_t.c | 4 ++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Makefile.power b/Makefile.power index bf7037995..c1556fe82 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,34 +11,34 @@ endif ifeq ($(CORE), POWER10) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp else FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp endif else ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math else FCOMMON_OPT += -O2 -Mrecursive endif @@ -48,26 +48,26 @@ endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp else FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp endif else ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) -FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math else -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif else FCOMMON_OPT += -O2 -Mrecursive diff --git a/kernel/Makefile b/kernel/Makefile index 9b468a6af..db3282c05 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) endif +ifeq ($(ARCH), power) +ifeq ($(C_COMPILER), CLANG) + override CFLAGS += -fno-integrated-as +endif +endif AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 09abd5a43..c07b3c223 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -359,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "stxvd2x 39, %[off], %[y] \n\t" "stxvd2x 40, %[off2], %[y] \n\t" - : [memy] "+m" (*(const double (*)[8])y), + : [memy] "+m" (*(double (*)[8])y), [n] "+&r" (n), [a0] "=b" (a0), [a1] "=&b" (a1), @@ -373,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do [off2]"=&b" (off2), [temp] "=&b" (tempR) : [memx] "m" (*(const double (*)[n])x), - [mem_ap] "m" (*(const double (*)[]) ap), + [mem_ap] "m" (*(const double (*)[n*8]) ap), [alpha] "d" (alpha), "[a0]" (ap), [x] "b" (x), From 921ec4e9e2ae5b1d32bcad04a19cdffe06e145c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Jul 2020 19:54:46 +0000 Subject: [PATCH 20/24] Adjust A53 SGEMM parameters to reflect move to 8x8 kernel --- cmake/prebuild.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 30256870c..ff7715c4b 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define HAVE_VFP\n" "#define HAVE_NEON\n" "#define ARMV8\n") +if ("${TCORE}" STREQUAL "CORTEXA57") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) +else + set(SGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 8) +endif set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) From 64e2e4aaf3d396740c0c0b66b5a10baf8fdef167 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Jul 2020 20:19:22 +0000 Subject: [PATCH 21/24] missing braces --- cmake/prebuild.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index ff7715c4b..4067138b4 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -198,10 +198,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS if ("${TCORE}" STREQUAL "CORTEXA57") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) -else +else () set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 8) -endif +endif () set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) From 200f5c44cc14f356d7dba6af257044016a0573da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 13:45:23 +0000 Subject: [PATCH 22/24] Add AMD Renoir models and preliminary support for ZEN3 as ZEN2 also remap erroneous family 16 entry to BOBCAT and reclaim erroneous family 25 "Barcelona" for Zen3 --- cpuid_x86.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 356800b78..ea846a392 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1454,10 +1454,11 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 3: - case 7: - case 10: +// case 7: +// case 10: return CPUTYPE_BARCELONA; case 5: + case 7: return CPUTYPE_BOBCAT; case 6: switch (model) { @@ -1507,6 +1508,8 @@ int get_cpuname(void){ // AMD Ryzen case 8: // AMD Ryzen2 + default: + // Matisse/Renoir and other recent Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; @@ -1516,6 +1519,16 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } + break; + case 10: // Zen3 + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_ZEN; +#else + return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CPUTYPE_BARCELONA; } break; } @@ -2107,7 +2120,7 @@ int get_coretype(void){ return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. - case 5: // New EXCAVATOR + case 5: // New EXCAVATOR if(support_avx()) return CORE_EXCAVATOR; else @@ -2135,12 +2148,14 @@ int get_coretype(void){ } break; } - } else if (exfamily == 8) { + } else if (exfamily == 8 || exfamily == 10) { switch (model) { case 1: // AMD Ryzen case 8: - // Ryzen 2 + // Ryzen 2 + default: + // Matisse,Renoir Ryzen2 models if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; From 12918358aa52aa9cdc194057d5e4b556933988aa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 13:53:17 +0000 Subject: [PATCH 23/24] Add AMD Renoir/Matisse and preliminary support for Zen3 as Zen2 also support AMD family 22 Jaguar/Puma as Bobcat --- driver/others/dynamic.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index c03b0b21d..5d71b1b2c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; - } else if (exfamily == 5) { + } else if (exfamily == 5 || exfamily == 7) { return &gotoblas_BOBCAT; } else if (exfamily == 6) { if(model == 1){ @@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1 || model == 8) { + /* if (model == 1 || model == 8) */ { if(support_avx()) return &gotoblas_ZEN; else{ @@ -718,16 +718,24 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } - } else if (exfamily == 9) { + } else if (exfamily == 9) { if(support_avx()) return &gotoblas_ZEN; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } + } + } else if (exfamily == 10) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } }else { return &gotoblas_BARCELONA; } + } } From 5fa581c87e0f3979d0fc70b4ea485fc0d898ffb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 14:22:41 +0000 Subject: [PATCH 24/24] Put hint to use git develop rather than master branch in README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e5e3e956..f8226f5cb 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code -using Git from https://github.com/xianyi/OpenBLAS.git. +using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be +sure to use the develop branch - master is several years out of date due to a change of maintainership.) Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. Most can also be given directly on the make or cmake command line.