diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 8d7cfea2d..b025f8634 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -44,6 +44,11 @@ jobs: if: github.event_name != 'pull_request' run: brew update || true + - name: unlink installed gcc to allow updating + run: | + brew unlink gcc@8 + brew unlink gcc@9 + - name: Install prerequisites run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas diff --git a/Makefile.arm64 b/Makefile.arm64 index 62a877fff..c3fe583e4 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,4 +1,4 @@ - +ifneq ($(C_COMPILER), PGI) ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a @@ -77,4 +77,4 @@ CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif - +endif diff --git a/Makefile.system b/Makefile.system index ca0879fe6..abc2c3dc5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1279,6 +1279,10 @@ CCOMMON_OPT += -DUSE_PAPI EXTRALIB += -lpapi -lperfctr endif +ifdef BUFFERSIZE +CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) +endif + ifdef DYNAMIC_THREADS CCOMMON_OPT += -DDYNAMIC_THREADS endif diff --git a/cblas.h b/cblas.h index da00d46d6..f0220eb99 100644 --- a/cblas.h +++ b/cblas.h @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); +void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); +void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); +void cblas_crotg(void *a, void *b, float *c, void *s); +void cblas_zrotg(void *a, void *b, double *c, void *s); + void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 8f25c1b27..29b5a067b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) + set (CMAKE_MATCH_1 CMAKE_C_COMPILER) + endif () if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) # message (STATUS "condition is true") set (IfElse 1) diff --git a/common_arm64.h b/common_arm64.h index 9cdded305..2270ffba7 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INLINE inline -#ifdef F_INTERFACE_FLANG +#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) #define RETURN_BY_STACK #else #define RETURN_BY_COMPLEX diff --git a/cpuid_x86.c b/cpuid_x86.c index 84c12ff43..aca37da45 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1436,6 +1436,15 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 7: // Rocket Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } @@ -2014,6 +2023,19 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 7:// Rocket Lake +#ifndef NO_AVX512 + if(support_avx512()) + return CORE_SKYLAKEX; +#endif +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; } case 5: switch (model) { diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 58f4d8b59..7845d6951 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ } } case 10: - if (model == 5 || model == 6) { + if (model == 5 || model == 6) { if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { @@ -666,7 +666,20 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } - } + } + if (model == 7) { + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 4f1b12f27..37c0694b6 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - __asm__("mrs %0, "#id : "=r" (var)); \ + __asm__ __volatile__("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { diff --git a/interface/Makefile b/interface/Makefile index 597956fdb..fab403c82 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) CZBLAS2OBJS = \ @@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) +cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + ifeq ($(BUILD_BFLOAT16),1) cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 6d8d759ad..f0793bdef 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) + string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) + if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) + if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 9249b54f8..79baa61b1 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; #else @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = dot[0]; CIMAG(result) = dot[1]; #else diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 603e47d87..c8a53c86b 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index e23133e52..db322dd0d 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S -DDOTKERNEL = dot.S -SDOTKERNEL = ../generic/dot.c -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index dcf2383a9..0be334893 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index cb02c7bc5..669f62698 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot_thunderx.c DDOTKERNEL = ddot_thunderx.c +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 index 1ce7bb7c0..54d016e17 100644 --- a/kernel/arm64/KERNEL.TSV110 +++ b/kernel/arm64/KERNEL.TSV110 @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index d61f5194a..1cf7b0b7c 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c SDOTKERNEL = sdot_power10.c DDOTKERNEL = ddot_power10.c DSDOTKERNEL = sdot_power10.c -ifneq ($(GCCVERSIONGTEQ9),1) -CDOTKERNEL = cdot_power9.S -else CDOTKERNEL = cdot.c -endif ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index ef5e4710f..c53fe0c02 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" +#if defined(POWER10) +#include "cdot_microk_power10.c" +#else #ifndef HAVE_KERNEL_8 #include @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) } #endif +#endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { +#if defined(POWER10) + BLASLONG n1 = n & -16; +#else BLASLONG n1 = n & -8; +#endif BLASLONG j=0; if (n1){ diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c new file mode 100644 index 000000000..399f2b180 --- /dev/null +++ b/kernel/power/cdot_microk_power10.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void cdot_kernel_8 (long n, float *x, float *y, float *dot) +{ + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 48, 0(%3) \n\t" + + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 50, 32(%3) \n\t" + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 40, 0(%2) \n\t" + + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 42, 32(%2) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 52, 64(%3) \n\t" + + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 54, 96(%3) \n\t" + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 44, 64(%2) \n\t" + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 46, 96(%2) \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 33, 33, 35 \n\t" + "xvaddsp 37, 37, 39 \n\t" + + "xvaddsp 35, 32, 36 \n\t" + "xvaddsp 34, 33, 37 \n\t" + "xxswapd 32, 35 \n\t" + "xxswapd 33, 34 \n\t" + "xvaddsp 35, 35, 32 \n\t" + "xvaddsp 34, 34, 33 \n\t" + "xxpermdi 34, 34, 35, 2 \n\t" + "stxv 34, 0(%6) \n\t" + + "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" + : + "=m" (*dot), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y), + "b" (dot), // 6 + "wa" (mask) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/param.h b/param.h index 6c5e0f107..6a790ab61 100644 --- a/param.h +++ b/param.h @@ -2399,6 +2399,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 @@ -2435,6 +2438,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 8