Merge pull request #1 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-12-06 18:52:51 +01:00 committed by GitHub
commit 3853014ea1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 1274 additions and 205 deletions

View File

@ -211,44 +211,48 @@ matrix:
- &test-macos - &test-macos
os: osx os: osx
osx_image: xcode10.1 osx_image: xcode11.5
before_script: before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
- brew install gcc@8 # for gfortran
script: script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env: env:
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
- <<: *test-macos - <<: *test-macos
osx_image: xcode12 osx_image: xcode12
before_script: before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update - brew update
- brew install gcc@10 # for gfortran - brew install gcc@10
script: script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env: env:
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
- <<: *test-macos # - <<: *test-macos
osx_image: xcode10.0 # osx_image: xcode10
env: # env:
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
- <<: *test-macos - <<: *test-macos
osx_image: xcode10.1 osx_image: xcode11.5
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
env: env:
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
- <<: *test-macos - <<: *test-macos
osx_image: xcode10.1 osx_image: xcode11.5
env: env:
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" # - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
- BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
- &test-graviton2 - &test-graviton2

View File

@ -268,7 +268,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc

View File

@ -1561,6 +1561,7 @@ export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE
export NO_AVX512 export NO_AVX512
export NO_AVX2
export BUILD_BFLOAT16 export BUILD_BFLOAT16
export SBGEMM_UNROLL_M export SBGEMM_UNROLL_M

View File

@ -59,9 +59,11 @@ LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
else else
LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm
endif endif
ifdef HAVE_SSE2
CCOMMON_OPT += -msse2
FCOMMON_OPT += -msse2
endif
ifdef HAVE_SSE3 ifdef HAVE_SSE3
ifndef DYNAMIC_ARCH
CCOMMON_OPT += -msse3 CCOMMON_OPT += -msse3
FCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3
ifdef HAVE_SSSE3 ifdef HAVE_SSSE3
@ -73,5 +75,4 @@ CCOMMON_OPT += -msse4.1
FCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1
endif endif
endif endif
endif

View File

@ -20,18 +20,24 @@ ifdef HAVE_SSE4_1
CCOMMON_OPT += -msse4.1 CCOMMON_OPT += -msse4.1
FCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1
endif endif
ifndef OLDGCC
ifdef HAVE_AVX ifdef HAVE_AVX
CCOMMON_OPT += -mavx CCOMMON_OPT += -mavx
FCOMMON_OPT += -mavx FCOMMON_OPT += -mavx
endif endif
endif
ifndef NO_AVX2
ifdef HAVE_AVX2 ifdef HAVE_AVX2
CCOMMON_OPT += -mavx2 CCOMMON_OPT += -mavx2
FCOMMON_OPT += -mavx2 FCOMMON_OPT += -mavx2
endif endif
endif
ifndef OLDGCC
ifdef HAVE_FMA3 ifdef HAVE_FMA3
CCOMMON_OPT += -mfma CCOMMON_OPT += -mfma
FCOMMON_OPT += -mfma FCOMMON_OPT += -mfma
endif endif
endif
ifeq ($(CORE), SKYLAKEX) ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH ifndef DYNAMIC_ARCH

11
c_check
View File

@ -276,6 +276,15 @@ if ($data =~ /HAVE_C11/) {
} }
} }
if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
$no_avx2 = 0;
$oldgcc = 0;
$data = `$compiler_name -dumpversion`;
if ($data <= 4.6) {
$no_avx2 = 1;
$oldgcc = 1;
}
}
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
@ -368,6 +377,8 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
$os =~ tr/[a-z]/[A-Z]/; $os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/;

View File

@ -330,6 +330,9 @@ if ($link ne "") {
$flags =~ s/\@/\,/g; $flags =~ s/\@/\,/g;
$linker_L .= "-Wl,". $flags . " " ; $linker_L .= "-Wl,". $flags . " " ;
} }
if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
$flags = "-lomp";
}
if ( if (
($flags =~ /^\-l/) ($flags =~ /^\-l/)

View File

@ -326,6 +326,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
#define ARCHITECTURE "X86" #define ARCHITECTURE "X86"
#ifdef NO_AVX2
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#else
#define SUBARCHITECTURE "HASWELL" #define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \ #define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -336,6 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "haswell" #define LIBNAME "haswell"
#define CORENAME "HASWELL" #define CORENAME "HASWELL"
#endif #endif
#endif
#ifdef FORCE_SKYLAKEX #ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512 #ifdef NO_AVX512
@ -551,6 +562,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
#define ARCHITECTURE "X86" #define ARCHITECTURE "X86"
#ifdef NO_AVX2
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#else
#define SUBARCHITECTURE "ZEN" #define SUBARCHITECTURE "ZEN"
#define ARCHCONFIG "-DZEN " \ #define ARCHCONFIG "-DZEN " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
@ -565,6 +586,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "zen" #define LIBNAME "zen"
#define CORENAME "ZEN" #define CORENAME "ZEN"
#endif #endif
#endif
#ifdef FORCE_SSE_GENERIC #ifdef FORCE_SSE_GENERIC
@ -983,6 +1005,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#endif #endif
#ifdef FORCE_RISCV64_GENERIC
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "RISCV64_GENERIC"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DRISCV64_GENERIC " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "riscv64_generic"
#define CORENAME "RISCV64_GENERIC"
#else
#endif
#ifdef FORCE_CORTEXA15 #ifdef FORCE_CORTEXA15
#define FORCE #define FORCE
#define ARCHITECTURE "ARM" #define ARCHITECTURE "ARM"
@ -1268,6 +1304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z14" #define CORENAME "Z14"
#endif #endif
#ifdef FORCE_C910V
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "C910V"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DC910V " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "c910v"
#define CORENAME "C910V"
#else
#endif
#ifndef FORCE #ifndef FORCE
#ifdef USER_TARGET #ifdef USER_TARGET
@ -1322,6 +1373,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED #define OPENBLAS_SUPPORTED
#endif #endif
#ifdef __riscv
#include "cpuid_riscv64.c"
#endif
#ifdef __arm__ #ifdef __arm__
#include "cpuid_arm.c" #include "cpuid_arm.c"
#define OPENBLAS_SUPPORTED #define OPENBLAS_SUPPORTED

View File

@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
/**************************************************************************************
* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define HAVE_KERNEL_4x4 1 #define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
); );
} }
static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha)
{
double *a0;
double *a1;
double *a2;
double *a3;
double *a4;
double *a5;
double *a6;
double *a7;
long tmp;
__asm__
(
"lxvp 34, 0( %15) \n\t" // x0, x1
"lxvp 38, 32( %15) \n\t" // x4, x5
XXSPLTD_S(58,%x14,0) // alpha, alpha
"sldi %10, %17, 3 \n\t" // lda * sizeof (double)
"xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha
"xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha
"xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha
"xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha
"li %11, 32 \n\t"
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda
"add %10, %10, %10 \n\t" // 2 * lda
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda
"add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda
"add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda
"add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda
"add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda
"lxvp 40, 0( %3) \n\t" // a0[0], a0[1]
"lxvp 42, 0( %4) \n\t" // a1[0], a1[1]
"lxvp 44, 0( %5) \n\t" // a2[0], a2[1]
"lxvp 46, 0( %6) \n\t" // a3[0], a3[1]
"lxvp 50, 0( %7) \n\t" // a4[0]
"lxvp 52, 0( %8) \n\t" // a5[0]
"lxvp 54, 0( %9) \n\t" // a6[0]
"lxvp 56, 0( %10) \n\t" // a7[0]
"addic. %1, %1, -4 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1]
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t"
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1]
"xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t"
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1]
"xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t"
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1]
"xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t"
"lxvpx 50, %7, %11 \n\t" // a4[0]
"xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t"
"lxvpx 52, %8, %11 \n\t" // a5[0]
"xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
"lxvpx 54, %9, %11 \n\t" // a6[0]
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
"lxvpx 56, %10, %11 \n\t" // a7[0]
"addi %11, %11, 32 \n\t"
"stxvp 36, 0( %2) \n\t" // y0, y1
"addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"lxvp 36, 0( %2) \n\t" // y0, y1
"xvmaddadp 36, 40, 34 \n\t"
"xvmaddadp 37, 41, 34 \n\t"
"xvmaddadp 36, 42, 35 \n\t"
"xvmaddadp 37, 43, 35 \n\t"
"xvmaddadp 36, 44, 32 \n\t"
"xvmaddadp 37, 45, 32 \n\t"
"xvmaddadp 36, 46, 33 \n\t"
"xvmaddadp 37, 47, 33 \n\t"
"xvmaddadp 36, 50, 48 \n\t"
"xvmaddadp 37, 51, 48 \n\t"
"xvmaddadp 36, 52, 49 \n\t"
"xvmaddadp 37, 53, 49 \n\t"
"xvmaddadp 36, 54, 38 \n\t"
"xvmaddadp 37, 55, 38 \n\t"
"xvmaddadp 36, 56, 39 \n\t"
"xvmaddadp 37, 57, 39 \n\t"
"stxvp 36, 0( %2) \n\t" // y0, y1
:
"+m" (*y),
"+r" (n), // 1
"+b" (y), // 2
"=b" (a0), // 3
"=b" (a1), // 4
"=&b" (a2), // 5
"=&b" (a3), // 6
"=&b" (a4), // 7
"=&b" (a5), // 8
"=&b" (a6), // 9
"=&b" (a7), // 10
"=b" (tmp)
:
"m" (*x),
"m" (*ap),
"d" (alpha), // 14
"r" (x), // 15
"3" (ap), // 16
"4" (lda) // 17
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48",
"vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58"
);
}

View File

@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include "common.h" #include "common.h"
#include <altivec.h>
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef __vector_pair __attribute__((aligned(8))) vecp_t;
#include "dgemv_n_microk_power10.c" #include "dgemv_n_microk_power10.c"
#define MMA(X, APTR, ACC) \
rX = (vec_t *) & X; \
rowA = *((vecp_t*)((void*)&APTR)); \
__builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
#define SAVE(ACC, Z) \
rowC = (v4sf_t *) &y[Z]; \
__builtin_mma_disassemble_acc ((void *)result, ACC); \
result[0][1] = result[1][0]; \
result[2][1] = result[3][0]; \
rowC[0] += valpha * result[0]; \
rowC[1] += valpha * result[2];
void
dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
FLOAT * y, FLOAT alpha)
{
BLASLONG i, j, tmp;
FLOAT *a0 = a_ptr;
FLOAT *x1 = xo;
vector double valpha = { alpha, alpha };
v4sf_t *rowC;
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
v4sf_t result[4];
vecp_t rowA;
vec_t *rX;
tmp = (n / 32) * 32;
for (i = 0; i < tmp; i += 32)
{
xo = x1;
a0 = a_ptr;
__builtin_mma_xxsetaccz (&acc0);
__builtin_mma_xxsetaccz (&acc1);
__builtin_mma_xxsetaccz (&acc2);
__builtin_mma_xxsetaccz (&acc3);
__builtin_mma_xxsetaccz (&acc4);
__builtin_mma_xxsetaccz (&acc5);
__builtin_mma_xxsetaccz (&acc6);
__builtin_mma_xxsetaccz (&acc7);
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + 0 + j * lda], &acc0);
MMA (xo[j], a0[i + 4 + j * lda], &acc1);
MMA (xo[j], a0[i + 8 + j * lda], &acc2);
MMA (xo[j], a0[i + 12 + j * lda], &acc3);
MMA (xo[j], a0[i + 16 + j * lda], &acc4);
MMA (xo[j], a0[i + 20 + j * lda], &acc5);
MMA (xo[j], a0[i + 24 + j * lda], &acc6);
MMA (xo[j], a0[i + 28 + j * lda], &acc7);
}
xo += 32;
a0 += lda << 5;
SAVE (&acc0, i + 0);
SAVE (&acc1, i + 4);
SAVE (&acc2, i + 8);
SAVE (&acc3, i + 12);
SAVE (&acc4, i + 16);
SAVE (&acc5, i + 20);
SAVE (&acc6, i + 24);
SAVE (&acc7, i + 28);
}
for (i = tmp; i < n; i += 4)
{
xo = x1;
a0 = a_ptr;
__builtin_mma_xxsetaccz (&acc0);
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
for (j = 0; j < 32; j++)
{
__builtin_prefetch (xo+j);
__builtin_prefetch (a0+i+j+lda);
MMA (xo[j], a0[i + j * lda], &acc0);
}
xo += 32;
a0 += lda << 5;
SAVE (&acc0, i);
}
}
#define NBMAX 4096 #define NBMAX 4096
#ifndef HAVE_KERNEL_4x4 #ifndef HAVE_KERNEL_4x4
@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *x_ptr; FLOAT *x_ptr;
FLOAT *y_ptr; FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1; BLASLONG m1;
BLASLONG m2; BLASLONG m2;
BLASLONG m3; BLASLONG m3;
BLASLONG n2; BLASLONG n2;
BLASLONG lda4 = lda << 2; BLASLONG lda4 = lda << 2;
BLASLONG lda128 = lda << 7; BLASLONG lda8 = lda << 3;
FLOAT xbuffer[8] __attribute__ ((aligned (16))); FLOAT xbuffer[8] __attribute__ ((aligned (16)));
FLOAT *ybuffer; FLOAT *ybuffer;
@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
if ( n < 1 ) return(0); if ( n < 1 ) return(0);
ybuffer = buffer; ybuffer = buffer;
BLASLONG n128 = n >> 7; BLASLONG n8 = n >> 3;
n1 = (n - (n128 * 128)) >> 2; n2 = n & 3;
n2 = (n - (n128 * 128)) & 3;
m3 = m & 3 ; m3 = m & 3 ;
m1 = m & -4 ; m1 = m & -4 ;
@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
if ( inc_x == 1 ) if ( inc_x == 1 )
{ {
for( i = 0; i < n128 ; i++) for( i = 0; i < n8 ; i++)
{ {
dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
a_ptr += lda128; a_ptr += lda8;
x_ptr += 128; x_ptr += 8;
} }
for( i = 0; i < n1 ; i++) if( n & 4 )
{ {
dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
a_ptr += lda4; a_ptr += lda4;
@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
} }
else else
{ {
for( i = 0; i < n128 ; i++) for( i = 0; i < n8 ; i++)
{ {
FLOAT xbuffer[128] __attribute__ ((aligned (16)));
BLASLONG j; BLASLONG j;
for ( j = 0; j < 128 ; j++) for ( j = 0; j < 8 ; j++)
{ {
xbuffer[j] = x_ptr[0]; xbuffer[j] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;
} }
dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
a_ptr += lda128; a_ptr += lda8;
} }
for( i = 0; i < n1 ; i++) if( n & 4 )
{ {
xbuffer[0] = x_ptr[0]; xbuffer[0] = x_ptr[0];
x_ptr += inc_x; x_ptr += inc_x;

View File

@ -27,3 +27,6 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c
CSCALKERNEL = ../arm/zscal.c CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c

144
kernel/x86_64/casum.c Normal file
View File

@ -0,0 +1,144 @@
#include "common.h"
#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif
#if defined(SKYLAKEX)
#include "casum_microk_skylakex-2.c"
#endif
#ifndef HAVE_CASUM_KERNEL
static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i=0;
BLASLONG n_8 = n & -8;
FLOAT *x = x1;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
FLOAT sum4 = 0.0;
while (i < n_8) {
temp0 = ABS_K(x[0]);
temp1 = ABS_K(x[1]);
temp2 = ABS_K(x[2]);
temp3 = ABS_K(x[3]);
temp4 = ABS_K(x[4]);
temp5 = ABS_K(x[5]);
temp6 = ABS_K(x[6]);
temp7 = ABS_K(x[7]);
sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;
sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;
x+=8;
i+=4;
}
while (i < n) {
sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
x1 += 2;
i++;
}
return sum0+sum1+sum2+sum3+sum4;
}
#endif
static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ip = 0;
BLASLONG inc_x2;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
if (inc_x == 1) {
sumf = casum_kernel(n, x);
}
else {
inc_x2 = 2 * inc_x;
while (i < n) {
sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
ip += inc_x2;
i++;
}
}
return(sumf);
}
#if defined(SMP)
static int asum_thread_function(BLASLONG n,
BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
FLOAT *x, BLASLONG inc_x,
FLOAT * dummy3, BLASLONG dummy4,
FLOAT * result, BLASLONG dummy5)
{
*(FLOAT *) result = asum_compute(n, x, inc_x);
return 0;
}
extern int blas_level1_thread_with_return_value(int mode,
BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
void *a, BLASLONG lda,
void *b, BLASLONG ldb,
void *c, BLASLONG ldc,
int (*function)(),
int nthread);
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha[2];
#endif
FLOAT sumf = 0.0;
#if defined(SMP)
int num_cpu = num_cpu_avail(1);
if (n <= 10000 || inc_x <= 0)
nthreads = 1;
else
nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
if (nthreads == 1) {
sumf = asum_compute(n, x, inc_x);
}
else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) *2];
FLOAT *ptr;
#if !defined(DOUBLE)
mode = BLAS_SINGLE | BLAS_COMPLEX;
#else
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
sumf += (*ptr);
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
}
}
#else
sumf = asum_compute(n, x, inc_x);
#endif
return(sumf);
}

View File

@ -0,0 +1,349 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_CASUM_KERNEL 1
#include <immintrin.h>
#include <stdint.h>
static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
{
FLOAT *x1 = x;
FLOAT sumf=0.0;
BLASLONG n2 = n + n;
if (n2 < 64) {
__m128 accum_10, accum_11, accum_12, accum_13;
__m128 abs_mask1;
accum_10 = _mm_setzero_ps();
accum_11 = _mm_setzero_ps();
accum_12 = _mm_setzero_ps();
accum_13 = _mm_setzero_ps();
abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
_mm_prefetch(&x1[0], _MM_HINT_T0);
if (n2 >= 32){
__m128 x00 = _mm_loadu_ps(&x1[ 0]);
__m128 x01 = _mm_loadu_ps(&x1[ 4]);
__m128 x02 = _mm_loadu_ps(&x1[ 8]);
__m128 x03 = _mm_loadu_ps(&x1[12]);
_mm_prefetch(&x1[16], _MM_HINT_T0);
__m128 x04 = _mm_loadu_ps(&x1[16]);
__m128 x05 = _mm_loadu_ps(&x1[20]);
__m128 x06 = _mm_loadu_ps(&x1[24]);
__m128 x07 = _mm_loadu_ps(&x1[28]);
x00 = _mm_and_ps(x00, abs_mask1);
x01 = _mm_and_ps(x01, abs_mask1);
x02 = _mm_and_ps(x02, abs_mask1);
x03 = _mm_and_ps(x03, abs_mask1);
accum_10 = _mm_add_ps(accum_10, x00);
accum_11 = _mm_add_ps(accum_11, x01);
accum_12 = _mm_add_ps(accum_12, x02);
accum_13 = _mm_add_ps(accum_13, x03);
x04 = _mm_and_ps(x04, abs_mask1);
x05 = _mm_and_ps(x05, abs_mask1);
x06 = _mm_and_ps(x06, abs_mask1);
x07 = _mm_and_ps(x07, abs_mask1);
accum_10 = _mm_add_ps(accum_10, x04);
accum_11 = _mm_add_ps(accum_11, x05);
accum_12 = _mm_add_ps(accum_12, x06);
accum_13 = _mm_add_ps(accum_13, x07);
n2 -= 32;
x1 += 32;
}
if (n2 >= 16) {
__m128 x00 = _mm_loadu_ps(&x1[ 0]);
__m128 x01 = _mm_loadu_ps(&x1[ 4]);
__m128 x02 = _mm_loadu_ps(&x1[ 8]);
__m128 x03 = _mm_loadu_ps(&x1[12]);
x00 = _mm_and_ps(x00, abs_mask1);
x01 = _mm_and_ps(x01, abs_mask1);
x02 = _mm_and_ps(x02, abs_mask1);
x03 = _mm_and_ps(x03, abs_mask1);
accum_10 = _mm_add_ps(accum_10, x00);
accum_11 = _mm_add_ps(accum_11, x01);
accum_12 = _mm_add_ps(accum_12, x02);
accum_13 = _mm_add_ps(accum_13, x03);
n2 -= 16;
x1 += 16;
}
if (n2 >= 8) {
__m128 x00 = _mm_loadu_ps(&x1[ 0]);
__m128 x01 = _mm_loadu_ps(&x1[ 4]);
x00 = _mm_and_ps(x00, abs_mask1);
x01 = _mm_and_ps(x01, abs_mask1);
accum_10 = _mm_add_ps(accum_10, x00);
accum_11 = _mm_add_ps(accum_11, x01);
n2 -= 8;
x1 += 8;
}
if (n2 >= 4) {
__m128 x00 = _mm_loadu_ps(&x1[ 0]);
x00 = _mm_and_ps(x00, abs_mask1);
accum_10 = _mm_add_ps(accum_10, x00);
n2 -= 4;
x1 += 4;
}
if (n2) {
sumf += (ABS_K(x1[0]) + ABS_K(x1[1]));
}
accum_10 = _mm_add_ps(accum_10, accum_11);
accum_12 = _mm_add_ps(accum_12, accum_13);
accum_10 = _mm_add_ps(accum_10, accum_12);
accum_10 = _mm_hadd_ps(accum_10, accum_10);
accum_10 = _mm_hadd_ps(accum_10, accum_10);
sumf += accum_10[0];
}
else {
__m512 accum_0, accum_1, accum_2, accum_3;
__m512 x00, x01, x02, x03, x04, x05, x06, x07;
__m512 abs_mask = (__m512)_mm512_set1_epi32(0x7fffffff);
accum_0 = _mm512_setzero_ps();
accum_1 = _mm512_setzero_ps();
accum_2 = _mm512_setzero_ps();
accum_3 = _mm512_setzero_ps();
// alignment has side-effect when the size of input array is not large enough
if (n2 < 256) {
if (n2 >= 128) {
x00 = _mm512_loadu_ps(&x1[ 0]);
x01 = _mm512_loadu_ps(&x1[ 16]);
x02 = _mm512_loadu_ps(&x1[ 32]);
x03 = _mm512_loadu_ps(&x1[ 48]);
x04 = _mm512_loadu_ps(&x1[ 64]);
x05 = _mm512_loadu_ps(&x1[ 80]);
x06 = _mm512_loadu_ps(&x1[ 96]);
x07 = _mm512_loadu_ps(&x1[112]);
x00 = _mm512_and_ps(x00, abs_mask);
x01 = _mm512_and_ps(x01, abs_mask);
x02 = _mm512_and_ps(x02, abs_mask);
x03 = _mm512_and_ps(x03, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
accum_1 = _mm512_add_ps(accum_1, x01);
accum_2 = _mm512_add_ps(accum_2, x02);
accum_3 = _mm512_add_ps(accum_3, x03);
x04 = _mm512_and_ps(x04, abs_mask);
x05 = _mm512_and_ps(x05, abs_mask);
x06 = _mm512_and_ps(x06, abs_mask);
x07 = _mm512_and_ps(x07, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x04);
accum_1 = _mm512_add_ps(accum_1, x05);
accum_2 = _mm512_add_ps(accum_2, x06);
accum_3 = _mm512_add_ps(accum_3, x07);
n2 -= 128;
x1 += 128;
}
if (n2 >= 64) {
x00 = _mm512_loadu_ps(&x1[ 0]);
x01 = _mm512_loadu_ps(&x1[16]);
x02 = _mm512_loadu_ps(&x1[32]);
x03 = _mm512_loadu_ps(&x1[48]);
x00 = _mm512_and_ps(x00, abs_mask);
x01 = _mm512_and_ps(x01, abs_mask);
x02 = _mm512_and_ps(x02, abs_mask);
x03 = _mm512_and_ps(x03, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
accum_1 = _mm512_add_ps(accum_1, x01);
accum_2 = _mm512_add_ps(accum_2, x02);
accum_3 = _mm512_add_ps(accum_3, x03);
n2 -= 64;
x1 += 64;
}
if (n2 >= 32) {
x00 = _mm512_loadu_ps(&x1[ 0]);
x01 = _mm512_loadu_ps(&x1[16]);
x00 = _mm512_and_ps(x00, abs_mask);
x01 = _mm512_and_ps(x01, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
accum_1 = _mm512_add_ps(accum_1, x01);
n2 -= 32;
x1 += 32;
}
if (n2 >= 16) {
x00 = _mm512_loadu_ps(&x1[ 0]);
x00 = _mm512_and_ps(x00, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
n2 -= 16;
x1 += 16;
}
if (n2) {
uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
x00 = _mm512_and_ps(x00, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
}
accum_0 = _mm512_add_ps(accum_0, accum_1);
accum_2 = _mm512_add_ps(accum_2, accum_3);
accum_0 = _mm512_add_ps(accum_0, accum_2);
sumf = _mm512_reduce_add_ps(accum_0);
}
// n2 >= 256, doing alignment
else {
int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf;
if (0 != align_header) {
uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header));
x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]);
x00 = _mm512_and_ps(x00, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
n2 -= align_header;
x1 += align_header;
}
x00 = _mm512_load_ps(&x1[ 0]);
x01 = _mm512_load_ps(&x1[ 16]);
x02 = _mm512_load_ps(&x1[ 32]);
x03 = _mm512_load_ps(&x1[ 48]);
x04 = _mm512_load_ps(&x1[ 64]);
x05 = _mm512_load_ps(&x1[ 80]);
x06 = _mm512_load_ps(&x1[ 96]);
x07 = _mm512_load_ps(&x1[112]);
n2 -= 128;
x1 += 128;
while (n2 >= 128) {
x00 = _mm512_and_ps(x00, abs_mask);
x01 = _mm512_and_ps(x01, abs_mask);
x02 = _mm512_and_ps(x02, abs_mask);
x03 = _mm512_and_ps(x03, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
x00 = _mm512_load_ps(&x1[ 0]);
accum_1 = _mm512_add_ps(accum_1, x01);
x01 = _mm512_load_ps(&x1[ 16]);
accum_2 = _mm512_add_ps(accum_2, x02);
x02 = _mm512_load_ps(&x1[ 32]);
accum_3 = _mm512_add_ps(accum_3, x03);
x03 = _mm512_load_ps(&x1[ 48]);
x04 = _mm512_and_ps(x04, abs_mask);
x05 = _mm512_and_ps(x05, abs_mask);
x06 = _mm512_and_ps(x06, abs_mask);
x07 = _mm512_and_ps(x07, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x04);
x04 = _mm512_load_ps(&x1[ 64]);
accum_1 = _mm512_add_ps(accum_1, x05);
x05 = _mm512_load_ps(&x1[ 80]);
accum_2 = _mm512_add_ps(accum_2, x06);
x06 = _mm512_load_ps(&x1[ 96]);
accum_3 = _mm512_add_ps(accum_3, x07);
x07 = _mm512_load_ps(&x1[112]);
n2 -= 128;
x1 += 128;
}
x00 = _mm512_and_ps(x00, abs_mask);
x01 = _mm512_and_ps(x01, abs_mask);
x02 = _mm512_and_ps(x02, abs_mask);
x03 = _mm512_and_ps(x03, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
accum_1 = _mm512_add_ps(accum_1, x01);
accum_2 = _mm512_add_ps(accum_2, x02);
accum_3 = _mm512_add_ps(accum_3, x03);
x04 = _mm512_and_ps(x04, abs_mask);
x05 = _mm512_and_ps(x05, abs_mask);
x06 = _mm512_and_ps(x06, abs_mask);
x07 = _mm512_and_ps(x07, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x04);
accum_1 = _mm512_add_ps(accum_1, x05);
accum_2 = _mm512_add_ps(accum_2, x06);
accum_3 = _mm512_add_ps(accum_3, x07);
if (n2 >= 64) {
x00 = _mm512_load_ps(&x1[ 0]);
x01 = _mm512_load_ps(&x1[16]);
x02 = _mm512_load_ps(&x1[32]);
x03 = _mm512_load_ps(&x1[48]);
x00 = _mm512_and_ps(x00, abs_mask);
x01 = _mm512_and_ps(x01, abs_mask);
x02 = _mm512_and_ps(x02, abs_mask);
x03 = _mm512_and_ps(x03, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
accum_1 = _mm512_add_ps(accum_1, x01);
accum_2 = _mm512_add_ps(accum_2, x02);
accum_3 = _mm512_add_ps(accum_3, x03);
n2 -= 64;
x1 += 64;
}
if (n2 >= 32) {
x00 = _mm512_load_ps(&x1[ 0]);
x01 = _mm512_load_ps(&x1[16]);
x00 = _mm512_and_ps(x00, abs_mask);
x01 = _mm512_and_ps(x01, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
accum_1 = _mm512_add_ps(accum_1, x01);
n2 -= 32;
x1 += 32;
}
if (n2 >= 16) {
x00 = _mm512_load_ps(&x1[ 0]);
x00 = _mm512_and_ps(x00, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
n2 -= 16;
x1 += 16;
}
if (n2) {
uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
x00 = _mm512_and_ps(x00, abs_mask);
accum_0 = _mm512_add_ps(accum_0, x00);
}
accum_0 = _mm512_add_ps(accum_0, accum_1);
accum_2 = _mm512_add_ps(accum_2, accum_3);
accum_0 = _mm512_add_ps(accum_0, accum_2);
sumf = _mm512_reduce_add_ps(accum_0);
}
}
return sumf;
}
#endif

144
kernel/x86_64/zasum.c Normal file
View File

@ -0,0 +1,144 @@
#include "common.h"
#ifndef ABS_K
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
#endif
#if defined(SKYLAKEX)
#include "zasum_microk_skylakex-2.c"
#endif
#ifndef HAVE_ZASUM_KERNEL
static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
{
BLASLONG i=0;
BLASLONG n_8 = n & -8;
FLOAT *x1 = x;
FLOAT temp0, temp1, temp2, temp3;
FLOAT temp4, temp5, temp6, temp7;
FLOAT sum0 = 0.0;
FLOAT sum1 = 0.0;
FLOAT sum2 = 0.0;
FLOAT sum3 = 0.0;
FLOAT sum4 = 0.0;
while (i < n_8) {
temp0 = ABS_K(x1[0]);
temp1 = ABS_K(x1[1]);
temp2 = ABS_K(x1[2]);
temp3 = ABS_K(x1[3]);
temp4 = ABS_K(x1[4]);
temp5 = ABS_K(x1[5]);
temp6 = ABS_K(x1[6]);
temp7 = ABS_K(x1[7]);
sum0 += temp0;
sum1 += temp1;
sum2 += temp2;
sum3 += temp3;
sum0 += temp4;
sum1 += temp5;
sum2 += temp6;
sum3 += temp7;
x1+=8;
i+=4;
}
while (i < n) {
sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
x1 += 2;
i++;
}
return sum0+sum1+sum2+sum3+sum4;
}
#endif
static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ip = 0;
BLASLONG inc_x2;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
if (inc_x == 1) {
sumf = zasum_kernel(n, x);
}
else {
inc_x2 = 2 * inc_x;
while (i < n) {
sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
ip += inc_x2;
i++;
}
}
return(sumf);
}
#if defined(SMP)
static int asum_thread_function(BLASLONG n,
BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
FLOAT *x, BLASLONG inc_x,
FLOAT * dummy3, BLASLONG dummy4,
FLOAT * result, BLASLONG dummy5)
{
*(FLOAT *) result = asum_compute(n, x, inc_x);
return 0;
}
extern int blas_level1_thread_with_return_value(int mode,
BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
void *a, BLASLONG lda,
void *b, BLASLONG ldb,
void *c, BLASLONG ldc,
int (*function)(),
int nthread);
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha[2];
#endif
FLOAT sumf = 0.0;
#if defined(SMP)
int num_cpu = num_cpu_avail(1);
if (n <= 10000 || inc_x <= 0)
nthreads = 1;
else
nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
if (nthreads == 1) {
sumf = asum_compute(n, x, inc_x);
}
else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) *2];
FLOAT *ptr;
#if !defined(DOUBLE)
mode = BLAS_SINGLE | BLAS_COMPLEX;
#else
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
sumf += (*ptr);
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
}
}
#else
sumf = asum_compute(n, x, inc_x);
#endif
return(sumf);
}

View File

@ -0,0 +1,340 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_ZASUM_KERNEL 1
#include <immintrin.h>
#include <stdint.h>
static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
{
FLOAT *x1 = x;
FLOAT sumf=0.0;
BLASLONG n2 = n + n;
if (n2 < 32) {
__m128d accum_10, accum_11, accum_12, accum_13;
__m128d abs_mask1;
accum_10 = _mm_setzero_pd();
accum_11 = _mm_setzero_pd();
accum_12 = _mm_setzero_pd();
accum_13 = _mm_setzero_pd();
// abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
_mm_prefetch(&x1[0], _MM_HINT_T0);
if (n2 >= 16){
__m128d x00 = _mm_loadu_pd(&x1[ 0]);
__m128d x01 = _mm_loadu_pd(&x1[ 2]);
__m128d x02 = _mm_loadu_pd(&x1[ 4]);
__m128d x03 = _mm_loadu_pd(&x1[ 6]);
_mm_prefetch(&x1[8], _MM_HINT_T0);
__m128d x04 = _mm_loadu_pd(&x1[ 8]);
__m128d x05 = _mm_loadu_pd(&x1[10]);
__m128d x06 = _mm_loadu_pd(&x1[12]);
__m128d x07 = _mm_loadu_pd(&x1[14]);
x00 = _mm_and_pd(x00, abs_mask1);
x01 = _mm_and_pd(x01, abs_mask1);
x02 = _mm_and_pd(x02, abs_mask1);
x03 = _mm_and_pd(x03, abs_mask1);
accum_10 = _mm_add_pd(accum_10, x00);
accum_11 = _mm_add_pd(accum_11, x01);
accum_12 = _mm_add_pd(accum_12, x02);
accum_13 = _mm_add_pd(accum_13, x03);
x04 = _mm_and_pd(x04, abs_mask1);
x05 = _mm_and_pd(x05, abs_mask1);
x06 = _mm_and_pd(x06, abs_mask1);
x07 = _mm_and_pd(x07, abs_mask1);
accum_10 = _mm_add_pd(accum_10, x04);
accum_11 = _mm_add_pd(accum_11, x05);
accum_12 = _mm_add_pd(accum_12, x06);
accum_13 = _mm_add_pd(accum_13, x07);
x1 += 16;
n2 -= 16;
}
if (n2 >= 8) {
__m128d x00 = _mm_loadu_pd(&x1[ 0]);
__m128d x01 = _mm_loadu_pd(&x1[ 2]);
__m128d x02 = _mm_loadu_pd(&x1[ 4]);
__m128d x03 = _mm_loadu_pd(&x1[ 6]);
x00 = _mm_and_pd(x00, abs_mask1);
x01 = _mm_and_pd(x01, abs_mask1);
x02 = _mm_and_pd(x02, abs_mask1);
x03 = _mm_and_pd(x03, abs_mask1);
accum_10 = _mm_add_pd(accum_10, x00);
accum_11 = _mm_add_pd(accum_11, x01);
accum_12 = _mm_add_pd(accum_12, x02);
accum_13 = _mm_add_pd(accum_13, x03);
n2 -= 8;
x1 += 8;
}
if (n2 >= 4) {
__m128d x00 = _mm_loadu_pd(&x1[ 0]);
__m128d x01 = _mm_loadu_pd(&x1[ 2]);
x00 = _mm_and_pd(x00, abs_mask1);
x01 = _mm_and_pd(x01, abs_mask1);
accum_10 = _mm_add_pd(accum_10, x00);
accum_11 = _mm_add_pd(accum_11, x01);
n2 -= 4;
x1 += 4;
}
if (n2) {
__m128d x00 = _mm_loadu_pd(&x1[ 0]);
x00 = _mm_and_pd(x00, abs_mask1);
accum_10 = _mm_add_pd(accum_10, x00);
}
accum_10 = _mm_add_pd(accum_10, accum_11);
accum_12 = _mm_add_pd(accum_12, accum_13);
accum_10 = _mm_add_pd(accum_10, accum_12);
accum_10 = _mm_hadd_pd(accum_10, accum_10);
sumf = accum_10[0];
}
else {
__m512d accum_0, accum_1, accum_2, accum_3;
__m512d x00, x01, x02, x03, x04, x05, x06, x07;
__m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff);
accum_0 = _mm512_setzero_pd();
accum_1 = _mm512_setzero_pd();
accum_2 = _mm512_setzero_pd();
accum_3 = _mm512_setzero_pd();
// alignment has side-effect when the size of input array is not large enough
if (n2 < 128) {
if (n2 >= 64) {
x00 = _mm512_loadu_pd(&x1[ 0]);
x01 = _mm512_loadu_pd(&x1[ 8]);
x02 = _mm512_loadu_pd(&x1[16]);
x03 = _mm512_loadu_pd(&x1[24]);
x04 = _mm512_loadu_pd(&x1[32]);
x05 = _mm512_loadu_pd(&x1[40]);
x06 = _mm512_loadu_pd(&x1[48]);
x07 = _mm512_loadu_pd(&x1[56]);
x00 = _mm512_and_pd(x00, abs_mask);
x01 = _mm512_and_pd(x01, abs_mask);
x02 = _mm512_and_pd(x02, abs_mask);
x03 = _mm512_and_pd(x03, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
accum_1 = _mm512_add_pd(accum_1, x01);
accum_2 = _mm512_add_pd(accum_2, x02);
accum_3 = _mm512_add_pd(accum_3, x03);
x04 = _mm512_and_pd(x04, abs_mask);
x05 = _mm512_and_pd(x05, abs_mask);
x06 = _mm512_and_pd(x06, abs_mask);
x07 = _mm512_and_pd(x07, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x04);
accum_1 = _mm512_add_pd(accum_1, x05);
accum_2 = _mm512_add_pd(accum_2, x06);
accum_3 = _mm512_add_pd(accum_3, x07);
n2 -= 64;
x1 += 64;
}
if (n2 >= 32) {
x00 = _mm512_loadu_pd(&x1[ 0]);
x01 = _mm512_loadu_pd(&x1[ 8]);
x02 = _mm512_loadu_pd(&x1[16]);
x03 = _mm512_loadu_pd(&x1[24]);
x00 = _mm512_and_pd(x00, abs_mask);
x01 = _mm512_and_pd(x01, abs_mask);
x02 = _mm512_and_pd(x02, abs_mask);
x03 = _mm512_and_pd(x03, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
accum_1 = _mm512_add_pd(accum_1, x01);
accum_2 = _mm512_add_pd(accum_2, x02);
accum_3 = _mm512_add_pd(accum_3, x03);
n2 -= 32;
x1 += 32;
}
if (n2 >= 16) {
x00 = _mm512_loadu_pd(&x1[ 0]);
x01 = _mm512_loadu_pd(&x1[ 8]);
x00 = _mm512_and_pd(x00, abs_mask);
x01 = _mm512_and_pd(x01, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
accum_1 = _mm512_add_pd(accum_1, x01);
n2 -= 16;
x1 += 16;
}
if (n2 >= 8) {
x00 = _mm512_loadu_pd(&x1[ 0]);
x00 = _mm512_and_pd(x00, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
n2 -= 8;
x1 += 8;
}
if (n2) {
unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
x00 = _mm512_and_pd(x00, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
}
accum_0 = _mm512_add_pd(accum_0, accum_1);
accum_2 = _mm512_add_pd(accum_2, accum_3);
accum_0 = _mm512_add_pd(accum_0, accum_2);
sumf = _mm512_reduce_add_pd(accum_0);
}
// n2 >= 128, doing alignment
else {
int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;
if (0 != align_header) {
unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header));
x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]);
x00 = _mm512_and_pd(x00, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
n2 -= align_header;
x1 += align_header;
}
x00 = _mm512_load_pd(&x1[ 0]);
x01 = _mm512_load_pd(&x1[ 8]);
x02 = _mm512_load_pd(&x1[16]);
x03 = _mm512_load_pd(&x1[24]);
x04 = _mm512_load_pd(&x1[32]);
x05 = _mm512_load_pd(&x1[40]);
x06 = _mm512_load_pd(&x1[48]);
x07 = _mm512_load_pd(&x1[56]);
n2 -= 64;
x1 += 64;
while (n2 >= 64) {
x00 = _mm512_and_pd(x00, abs_mask);
x01 = _mm512_and_pd(x01, abs_mask);
x02 = _mm512_and_pd(x02, abs_mask);
x03 = _mm512_and_pd(x03, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
x00 = _mm512_load_pd(&x1[ 0]);
accum_1 = _mm512_add_pd(accum_1, x01);
x01 = _mm512_load_pd(&x1[ 8]);
accum_2 = _mm512_add_pd(accum_2, x02);
x02 = _mm512_load_pd(&x1[16]);
accum_3 = _mm512_add_pd(accum_3, x03);
x03 = _mm512_load_pd(&x1[24]);
x04 = _mm512_and_pd(x04, abs_mask);
x05 = _mm512_and_pd(x05, abs_mask);
x06 = _mm512_and_pd(x06, abs_mask);
x07 = _mm512_and_pd(x07, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x04);
x04 = _mm512_load_pd(&x1[32]);
accum_1 = _mm512_add_pd(accum_1, x05);
x05 = _mm512_load_pd(&x1[40]);
accum_2 = _mm512_add_pd(accum_2, x06);
x06 = _mm512_load_pd(&x1[48]);
accum_3 = _mm512_add_pd(accum_3, x07);
x07 = _mm512_load_pd(&x1[56]);
n2 -= 64;
x1 += 64;
}
x00 = _mm512_and_pd(x00, abs_mask);
x01 = _mm512_and_pd(x01, abs_mask);
x02 = _mm512_and_pd(x02, abs_mask);
x03 = _mm512_and_pd(x03, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
accum_1 = _mm512_add_pd(accum_1, x01);
accum_2 = _mm512_add_pd(accum_2, x02);
accum_3 = _mm512_add_pd(accum_3, x03);
x04 = _mm512_and_pd(x04, abs_mask);
x05 = _mm512_and_pd(x05, abs_mask);
x06 = _mm512_and_pd(x06, abs_mask);
x07 = _mm512_and_pd(x07, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x04);
accum_1 = _mm512_add_pd(accum_1, x05);
accum_2 = _mm512_add_pd(accum_2, x06);
accum_3 = _mm512_add_pd(accum_3, x07);
if (n2 >= 32) {
x00 = _mm512_load_pd(&x1[ 0]);
x01 = _mm512_load_pd(&x1[ 8]);
x02 = _mm512_load_pd(&x1[16]);
x03 = _mm512_load_pd(&x1[24]);
x00 = _mm512_and_pd(x00, abs_mask);
x01 = _mm512_and_pd(x01, abs_mask);
x02 = _mm512_and_pd(x02, abs_mask);
x03 = _mm512_and_pd(x03, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
accum_1 = _mm512_add_pd(accum_1, x01);
accum_2 = _mm512_add_pd(accum_2, x02);
accum_3 = _mm512_add_pd(accum_3, x03);
n2 -= 32;
x1 += 32;
}
if (n2 >= 16) {
x00 = _mm512_load_pd(&x1[ 0]);
x01 = _mm512_load_pd(&x1[ 8]);
x00 = _mm512_and_pd(x00, abs_mask);
x01 = _mm512_and_pd(x01, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
accum_1 = _mm512_add_pd(accum_1, x01);
n2 -= 16;
x1 += 16;
}
if (n2 >= 8) {
x00 = _mm512_load_pd(&x1[ 0]);
x00 = _mm512_and_pd(x00, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
n2 -= 8;
x1 += 8;
}
if (n2) {
unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
x00 = _mm512_and_pd(x00, abs_mask);
accum_0 = _mm512_add_pd(accum_0, x00);
}
accum_0 = _mm512_add_pd(accum_0, accum_1);
accum_2 = _mm512_add_pd(accum_2, accum_3);
accum_0 = _mm512_add_pd(accum_0, accum_2);
sumf = _mm512_reduce_add_pd(accum_0);
}
}
return sumf;
}
#endif

39
param.h
View File

@ -2388,7 +2388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(POWER9) || defined(POWER10) #if defined(POWER9)
#define SNUMOPT 16 #define SNUMOPT 16
#define DNUMOPT 8 #define DNUMOPT 8
@ -2426,6 +2426,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(POWER10) #if defined(POWER10)
#define SNUMOPT 16
#define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 832
#define DGEMM_DEFAULT_P 320
#define CGEMM_DEFAULT_P 512
#define ZGEMM_DEFAULT_P 256
#define SGEMM_DEFAULT_Q 1026
#define DGEMM_DEFAULT_Q 960
#define CGEMM_DEFAULT_Q 1026
#define ZGEMM_DEFAULT_Q 1026
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#define SYMV_P 8
#undef SBGEMM_DEFAULT_UNROLL_N #undef SBGEMM_DEFAULT_UNROLL_N
#undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_M
#undef SBGEMM_DEFAULT_P #undef SBGEMM_DEFAULT_P
@ -2436,10 +2469,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SBGEMM_DEFAULT_P 832 #define SBGEMM_DEFAULT_P 832
#define SBGEMM_DEFAULT_Q 1026 #define SBGEMM_DEFAULT_Q 1026
#define SBGEMM_DEFAULT_R 4096 #define SBGEMM_DEFAULT_R 4096
#undef DGEMM_DEFAULT_UNROLL_M
#undef DGEMM_DEFAULT_UNROLL_N
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
#endif #endif
#if defined(SPARC) && defined(V7) #if defined(SPARC) && defined(V7)

View File

@ -58,7 +58,7 @@ add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME}) target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME})
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
target_link_libraries(${OpenBLAS_utest_bin} m) target_link_libraries(${OpenBLAS_utest_bin} m)
endif() endif()