diff --git a/.travis.yml b/.travis.yml index 6016ec1fe..9e18412e8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -160,26 +160,25 @@ matrix: os: osx osx_image: xcode10.1 before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - brew update - brew install gcc@8 # for gfortran script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8" + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" - <<: *test-macos - osx_image: xcode8.3 + osx_image: xcode10.0 env: - - BTYPE="BINARY=32 FC=gfortran-8" + - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - <<: *test-macos osx_image: xcode10.1 env: - - COMMON_FLAGS="NUM_THREADS=32" - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" - - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang" + - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" + - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" # whitelist branches: diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 896ed94f5..df3b7898f 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -30,17 +30,20 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 #define CPU_Z14 2 +#define CPU_Z15 3 static char *cpuname[] = { "ZARCH_GENERIC", "Z13", - "Z14" + "Z14", + "Z15" }; static char *cpuname_lower[] = { "zarch_generic", "z13", - "z14" + "z14", + "z15" }; int detect(void) @@ -66,6 +69,8 @@ int detect(void) if (strstr(p, "2965")) return CPU_Z13; if (strstr(p, "3906")) return CPU_Z14; if (strstr(p, "3907")) return CPU_Z14; + if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 + if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 return CPU_GENERIC; } diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 4903aa5bd..21d431b60 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -408,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); @@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; - } + WMB; + } current = mypos; @@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); /* thread has to wait */ - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); @@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + WMB; } } } while (current != mypos); @@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + WMB; } } @@ -541,7 +544,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); @@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); /* thread has to wait */ - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); @@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + WMB; } } } while (current != mypos); @@ -677,7 +681,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) - while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; STOP_RPCC(waiting1); @@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); /* thread has to wait */ - while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; STOP_RPCC(waiting2); @@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } if (m_to - m_from == min_i) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; - } + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; +} } } while (current != mypos); @@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif if (is + min_i >= m_to) { /* Thread doesn't need this buffer any more */ - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; } } @@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;}; } } @@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ +#ifndef USE_OPENMP +#ifndef OS_WINDOWS +static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; +#else +CRITICAL_SECTION level3_lock; +InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + blas_arg_t newarg; blas_queue_t queue[MAX_CPU_NUMBER]; @@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; #endif +#ifndef USE_OPENMP +#ifndef OS_WINDOWS +pthread_mutex_lock(&level3_lock); +#else +EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + newarg.m = args -> m; newarg.n = args -> n; newarg.k = args -> k; @@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG free(job); #endif +#ifndef USE_OPENMP +#ifndef OS_WINDOWS + pthread_mutex_unlock(&level3_lock); +#else + LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); +#endif +#endif + return 0; } diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bace54a23..e27725baf 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){ for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects - WaitForSingleObject(blas_threads[i], 5); //INFINITE); + DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000); + #ifndef OS_WINDOWSSTORE -// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP - TerminateThread(blas_threads[i],0); + // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP + if (WAIT_OBJECT_0 != wait_thread_value) { + TerminateThread(blas_threads[i],0); + } #endif + CloseHandle(blas_threads[i]); } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index f1cd3c6e6..a4ff0e086 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -329,7 +329,7 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 1){ + if((ebx & (1<<7)) == 0){ ret=0; //OS does not even support AVX2 } if((ebx & (1<<31)) != 0){ diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 17b7f5d60..6174d9b75 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -38,21 +38,29 @@ #include #include "common.h" - -#ifndef SMP -#define blas_cpu_number 1 -#else - -int blas_cpu_number = 1; - -int blas_get_cpu_number(void){ - - return blas_cpu_number; -} +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#include +#include #endif +#ifdef OS_HAIKU +#include +#endif + +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) +#include +#include +#endif + + #define FIXED_PAGESIZE 4096 + void *sa = NULL; void *sb = NULL; static double static_buffer[BUFFER_SIZE/sizeof(double)]; @@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)]; void *blas_memory_alloc(int numproc){ if (sa == NULL){ -#if 1 +#if 0 sa = (void *)qalloc(QFAST, BUFFER_SIZE); #else sa = (void *)malloc(BUFFER_SIZE); @@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){ return; } + + +extern void openblas_warning(int verbose, const char * msg); + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#if defined(OS_LINUX) || defined(OS_SUNOS) +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + + static int nums = 0; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif + + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); +#if !defined(OS_LINUX) + return nums; +#endif + +/* +#if !defined(__GLIBC_PREREQ) + return nums; +#else + #if !__GLIBC_PREREQ(2, 3) + return nums; + #endif + + #if !__GLIBC_PREREQ(2, 7) + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); + if (ret!=0) return nums; + n=0; + #if !__GLIBC_PREREQ(2, 6) + for (i=0;i= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } + #endif +#endif +*/ + return 1; +} +#endif +#endif + +#ifdef OS_ANDROID +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_HAIKU +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_AIX +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + return nums; +} +#endif + +#ifdef OS_WINDOWS + +int get_num_procs(void) { + + static int nums = 0; + + if (nums == 0) { + + SYSTEM_INFO sysinfo; + + GetSystemInfo(&sysinfo); + + nums = sysinfo.dwNumberOfProcessors; + } + + return nums; +} + +#endif + +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) + +int get_num_procs(void) { + + static int nums = 0; + + int m[2]; + size_t len; + + if (nums == 0) { + m[0] = CTL_HW; + m[1] = HW_NCPU; + len = sizeof(int); + sysctl(m, 2, &nums, &len, NULL, 0); + } + + return nums; +} + +#endif + +#if defined(OS_DARWIN) +int get_num_procs(void) { + static int nums = 0; + size_t len; + if (nums == 0){ + len = sizeof(int); + sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0); + } + return nums; +} +/* +void set_stack_limit(int limitMB){ + int result=0; + struct rlimit rl; + rlim_t StackSize; + + StackSize=limitMB*1024*1024; + result=getrlimit(RLIMIT_STACK, &rl); + if(result==0){ + if(rl.rlim_cur < StackSize){ + rl.rlim_cur=StackSize; + result=setrlimit(RLIMIT_STACK, &rl); + if(result !=0){ + fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result); + } + } + } +} +*/ +#endif + + +/* +OpenBLAS uses the numbers of CPU cores in multithreading. +It can be set by openblas_set_num_threads(int num_threads); +*/ +int blas_cpu_number = 0; +/* +The numbers of threads in the thread pool. +This value is equal or large than blas_cpu_number. This means some threads are sleep. +*/ +int blas_num_threads = 0; + +int goto_get_num_procs (void) { + return blas_cpu_number; +} + +void openblas_fork_handler() +{ + // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is + // built with "make USE_OPENMP=0". + // Hanging can still happen when OpenBLAS is built against the libgomp + // implementation of OpenMP. The problem is tracked at: + // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 + // In the mean time build with USE_OPENMP=0 or link against another + // implementation of OpenMP. +#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) + int err; + err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); + if(err != 0) + openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); +#endif +} + +extern int openblas_num_threads_env(); +extern int openblas_goto_num_threads_env(); +extern int openblas_omp_num_threads_env(); + +int blas_get_cpu_number(void){ +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) + int max_num; +#endif + int blas_goto_num = 0; + int blas_omp_num = 0; + + if (blas_num_threads) return blas_num_threads; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) + max_num = get_num_procs(); +#endif + + // blas_goto_num = 0; +#ifndef USE_OPENMP + blas_goto_num=openblas_num_threads_env(); + if (blas_goto_num < 0) blas_goto_num = 0; + + if (blas_goto_num == 0) { + blas_goto_num=openblas_goto_num_threads_env(); + if (blas_goto_num < 0) blas_goto_num = 0; + } + +#endif + + // blas_omp_num = 0; + blas_omp_num=openblas_omp_num_threads_env(); + if (blas_omp_num < 0) blas_omp_num = 0; + + if (blas_goto_num > 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + +int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else + // init blas_cpu_number if needed + blas_get_cpu_number(); + return blas_cpu_number; +#endif +} diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c08f3fb00..fb9452a35 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ISAMAXKERNEL = isamax_power8.S +else +ISAMAXKERNEL = isamax.c +endif IDAMAXKERNEL = idamax.c +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ICAMAXKERNEL = icamax_power8.S +else +ICAMAXKERNEL = icamax.c +endif IZAMAXKERNEL = izamax.c # +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ISAMINKERNEL = isamin_power8.S +else +ISAMINKERNEL = isamin.c +endif IDAMINKERNEL = idamin.c +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) ICAMINKERNEL = icamin_power8.S +else +ICAMINKERNEL = icamin.c +endif IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) CAXPYKERNEL = caxpy_power8.S +else +CAXPYKERNEL = caxpy.c +endif ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 988a4b701..a0696b548 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c +else CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) CDOTKERNEL = zdot_ppc440.S ZDOTKERNEL = zdot_ppc440.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif ISAMAXKERNEL = iamax_ppc440.S IDAMAXKERNEL = iamax_ppc440.S @@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S SROTKERNEL = rot_ppc440.S DROTKERNEL = rot_ppc440.S +ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) CROTKERNEL = zrot_ppc440.S ZROTKERNEL = zrot_ppc440.S +else +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c +endif SSCALKERNEL = scal_ppc440.S DSCALKERNEL = scal_ppc440.S @@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S + +ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c +endif + diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S index 0ce61ca3b..b5f841d2e 100644 --- a/kernel/power/caxpy_power8.S +++ b/kernel/power/caxpy_power8.S @@ -12,11 +12,12 @@ PROLOGUE -caxpy_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry caxpy_k,.-caxpy_k +#endif mr. 7,3 ble 0,.L33 cmpdi 7,9,1 @@ -515,7 +516,9 @@ caxpy_k: b .L13 .long 0 .byte 0,0,0,0,0,4,0,0 +#if _CALL_ELF ==2 .size caxpy_k,.-caxpy_k +#endif .section .rodata .align 4 .set .LANCHOR0,. + 0 diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S index e3d66798e..f2993e83e 100644 --- a/kernel/power/icamin_power8.S +++ b/kernel/power/icamin_power8.S @@ -11,11 +11,12 @@ PROLOGUE -icamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry icamin_k,.-icamin_k +#endif mr. 9,3 ble 0,.L25 cmpdi 7,5,0 @@ -388,7 +389,9 @@ icamin_k: b .L21 .long 0 .byte 0,0,0,0,0,1,0,0 +#if _CALL_ELF ==2 .size icamin_k,.-icamin_k +#endif .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC2: diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 337fa54f8..95aa592c7 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -324,15 +324,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { -#if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -32; - if (n1 > 0) { +#if defined(_CALL_ELF) && (_CALL_ELF == 2) + if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); i = n1; } -#endif +#endif while (i < n) { if (ABS(x[i]) > maxf) { max = i; diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 85dd49ac1..323f9987e 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -328,13 +328,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n1 > 0) { min = diamin_kernel_32(n1, x, &minf); i = n1; } #endif - while (i < n) { if (ABS(x[i]) < minf) { min = i; diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S index c8fcaecc3..fa5433333 100644 --- a/kernel/power/isamax_power8.S +++ b/kernel/power/isamax_power8.S @@ -12,11 +12,12 @@ PROLOGUE -isamax_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry isamax_k,.-isamax_k +#endif mr. 11,3 ble 0,.L36 cmpdi 7,5,0 @@ -397,7 +398,9 @@ isamax_k: b .L61 .long 0 .byte 0,0,0,0,0,1,0,0 +#if _CALL_ELF ==2 .size isamax_k,.-isamax_k +#endif .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC2: diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S index 3873e879b..c9b6acb85 100644 --- a/kernel/power/isamin_power8.S +++ b/kernel/power/isamin_power8.S @@ -11,11 +11,12 @@ PROLOGUE -isamin_k: .LCF0: 0: addis 2,12,.TOC.-.LCF0@ha addi 2,2,.TOC.-.LCF0@l +#if _CALL_ELF ==2 .localentry isamin_k,.-isamin_k +#endif mr. 11,3 ble 0,.L36 cmpdi 7,5,0 @@ -380,7 +381,9 @@ isamin_k: b .L35 .long 0 .byte 0,0,0,0,0,1,0,0 +#if _CALL_ELF ==2 .size isamin_k,.-isamin_k +#endif .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC2: diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 8da2189c6..06a5537d8 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -316,14 +316,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) minf = CABS1(x,0); //index will not be incremented #if defined(_CALL_ELF) && (_CALL_ELF == 2) - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; if (n1 > 0) { min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; ix = n1 << 1; } -#endif +#endif while(i < n) { diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index 72878acfd..90a4c2b1d 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -2,7 +2,8 @@ #include #include -//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. + /* row-major c_block */ #define INNER_KERNEL_k1m1n8 \ "prefetcht0 384(%1);"\ @@ -13,18 +14,6 @@ INNER_KERNEL_k1m1n8\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" -#define INNER_KERNEL_k1m4n8 \ - INNER_KERNEL_k1m2n8\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" - -#define INNER_KERNEL_k1m8n8 \ - INNER_KERNEL_k1m4n8\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" - #define INNER_KERNEL_k1m1n16 \ "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ @@ -34,18 +23,6 @@ INNER_KERNEL_k1m1n16\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" -#define INNER_KERNEL_k1m4n16 \ - INNER_KERNEL_k1m2n16\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" - -#define INNER_KERNEL_k1m8n16 \ - INNER_KERNEL_k1m4n16\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" - #define INNER_KERNEL_k1m1n24 \ "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ @@ -55,18 +32,48 @@ INNER_KERNEL_k1m1n24\ "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" +/* row-major z-partition c_block */ +#define INNER_KERNEL_k1m4n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" + #define INNER_KERNEL_k1m4n24 \ - INNER_KERNEL_k1m2n24\ - "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ - "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" + INNER_KERNEL_k1m4n16\ + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" + +#define INNER_KERNEL_k1m8n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ + "prefetcht0 128(%1);"\ + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m8n8\ + "prefetcht0 128(%1,%%r12,2);"\ + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" #define INNER_KERNEL_k1m8n24 \ - INNER_KERNEL_k1m4n24\ - "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ - "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ - "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ - "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" + INNER_KERNEL_k1m8n16\ + "prefetcht0 128(%1,%%r12,4);"\ + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" +/* micro kernels */ #define INNER_KERNELm1(nn) \ "cmpq $1,%2;jb "#nn"3f;"\ #nn"4:\n\t"\ @@ -84,26 +91,28 @@ #define INNER_KERNELm4(nn) \ "cmpq $1,%2;jb "#nn"00f;"\ #nn"01:\n\t"\ - INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ #nn"00:\n\t" /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ #define INNER_KERNELm8(nn) \ - "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ + "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ #nn"008:\n\t"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ - "prefetcht1 (%11); addq $16,%11;"\ - "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%11); addq $32,%11;"\ + "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ "movq %3,%10;"\ #nn"001:\n\t"\ "cmpq $1,%2;jb "#nn"000f;"\ "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ - INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ "decq %2;jmp "#nn"001b;"\ ""#nn"000:\n\t" @@ -207,24 +216,19 @@ INNER_STORE_m1n8(%%zmm13,8) #define INNER_TRANS_4x8(c1,c2,c3,c4) \ - "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ - "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ - "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ - "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ - "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ - "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ + +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - INNER_TRANS_4x8(c1,c2,c3,c4)\ - INNER_TRANS_4x8(c5,c6,c7,c8)\ - "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ - "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ - "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ - "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ - "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ - "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ - "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ - "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) //%7 for k01(input) only when m=4 #define INNER_STORE_4x8(c1,c2,c3,c4) \ @@ -250,20 +254,14 @@ INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) #define INNER_SAVE_m4n16 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ - INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ - INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ - INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) + INNER_SAVE_m4n8\ + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) #define INNER_SAVE_m4n24 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ - INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ - INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ - INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ - INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ - INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) + INNER_SAVE_m4n16\ + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) #define INNER_SAVE_m8n8 \ "movq %3,%10;"\ @@ -271,20 +269,14 @@ INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) #define INNER_SAVE_m8n16 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ - INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ - INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ - INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) + INNER_SAVE_m8n8\ + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) #define INNER_SAVE_m8n24 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ - INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ - INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ - INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ - INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ - INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) + INNER_SAVE_m8n16\ + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) #define COMPUTE_n8 {\ b_pref = packed_b_pointer + 8 * K;\ @@ -327,7 +319,7 @@ "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } #define COMPUTE_n16 {\ @@ -372,7 +364,7 @@ "leaq (%1,%%r12,4),%1;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } @@ -417,9 +409,9 @@ "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ - "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ a_block_pointer -= M * K;\ } static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 3246e681f..76b82e65b 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc) { - unsigned long M = m, N = n, K = k; + unsigned long long M = m, N = n, K = k; if (M == 0) return 0; if (N == 0) @@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - int mnk = M * N * K; + unsigned long long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; @@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict STORE_SCALAR(0, 0); } } -} \ No newline at end of file +} diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 5d491237b..ee3417505 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - int mnk = M * N * K; + unsigned long long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c index c0b8c6371..e9413da8e 100644 --- a/kernel/zarch/csum.c +++ b/kernel/zarch/csum.c @@ -88,7 +88,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vrepf %%v25,%%v24,2\n\t" "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" + "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c index 178bc3462..8d44873c0 100644 --- a/kernel/zarch/dsum.c +++ b/kernel/zarch/dsum.c @@ -86,7 +86,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v31\n\t" "vrepg %%v25,%%v24,1\n\t" "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" + "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c index a433ab592..3f3f46a85 100644 --- a/kernel/zarch/ssum.c +++ b/kernel/zarch/ssum.c @@ -89,7 +89,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vrepf %%v25,%%v24,2\n\t" "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" + "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c index 7cfc1f17f..e0f978d87 100644 --- a/kernel/zarch/zsum.c +++ b/kernel/zarch/zsum.c @@ -87,7 +87,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v31\n\t" "vrepg %%v25,%%v24,1\n\t" "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" + "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", diff --git a/param.h b/param.h index 9dc94c420..d39fc4a1d 100644 --- a/param.h +++ b/param.h @@ -1691,16 +1691,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 768 -#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 168 #else #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 168 #endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128