commit
948d11fc51
13
.travis.yml
13
.travis.yml
|
@ -160,26 +160,25 @@ matrix:
|
|||
os: osx
|
||||
osx_image: xcode10.1
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc@8 # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
osx_image: xcode10.0
|
||||
env:
|
||||
- BTYPE="BINARY=32 FC=gfortran-8"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode10.1
|
||||
env:
|
||||
- COMMON_FLAGS="NUM_THREADS=32"
|
||||
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
|
||||
- CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
|
||||
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
|
|
@ -30,17 +30,20 @@
|
|||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14"
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14"
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
|
@ -66,6 +69,8 @@ int detect(void)
|
|||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
|
|
@ -408,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
|
@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
}
|
||||
WMB;
|
||||
}
|
||||
|
||||
current = mypos;
|
||||
|
||||
|
@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
|
@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -541,7 +544,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
|
@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
|
@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
@ -677,7 +681,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
|
@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
|
@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
}
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
}
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
||||
|
@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#endif
|
||||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#else
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#else
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
newarg.m = args -> m;
|
||||
newarg.n = args -> n;
|
||||
newarg.k = args -> k;
|
||||
|
@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
free(job);
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#else
|
||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
|
||||
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||
TerminateThread(blas_threads[i],0);
|
||||
}
|
||||
#endif
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
|
|
|
@ -329,7 +329,7 @@ int support_avx512(){
|
|||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
|
|
|
@ -38,21 +38,29 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef SMP
|
||||
#define blas_cpu_number 1
|
||||
#else
|
||||
|
||||
int blas_cpu_number = 1;
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
|
||||
return blas_cpu_number;
|
||||
}
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define FIXED_PAGESIZE 4096
|
||||
|
||||
|
||||
void *sa = NULL;
|
||||
void *sb = NULL;
|
||||
static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
|
@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
|||
void *blas_memory_alloc(int numproc){
|
||||
|
||||
if (sa == NULL){
|
||||
#if 1
|
||||
#if 0
|
||||
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
|
||||
#else
|
||||
sa = (void *)malloc(BUFFER_SIZE);
|
||||
|
@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
|
|||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#ifndef SMP
|
||||
|
||||
#define blas_cpu_number 1
|
||||
#define blas_num_threads 1
|
||||
|
||||
/* Dummy Function */
|
||||
int goto_get_num_procs (void) { return 1;};
|
||||
void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
if (nums == 0) {
|
||||
|
||||
SYSTEM_INFO sysinfo;
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
||||
nums = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int m[2];
|
||||
size_t len;
|
||||
|
||||
if (nums == 0) {
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &nums, &len, NULL, 0);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN)
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
size_t len;
|
||||
if (nums == 0){
|
||||
len = sizeof(int);
|
||||
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
|
||||
}
|
||||
return nums;
|
||||
}
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
rlim_t StackSize;
|
||||
|
||||
StackSize=limitMB*1024*1024;
|
||||
result=getrlimit(RLIMIT_STACK, &rl);
|
||||
if(result==0){
|
||||
if(rl.rlim_cur < StackSize){
|
||||
rl.rlim_cur=StackSize;
|
||||
result=setrlimit(RLIMIT_STACK, &rl);
|
||||
if(result !=0){
|
||||
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
*/
|
||||
int blas_cpu_number = 0;
|
||||
/*
|
||||
The numbers of threads in the thread pool.
|
||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
// built with "make USE_OPENMP=0".
|
||||
// Hanging can still happen when OpenBLAS is built against the libgomp
|
||||
// implementation of OpenMP. The problem is tracked at:
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
int blas_omp_num = 0;
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
|
||||
#endif
|
||||
|
||||
blas_cpu_number = blas_num_threads;
|
||||
|
||||
return blas_num_threads;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMAXKERNEL = isamax_power8.S
|
||||
else
|
||||
ISAMAXKERNEL = isamax.c
|
||||
endif
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMAXKERNEL = icamax_power8.S
|
||||
else
|
||||
ICAMAXKERNEL = icamax.c
|
||||
endif
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMINKERNEL = isamin_power8.S
|
||||
else
|
||||
ISAMINKERNEL = isamin.c
|
||||
endif
|
||||
IDAMINKERNEL = idamin.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMINKERNEL = icamin_power8.S
|
||||
else
|
||||
ICAMINKERNEL = icamin.c
|
||||
endif
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
|
@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c
|
|||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = caxpy_power8.S
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
|
|
|
@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S
|
|||
|
||||
SAXPYKERNEL = axpy_ppc440.S
|
||||
DAXPYKERNEL = axpy_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
else
|
||||
CAXPYKERNEL = zaxpy_ppc440.S
|
||||
ZAXPYKERNEL = zaxpy_ppc440.S
|
||||
endif
|
||||
|
||||
SDOTKERNEL = dot_ppc440.S
|
||||
DDOTKERNEL = dot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CDOTKERNEL = zdot_ppc440.S
|
||||
ZDOTKERNEL = zdot_ppc440.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
|
||||
ISAMAXKERNEL = iamax_ppc440.S
|
||||
IDAMAXKERNEL = iamax_ppc440.S
|
||||
|
@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S
|
|||
|
||||
SROTKERNEL = rot_ppc440.S
|
||||
DROTKERNEL = rot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CROTKERNEL = zrot_ppc440.S
|
||||
ZROTKERNEL = zrot_ppc440.S
|
||||
else
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
SSCALKERNEL = scal_ppc440.S
|
||||
DSCALKERNEL = scal_ppc440.S
|
||||
|
@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S
|
|||
ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
endif
|
||||
|
||||
|
|
|
@ -12,11 +12,12 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
caxpy_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
mr. 7,3
|
||||
ble 0,.L33
|
||||
cmpdi 7,9,1
|
||||
|
@ -515,7 +516,9 @@ caxpy_k:
|
|||
b .L13
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,4,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
|
|
|
@ -11,11 +11,12 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
icamin_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry icamin_k,.-icamin_k
|
||||
#endif
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
|
@ -388,7 +389,9 @@ icamin_k:
|
|||
b .L21
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size icamin_k,.-icamin_k
|
||||
#endif
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
|
|
|
@ -324,15 +324,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
|
||||
if (inc_x == 1) {
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = i;
|
||||
|
|
|
@ -328,13 +328,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
|||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
if (n1 > 0) {
|
||||
|
||||
min = diamin_kernel_32(n1, x, &minf);
|
||||
i = n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = i;
|
||||
|
|
|
@ -12,11 +12,12 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
isamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry isamax_k,.-isamax_k
|
||||
#endif
|
||||
mr. 11,3
|
||||
ble 0,.L36
|
||||
cmpdi 7,5,0
|
||||
|
@ -397,7 +398,9 @@ isamax_k:
|
|||
b .L61
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size isamax_k,.-isamax_k
|
||||
#endif
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
|
|
|
@ -11,11 +11,12 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
isamin_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry isamin_k,.-isamin_k
|
||||
#endif
|
||||
mr. 11,3
|
||||
ble 0,.L36
|
||||
cmpdi 7,5,0
|
||||
|
@ -380,7 +381,9 @@ isamin_k:
|
|||
b .L35
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size isamin_k,.-isamin_k
|
||||
#endif
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
|
|
|
@ -316,14 +316,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
minf = CABS1(x,0); //index will not be incremented
|
||||
|
||||
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
|
||||
BLASLONG n1 = n & -16;
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = ziamin_kernel_16_TUNED(n1, x, &minf);
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
#include <stdint.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
|
||||
//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
|
||||
|
||||
/* row-major c_block */
|
||||
#define INNER_KERNEL_k1m1n8 \
|
||||
"prefetcht0 384(%1);"\
|
||||
|
@ -13,18 +14,6 @@
|
|||
INNER_KERNEL_k1m1n8\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n8 \
|
||||
INNER_KERNEL_k1m2n8\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n8 \
|
||||
INNER_KERNEL_k1m4n8\
|
||||
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
|
||||
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
|
||||
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
|
||||
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m1n16 \
|
||||
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
|
||||
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
|
||||
|
@ -34,18 +23,6 @@
|
|||
INNER_KERNEL_k1m1n16\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n16 \
|
||||
INNER_KERNEL_k1m2n16\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n16 \
|
||||
INNER_KERNEL_k1m4n16\
|
||||
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
|
||||
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
|
||||
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
|
||||
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
|
||||
|
||||
#define INNER_KERNEL_k1m1n24 \
|
||||
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
|
||||
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
|
||||
|
@ -55,18 +32,48 @@
|
|||
INNER_KERNEL_k1m1n24\
|
||||
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
|
||||
|
||||
/* row-major z-partition c_block */
|
||||
#define INNER_KERNEL_k1m4n8 \
|
||||
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
|
||||
"vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
|
||||
"vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n16 \
|
||||
INNER_KERNEL_k1m4n8\
|
||||
"vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
|
||||
"vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m4n24 \
|
||||
INNER_KERNEL_k1m2n24\
|
||||
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
|
||||
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
|
||||
INNER_KERNEL_k1m4n16\
|
||||
"vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
|
||||
"vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n8 \
|
||||
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
|
||||
"vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
|
||||
"prefetcht0 128(%1);"\
|
||||
"vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
|
||||
"vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
|
||||
"vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
|
||||
"vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n16 \
|
||||
INNER_KERNEL_k1m8n8\
|
||||
"prefetcht0 128(%1,%%r12,2);"\
|
||||
"vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
|
||||
"vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
|
||||
"vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
|
||||
"vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
|
||||
|
||||
#define INNER_KERNEL_k1m8n24 \
|
||||
INNER_KERNEL_k1m4n24\
|
||||
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
|
||||
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
|
||||
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
|
||||
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
|
||||
INNER_KERNEL_k1m8n16\
|
||||
"prefetcht0 128(%1,%%r12,4);"\
|
||||
"vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
|
||||
"vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
|
||||
"vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
|
||||
"vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
|
||||
|
||||
/* micro kernels */
|
||||
#define INNER_KERNELm1(nn) \
|
||||
"cmpq $1,%2;jb "#nn"3f;"\
|
||||
#nn"4:\n\t"\
|
||||
|
@ -84,26 +91,28 @@
|
|||
#define INNER_KERNELm4(nn) \
|
||||
"cmpq $1,%2;jb "#nn"00f;"\
|
||||
#nn"01:\n\t"\
|
||||
INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
|
||||
INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
|
||||
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
|
||||
#nn"00:\n\t"
|
||||
|
||||
/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
|
||||
#define INNER_KERNELm8(nn) \
|
||||
"movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
|
||||
"movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\
|
||||
#nn"008:\n\t"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
|
||||
"prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
"prefetcht1 (%11); addq $16,%11;"\
|
||||
"subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
|
||||
"prefetcht1 (%11); addq $32,%11;"\
|
||||
"subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\
|
||||
"movq %3,%10;"\
|
||||
#nn"001:\n\t"\
|
||||
"cmpq $1,%2;jb "#nn"000f;"\
|
||||
"prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
|
||||
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
|
||||
"decq %2;jmp "#nn"001b;"\
|
||||
""#nn"000:\n\t"
|
||||
|
||||
|
@ -207,24 +216,19 @@
|
|||
INNER_STORE_m1n8(%%zmm13,8)
|
||||
|
||||
#define INNER_TRANS_4x8(c1,c2,c3,c4) \
|
||||
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
|
||||
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
|
||||
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
|
||||
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
|
||||
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
|
||||
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
|
||||
"vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
|
||||
"vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
|
||||
"vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
|
||||
"vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
|
||||
|
||||
#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
|
||||
"vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
|
||||
"vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
|
||||
"vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
|
||||
"vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
|
||||
|
||||
#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
|
||||
INNER_TRANS_4x8(c1,c2,c3,c4)\
|
||||
INNER_TRANS_4x8(c5,c6,c7,c8)\
|
||||
"vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
|
||||
"vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\
|
||||
"vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
|
||||
"vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\
|
||||
"vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
|
||||
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
|
||||
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
|
||||
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
|
||||
INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
|
||||
|
||||
//%7 for k01(input) only when m=4
|
||||
#define INNER_STORE_4x8(c1,c2,c3,c4) \
|
||||
|
@ -250,20 +254,14 @@
|
|||
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
|
||||
|
||||
#define INNER_SAVE_m4n16 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
|
||||
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
|
||||
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
|
||||
INNER_SAVE_m4n8\
|
||||
INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
|
||||
INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
||||
|
||||
#define INNER_SAVE_m4n24 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
|
||||
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
|
||||
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
|
||||
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
|
||||
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
|
||||
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
|
||||
INNER_SAVE_m4n16\
|
||||
INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
|
||||
INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
|
||||
|
||||
#define INNER_SAVE_m8n8 \
|
||||
"movq %3,%10;"\
|
||||
|
@ -271,20 +269,14 @@
|
|||
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
|
||||
|
||||
#define INNER_SAVE_m8n16 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
|
||||
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
|
||||
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
|
||||
INNER_SAVE_m8n8\
|
||||
INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
|
||||
INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
|
||||
|
||||
#define INNER_SAVE_m8n24 \
|
||||
"movq %3,%10;"\
|
||||
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
|
||||
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
|
||||
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
|
||||
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
|
||||
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
|
||||
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
|
||||
INNER_SAVE_m8n16\
|
||||
INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
|
||||
INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
|
||||
|
||||
#define COMPUTE_n8 {\
|
||||
b_pref = packed_b_pointer + 8 * K;\
|
||||
|
@ -327,7 +319,7 @@
|
|||
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
||||
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
|
||||
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
#define COMPUTE_n16 {\
|
||||
|
@ -372,7 +364,7 @@
|
|||
"leaq (%1,%%r12,4),%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
||||
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
|
||||
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
|
||||
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
|
@ -417,9 +409,9 @@
|
|||
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
|
||||
"leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
|
||||
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
|
||||
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
|
||||
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
|
||||
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
|
||||
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
|
||||
"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
|
||||
"zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
|
||||
a_block_pointer -= M * K;\
|
||||
}
|
||||
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8
|
||||
|
|
|
@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
int __attribute__ ((noinline))
|
||||
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc)
|
||||
{
|
||||
unsigned long M = m, N = n, K = k;
|
||||
unsigned long long M = m, N = n, K = k;
|
||||
if (M == 0)
|
||||
return 0;
|
||||
if (N == 0)
|
||||
|
@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo
|
|||
|
||||
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
|
||||
{
|
||||
int mnk = M * N * K;
|
||||
unsigned long long mnk = M * N * K;
|
||||
/* large matrixes -> not performant */
|
||||
if (mnk >= 28 * 512 * 512)
|
||||
return 0;
|
||||
|
@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
|
|||
STORE_SCALAR(0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
|
|||
|
||||
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
|
||||
{
|
||||
int mnk = M * N * K;
|
||||
unsigned long long mnk = M * N * K;
|
||||
/* large matrixes -> not performant */
|
||||
if (mnk >= 28 * 512 * 512)
|
||||
return 0;
|
||||
|
|
|
@ -88,7 +88,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
|
|||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vrepf %%v25,%%v24,2\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vstef %%v24,%[asum],0"
|
||||
"vstef %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
|
|
@ -86,7 +86,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
|
|||
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||
"vrepg %%v25,%%v24,1\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vsteg %%v24,%[asum],0"
|
||||
"vsteg %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
|
|
@ -89,7 +89,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
|
|||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vrepf %%v25,%%v24,2\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vstef %%v24,%[asum],0"
|
||||
"vstef %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
|
|
@ -87,7 +87,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
|
|||
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||
"vrepg %%v25,%%v24,1\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vsteg %%v24,%[asum],0"
|
||||
"vsteg %%v24,%[sum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
|
|
6
param.h
6
param.h
|
@ -1691,16 +1691,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
|
||||
#define SGEMM_DEFAULT_P 768
|
||||
#define DGEMM_DEFAULT_P 512
|
||||
#define DGEMM_DEFAULT_P 384
|
||||
#define CGEMM_DEFAULT_P 384
|
||||
#define ZGEMM_DEFAULT_P 256
|
||||
|
||||
#ifdef WINDOWS_ABI
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define DGEMM_DEFAULT_Q 168
|
||||
#else
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define DGEMM_DEFAULT_Q 168
|
||||
#endif
|
||||
#define CGEMM_DEFAULT_Q 192
|
||||
#define ZGEMM_DEFAULT_Q 128
|
||||
|
|
Loading…
Reference in New Issue