Merge pull request #19 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2019-11-29 23:44:09 +01:00 committed by GitHub
commit 948d11fc51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 535 additions and 145 deletions

View File

@ -160,26 +160,25 @@ matrix:
os: osx
osx_image: xcode10.1
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
- brew install gcc@8 # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
- <<: *test-macos
osx_image: xcode8.3
osx_image: xcode10.0
env:
- BTYPE="BINARY=32 FC=gfortran-8"
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
- <<: *test-macos
osx_image: xcode10.1
env:
- COMMON_FLAGS="NUM_THREADS=32"
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
- CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
# whitelist
branches:

View File

@ -30,17 +30,20 @@
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_Z15 3
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13",
"Z14"
"Z14",
"Z15"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13",
"z14"
"z14",
"z15"
};
int detect(void)
@ -66,6 +69,8 @@ int detect(void)
if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
return CPU_GENERIC;
}

View File

@ -408,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 0; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
}
WMB;
}
current = mypos;
@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);
@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (is + min_i >= m_to) {
/* Thread doesn't need this buffer any more */
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
@ -541,7 +544,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);
@ -677,7 +681,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
/* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);
@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
}
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
} while (current != mypos);
@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif
if (is + min_i >= m_to) {
/* Thread doesn't need this buffer any more */
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
}
}
@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 0; i < args -> nthreads; i++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
}
}
@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
blas_arg_t newarg;
blas_queue_t queue[MAX_CPU_NUMBER];
@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
newarg.m = args -> m;
newarg.n = args -> n;
newarg.k = args -> k;
@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
free(job);
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
return 0;
}

View File

@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
if (WAIT_OBJECT_0 != wait_thread_value) {
TerminateThread(blas_threads[i],0);
}
#endif
CloseHandle(blas_threads[i]);
}

View File

@ -329,7 +329,7 @@ int support_avx512(){
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
if((ebx & (1<<7)) == 0){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){

View File

@ -38,21 +38,29 @@
#include <stdio.h>
#include "common.h"
#ifndef SMP
#define blas_cpu_number 1
#else
int blas_cpu_number = 1;
int blas_get_cpu_number(void){
return blas_cpu_number;
}
#ifdef OS_LINUX
#include <sys/sysinfo.h>
#include <sched.h>
#include <errno.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/resource.h>
#endif
#ifdef OS_HAIKU
#include <unistd.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
#define FIXED_PAGESIZE 4096
void *sa = NULL;
void *sb = NULL;
static double static_buffer[BUFFER_SIZE/sizeof(double)];
@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
void *blas_memory_alloc(int numproc){
if (sa == NULL){
#if 1
#if 0
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
#else
sa = (void *)malloc(BUFFER_SIZE);
@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
return;
}
extern void openblas_warning(int verbose, const char * msg);
#ifndef SMP
#define blas_cpu_number 1
#define blas_num_threads 1
/* Dummy Function */
int goto_get_num_procs (void) { return 1;};
void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
#endif
/*
#if !defined(__GLIBC_PREREQ)
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
*/
return 1;
}
#endif
#endif
#ifdef OS_ANDROID
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_HAIKU
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_AIX
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {
static int nums = 0;
if (nums == 0) {
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
nums = sysinfo.dwNumberOfProcessors;
}
return nums;
}
#endif
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
static int nums = 0;
int m[2];
size_t len;
if (nums == 0) {
m[0] = CTL_HW;
m[1] = HW_NCPU;
len = sizeof(int);
sysctl(m, 2, &nums, &len, NULL, 0);
}
return nums;
}
#endif
#if defined(OS_DARWIN)
int get_num_procs(void) {
static int nums = 0;
size_t len;
if (nums == 0){
len = sizeof(int);
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
}
return nums;
}
/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
rlim_t StackSize;
StackSize=limitMB*1024*1024;
result=getrlimit(RLIMIT_STACK, &rl);
if(result==0){
if(rl.rlim_cur < StackSize){
rl.rlim_cur=StackSize;
result=setrlimit(RLIMIT_STACK, &rl);
if(result !=0){
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
}
}
}
}
*/
#endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0;
/*
The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
void openblas_fork_handler()
{
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
// built with "make USE_OPENMP=0".
// Hanging can still happen when OpenBLAS is built against the libgomp
// implementation of OpenMP. The problem is tracked at:
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
#endif
}
extern int openblas_num_threads_env();
extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
int blas_omp_num = 0;
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
// blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
if (blas_goto_num == 0) {
blas_goto_num=openblas_goto_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
}
#endif
// blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
#ifdef DEBUG
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
#endif
blas_cpu_number = blas_num_threads;
return blas_num_threads;
}
#endif
int openblas_get_num_procs(void) {
#ifndef SMP
return 1;
#else
return get_num_procs();
#endif
}
int openblas_get_num_threads(void) {
#ifndef SMP
return 1;
#else
// init blas_cpu_number if needed
blas_get_cpu_number();
return blas_cpu_number;
#endif
}

View File

@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#SMINKERNEL = ../arm/min.c
#DMINKERNEL = ../arm/min.c
#
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
ISAMAXKERNEL = isamax_power8.S
else
ISAMAXKERNEL = isamax.c
endif
IDAMAXKERNEL = idamax.c
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
ICAMAXKERNEL = icamax_power8.S
else
ICAMAXKERNEL = icamax.c
endif
IZAMAXKERNEL = izamax.c
#
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
ISAMINKERNEL = isamin_power8.S
else
ISAMINKERNEL = isamin.c
endif
IDAMINKERNEL = idamin.c
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
ICAMINKERNEL = icamin_power8.S
else
ICAMINKERNEL = icamin.c
endif
IZAMINKERNEL = izamin.c
#
#ISMAXKERNEL = ../arm/imax.c
@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c
#
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
CAXPYKERNEL = caxpy_power8.S
else
CAXPYKERNEL = caxpy.c
endif
ZAXPYKERNEL = zaxpy.c
#
SCOPYKERNEL = scopy.c

View File

@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S
SAXPYKERNEL = axpy_ppc440.S
DAXPYKERNEL = axpy_ppc440.S
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
else
CAXPYKERNEL = zaxpy_ppc440.S
ZAXPYKERNEL = zaxpy_ppc440.S
endif
SDOTKERNEL = dot_ppc440.S
DDOTKERNEL = dot_ppc440.S
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
CDOTKERNEL = zdot_ppc440.S
ZDOTKERNEL = zdot_ppc440.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
ISAMAXKERNEL = iamax_ppc440.S
IDAMAXKERNEL = iamax_ppc440.S
@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S
SROTKERNEL = rot_ppc440.S
DROTKERNEL = rot_ppc440.S
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
CROTKERNEL = zrot_ppc440.S
ZROTKERNEL = zrot_ppc440.S
else
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
endif
SSCALKERNEL = scal_ppc440.S
DSCALKERNEL = scal_ppc440.S
@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S
ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S
ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S
ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
endif

View File

@ -12,11 +12,12 @@
PROLOGUE
caxpy_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
#if _CALL_ELF ==2
.localentry caxpy_k,.-caxpy_k
#endif
mr. 7,3
ble 0,.L33
cmpdi 7,9,1
@ -515,7 +516,9 @@ caxpy_k:
b .L13
.long 0
.byte 0,0,0,0,0,4,0,0
#if _CALL_ELF ==2
.size caxpy_k,.-caxpy_k
#endif
.section .rodata
.align 4
.set .LANCHOR0,. + 0

View File

@ -11,11 +11,12 @@
PROLOGUE
icamin_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
#if _CALL_ELF ==2
.localentry icamin_k,.-icamin_k
#endif
mr. 9,3
ble 0,.L25
cmpdi 7,5,0
@ -388,7 +389,9 @@ icamin_k:
b .L21
.long 0
.byte 0,0,0,0,0,1,0,0
#if _CALL_ELF ==2
.size icamin_k,.-icamin_k
#endif
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:

View File

@ -324,15 +324,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
if (inc_x == 1) {
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
BLASLONG n1 = n & -32;
if (n1 > 0) {
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
if (n1 > 0) {
max = diamax_kernel_32(n1, x, &maxf);
i = n1;
}
#endif
#endif
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;

View File

@ -328,13 +328,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
BLASLONG n1 = n & -32;
if (n1 > 0) {
if (n1 > 0) {
min = diamin_kernel_32(n1, x, &minf);
i = n1;
}
#endif
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;

View File

@ -12,11 +12,12 @@
PROLOGUE
isamax_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
#if _CALL_ELF ==2
.localentry isamax_k,.-isamax_k
#endif
mr. 11,3
ble 0,.L36
cmpdi 7,5,0
@ -397,7 +398,9 @@ isamax_k:
b .L61
.long 0
.byte 0,0,0,0,0,1,0,0
#if _CALL_ELF ==2
.size isamax_k,.-isamax_k
#endif
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:

View File

@ -11,11 +11,12 @@
PROLOGUE
isamin_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
#if _CALL_ELF ==2
.localentry isamin_k,.-isamin_k
#endif
mr. 11,3
ble 0,.L36
cmpdi 7,5,0
@ -380,7 +381,9 @@ isamin_k:
b .L35
.long 0
.byte 0,0,0,0,0,1,0,0
#if _CALL_ELF ==2
.size isamin_k,.-isamin_k
#endif
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:

View File

@ -316,14 +316,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
minf = CABS1(x,0); //index will not be incremented
#if defined(_CALL_ELF) && (_CALL_ELF == 2)
BLASLONG n1 = n & -16;
BLASLONG n1 = n & -16;
if (n1 > 0) {
min = ziamin_kernel_16_TUNED(n1, x, &minf);
i = n1;
ix = n1 << 1;
}
#endif
#endif
while(i < n)
{

View File

@ -2,7 +2,8 @@
#include <stdint.h>
#include <immintrin.h>
//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
/* row-major c_block */
#define INNER_KERNEL_k1m1n8 \
"prefetcht0 384(%1);"\
@ -13,18 +14,6 @@
INNER_KERNEL_k1m1n8\
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
#define INNER_KERNEL_k1m4n8 \
INNER_KERNEL_k1m2n8\
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
#define INNER_KERNEL_k1m8n8 \
INNER_KERNEL_k1m4n8\
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
#define INNER_KERNEL_k1m1n16 \
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
@ -34,18 +23,6 @@
INNER_KERNEL_k1m1n16\
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
#define INNER_KERNEL_k1m4n16 \
INNER_KERNEL_k1m2n16\
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
#define INNER_KERNEL_k1m8n16 \
INNER_KERNEL_k1m4n16\
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
#define INNER_KERNEL_k1m1n24 \
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
"vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
@ -55,18 +32,48 @@
INNER_KERNEL_k1m1n24\
"vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
/* row-major z-partition c_block */
#define INNER_KERNEL_k1m4n8 \
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
"vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
"vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
#define INNER_KERNEL_k1m4n16 \
INNER_KERNEL_k1m4n8\
"vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
"vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
#define INNER_KERNEL_k1m4n24 \
INNER_KERNEL_k1m2n24\
"vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
"vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
INNER_KERNEL_k1m4n16\
"vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
"vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
#define INNER_KERNEL_k1m8n8 \
"vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
"vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
"prefetcht0 128(%1);"\
"vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
"vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
"vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
"vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
#define INNER_KERNEL_k1m8n16 \
INNER_KERNEL_k1m8n8\
"prefetcht0 128(%1,%%r12,2);"\
"vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
"vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
"vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
"vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
#define INNER_KERNEL_k1m8n24 \
INNER_KERNEL_k1m4n24\
"vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
"vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
"vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
"vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
INNER_KERNEL_k1m8n16\
"prefetcht0 128(%1,%%r12,4);"\
"vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
"vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
"vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
"vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
/* micro kernels */
#define INNER_KERNELm1(nn) \
"cmpq $1,%2;jb "#nn"3f;"\
#nn"4:\n\t"\
@ -84,26 +91,28 @@
#define INNER_KERNELm4(nn) \
"cmpq $1,%2;jb "#nn"00f;"\
#nn"01:\n\t"\
INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
"decq %2;cmpq $1,%2;jnb "#nn"01b;"\
#nn"00:\n\t"
/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
#define INNER_KERNELm8(nn) \
"movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
"movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\
#nn"008:\n\t"\
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
"prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
"prefetcht1 (%11); addq $16,%11;"\
"subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
"prefetcht1 (%11); addq $32,%11;"\
"subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\
"movq %3,%10;"\
#nn"001:\n\t"\
"cmpq $1,%2;jb "#nn"000f;"\
"prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
"decq %2;jmp "#nn"001b;"\
""#nn"000:\n\t"
@ -207,24 +216,19 @@
INNER_STORE_m1n8(%%zmm13,8)
#define INNER_TRANS_4x8(c1,c2,c3,c4) \
"vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
"vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
"vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
"vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
"vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
"vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
"vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
"vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
"vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
"vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
"vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
"vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
"vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
"vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
INNER_TRANS_4x8(c1,c2,c3,c4)\
INNER_TRANS_4x8(c5,c6,c7,c8)\
"vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
"vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\
"vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
"vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\
"vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
"vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
//%7 for k01(input) only when m=4
#define INNER_STORE_4x8(c1,c2,c3,c4) \
@ -250,20 +254,14 @@
INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
#define INNER_SAVE_m4n16 \
"movq %3,%10;"\
INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
INNER_SAVE_m4n8\
INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
#define INNER_SAVE_m4n24 \
"movq %3,%10;"\
INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
INNER_SAVE_m4n16\
INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
#define INNER_SAVE_m8n8 \
"movq %3,%10;"\
@ -271,20 +269,14 @@
INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
#define INNER_SAVE_m8n16 \
"movq %3,%10;"\
INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
INNER_SAVE_m8n8\
INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
#define INNER_SAVE_m8n24 \
"movq %3,%10;"\
INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
INNER_SAVE_m8n16\
INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
#define COMPUTE_n8 {\
b_pref = packed_b_pointer + 8 * K;\
@ -327,7 +319,7 @@
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
a_block_pointer -= M * K;\
}
#define COMPUTE_n16 {\
@ -372,7 +364,7 @@
"leaq (%1,%%r12,4),%1;"\
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
a_block_pointer -= M * K;\
}
@ -417,9 +409,9 @@
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
"leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
"+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
"zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
a_block_pointer -= M * K;\
}
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8

View File

@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
int __attribute__ ((noinline))
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc)
{
unsigned long M = m, N = n, K = k;
unsigned long long M = m, N = n, K = k;
if (M == 0)
return 0;
if (N == 0)
@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
{
int mnk = M * N * K;
unsigned long long mnk = M * N * K;
/* large matrixes -> not performant */
if (mnk >= 28 * 512 * 512)
return 0;
@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
STORE_SCALAR(0, 0);
}
}
}
}

View File

@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
{
int mnk = M * N * K;
unsigned long long mnk = M * N * K;
/* large matrixes -> not performant */
if (mnk >= 28 * 512 * 512)
return 0;

View File

@ -88,7 +88,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
"vstef %%v24,%[sum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",

View File

@ -86,7 +86,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
"vsteg %%v24,%[sum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",

View File

@ -89,7 +89,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
"vstef %%v24,%[sum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",

View File

@ -87,7 +87,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
"vsteg %%v24,%[sum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",

View File

@ -1691,16 +1691,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#define SGEMM_DEFAULT_P 768
#define DGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 384
#define CGEMM_DEFAULT_P 384
#define ZGEMM_DEFAULT_P 256
#ifdef WINDOWS_ABI
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 128
#define DGEMM_DEFAULT_Q 168
#else
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 128
#define DGEMM_DEFAULT_Q 168
#endif
#define CGEMM_DEFAULT_Q 192
#define ZGEMM_DEFAULT_Q 128