Merge branch 'develop' into fc-1847
This commit is contained in:
commit
5f8f0583d4
|
@ -48,6 +48,7 @@ ifndef NO_CBLAS
|
|||
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||
endif
|
||||
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
|
@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
|||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
|
@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT)
|
|||
endif
|
||||
endif
|
||||
|
||||
else
|
||||
#install on AIX has different options syntax
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifndef NO_STATIC
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifndef NO_SHARED
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
#Generating openblas.pc
|
||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
|
|
|
@ -510,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT)
|
|||
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), arm64)
|
||||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
override DYNAMIC_ARCH=
|
||||
|
|
2
common.h
2
common.h
|
@ -183,7 +183,7 @@ extern "C" {
|
|||
|
||||
#define ALLOCA_ALIGN 63UL
|
||||
|
||||
#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
|
||||
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
|
||||
|
||||
#ifdef NEEDBUNDERSCORE
|
||||
#define BLASFUNC(FUNC) FUNC##_
|
||||
|
|
|
@ -237,7 +237,6 @@ void get_cpuconfig(void)
|
|||
break;
|
||||
|
||||
case CPU_THUNDERX:
|
||||
printf("#define ARMV8\n");
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
|
|
|
@ -2009,6 +2009,8 @@ int get_coretype(void){
|
|||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
case 8:
|
||||
// Ryzen 2
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
|
|
|
@ -48,6 +48,10 @@
|
|||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
#ifndef GEMM_PREFERED_SIZE
|
||||
#define GEMM_PREFERED_SIZE 1
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||
|
@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int round_up(int remainder, int width, int multiple)
|
||||
{
|
||||
if (multiple > remainder || width <= multiple)
|
||||
return width;
|
||||
width = (width + multiple - 1) / multiple;
|
||||
width = width * multiple;
|
||||
return width;
|
||||
}
|
||||
|
||||
|
||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
*range_n, FLOAT *sa, FLOAT *sb,
|
||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||
|
@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
num_parts = 0;
|
||||
while (m > 0){
|
||||
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
|
||||
|
||||
width = round_up(m, width, GEMM_PREFERED_SIZE);
|
||||
|
||||
m -= width;
|
||||
|
||||
if (m < 0) width = width + m;
|
||||
range_M[num_parts + 1] = range_M[num_parts] + width;
|
||||
|
||||
num_parts ++;
|
||||
}
|
||||
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
|
||||
|
@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
if (width < SWITCH_RATIO) {
|
||||
width = SWITCH_RATIO;
|
||||
}
|
||||
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||
|
||||
n -= width;
|
||||
if (n < 0) width = width + n;
|
||||
range_N[num_parts + 1] = range_N[num_parts] + width;
|
||||
|
||||
num_parts ++;
|
||||
}
|
||||
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
|
||||
|
|
|
@ -15,7 +15,11 @@ endif
|
|||
# COMMONOBJS += info.$(SUFFIX)
|
||||
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(ARCH),arm64)
|
||||
COMMONOBJS += dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
|
@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c
|
|||
endif
|
||||
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(ARCH),arm64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
|
|
@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
long i;
|
||||
|
||||
#ifdef SMP_SERVER
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
|
|
|
@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
void goto_set_num_threads(int num_threads)
|
||||
{
|
||||
long i;
|
||||
long i;
|
||||
|
||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
if (num_threads < 1) num_threads = blas_cpu_number;
|
||||
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
* that HWCAP_CPUID is defined
|
||||
*/
|
||||
#ifndef HWCAP_CPUID
|
||||
#define HWCAP_CPUID (1 << 11)
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
asm("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
"armv8",
|
||||
"cortexa57",
|
||||
"thunderx",
|
||||
"thunderx2t99",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype) {
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i=0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_ARMV8);
|
||||
case 1: return (&gotoblas_CORTEXA57);
|
||||
case 2: return (&gotoblas_THUNDERX);
|
||||
case 3: return (&gotoblas_THUNDERX2T99);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
* MIDR_EL1
|
||||
*
|
||||
* 31 24 23 20 19 16 15 4 3 0
|
||||
* -----------------------------------------------------------------
|
||||
* | Implementer | Variant | Architecture | Part Number | Revision |
|
||||
* -----------------------------------------------------------------
|
||||
*/
|
||||
implementer = (midr_el1 >> 24) & 0xFF;
|
||||
part = (midr_el1 >> 4) & 0xFFF;
|
||||
|
||||
switch(implementer)
|
||||
{
|
||||
case 0x41: // ARM
|
||||
switch (part)
|
||||
{
|
||||
case 0xd07: // Cortex A57
|
||||
case 0xd08: // Cortex A72
|
||||
case 0xd03: // Cortex A53
|
||||
return &gotoblas_CORTEXA57;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
switch (part)
|
||||
{
|
||||
case 0x516: // Vulcan
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
case 0x43: // Cavium
|
||||
switch (part)
|
||||
{
|
||||
case 0x0a1: // ThunderX
|
||||
return &gotoblas_THUNDERX;
|
||||
case 0x0af: // ThunderX2
|
||||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_ARMV8;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
|
@ -259,6 +259,16 @@ int get_num_procs(void) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
@ -1739,6 +1749,22 @@ int get_num_procs(void) {
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
|
|
@ -730,35 +730,8 @@ void blas_set_parameter(void){
|
|||
|
||||
#if defined(ARCH_ARM64)
|
||||
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
unsigned long dgemm_prefetch_size_a;
|
||||
unsigned long dgemm_prefetch_size_b;
|
||||
unsigned long dgemm_prefetch_size_c;
|
||||
#endif
|
||||
|
||||
void blas_set_parameter(void)
|
||||
{
|
||||
#if defined(VULCAN) || defined(THUNDERX2T99)
|
||||
dgemm_p = 160;
|
||||
dgemm_q = 128;
|
||||
dgemm_r = 4096;
|
||||
|
||||
sgemm_p = 128;
|
||||
sgemm_q = 352;
|
||||
sgemm_r = 4096;
|
||||
|
||||
cgemm_p = 128;
|
||||
cgemm_q = 224;
|
||||
cgemm_r = 4096;
|
||||
|
||||
zgemm_p = 128;
|
||||
zgemm_q = 112;
|
||||
zgemm_r = 4096;
|
||||
|
||||
dgemm_prefetch_size_a = 3584;
|
||||
dgemm_prefetch_size_b = 512;
|
||||
dgemm_prefetch_size_c = 128;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN)
|
||||
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
|
||||
// Multithreaded swap gives performance benefits in ThunderX2T99
|
||||
#else
|
||||
// Disable multi-threading as it does not show any performance
|
||||
|
|
|
@ -43,6 +43,10 @@
|
|||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
// this is smallest dimension N of square input a to permit threading
|
||||
// see graph in issue #1820 for explanation
|
||||
#define MULTI_THREAD_MINIMAL 362
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XHEMV "
|
||||
#elif defined(DOUBLE)
|
||||
|
@ -195,7 +199,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
|
|||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(2);
|
||||
if (n<MULTI_THREAD_MINIMAL) {
|
||||
nthreads = 1 ;
|
||||
} else {
|
||||
nthreads = num_cpu_avail(2);
|
||||
};
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
|
|
@ -88,7 +88,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL)
|
|||
$(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F)
|
||||
|
||||
setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
|
||||
ifeq ($(USE_GEMM3M), 1)
|
||||
$(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@
|
||||
else
|
||||
$(CC) -c $(CFLAGS) $< -o $@
|
||||
endif
|
||||
|
||||
setparam$(TSUFFIX).c : setparam-ref.c
|
||||
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)
|
||||
|
|
|
@ -1,8 +1,3 @@
|
|||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
|
@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c
|
|||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
|
@ -30,92 +20,6 @@ IDMAXKERNEL = ../arm/imax.c
|
|||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_4x4.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SGEMMKERNEL = sgemm_kernel_4x4.S
|
||||
else
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
|
@ -136,6 +40,186 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
endif
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
else
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
|
|
|
@ -1,4 +1,49 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV8
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_4x4.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
|
@ -66,13 +111,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
|||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
@ -87,8 +132,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
|||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
|
@ -99,32 +144,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
|||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
|
|
@ -1,6 +1,133 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV8
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx.c
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot_thunderx.c
|
||||
DDOTKERNEL = ddot_thunderx.c
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_4x4.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SDOTKERNEL=dot_thunderx.c
|
||||
DDOTKERNEL=ddot_thunderx.c
|
||||
DAXPYKERNEL=daxpy_thunderx.c
|
||||
|
||||
|
|
|
@ -1,4 +1,137 @@
|
|||
include $(KERNELDIR)/KERNEL.CORTEXA57
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
|
@ -27,12 +160,12 @@ CNRM2KERNEL = scnrm2_thunderx2t99.c
|
|||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV8
|
|
@ -943,13 +943,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
|
||||
ldr A_PRE_SIZE, =dgemm_prefetch_size_a
|
||||
ldr A_PRE_SIZE, [A_PRE_SIZE]
|
||||
ldr B_PRE_SIZE, =dgemm_prefetch_size_b
|
||||
ldr B_PRE_SIZE, [B_PRE_SIZE]
|
||||
ldr C_PRE_SIZE, =dgemm_prefetch_size_c
|
||||
ldr C_PRE_SIZE, [C_PRE_SIZE]
|
||||
mov A_PRE_SIZE, #3584
|
||||
mov B_PRE_SIZE, #512
|
||||
mov C_PRE_SIZE, #128
|
||||
add A_PRE_SIZE_64, A_PRE_SIZE, #64
|
||||
add B_PRE_SIZE_64, B_PRE_SIZE, #64
|
||||
|
||||
|
|
|
@ -294,6 +294,8 @@ gotoblas_t TABLE_NAME = {
|
|||
chemm_outcopyTS, chemm_oltcopyTS,
|
||||
|
||||
0, 0, 0,
|
||||
|
||||
#if defined(USE_GEMM3M)
|
||||
#ifdef CGEMM3M_DEFAULT_UNROLL_M
|
||||
CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N),
|
||||
#else
|
||||
|
@ -324,6 +326,33 @@ gotoblas_t TABLE_NAME = {
|
|||
chemm3m_oucopybTS, chemm3m_olcopybTS,
|
||||
chemm3m_oucopyrTS, chemm3m_olcopyrTS,
|
||||
chemm3m_oucopyiTS, chemm3m_olcopyiTS,
|
||||
#else
|
||||
0, 0, 0,
|
||||
|
||||
NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
#endif
|
||||
|
||||
#ifndef NO_LAPACK
|
||||
cneg_tcopyTS, claswp_ncopyTS,
|
||||
|
@ -400,6 +429,7 @@ gotoblas_t TABLE_NAME = {
|
|||
zhemm_outcopyTS, zhemm_oltcopyTS,
|
||||
|
||||
0, 0, 0,
|
||||
#if defined(USE_GEMM3M)
|
||||
#ifdef ZGEMM3M_DEFAULT_UNROLL_M
|
||||
ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N),
|
||||
#else
|
||||
|
@ -430,6 +460,33 @@ gotoblas_t TABLE_NAME = {
|
|||
zhemm3m_oucopybTS, zhemm3m_olcopybTS,
|
||||
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
|
||||
zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
|
||||
#else
|
||||
0, 0, 0,
|
||||
|
||||
NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
#endif
|
||||
|
||||
#ifndef NO_LAPACK
|
||||
zneg_tcopyTS, zlaswp_ncopyTS,
|
||||
|
@ -503,6 +560,7 @@ gotoblas_t TABLE_NAME = {
|
|||
xhemm_outcopyTS, xhemm_oltcopyTS,
|
||||
|
||||
0, 0, 0,
|
||||
#if defined(USE_GEMM3M)
|
||||
QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
|
||||
|
||||
xgemm3m_kernelTS,
|
||||
|
@ -528,6 +586,33 @@ gotoblas_t TABLE_NAME = {
|
|||
xhemm3m_oucopybTS, xhemm3m_olcopybTS,
|
||||
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
|
||||
xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
|
||||
#else
|
||||
0, 0, 0,
|
||||
|
||||
NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
NULL, NULL,
|
||||
#endif
|
||||
|
||||
#ifndef NO_LAPACK
|
||||
xneg_tcopyTS, xlaswp_ncopyTS,
|
||||
|
@ -561,6 +646,78 @@ gotoblas_t TABLE_NAME = {
|
|||
|
||||
};
|
||||
|
||||
#if defined(ARCH_ARM64)
|
||||
static void init_parameter(void) {
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
|
||||
|
||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
|
||||
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
|
||||
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
|
||||
|
||||
#ifdef EXPRECISION
|
||||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||
TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R;
|
||||
TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R;
|
||||
#endif
|
||||
|
||||
#if defined(USE_GEMM3M)
|
||||
#ifdef CGEMM3M_DEFAULT_P
|
||||
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
|
||||
#else
|
||||
TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
|
||||
#endif
|
||||
|
||||
#ifdef ZGEMM3M_DEFAULT_P
|
||||
TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
|
||||
#else
|
||||
TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
|
||||
#endif
|
||||
|
||||
#ifdef CGEMM3M_DEFAULT_Q
|
||||
TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
|
||||
#else
|
||||
TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q;
|
||||
#endif
|
||||
|
||||
#ifdef ZGEMM3M_DEFAULT_Q
|
||||
TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
|
||||
#else
|
||||
TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q;
|
||||
#endif
|
||||
|
||||
#ifdef CGEMM3M_DEFAULT_R
|
||||
TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R;
|
||||
#else
|
||||
TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r;
|
||||
#endif
|
||||
|
||||
#ifdef ZGEMM3M_DEFAULT_R
|
||||
TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R;
|
||||
#else
|
||||
TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r;
|
||||
#endif
|
||||
|
||||
#ifdef EXPRECISION
|
||||
TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
|
||||
TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q;
|
||||
TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
#else // defined(ARCH_ARM64)
|
||||
#ifdef ARCH_X86
|
||||
static int get_l2_size_old(void){
|
||||
int i, eax, ebx, ecx, edx, cpuid_level;
|
||||
|
@ -1146,3 +1303,4 @@ static void init_parameter(void) {
|
|||
|
||||
|
||||
}
|
||||
#endif //defined(ARCH_ARM64)
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
include $(KERNELDIR)/KERNEL.HASWELL
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c
|
||||
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
|
||||
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c
|
||||
|
||||
|
@ -9,5 +14,5 @@ DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
|
|||
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
|
||||
DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c
|
||||
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
SGEMM_BETA = sgemm_beta_skylakex.c
|
||||
DGEMM_BETA = dgemm_beta_skylakex.c
|
||||
|
|
|
@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (m == 0 || n == 0)
|
||||
return 0;
|
||||
|
||||
c_offset = c;
|
||||
|
||||
|
@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
|
||||
i = m;
|
||||
|
||||
while (i > 32) {
|
||||
while (i >= 32) {
|
||||
_mm512_storeu_pd(c_offset1, z_zero);
|
||||
_mm512_storeu_pd(c_offset1 + 8, z_zero);
|
||||
_mm512_storeu_pd(c_offset1 + 16, z_zero);
|
||||
|
@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
c_offset1 += 32;
|
||||
i -= 32;
|
||||
}
|
||||
while (i > 8) {
|
||||
while (i >= 8) {
|
||||
_mm512_storeu_pd(c_offset1, z_zero);
|
||||
c_offset1 += 8;
|
||||
i -= 8;
|
||||
|
|
|
@ -55,13 +55,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (n == 0 || m == 0)
|
||||
return;
|
||||
|
||||
c_offset = c;
|
||||
|
||||
if (beta == ZERO){
|
||||
__m512 z_zero;
|
||||
__m256 y_zero;
|
||||
|
||||
z_zero = _mm512_setzero_ps();
|
||||
y_zero = _mm256_setzero_ps();
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
|
@ -69,16 +73,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
|
||||
i = m;
|
||||
|
||||
while (i > 32) {
|
||||
while (i >= 32) {
|
||||
_mm512_storeu_ps(c_offset1, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 8, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 16, z_zero);
|
||||
_mm512_storeu_ps(c_offset1 + 24 , z_zero);
|
||||
c_offset1 += 32;
|
||||
i -= 32;
|
||||
}
|
||||
while (i > 8) {
|
||||
_mm512_storeu_ps(c_offset1, z_zero);
|
||||
while (i >= 8) {
|
||||
_mm256_storeu_ps(c_offset1, y_zero);
|
||||
c_offset1 += 8;
|
||||
i -= 8;
|
||||
}
|
||||
|
|
|
@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
|
||||
#define INIT32x8() \
|
||||
row0 = _mm512_setzero_ps(); \
|
||||
row1 = _mm512_setzero_ps(); \
|
||||
row2 = _mm512_setzero_ps(); \
|
||||
row3 = _mm512_setzero_ps(); \
|
||||
row4 = _mm512_setzero_ps(); \
|
||||
row5 = _mm512_setzero_ps(); \
|
||||
row6 = _mm512_setzero_ps(); \
|
||||
row0b = _mm512_setzero_ps(); \
|
||||
row1b = _mm512_setzero_ps(); \
|
||||
row2b = _mm512_setzero_ps(); \
|
||||
row3b = _mm512_setzero_ps(); \
|
||||
row4b = _mm512_setzero_ps(); \
|
||||
row5b = _mm512_setzero_ps(); \
|
||||
row6b = _mm512_setzero_ps(); \
|
||||
row7b = _mm512_setzero_ps(); \
|
||||
|
||||
#define KERNEL32x8_SUB() \
|
||||
zmm0 = _mm512_loadu_ps(AO); \
|
||||
zmm0b = _mm512_loadu_ps(AOb); \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += zmm0 * zmm2; \
|
||||
row1 += zmm0 * zmm3; \
|
||||
row0b += zmm0b * zmm2; \
|
||||
row1b += zmm0b * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += zmm0 * zmm2; \
|
||||
row3 += zmm0 * zmm3; \
|
||||
row2b += zmm0b * zmm2; \
|
||||
row3b += zmm0b * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += zmm0 * zmm2; \
|
||||
row5 += zmm0 * zmm3; \
|
||||
row4b += zmm0b * zmm2; \
|
||||
row5b += zmm0b * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += zmm0 * zmm2; \
|
||||
row7 += zmm0 * zmm3; \
|
||||
row6b += zmm0b * zmm2; \
|
||||
row7b += zmm0b * zmm3; \
|
||||
BO += 8; \
|
||||
AO += 16; \
|
||||
AOb += 16;
|
||||
|
||||
|
||||
#define SAVE32x8(ALPHA) \
|
||||
zmm0 = _mm512_set1_ps(ALPHA); \
|
||||
row0 *= zmm0; \
|
||||
row1 *= zmm0; \
|
||||
row2 *= zmm0; \
|
||||
row3 *= zmm0; \
|
||||
row4 *= zmm0; \
|
||||
row5 *= zmm0; \
|
||||
row6 *= zmm0; \
|
||||
row7 *= zmm0; \
|
||||
row0b *= zmm0; \
|
||||
row1b *= zmm0; \
|
||||
row2b *= zmm0; \
|
||||
row3b *= zmm0; \
|
||||
row4b *= zmm0; \
|
||||
row5b *= zmm0; \
|
||||
row6b *= zmm0; \
|
||||
row7b *= zmm0; \
|
||||
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm512_storeu_ps(CO1 + 7 * ldc, row7); \
|
||||
row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \
|
||||
row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \
|
||||
row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \
|
||||
row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \
|
||||
row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \
|
||||
row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \
|
||||
row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \
|
||||
row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \
|
||||
_mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \
|
||||
_mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \
|
||||
_mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \
|
||||
_mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \
|
||||
_mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \
|
||||
_mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \
|
||||
_mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \
|
||||
_mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \
|
||||
|
||||
|
||||
#define INIT16x8() \
|
||||
row0 = _mm512_setzero_ps(); \
|
||||
row1 = _mm512_setzero_ps(); \
|
||||
row2 = _mm512_setzero_ps(); \
|
||||
row3 = _mm512_setzero_ps(); \
|
||||
row4 = _mm512_setzero_ps(); \
|
||||
row5 = _mm512_setzero_ps(); \
|
||||
row6 = _mm512_setzero_ps(); \
|
||||
row7 = _mm512_setzero_ps(); \
|
||||
|
||||
#define KERNEL16x8_SUB() \
|
||||
zmm0 = _mm512_loadu_ps(AO); \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += zmm0 * zmm2; \
|
||||
row1 += zmm0 * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += zmm0 * zmm2; \
|
||||
row3 += zmm0 * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += zmm0 * zmm2; \
|
||||
row5 += zmm0 * zmm3; \
|
||||
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += zmm0 * zmm2; \
|
||||
row7 += zmm0 * zmm3; \
|
||||
BO += 8; \
|
||||
AO += 16;
|
||||
|
||||
|
||||
#define SAVE16x8(ALPHA) \
|
||||
zmm0 = _mm512_set1_ps(ALPHA); \
|
||||
row0 *= zmm0; \
|
||||
row1 *= zmm0; \
|
||||
row2 *= zmm0; \
|
||||
row3 *= zmm0; \
|
||||
row4 *= zmm0; \
|
||||
row5 *= zmm0; \
|
||||
row6 *= zmm0; \
|
||||
row7 *= zmm0; \
|
||||
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm512_storeu_ps(CO1 + 7 * ldc, row7);
|
||||
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT8x8() \
|
||||
row0 = _mm256_setzero_ps(); \
|
||||
row1 = _mm256_setzero_ps(); \
|
||||
row2 = _mm256_setzero_ps(); \
|
||||
row3 = _mm256_setzero_ps(); \
|
||||
row4 = _mm256_setzero_ps(); \
|
||||
row5 = _mm256_setzero_ps(); \
|
||||
row6 = _mm256_setzero_ps(); \
|
||||
row7 = _mm256_setzero_ps(); \
|
||||
|
||||
#define KERNEL8x8_SUB() \
|
||||
ymm0 = _mm256_loadu_ps(AO); \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += ymm0 * ymm2; \
|
||||
row1 += ymm0 * ymm3; \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += ymm0 * ymm2; \
|
||||
row3 += ymm0 * ymm3; \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += ymm0 * ymm2; \
|
||||
row5 += ymm0 * ymm3; \
|
||||
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += ymm0 * ymm2; \
|
||||
row7 += ymm0 * ymm3; \
|
||||
BO += 8; \
|
||||
AO += 8;
|
||||
|
||||
|
||||
#define SAVE8x8(ALPHA) \
|
||||
ymm0 = _mm256_set1_ps(ALPHA); \
|
||||
row0 *= ymm0; \
|
||||
row1 *= ymm0; \
|
||||
row2 *= ymm0; \
|
||||
row3 *= ymm0; \
|
||||
row4 *= ymm0; \
|
||||
row5 *= ymm0; \
|
||||
row6 *= ymm0; \
|
||||
row7 *= ymm0; \
|
||||
row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm256_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm256_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm256_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm256_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm256_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm256_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm256_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm256_storeu_ps(CO1 + 7 * ldc, row7); \
|
||||
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT4x8() \
|
||||
row0 = _mm_setzero_ps(); \
|
||||
row1 = _mm_setzero_ps(); \
|
||||
row2 = _mm_setzero_ps(); \
|
||||
row3 = _mm_setzero_ps(); \
|
||||
row4 = _mm_setzero_ps(); \
|
||||
row5 = _mm_setzero_ps(); \
|
||||
row6 = _mm_setzero_ps(); \
|
||||
row7 = _mm_setzero_ps(); \
|
||||
|
||||
|
||||
#define KERNEL4x8_SUB() \
|
||||
xmm0 = _mm_loadu_ps(AO); \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \
|
||||
row0 += xmm0 * xmm2; \
|
||||
row1 += xmm0 * xmm3; \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \
|
||||
row2 += xmm0 * xmm2; \
|
||||
row3 += xmm0 * xmm3; \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \
|
||||
row4 += xmm0 * xmm2; \
|
||||
row5 += xmm0 * xmm3; \
|
||||
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \
|
||||
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \
|
||||
row6 += xmm0 * xmm2; \
|
||||
row7 += xmm0 * xmm3; \
|
||||
BO += 8; \
|
||||
AO += 4;
|
||||
|
||||
|
||||
#define SAVE4x8(ALPHA) \
|
||||
xmm0 = _mm_set1_ps(ALPHA); \
|
||||
row0 *= xmm0; \
|
||||
row1 *= xmm0; \
|
||||
row2 *= xmm0; \
|
||||
row3 *= xmm0; \
|
||||
row4 *= xmm0; \
|
||||
row5 *= xmm0; \
|
||||
row6 *= xmm0; \
|
||||
row7 *= xmm0; \
|
||||
row0 += _mm_loadu_ps(CO1 + 0 * ldc); \
|
||||
row1 += _mm_loadu_ps(CO1 + 1 * ldc); \
|
||||
row2 += _mm_loadu_ps(CO1 + 2 * ldc); \
|
||||
row3 += _mm_loadu_ps(CO1 + 3 * ldc); \
|
||||
row4 += _mm_loadu_ps(CO1 + 4 * ldc); \
|
||||
row5 += _mm_loadu_ps(CO1 + 5 * ldc); \
|
||||
row6 += _mm_loadu_ps(CO1 + 6 * ldc); \
|
||||
row7 += _mm_loadu_ps(CO1 + 7 * ldc); \
|
||||
_mm_storeu_ps(CO1 + 0 * ldc, row0); \
|
||||
_mm_storeu_ps(CO1 + 1 * ldc, row1); \
|
||||
_mm_storeu_ps(CO1 + 2 * ldc, row2); \
|
||||
_mm_storeu_ps(CO1 + 3 * ldc, row3); \
|
||||
_mm_storeu_ps(CO1 + 4 * ldc, row4); \
|
||||
_mm_storeu_ps(CO1 + 5 * ldc, row5); \
|
||||
_mm_storeu_ps(CO1 + 6 * ldc, row6); \
|
||||
_mm_storeu_ps(CO1 + 7 * ldc, row7); \
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT2x8() \
|
||||
row0a = row0b = 0; \
|
||||
row1a = row1b = 0; \
|
||||
row2a = row2b = 0; \
|
||||
row3a = row3b = 0; \
|
||||
row4a = row4b = 0; \
|
||||
row5a = row5b = 0; \
|
||||
row6a = row6b = 0; \
|
||||
row7a = row7b = 0; \
|
||||
|
||||
#define KERNEL2x8_SUB() \
|
||||
xmm0 = *(AO); \
|
||||
xmm1 = *(AO + 1); \
|
||||
xmm2 = *(BO + 0); \
|
||||
xmm3 = *(BO + 1); \
|
||||
row0a += xmm0 * xmm2; \
|
||||
row0b += xmm1 * xmm2; \
|
||||
row1a += xmm0 * xmm3; \
|
||||
row1b += xmm1 * xmm3; \
|
||||
xmm2 = *(BO + 2); \
|
||||
xmm3 = *(BO + 3); \
|
||||
row2a += xmm0 * xmm2; \
|
||||
row2b += xmm1 * xmm2; \
|
||||
row3a += xmm0 * xmm3; \
|
||||
row3b += xmm1 * xmm3; \
|
||||
xmm2 = *(BO + 4); \
|
||||
xmm3 = *(BO + 5); \
|
||||
row4a += xmm0 * xmm2; \
|
||||
row4b += xmm1 * xmm2; \
|
||||
row5a += xmm0 * xmm3; \
|
||||
row5b += xmm1 * xmm3; \
|
||||
xmm2 = *(BO + 6); \
|
||||
xmm3 = *(BO + 7); \
|
||||
row6a += xmm0 * xmm2; \
|
||||
row6b += xmm1 * xmm2; \
|
||||
row7a += xmm0 * xmm3; \
|
||||
row7b += xmm1 * xmm3; \
|
||||
BO += 8; \
|
||||
AO += 2;
|
||||
|
||||
|
||||
#define SAVE2x8(ALPHA) \
|
||||
xmm0 = ALPHA; \
|
||||
row0a *= xmm0; \
|
||||
row0b *= xmm0; \
|
||||
row1a *= xmm0; \
|
||||
row1b *= xmm0; \
|
||||
row2a *= xmm0; \
|
||||
row2b *= xmm0; \
|
||||
row3a *= xmm0; \
|
||||
row3b *= xmm0; \
|
||||
row4a *= xmm0; \
|
||||
row4b *= xmm0; \
|
||||
row5a *= xmm0; \
|
||||
row5b *= xmm0; \
|
||||
row6a *= xmm0; \
|
||||
row6b *= xmm0; \
|
||||
row7a *= xmm0; \
|
||||
row7b *= xmm0; \
|
||||
*(CO1 + 0 * ldc + 0) += row0a; \
|
||||
*(CO1 + 0 * ldc + 1) += row0b; \
|
||||
*(CO1 + 1 * ldc + 0) += row1a; \
|
||||
*(CO1 + 1 * ldc + 1) += row1b; \
|
||||
*(CO1 + 2 * ldc + 0) += row2a; \
|
||||
*(CO1 + 2 * ldc + 1) += row2b; \
|
||||
*(CO1 + 3 * ldc + 0) += row3a; \
|
||||
*(CO1 + 3 * ldc + 1) += row3b; \
|
||||
*(CO1 + 4 * ldc + 0) += row4a; \
|
||||
*(CO1 + 4 * ldc + 1) += row4b; \
|
||||
*(CO1 + 5 * ldc + 0) += row5a; \
|
||||
*(CO1 + 5 * ldc + 1) += row5b; \
|
||||
*(CO1 + 6 * ldc + 0) += row6a; \
|
||||
*(CO1 + 6 * ldc + 1) += row6b; \
|
||||
*(CO1 + 7 * ldc + 0) += row7a; \
|
||||
*(CO1 + 7 * ldc + 1) += row7b; \
|
||||
|
||||
|
||||
|
||||
/*******************************************************************************************/
|
||||
|
||||
#define INIT1x8() \
|
||||
row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0;
|
||||
|
||||
#define KERNEL1x8_SUB() \
|
||||
xmm0 = *(AO ); \
|
||||
xmm2 = *(BO + 0); \
|
||||
xmm3 = *(BO + 1); \
|
||||
row0 += xmm0 * xmm2; \
|
||||
row1 += xmm0 * xmm3; \
|
||||
xmm2 = *(BO + 2); \
|
||||
xmm3 = *(BO + 3); \
|
||||
row2 += xmm0 * xmm2; \
|
||||
row3 += xmm0 * xmm3; \
|
||||
xmm2 = *(BO + 4); \
|
||||
xmm3 = *(BO + 5); \
|
||||
row4 += xmm0 * xmm2; \
|
||||
row5 += xmm0 * xmm3; \
|
||||
xmm2 = *(BO + 6); \
|
||||
xmm3 = *(BO + 7); \
|
||||
row6 += xmm0 * xmm2; \
|
||||
row7 += xmm0 * xmm3; \
|
||||
BO += 8; \
|
||||
AO += 1;
|
||||
|
||||
|
||||
#define SAVE1x8(ALPHA) \
|
||||
xmm0 = ALPHA; \
|
||||
row0 *= xmm0; \
|
||||
row1 *= xmm0; \
|
||||
row2 *= xmm0; \
|
||||
row3 *= xmm0; \
|
||||
row4 *= xmm0; \
|
||||
row5 *= xmm0; \
|
||||
row6 *= xmm0; \
|
||||
row7 *= xmm0; \
|
||||
*(CO1 + 0 * ldc) += row0; \
|
||||
*(CO1 + 1 * ldc) += row1; \
|
||||
*(CO1 + 2 * ldc) += row2; \
|
||||
*(CO1 + 3 * ldc) += row3; \
|
||||
*(CO1 + 4 * ldc) += row4; \
|
||||
*(CO1 + 5 * ldc) += row5; \
|
||||
*(CO1 + 6 * ldc) += row6; \
|
||||
*(CO1 + 7 * ldc) += row7; \
|
||||
|
||||
|
||||
|
||||
|
@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
|
|||
return 0;
|
||||
|
||||
|
||||
|
||||
// L8_0
|
||||
while (N >= 8 && 0) {
|
||||
float *CO1;
|
||||
float *AO;
|
||||
int i;
|
||||
// L8_10
|
||||
CO1 = C;
|
||||
C += 8 * ldc;
|
||||
|
||||
AO = A;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i >= 32 && 0) {
|
||||
float *BO, *AOb;
|
||||
// L8_11
|
||||
__m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
AOb = AO + 16 * K;
|
||||
|
||||
INIT32x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
// L12_17
|
||||
KERNEL32x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
// L8_19
|
||||
SAVE32x8(alpha)
|
||||
CO1 += 32;
|
||||
AO += 16 * K;
|
||||
|
||||
i -= 32;
|
||||
}
|
||||
while (i >= 16) {
|
||||
float *BO;
|
||||
// L8_11
|
||||
__m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
|
||||
INIT16x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
KERNEL16x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
SAVE16x8(alpha)
|
||||
CO1 += 16;
|
||||
|
||||
i -= 16;
|
||||
}
|
||||
while (i >= 8) {
|
||||
float *BO;
|
||||
// L8_11
|
||||
__m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
|
||||
INIT8x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
// L12_17
|
||||
KERNEL8x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
// L8_19
|
||||
SAVE8x8(alpha)
|
||||
CO1 += 8;
|
||||
|
||||
i -= 8;
|
||||
}
|
||||
while (i >= 4) {
|
||||
// L8_11
|
||||
float *BO;
|
||||
__m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
BO = B;
|
||||
int kloop = K;
|
||||
|
||||
INIT4x8()
|
||||
// L8_16
|
||||
while (kloop > 0) {
|
||||
// L12_17
|
||||
KERNEL4x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
// L8_19
|
||||
SAVE4x8(alpha)
|
||||
CO1 += 4;
|
||||
|
||||
i -= 4;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Rest of M
|
||||
***************************************************************************/
|
||||
|
||||
while (i >= 2) {
|
||||
float *BO;
|
||||
float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b;
|
||||
BO = B;
|
||||
|
||||
INIT2x8()
|
||||
int kloop = K;
|
||||
|
||||
while (kloop > 0) {
|
||||
KERNEL2x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
SAVE2x8(alpha)
|
||||
CO1 += 2;
|
||||
i -= 2;
|
||||
}
|
||||
// L13_40
|
||||
while (i >= 1) {
|
||||
float *BO;
|
||||
float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
int kloop = K;
|
||||
BO = B;
|
||||
INIT1x8()
|
||||
|
||||
while (kloop > 0) {
|
||||
KERNEL1x8_SUB()
|
||||
kloop--;
|
||||
}
|
||||
SAVE1x8(alpha)
|
||||
CO1 += 1;
|
||||
i -= 1;
|
||||
}
|
||||
|
||||
B += K * 8;
|
||||
N -= 8;
|
||||
}
|
||||
|
||||
while (N >= 4) {
|
||||
float *CO1;
|
||||
float *AO;
|
||||
|
|
|
@ -34,6 +34,13 @@
|
|||
#ifndef _LAPACKE_CONFIG_H_
|
||||
#define _LAPACKE_CONFIG_H_
|
||||
|
||||
// For Android prior to API 21 (no <complex> include)
|
||||
#if defined(__ANDROID__)
|
||||
#if __ANDROID_API__ < 21
|
||||
#define LAPACK_COMPLEX_STRUCTURE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#if defined(LAPACK_COMPLEX_CPP)
|
||||
#include <complex>
|
||||
|
|
72
param.h
72
param.h
|
@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SYMV_P 8
|
||||
|
||||
#define SWITCH_RATIO 32
|
||||
#define GEMM_PREFERED_SIZE 32
|
||||
|
||||
#ifdef ARCH_X86
|
||||
|
||||
|
@ -2583,6 +2584,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#if defined(ARMV8)
|
||||
|
||||
#if defined(OS_DARWIN) && defined(CROSS)
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
|
@ -2590,13 +2593,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#if defined(OS_DARWIN) && defined(CROSS)
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL N 2
|
||||
#else
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#endif
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 2
|
||||
|
@ -2622,10 +2620,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#define SYMV_P 16
|
||||
#else
|
||||
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 160
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q 352
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 224
|
||||
#define ZGEMM_DEFAULT_Q 112
|
||||
|
||||
#define SGEMM_DEFAULT_R 4096
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(THUNDERX)
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
@ -2685,20 +2721,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P sgemm_p
|
||||
#define DGEMM_DEFAULT_P dgemm_p
|
||||
#define CGEMM_DEFAULT_P cgemm_p
|
||||
#define ZGEMM_DEFAULT_P zgemm_p
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 160
|
||||
#define CGEMM_DEFAULT_P 128
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q sgemm_q
|
||||
#define DGEMM_DEFAULT_Q dgemm_q
|
||||
#define CGEMM_DEFAULT_Q cgemm_q
|
||||
#define ZGEMM_DEFAULT_Q zgemm_q
|
||||
#define SGEMM_DEFAULT_Q 352
|
||||
#define DGEMM_DEFAULT_Q 128
|
||||
#define CGEMM_DEFAULT_Q 224
|
||||
#define ZGEMM_DEFAULT_Q 112
|
||||
|
||||
#define SGEMM_DEFAULT_R sgemm_r
|
||||
#define DGEMM_DEFAULT_R dgemm_r
|
||||
#define CGEMM_DEFAULT_R cgemm_r
|
||||
#define ZGEMM_DEFAULT_R zgemm_r
|
||||
#define SGEMM_DEFAULT_R 4096
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue