Merge branch 'develop' into fc-1847

This commit is contained in:
Martin Kroeker 2018-11-07 08:47:52 +01:00 committed by GitHub
commit 5f8f0583d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 1066 additions and 735 deletions

View File

@ -48,6 +48,7 @@ ifndef NO_CBLAS
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT)
endif
endif
else
#install on AIX has different options syntax
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
endif
#Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"

View File

@ -510,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT)
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
endif
ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
ifndef DYNAMIC_CORE
override DYNAMIC_ARCH=

View File

@ -183,7 +183,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
#ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_

View File

@ -237,7 +237,6 @@ void get_cpuconfig(void)
break;
case CPU_THUNDERX:
printf("#define ARMV8\n");
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");

View File

@ -2009,6 +2009,8 @@ int get_coretype(void){
switch (model) {
case 1:
// AMD Ryzen
case 8:
// Ryzen 2
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;

View File

@ -48,6 +48,10 @@
#define SWITCH_RATIO 2
#endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
return 0;
}
static int round_up(int remainder, int width, int multiple)
{
if (multiple > remainder || width <= multiple)
return width;
width = (width + multiple - 1) / multiple;
width = width * multiple;
return width;
}
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) {
@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0;
while (m > 0){
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
width = round_up(m, width, GEMM_PREFERED_SIZE);
m -= width;
if (m < 0) width = width + m;
range_M[num_parts + 1] = range_M[num_parts] + width;
num_parts ++;
}
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
if (width < SWITCH_RATIO) {
width = SWITCH_RATIO;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);
n -= width;
if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width;
num_parts ++;
}
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {

View File

@ -15,7 +15,11 @@ endif
# COMMONOBJS += info.$(SUFFIX)
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
COMMONOBJS += dynamic_arm64.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX)
endif
else
COMMONOBJS += parameter.$(SUFFIX)
endif
@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c
endif
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif

View File

@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
long i;
#ifdef SMP_SERVER
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_num_threads;
#ifndef NO_AFFINITY

View File

@ -480,6 +480,11 @@ void goto_set_num_threads(int num_threads)
{
long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;

View File

@ -0,0 +1,198 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#include <asm/hwcap.h>
#include <sys/auxv.h>
extern gotoblas_t gotoblas_ARMV8;
extern gotoblas_t gotoblas_CORTEXA57;
extern gotoblas_t gotoblas_THUNDERX;
extern gotoblas_t gotoblas_THUNDERX2T99;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 4
/*
* In case asm/hwcap.h is outdated on the build system, make sure
* that HWCAP_CPUID is defined
*/
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
#define get_cpu_ftr(id, var) ({ \
asm("mrs %0, "#id : "=r" (var)); \
})
static char *corename[] = {
"armv8",
"cortexa57",
"thunderx",
"thunderx2t99",
"unknown"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
return corename[NUM_CORETYPES];
}
static gotoblas_t *force_coretype(char *coretype) {
int i ;
int found = -1;
char message[128];
for ( i=0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 0: return (&gotoblas_ARMV8);
case 1: return (&gotoblas_CORTEXA57);
case 2: return (&gotoblas_THUNDERX);
case 3: return (&gotoblas_THUNDERX2T99);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}
static gotoblas_t *get_coretype(void) {
int implementer, variant, part, arch, revision, midr_el1;
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
char coremsg[128];
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg);
return NULL;
}
get_cpu_ftr(MIDR_EL1, midr_el1);
/*
* MIDR_EL1
*
* 31 24 23 20 19 16 15 4 3 0
* -----------------------------------------------------------------
* | Implementer | Variant | Architecture | Part Number | Revision |
* -----------------------------------------------------------------
*/
implementer = (midr_el1 >> 24) & 0xFF;
part = (midr_el1 >> 4) & 0xFFF;
switch(implementer)
{
case 0x41: // ARM
switch (part)
{
case 0xd07: // Cortex A57
case 0xd08: // Cortex A72
case 0xd03: // Cortex A53
return &gotoblas_CORTEXA57;
}
break;
case 0x42: // Broadcom
switch (part)
{
case 0x516: // Vulcan
return &gotoblas_THUNDERX2T99;
}
break;
case 0x43: // Cavium
switch (part)
{
case 0x0a1: // ThunderX
return &gotoblas_THUNDERX;
case 0x0af: // ThunderX2
return &gotoblas_THUNDERX2T99;
}
break;
}
return NULL;
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_ARMV8;
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@ -259,6 +259,16 @@ int get_num_procs(void) {
}
#endif
#ifdef OS_AIX
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {
@ -1739,6 +1749,22 @@ int get_num_procs(void) {
}
#endif
#ifdef OS_HAIKU
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_AIX
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {

View File

@ -730,35 +730,8 @@ void blas_set_parameter(void){
#if defined(ARCH_ARM64)
#if defined(VULCAN) || defined(THUNDERX2T99)
unsigned long dgemm_prefetch_size_a;
unsigned long dgemm_prefetch_size_b;
unsigned long dgemm_prefetch_size_c;
#endif
void blas_set_parameter(void)
{
#if defined(VULCAN) || defined(THUNDERX2T99)
dgemm_p = 160;
dgemm_q = 128;
dgemm_r = 4096;
sgemm_p = 128;
sgemm_q = 352;
sgemm_r = 4096;
cgemm_p = 128;
cgemm_q = 224;
cgemm_r = 4096;
zgemm_p = 128;
zgemm_q = 112;
zgemm_r = 4096;
dgemm_prefetch_size_a = 3584;
dgemm_prefetch_size_b = 512;
dgemm_prefetch_size_c = 128;
#endif
}
#endif

View File

@ -42,7 +42,7 @@
#include "functable.h"
#endif
#if defined(THUNDERX2T99) || defined(VULCAN)
#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
// Multithreaded swap gives performance benefits in ThunderX2T99
#else
// Disable multi-threading as it does not show any performance

View File

@ -43,6 +43,10 @@
#include "functable.h"
#endif
// this is smallest dimension N of square input a to permit threading
// see graph in issue #1820 for explanation
#define MULTI_THREAD_MINIMAL 362
#ifdef XDOUBLE
#define ERROR_NAME "XHEMV "
#elif defined(DOUBLE)
@ -195,7 +199,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, void *VALPHA
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
if (n<MULTI_THREAD_MINIMAL) {
nthreads = 1 ;
} else {
nthreads = num_cpu_avail(2);
};
if (nthreads == 1) {
#endif

View File

@ -88,7 +88,11 @@ lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL)
$(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F)
setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h
ifeq ($(USE_GEMM3M), 1)
$(CC) -c $(CFLAGS) -DUSE_GEMM3M $< -o $@
else
$(CC) -c $(CFLAGS) $< -o $@
endif
setparam$(TSUFFIX).c : setparam-ref.c
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)

View File

@ -1,8 +1,3 @@
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
@ -30,92 +20,6 @@ IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
ifneq ($(OS_DARWIN)$(CROSS),11)
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
endif
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ifneq ($(OS_DARWIN)$(CROSS),11)
SGEMMKERNEL = sgemm_kernel_4x4.S
else
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
endif
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@ -136,6 +40,186 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
ifneq ($(OS_DARWIN)$(CROSS),11)
SNRM2KERNEL = scnrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
endif
DDOTKERNEL = dot_thunderx2t99.c
SDOTKERNEL = dot_thunderx2t99.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
ifneq ($(OS_DARWIN)$(CROSS),11)
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
endif
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
endif
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
endif
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
endif
else
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif

View File

@ -1,4 +1,49 @@
include $(KERNELDIR)/KERNEL.ARMV8
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
@ -66,13 +111,13 @@ STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
@ -87,8 +132,8 @@ DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
@ -99,32 +144,32 @@ DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -1,6 +1,133 @@
include $(KERNELDIR)/KERNEL.ARMV8
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx.c
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SDOTKERNEL = dot_thunderx.c
DDOTKERNEL = ddot_thunderx.c
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = sgemm_kernel_4x4.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SDOTKERNEL=dot_thunderx.c
DDOTKERNEL=ddot_thunderx.c
DAXPYKERNEL=daxpy_thunderx.c

View File

@ -1,4 +1,137 @@
include $(KERNELDIR)/KERNEL.CORTEXA57
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
@ -27,12 +160,12 @@ CNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
DAXPYKERNEL = daxpy_thunderx2t99.S
DDOTKERNEL = dot_thunderx2t99.c
SDOTKERNEL = dot_thunderx2t99.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S

View File

@ -1 +0,0 @@
include $(KERNELDIR)/KERNEL.ARMV8

View File

@ -943,13 +943,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
ldr A_PRE_SIZE, =dgemm_prefetch_size_a
ldr A_PRE_SIZE, [A_PRE_SIZE]
ldr B_PRE_SIZE, =dgemm_prefetch_size_b
ldr B_PRE_SIZE, [B_PRE_SIZE]
ldr C_PRE_SIZE, =dgemm_prefetch_size_c
ldr C_PRE_SIZE, [C_PRE_SIZE]
mov A_PRE_SIZE, #3584
mov B_PRE_SIZE, #512
mov C_PRE_SIZE, #128
add A_PRE_SIZE_64, A_PRE_SIZE, #64
add B_PRE_SIZE_64, B_PRE_SIZE, #64

View File

@ -294,6 +294,8 @@ gotoblas_t TABLE_NAME = {
chemm_outcopyTS, chemm_oltcopyTS,
0, 0, 0,
#if defined(USE_GEMM3M)
#ifdef CGEMM3M_DEFAULT_UNROLL_M
CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N),
#else
@ -324,6 +326,33 @@ gotoblas_t TABLE_NAME = {
chemm3m_oucopybTS, chemm3m_olcopybTS,
chemm3m_oucopyrTS, chemm3m_olcopyrTS,
chemm3m_oucopyiTS, chemm3m_olcopyiTS,
#else
0, 0, 0,
NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
#endif
#ifndef NO_LAPACK
cneg_tcopyTS, claswp_ncopyTS,
@ -400,6 +429,7 @@ gotoblas_t TABLE_NAME = {
zhemm_outcopyTS, zhemm_oltcopyTS,
0, 0, 0,
#if defined(USE_GEMM3M)
#ifdef ZGEMM3M_DEFAULT_UNROLL_M
ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N),
#else
@ -430,6 +460,33 @@ gotoblas_t TABLE_NAME = {
zhemm3m_oucopybTS, zhemm3m_olcopybTS,
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
#else
0, 0, 0,
NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
#endif
#ifndef NO_LAPACK
zneg_tcopyTS, zlaswp_ncopyTS,
@ -503,6 +560,7 @@ gotoblas_t TABLE_NAME = {
xhemm_outcopyTS, xhemm_oltcopyTS,
0, 0, 0,
#if defined(USE_GEMM3M)
QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
xgemm3m_kernelTS,
@ -528,6 +586,33 @@ gotoblas_t TABLE_NAME = {
xhemm3m_oucopybTS, xhemm3m_olcopybTS,
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
#else
0, 0, 0,
NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
NULL, NULL,
#endif
#ifndef NO_LAPACK
xneg_tcopyTS, xlaswp_ncopyTS,
@ -561,6 +646,78 @@ gotoblas_t TABLE_NAME = {
};
#if defined(ARCH_ARM64)
static void init_parameter(void) {
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R;
TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R;
#endif
#if defined(USE_GEMM3M)
#ifdef CGEMM3M_DEFAULT_P
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
#else
TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
#endif
#ifdef ZGEMM3M_DEFAULT_P
TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
#else
TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
#endif
#ifdef CGEMM3M_DEFAULT_Q
TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
#else
TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q;
#endif
#ifdef ZGEMM3M_DEFAULT_Q
TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
#else
TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q;
#endif
#ifdef CGEMM3M_DEFAULT_R
TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R;
#else
TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r;
#endif
#ifdef ZGEMM3M_DEFAULT_R
TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R;
#else
TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r;
#endif
#ifdef EXPRECISION
TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q;
TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
#endif
#endif
}
#else // defined(ARCH_ARM64)
#ifdef ARCH_X86
static int get_l2_size_old(void){
int i, eax, ebx, ecx, edx, cpuid_level;
@ -1146,3 +1303,4 @@ static void init_parameter(void) {
}
#endif //defined(ARCH_ARM64)

View File

@ -1,6 +1,11 @@
include $(KERNELDIR)/KERNEL.HASWELL
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c
@ -9,5 +14,5 @@ DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c
SGEMM_BETA = ../generic/gemm_beta.c
SGEMM_BETA = sgemm_beta_skylakex.c
DGEMM_BETA = dgemm_beta_skylakex.c

View File

@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
return 0;
}
if (m == 0 || n == 0)
return 0;
c_offset = c;
@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
i = m;
while (i > 32) {
while (i >= 32) {
_mm512_storeu_pd(c_offset1, z_zero);
_mm512_storeu_pd(c_offset1 + 8, z_zero);
_mm512_storeu_pd(c_offset1 + 16, z_zero);
@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset1 += 32;
i -= 32;
}
while (i > 8) {
while (i >= 8) {
_mm512_storeu_pd(c_offset1, z_zero);
c_offset1 += 8;
i -= 8;

View File

@ -55,13 +55,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
return 0;
}
if (n == 0 || m == 0)
return;
c_offset = c;
if (beta == ZERO){
__m512 z_zero;
__m256 y_zero;
z_zero = _mm512_setzero_ps();
y_zero = _mm256_setzero_ps();
j = n;
do {
c_offset1 = c_offset;
@ -69,16 +73,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
i = m;
while (i > 32) {
while (i >= 32) {
_mm512_storeu_ps(c_offset1, z_zero);
_mm512_storeu_ps(c_offset1 + 8, z_zero);
_mm512_storeu_ps(c_offset1 + 16, z_zero);
_mm512_storeu_ps(c_offset1 + 24 , z_zero);
c_offset1 += 32;
i -= 32;
}
while (i > 8) {
_mm512_storeu_ps(c_offset1, z_zero);
while (i >= 8) {
_mm256_storeu_ps(c_offset1, y_zero);
c_offset1 += 8;
i -= 8;
}

View File

@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define INIT32x8() \
row0 = _mm512_setzero_ps(); \
row1 = _mm512_setzero_ps(); \
row2 = _mm512_setzero_ps(); \
row3 = _mm512_setzero_ps(); \
row4 = _mm512_setzero_ps(); \
row5 = _mm512_setzero_ps(); \
row6 = _mm512_setzero_ps(); \
row0b = _mm512_setzero_ps(); \
row1b = _mm512_setzero_ps(); \
row2b = _mm512_setzero_ps(); \
row3b = _mm512_setzero_ps(); \
row4b = _mm512_setzero_ps(); \
row5b = _mm512_setzero_ps(); \
row6b = _mm512_setzero_ps(); \
row7b = _mm512_setzero_ps(); \
#define KERNEL32x8_SUB() \
zmm0 = _mm512_loadu_ps(AO); \
zmm0b = _mm512_loadu_ps(AOb); \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \
row0 += zmm0 * zmm2; \
row1 += zmm0 * zmm3; \
row0b += zmm0b * zmm2; \
row1b += zmm0b * zmm3; \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \
row2 += zmm0 * zmm2; \
row3 += zmm0 * zmm3; \
row2b += zmm0b * zmm2; \
row3b += zmm0b * zmm3; \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \
row4 += zmm0 * zmm2; \
row5 += zmm0 * zmm3; \
row4b += zmm0b * zmm2; \
row5b += zmm0b * zmm3; \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \
row6 += zmm0 * zmm2; \
row7 += zmm0 * zmm3; \
row6b += zmm0b * zmm2; \
row7b += zmm0b * zmm3; \
BO += 8; \
AO += 16; \
AOb += 16;
#define SAVE32x8(ALPHA) \
zmm0 = _mm512_set1_ps(ALPHA); \
row0 *= zmm0; \
row1 *= zmm0; \
row2 *= zmm0; \
row3 *= zmm0; \
row4 *= zmm0; \
row5 *= zmm0; \
row6 *= zmm0; \
row7 *= zmm0; \
row0b *= zmm0; \
row1b *= zmm0; \
row2b *= zmm0; \
row3b *= zmm0; \
row4b *= zmm0; \
row5b *= zmm0; \
row6b *= zmm0; \
row7b *= zmm0; \
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \
_mm512_storeu_ps(CO1 + 7 * ldc, row7); \
row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \
row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \
row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \
row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \
row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \
row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \
row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \
row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \
_mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \
_mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \
_mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \
_mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \
_mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \
_mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \
_mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \
_mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \
#define INIT16x8() \
row0 = _mm512_setzero_ps(); \
row1 = _mm512_setzero_ps(); \
row2 = _mm512_setzero_ps(); \
row3 = _mm512_setzero_ps(); \
row4 = _mm512_setzero_ps(); \
row5 = _mm512_setzero_ps(); \
row6 = _mm512_setzero_ps(); \
row7 = _mm512_setzero_ps(); \
#define KERNEL16x8_SUB() \
zmm0 = _mm512_loadu_ps(AO); \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \
row0 += zmm0 * zmm2; \
row1 += zmm0 * zmm3; \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \
row2 += zmm0 * zmm2; \
row3 += zmm0 * zmm3; \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \
row4 += zmm0 * zmm2; \
row5 += zmm0 * zmm3; \
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \
row6 += zmm0 * zmm2; \
row7 += zmm0 * zmm3; \
BO += 8; \
AO += 16;
#define SAVE16x8(ALPHA) \
zmm0 = _mm512_set1_ps(ALPHA); \
row0 *= zmm0; \
row1 *= zmm0; \
row2 *= zmm0; \
row3 *= zmm0; \
row4 *= zmm0; \
row5 *= zmm0; \
row6 *= zmm0; \
row7 *= zmm0; \
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \
_mm512_storeu_ps(CO1 + 7 * ldc, row7);
/*******************************************************************************************/
#define INIT8x8() \
row0 = _mm256_setzero_ps(); \
row1 = _mm256_setzero_ps(); \
row2 = _mm256_setzero_ps(); \
row3 = _mm256_setzero_ps(); \
row4 = _mm256_setzero_ps(); \
row5 = _mm256_setzero_ps(); \
row6 = _mm256_setzero_ps(); \
row7 = _mm256_setzero_ps(); \
#define KERNEL8x8_SUB() \
ymm0 = _mm256_loadu_ps(AO); \
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \
row0 += ymm0 * ymm2; \
row1 += ymm0 * ymm3; \
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \
row2 += ymm0 * ymm2; \
row3 += ymm0 * ymm3; \
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \
row4 += ymm0 * ymm2; \
row5 += ymm0 * ymm3; \
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \
row6 += ymm0 * ymm2; \
row7 += ymm0 * ymm3; \
BO += 8; \
AO += 8;
#define SAVE8x8(ALPHA) \
ymm0 = _mm256_set1_ps(ALPHA); \
row0 *= ymm0; \
row1 *= ymm0; \
row2 *= ymm0; \
row3 *= ymm0; \
row4 *= ymm0; \
row5 *= ymm0; \
row6 *= ymm0; \
row7 *= ymm0; \
row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \
row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \
row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \
row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \
row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \
row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \
row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \
row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \
_mm256_storeu_ps(CO1 + 0 * ldc, row0); \
_mm256_storeu_ps(CO1 + 1 * ldc, row1); \
_mm256_storeu_ps(CO1 + 2 * ldc, row2); \
_mm256_storeu_ps(CO1 + 3 * ldc, row3); \
_mm256_storeu_ps(CO1 + 4 * ldc, row4); \
_mm256_storeu_ps(CO1 + 5 * ldc, row5); \
_mm256_storeu_ps(CO1 + 6 * ldc, row6); \
_mm256_storeu_ps(CO1 + 7 * ldc, row7); \
/*******************************************************************************************/
#define INIT4x8() \
row0 = _mm_setzero_ps(); \
row1 = _mm_setzero_ps(); \
row2 = _mm_setzero_ps(); \
row3 = _mm_setzero_ps(); \
row4 = _mm_setzero_ps(); \
row5 = _mm_setzero_ps(); \
row6 = _mm_setzero_ps(); \
row7 = _mm_setzero_ps(); \
#define KERNEL4x8_SUB() \
xmm0 = _mm_loadu_ps(AO); \
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \
row0 += xmm0 * xmm2; \
row1 += xmm0 * xmm3; \
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \
row2 += xmm0 * xmm2; \
row3 += xmm0 * xmm3; \
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \
row4 += xmm0 * xmm2; \
row5 += xmm0 * xmm3; \
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \
row6 += xmm0 * xmm2; \
row7 += xmm0 * xmm3; \
BO += 8; \
AO += 4;
#define SAVE4x8(ALPHA) \
xmm0 = _mm_set1_ps(ALPHA); \
row0 *= xmm0; \
row1 *= xmm0; \
row2 *= xmm0; \
row3 *= xmm0; \
row4 *= xmm0; \
row5 *= xmm0; \
row6 *= xmm0; \
row7 *= xmm0; \
row0 += _mm_loadu_ps(CO1 + 0 * ldc); \
row1 += _mm_loadu_ps(CO1 + 1 * ldc); \
row2 += _mm_loadu_ps(CO1 + 2 * ldc); \
row3 += _mm_loadu_ps(CO1 + 3 * ldc); \
row4 += _mm_loadu_ps(CO1 + 4 * ldc); \
row5 += _mm_loadu_ps(CO1 + 5 * ldc); \
row6 += _mm_loadu_ps(CO1 + 6 * ldc); \
row7 += _mm_loadu_ps(CO1 + 7 * ldc); \
_mm_storeu_ps(CO1 + 0 * ldc, row0); \
_mm_storeu_ps(CO1 + 1 * ldc, row1); \
_mm_storeu_ps(CO1 + 2 * ldc, row2); \
_mm_storeu_ps(CO1 + 3 * ldc, row3); \
_mm_storeu_ps(CO1 + 4 * ldc, row4); \
_mm_storeu_ps(CO1 + 5 * ldc, row5); \
_mm_storeu_ps(CO1 + 6 * ldc, row6); \
_mm_storeu_ps(CO1 + 7 * ldc, row7); \
/*******************************************************************************************/
#define INIT2x8() \
row0a = row0b = 0; \
row1a = row1b = 0; \
row2a = row2b = 0; \
row3a = row3b = 0; \
row4a = row4b = 0; \
row5a = row5b = 0; \
row6a = row6b = 0; \
row7a = row7b = 0; \
#define KERNEL2x8_SUB() \
xmm0 = *(AO); \
xmm1 = *(AO + 1); \
xmm2 = *(BO + 0); \
xmm3 = *(BO + 1); \
row0a += xmm0 * xmm2; \
row0b += xmm1 * xmm2; \
row1a += xmm0 * xmm3; \
row1b += xmm1 * xmm3; \
xmm2 = *(BO + 2); \
xmm3 = *(BO + 3); \
row2a += xmm0 * xmm2; \
row2b += xmm1 * xmm2; \
row3a += xmm0 * xmm3; \
row3b += xmm1 * xmm3; \
xmm2 = *(BO + 4); \
xmm3 = *(BO + 5); \
row4a += xmm0 * xmm2; \
row4b += xmm1 * xmm2; \
row5a += xmm0 * xmm3; \
row5b += xmm1 * xmm3; \
xmm2 = *(BO + 6); \
xmm3 = *(BO + 7); \
row6a += xmm0 * xmm2; \
row6b += xmm1 * xmm2; \
row7a += xmm0 * xmm3; \
row7b += xmm1 * xmm3; \
BO += 8; \
AO += 2;
#define SAVE2x8(ALPHA) \
xmm0 = ALPHA; \
row0a *= xmm0; \
row0b *= xmm0; \
row1a *= xmm0; \
row1b *= xmm0; \
row2a *= xmm0; \
row2b *= xmm0; \
row3a *= xmm0; \
row3b *= xmm0; \
row4a *= xmm0; \
row4b *= xmm0; \
row5a *= xmm0; \
row5b *= xmm0; \
row6a *= xmm0; \
row6b *= xmm0; \
row7a *= xmm0; \
row7b *= xmm0; \
*(CO1 + 0 * ldc + 0) += row0a; \
*(CO1 + 0 * ldc + 1) += row0b; \
*(CO1 + 1 * ldc + 0) += row1a; \
*(CO1 + 1 * ldc + 1) += row1b; \
*(CO1 + 2 * ldc + 0) += row2a; \
*(CO1 + 2 * ldc + 1) += row2b; \
*(CO1 + 3 * ldc + 0) += row3a; \
*(CO1 + 3 * ldc + 1) += row3b; \
*(CO1 + 4 * ldc + 0) += row4a; \
*(CO1 + 4 * ldc + 1) += row4b; \
*(CO1 + 5 * ldc + 0) += row5a; \
*(CO1 + 5 * ldc + 1) += row5b; \
*(CO1 + 6 * ldc + 0) += row6a; \
*(CO1 + 6 * ldc + 1) += row6b; \
*(CO1 + 7 * ldc + 0) += row7a; \
*(CO1 + 7 * ldc + 1) += row7b; \
/*******************************************************************************************/
#define INIT1x8() \
row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0;
#define KERNEL1x8_SUB() \
xmm0 = *(AO ); \
xmm2 = *(BO + 0); \
xmm3 = *(BO + 1); \
row0 += xmm0 * xmm2; \
row1 += xmm0 * xmm3; \
xmm2 = *(BO + 2); \
xmm3 = *(BO + 3); \
row2 += xmm0 * xmm2; \
row3 += xmm0 * xmm3; \
xmm2 = *(BO + 4); \
xmm3 = *(BO + 5); \
row4 += xmm0 * xmm2; \
row5 += xmm0 * xmm3; \
xmm2 = *(BO + 6); \
xmm3 = *(BO + 7); \
row6 += xmm0 * xmm2; \
row7 += xmm0 * xmm3; \
BO += 8; \
AO += 1;
#define SAVE1x8(ALPHA) \
xmm0 = ALPHA; \
row0 *= xmm0; \
row1 *= xmm0; \
row2 *= xmm0; \
row3 *= xmm0; \
row4 *= xmm0; \
row5 *= xmm0; \
row6 *= xmm0; \
row7 *= xmm0; \
*(CO1 + 0 * ldc) += row0; \
*(CO1 + 1 * ldc) += row1; \
*(CO1 + 2 * ldc) += row2; \
*(CO1 + 3 * ldc) += row3; \
*(CO1 + 4 * ldc) += row4; \
*(CO1 + 5 * ldc) += row5; \
*(CO1 + 6 * ldc) += row6; \
*(CO1 + 7 * ldc) += row7; \
@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
return 0;
// L8_0
while (N >= 8 && 0) {
float *CO1;
float *AO;
int i;
// L8_10
CO1 = C;
C += 8 * ldc;
AO = A;
i = m;
while (i >= 32 && 0) {
float *BO, *AOb;
// L8_11
__m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b;
BO = B;
int kloop = K;
AOb = AO + 16 * K;
INIT32x8()
while (kloop > 0) {
// L12_17
KERNEL32x8_SUB()
kloop--;
}
// L8_19
SAVE32x8(alpha)
CO1 += 32;
AO += 16 * K;
i -= 32;
}
while (i >= 16) {
float *BO;
// L8_11
__m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7;
BO = B;
int kloop = K;
INIT16x8()
while (kloop > 0) {
KERNEL16x8_SUB()
kloop--;
}
SAVE16x8(alpha)
CO1 += 16;
i -= 16;
}
while (i >= 8) {
float *BO;
// L8_11
__m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7;
BO = B;
int kloop = K;
INIT8x8()
while (kloop > 0) {
// L12_17
KERNEL8x8_SUB()
kloop--;
}
// L8_19
SAVE8x8(alpha)
CO1 += 8;
i -= 8;
}
while (i >= 4) {
// L8_11
float *BO;
__m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7;
BO = B;
int kloop = K;
INIT4x8()
// L8_16
while (kloop > 0) {
// L12_17
KERNEL4x8_SUB()
kloop--;
}
// L8_19
SAVE4x8(alpha)
CO1 += 4;
i -= 4;
}
/**************************************************************************
* Rest of M
***************************************************************************/
while (i >= 2) {
float *BO;
float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b;
BO = B;
INIT2x8()
int kloop = K;
while (kloop > 0) {
KERNEL2x8_SUB()
kloop--;
}
SAVE2x8(alpha)
CO1 += 2;
i -= 2;
}
// L13_40
while (i >= 1) {
float *BO;
float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7;
int kloop = K;
BO = B;
INIT1x8()
while (kloop > 0) {
KERNEL1x8_SUB()
kloop--;
}
SAVE1x8(alpha)
CO1 += 1;
i -= 1;
}
B += K * 8;
N -= 8;
}
while (N >= 4) {
float *CO1;
float *AO;

View File

@ -34,6 +34,13 @@
#ifndef _LAPACKE_CONFIG_H_
#define _LAPACKE_CONFIG_H_
// For Android prior to API 21 (no <complex> include)
#if defined(__ANDROID__)
#if __ANDROID_API__ < 21
#define LAPACK_COMPLEX_STRUCTURE
#endif
#endif
#ifdef __cplusplus
#if defined(LAPACK_COMPLEX_CPP)
#include <complex>

72
param.h
View File

@ -1627,6 +1627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8
#define SWITCH_RATIO 32
#define GEMM_PREFERED_SIZE 32
#ifdef ARCH_X86
@ -2583,6 +2584,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#if defined(ARMV8)
#if defined(OS_DARWIN) && defined(CROSS)
#define SNUMOPT 2
#define DNUMOPT 2
@ -2590,13 +2593,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#if defined(OS_DARWIN) && defined(CROSS)
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL N 2
#else
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
#endif
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2
@ -2622,10 +2620,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#define SYMV_P 16
#else
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#define SYMV_P 16
#endif
#endif
#if defined(THUNDERX)
#define SNUMOPT 2
#define DNUMOPT 2
@ -2685,20 +2721,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P sgemm_p
#define DGEMM_DEFAULT_P dgemm_p
#define CGEMM_DEFAULT_P cgemm_p
#define ZGEMM_DEFAULT_P zgemm_p
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q sgemm_q
#define DGEMM_DEFAULT_Q dgemm_q
#define CGEMM_DEFAULT_Q cgemm_q
#define ZGEMM_DEFAULT_Q zgemm_q
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R sgemm_r
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#define SYMV_P 16
#endif