Merge pull request #35 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-03-01 23:44:10 +01:00 committed by GitHub
commit 02d60c1563
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 1423 additions and 153 deletions

View File

@ -24,6 +24,23 @@ CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
# Use a72 tunings because Neoverse-N1 is only available
# in GCC>=9
ifeq ($(CORE), NEOVERSEN1)
ifeq ($(GCCVERSIONGTEQ7), 1)
ifeq ($(GCCVERSIONGTEQ9), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
FCOMMON_OPT += -march=armv8-a -mtune=thunderx

View File

@ -328,6 +328,7 @@ ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
@ -554,6 +555,7 @@ DYNAMIC_CORE += CORTEXA53
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += NEOVERSEN1
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99

View File

@ -15,10 +15,12 @@ CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif

View File

@ -88,6 +88,7 @@ CORTEXA53
CORTEXA57
CORTEXA72
CORTEXA73
NEOVERSEN1
FALKOR
THUNDERX
THUNDERX2T99

View File

@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system
# ACML 6.1 custom
ACML=/home/saar/acml6.1/gfortran64_mp/lib
LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
# Atlas Ubuntu
# Atlas Ubuntu
#ATLAS=/usr/lib/atlas-base
#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm
@ -56,6 +56,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
ssyr.goto dsyr.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto cger.goto zger.goto \
@ -83,6 +84,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
ssyr.acml dsyr.acml \
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
sger.acml dger.acml cger.acml zger.acml \
@ -109,6 +111,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
ssyr.goto dsyr.atlas \
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
sger.atlas dger.atlas cger.atlas zger.atlas\
@ -136,6 +139,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
ssyr.mkl dsyr.mkl \
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
sger.mkl dger.mkl cger.mkl zger.mkl \
@ -162,6 +166,7 @@ else
goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
ssyr.goto dsyr.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto cger.goto zger.goto \
@ -188,6 +193,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
ssyr.acml dsyr.acml \
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
sger.acml dger.acml cger.acml zger.acml \
@ -214,6 +220,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
ssyr.atlas dsyr.atlas \
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
sger.atlas dger.atlas cger.atlas zger.atlas\
@ -243,6 +250,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
ssyr.mkl dsyr.mkl \
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
sger.mkl dger.mkl cger.mkl zger.mkl \
@ -280,6 +288,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \
strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \
strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \
ssyr.veclib dsyr.veclib \
ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \
ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \
sger.veclib dger.veclib cger.veclib zger.veclib \
@ -768,6 +777,36 @@ ztrsm.veclib : ztrsm.$(SUFFIX)
ztrsm.essl : ztrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ssyr ####################################################
ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
ssyr.acml : ssyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ssyr.atlas : ssyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ssyr.mkl : ssyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ssyr.veclib : ssyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dsyr ####################################################
dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dsyr.acml : dsyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dsyr.atlas : dsyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dsyr.mkl : dsyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dsyr.veclib : dsyr.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ssyrk ####################################################
ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
@ -2078,6 +2117,12 @@ ctrsm.$(SUFFIX) : trsm.c
ztrsm.$(SUFFIX) : trsm.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
ssyr.$(SUFFIX) : syr.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dsyr.$(SUFFIX) : syr.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
ssyrk.$(SUFFIX) : syrk.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

187
benchmark/syr.c Normal file
View File

@ -0,0 +1,187 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYR
#ifdef DOUBLE
#define SYR BLASFUNC(dsyr)
#else
#define SYR BLASFUNC(ssyr)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x,*a;
FLOAT alpha[] = {1.0, 1.0};
char *p;
char uplo='U';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
blasint m, i, j;
blasint inc_x= 1;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -45,7 +45,7 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1)
endif ()
if (POWER)

View File

@ -229,6 +229,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "NEOVERSEN1")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t1048576\n\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "FALKOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"

View File

@ -53,16 +53,16 @@ static void __inline blas_lock(volatile BLASULONG *address){
BLASULONG ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"mov x4, #1 \n\t"
"sevl \n\t"
"1: \n\t"
"wfe \n\t"
"2: \n\t"
"ldaxr x2, [%1] \n\t"
"cbnz x2, 1b \n\t"
"2: \n\t"
"stxr w3, x4, [%1] \n\t"
"cbnz w3, 1b \n\t"
"cbnz w3, 2b \n\t"
"mov %0, #0 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
@ -81,10 +81,12 @@ static void __inline blas_lock(volatile BLASULONG *address){
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
static __inline BLASULONG rpcc(void){
BLASULONG ret = 0;
blasint shift;
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
__asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift));
return ret;
return ret << shift;
}
#define RPCC_DEFINED

View File

@ -34,6 +34,7 @@
#define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
#define CPU_NEOVERSEN1 11
// Qualcomm
#define CPU_FALKOR 6
// Cavium
@ -55,7 +56,8 @@ static char *cpuname[] = {
"THUNDERX",
"THUNDERX2T99",
"TSV110",
"EMAG8180"
"EMAG8180",
"NEOVERSEN1"
};
static char *cpuname_lower[] = {
@ -69,7 +71,8 @@ static char *cpuname_lower[] = {
"thunderx",
"thunderx2t99",
"tsv110",
"emag8180"
"emag8180",
"neoversen1"
};
int get_feature(char *search)
@ -144,6 +147,8 @@ int detect(void)
return CPU_CORTEXA72;
else if (strstr(cpu_part, "0xd09"))
return CPU_CORTEXA73;
else if (strstr(cpu_part, "0xd0c"))
return CPU_NEOVERSEN1;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@ -285,6 +290,20 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_NEOVERSEN1:
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FALKOR:
printf("#define FALKOR\n");

View File

@ -351,8 +351,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
STOP_RPCC(waiting1);
MB;
#if defined(FUSED_GEMM) && !defined(TIMING)
@ -395,10 +396,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif
WMB;
/* Set flag so other threads can access local region of B */
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
}
/* Get regions of B from other threads and apply kernel */
@ -417,8 +418,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */
START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
STOP_RPCC(waiting2);
MB;
/* Apply kernel with local region of A and part of other region of B */
START_RPCC();
@ -434,8 +436,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
}
}
} while (current != mypos);
@ -477,8 +479,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
}
}
@ -497,10 +499,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
}
}
STOP_RPCC(waiting3);
MB;
#ifdef TIMING
BLASLONG waiting = waiting1 + waiting2 + waiting3;
@ -705,7 +708,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
}
}
}
WMB;
/* Execute parallel computation */
exec_blas(nthreads, queue);
}

View File

@ -52,10 +52,11 @@ extern gotoblas_t gotoblas_THUNDERX;
extern gotoblas_t gotoblas_THUNDERX2T99;
extern gotoblas_t gotoblas_TSV110;
extern gotoblas_t gotoblas_EMAG8180;
extern gotoblas_t gotoblas_NEOVERSEN1;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 10
#define NUM_CORETYPES 11
/*
* In case asm/hwcap.h is outdated on the build system, make sure
@ -80,6 +81,7 @@ static char *corename[] = {
"thunderx2t99",
"tsv110",
"emag8180",
"neoversen1",
"unknown"
};
@ -94,6 +96,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
return corename[NUM_CORETYPES];
}
@ -123,6 +126,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 7: return (&gotoblas_THUNDERX2T99);
case 8: return (&gotoblas_TSV110);
case 9: return (&gotoblas_EMAG8180);
case 10: return (&gotoblas_NEOVERSEN1);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@ -168,6 +172,8 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_CORTEXA72;
case 0xd09: // Cortex A73
return &gotoblas_CORTEXA73;
case 0xd0c: // Neoverse N1
return &gotoblas_NEOVERSEN1;
}
break;
case 0x42: // Broadcom

View File

@ -1028,6 +1028,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_NEOVERSEN1
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "NEOVERSEN1"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DNEOVERSEN1 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" \
"-march=armv8.2-a -mtune=cortex-a72"
#define LIBNAME "neoversen1"
#define CORENAME "NEOVERSEN1"
#else
#endif
#ifdef FORCE_FALKOR
#define FORCE
#define ARCHITECTURE "ARM64"

View File

@ -0,0 +1,189 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
DDOTKERNEL = dot_thunderx2t99.c
SDOTKERNEL = dot_thunderx2t99.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ifeq ($(SGEMM_UNROLL_M), 16)
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
else
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
endif
ifeq ($(SGEMM_UNROLL_M), 4)
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
else
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
endif
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(SGEMM_UNROLL_N), 16)
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
else
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
endif
ifeq ($(SGEMM_UNROLL_N), 4)
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
else
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -32,7 +32,7 @@ CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
STRMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c
SGEMM_BETA = sgemm_beta_skylakex.c
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c

View File

@ -1,7 +1,7 @@
include $(KERNELDIR)/KERNEL.HASWELL
SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
STRMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c

View File

@ -31,7 +31,7 @@ CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
STRMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c

View File

@ -50,7 +50,7 @@
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
#define KERNEL_2_k1m8n4 \
"vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\
"vpermilps $177,-64(%0),%%ymm0; vpermilps $177,-32(%0),%%ymm1;"\
acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
@ -93,7 +93,6 @@
"movq $10,%5; movq $84,%%r15;"\
#ndim"8881:\n\t"\
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\

View File

@ -1,8 +1,152 @@
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
/* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */
/* r10 to assist prefetch, r11 = m_counter, r12 = k << 4(const), r13 = k_todo, r14 = b_head_pos(const), r15 = %1 + 3r12 */
#include "common.h"
#include <stdint.h>
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
#define BACKWARDS 1
#else
#define BACKWARDS 0
#endif
#define REC_POINTER_1(ptr) "salq $2,%%r13; subq %%r13,"#ptr"; sarq $2,%%r13;"
#define REC_POINTER_2(ptr) "salq $3,%%r13; subq %%r13,"#ptr"; sarq $3,%%r13;"
#define REC_POINTER_4(ptr) "salq $4,%%r13; subq %%r13,"#ptr"; sarq $4,%%r13;"
#define REC_POINTER_8(ptr) "salq $5,%%r13; subq %%r13,"#ptr"; sarq $5,%%r13;"
#define REC_POINTER_16(ptr) "salq $6,%%r13; subq %%r13,"#ptr"; sarq $6,%%r13;"
#define INC_POINTER_1(ptr) "sarq $2,%%r12; addq %%r12,"#ptr"; salq $2,%%r12;"
#define INC_POINTER_2(ptr) "sarq $1,%%r12; addq %%r12,"#ptr"; salq $1,%%r12;"
#define INC_POINTER_4(ptr) "addq %%r12,"#ptr";"
#define INC_POINTER_8(ptr) "leaq ("#ptr",%%r12,2),"#ptr";"
#define INC_POINTER_16(ptr) "leaq ("#ptr",%%r12,4),"#ptr";"
#define SET_POINTER(ptr,dim) REC_POINTER_##dim(ptr) INC_POINTER_##dim(ptr)
#define SET_PB_1 SET_POINTER(%1,1)
#define SET_PB_2 SET_POINTER(%1,2)
#define SET_PB_4 SET_POINTER(%1,4)
#define SET_PB_8 SET_POINTER(%1,4)
#define SET_PB_12 SET_POINTER(%1,4)
#define SET_PB_16 SET_POINTER(%1,4)
#define SET_PB_20 SET_POINTER(%1,4)
#define SET_PB_24 SET_POINTER(%1,4)
#ifdef TRMMKERNEL
#if BACKWARDS == 1
#define START_SET_PAPB(mdim,ndim) SET_POINTER(%0,mdim) "movq %%r14,%1;" SET_PB_##ndim "leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"
#define END_SET_PA(mdim) ""
#else
#define START_SET_PAPB(mdim,ndim) "movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"
#define END_SET_PA(mdim) SET_POINTER(%0,mdim)
#endif
#else
#define START_SET_PAPB(mdim,ndim) "movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"
#define END_SET_PA(mdim) ""
#endif
#define RECOVER_PA(mdim) REC_POINTER_##mdim(%0)
#if defined(TRMMKERNEL) && !defined(LEFT)
#if BACKWARDS == 1
#define KERNEL_HEAD_C_n8(mdim) \
KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $4,%4; addq $64,%%r15;"
#define KERNEL_HEAD_C_n12(mdim) KERNEL_HEAD_C_n8(mdim)\
KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 "subq $4,%4; addq $64,%%r15;"
#define KERNEL_HEAD_C_n16(mdim) KERNEL_HEAD_C_n12(mdim)\
KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 KERNEL_k1m##mdim##n12 "subq $4,%4; addq $64,%%r15;"
#define KERNEL_HEAD_C_n20(mdim) KERNEL_HEAD_C_n16(mdim)\
KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 KERNEL_k1m##mdim##n16 "subq $4,%4;"
#define KERNEL_HEAD_C_n24(mdim) KERNEL_HEAD_C_n20(mdim)\
KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 KERNEL_k1m##mdim##n20 "subq $4,%4;"
#define KERNEL_HEAD_R_n4(mdim) "subq $12,%4; addq $64,%%r15; addq $"#mdim"*48,%0;"
#define KERNEL_HEAD_R_n8(mdim) KERNEL_HEAD_R_n4(mdim)\
kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) kernel_k1m##mdim##n4(%%r15) "subq $4,%4;"
#define KERNEL_HEAD_R_n12(mdim) KERNEL_HEAD_R_n8(mdim)\
kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) kernel_k1m##mdim##n8(%%r15) "subq $4,%4;"
#define KERNEL_TAIL_C_n8(mdim) ""
#define KERNEL_TAIL_C_n12(mdim) ""
#define KERNEL_TAIL_C_n16(mdim) ""
#define KERNEL_TAIL_C_n20(mdim) ""
#define KERNEL_TAIL_C_n24(mdim) ""
#define KERNEL_TAIL_R_n4(mdim) ""
#define KERNEL_TAIL_R_n8(mdim) ""
#define KERNEL_TAIL_R_n12(mdim) ""
#else
#define KERNEL_HEAD_C_n8(mdim) ""
#define KERNEL_HEAD_C_n12(mdim) ""
#define KERNEL_HEAD_C_n16(mdim) ""
#define KERNEL_HEAD_C_n20(mdim) ""
#define KERNEL_HEAD_C_n24(mdim) ""
#define KERNEL_HEAD_R_n4(mdim) ""
#define KERNEL_HEAD_R_n8(mdim) ""
#define KERNEL_HEAD_R_n12(mdim) ""
#define end_kernel_k4_ncx1(k_0,k_1,k_2,k_3,n1,mdim) \
end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0)\
end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1)\
end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2)\
end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3)
#define end_kernel_k4_ncx2(k_0,k_1,k_2,k_3,n1,n2,mdim) \
end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0)\
end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1)\
end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2)\
end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3)
#define end_kernel_k4_ncx3(k_0,k_1,k_2,k_3,n1,n2,n3,mdim) \
end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0) end_acc_nc##n3##_k1m##mdim(k_0)\
end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1) end_acc_nc##n3##_k1m##mdim(k_1)\
end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2) end_acc_nc##n3##_k1m##mdim(k_2)\
end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) end_acc_nc##n3##_k1m##mdim(k_3)
#define end_kernel_k4_ncx4(k_0,k_1,k_2,k_3,n1,n2,n3,n4,mdim) \
end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0) end_acc_nc##n3##_k1m##mdim(k_0) end_acc_nc##n4##_k1m##mdim(k_0)\
end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1) end_acc_nc##n3##_k1m##mdim(k_1) end_acc_nc##n4##_k1m##mdim(k_1)\
end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2) end_acc_nc##n3##_k1m##mdim(k_2) end_acc_nc##n4##_k1m##mdim(k_2)\
end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3) end_acc_nc##n3##_k1m##mdim(k_3) end_acc_nc##n4##_k1m##mdim(k_3)
#define end_kernel_k4_ncx5(k_0,k_1,k_2,k_3,n1,n2,n3,n4,n5,mdim) \
end_load_a_k1m##mdim(k_0) end_acc_nc##n1##_k1m##mdim(k_0) end_acc_nc##n2##_k1m##mdim(k_0)\
end_acc_nc##n3##_k1m##mdim(k_0) end_acc_nc##n4##_k1m##mdim(k_0) end_acc_nc##n5##_k1m##mdim(k_0)\
end_load_a_k1m##mdim(k_1) end_acc_nc##n1##_k1m##mdim(k_1) end_acc_nc##n2##_k1m##mdim(k_1)\
end_acc_nc##n3##_k1m##mdim(k_1) end_acc_nc##n4##_k1m##mdim(k_1) end_acc_nc##n5##_k1m##mdim(k_1)\
end_load_a_k1m##mdim(k_2) end_acc_nc##n1##_k1m##mdim(k_2) end_acc_nc##n2##_k1m##mdim(k_2)\
end_acc_nc##n3##_k1m##mdim(k_2) end_acc_nc##n4##_k1m##mdim(k_2) end_acc_nc##n5##_k1m##mdim(k_2)\
end_load_a_k1m##mdim(k_3) end_acc_nc##n1##_k1m##mdim(k_3) end_acc_nc##n2##_k1m##mdim(k_3)\
end_acc_nc##n3##_k1m##mdim(k_3) end_acc_nc##n4##_k1m##mdim(k_3) end_acc_nc##n5##_k1m##mdim(k_3)
#define KERNEL_TAIL_C_n8(mdim) end_kernel_k4_ncx1(0,1,2,3,2,mdim)
#define KERNEL_TAIL_C_n12(mdim) \
end_kernel_k4_ncx2(0,1,2,3,2,3,mdim) end_kernel_k4_ncx1(4,5,6,7,3,mdim)
#define KERNEL_TAIL_C_n16(mdim) \
end_kernel_k4_ncx3(0,1,2,3,2,3,4,mdim) end_kernel_k4_ncx2(4,5,6,7,3,4,mdim) end_kernel_k4_ncx1(8,9,10,11,4,mdim)
#define KERNEL_TAIL_C_n20(mdim) \
end_kernel_k4_ncx4(0,1,2,3,2,3,4,5,mdim) end_kernel_k4_ncx3(4,5,6,7,3,4,5,mdim)\
end_kernel_k4_ncx2(8,9,10,11,4,5,mdim) end_kernel_k4_ncx1(12,13,14,15,5,mdim)
#define KERNEL_TAIL_C_n24(mdim) \
end_kernel_k4_ncx5(0,1,2,3,2,3,4,5,6,mdim) end_kernel_k4_ncx4(4,5,6,7,3,4,5,6,mdim) end_kernel_k4_ncx3(8,9,10,11,4,5,6,mdim)\
end_kernel_k4_ncx2(12,13,14,15,5,6,mdim) end_kernel_k4_ncx1(16,17,18,19,6,mdim)
#define KERNEL_TAIL_R_n4(mdim) \
end_kernel_k4_ncx1(0,1,2,3,4,mdim) end_kernel_k4_ncx1(4,5,6,7,4,mdim) end_kernel_k4_ncx1(8,9,10,11,4,mdim)
#define KERNEL_TAIL_R_n8(mdim) \
end_kernel_k4_ncx2(0,1,2,3,4,5,mdim) end_kernel_k4_ncx2(4,5,6,7,4,5,mdim) end_kernel_k4_ncx2(8,9,10,11,4,5,mdim) end_kernel_k4_ncx1(12,13,14,15,5,mdim)
#define KERNEL_TAIL_R_n12(mdim) \
end_kernel_k4_ncx3(0,1,2,3,4,5,6,mdim) end_kernel_k4_ncx3(4,5,6,7,4,5,6,mdim) end_kernel_k4_ncx3(8,9,10,11,4,5,6,mdim)\
end_kernel_k4_ncx2(12,13,14,15,5,6,mdim) end_kernel_k4_ncx1(16,17,18,19,6,mdim)
#endif
#else
#define KERNEL_HEAD_C_n8(mdim) ""
#define KERNEL_HEAD_C_n12(mdim) ""
#define KERNEL_HEAD_C_n16(mdim) ""
#define KERNEL_HEAD_C_n20(mdim) ""
#define KERNEL_HEAD_C_n24(mdim) ""
#define KERNEL_HEAD_R_n4(mdim) ""
#define KERNEL_HEAD_R_n8(mdim) ""
#define KERNEL_HEAD_R_n12(mdim) ""
#define KERNEL_TAIL_C_n8(mdim) ""
#define KERNEL_TAIL_C_n12(mdim) ""
#define KERNEL_TAIL_C_n16(mdim) ""
#define KERNEL_TAIL_C_n20(mdim) ""
#define KERNEL_TAIL_C_n24(mdim) ""
#define KERNEL_TAIL_R_n4(mdim) ""
#define KERNEL_TAIL_R_n8(mdim) ""
#define KERNEL_TAIL_R_n12(mdim) ""
#endif
#define KERNEL_HEAD_C_n1(mdim) ""
#define KERNEL_HEAD_C_n2(mdim) ""
#define KERNEL_HEAD_C_n4(mdim) ""
#define KERNEL_TAIL_C_n1(mdim) ""
#define KERNEL_TAIL_C_n2(mdim) ""
#define KERNEL_TAIL_C_n4(mdim) ""
/* m = 16 */ /* zmm8-zmm31 for accumulators, zmm1-zmm7 for temporary use, zmm0 for alpha */
#define KERNEL_k1m16n1 \
@ -15,9 +159,10 @@
#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;"
#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;"
#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \
"vbroadcastsd ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\
"vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";"
#define unit_gen_kernel_k1m16n4(c1,c2,c3,c4,k_no,...) \
"vbroadcastsd "#k_no"*16 ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\
"vbroadcastsd "#k_no"*16+8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";"
#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) unit_gen_kernel_k1m16n4(c1,c2,c3,c4,0,__VA_ARGS__)
#define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1)
#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;"
#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2)
@ -28,6 +173,12 @@
#define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;"
#define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2)
#define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;"
#define end_load_a_k1m16(k_no) "vmovsldup "#k_no"*64(%0),%%zmm4; vmovshdup "#k_no"*64(%0),%%zmm5;"
#define end_acc_nc2_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,k_no,%1,%%r12,1)
#define end_acc_nc3_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,k_no,%1,%%r12,2)
#define end_acc_nc4_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,k_no,%%r15)
#define end_acc_nc5_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,k_no,%%r15,%%r12,1)
#define end_acc_nc6_k1m16(k_no) unit_gen_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,k_no,%%r15,%%r12,2)
#define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;"
#define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;"
#define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
@ -38,11 +189,19 @@
#define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23)
#define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27)
#define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31)
#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);"
#define unit_save_m16n2(c1,c2) \
#ifdef TRMMKERNEL
#define SAVE_h_m16n1 "vmulps %%zmm8,%%zmm0,%%zmm8; vmovups %%zmm8,(%2);"
#define unit_save_m16n2(c1,c2) \
"vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\
"vmulps %%zmm4,%%zmm0,%%zmm4; vmulps %%zmm5,%%zmm0,%%zmm5;"\
"vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;"
#else
#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);"
#define unit_save_m16n2(c1,c2) \
"vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\
"vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\
"vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;"
#endif
#define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9)
#define SAVE_h_m16n4 SAVE_h_m16n2 unit_save_m16n2(%%zmm10,%%zmm11)
#define SAVE_h_m16n8 SAVE_h_m16n4 unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15)
@ -52,8 +211,9 @@
#define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31)
#define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;"
#define COMPUTE_m16(ndim) \
INIT_m16n##ndim\
"movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5; xorq %%r10,%%r10;"\
INIT_m16n##ndim START_SET_PAPB(16,ndim)\
"movq %%r13,%4; movq %2,%5; xorq %%r10,%%r10;"\
KERNEL_HEAD_C_n##ndim(16)\
"cmpq $16,%4; jb "#ndim"016162f;"\
#ndim"016161:\n\t"\
"cmpq $126,%%r10; movq $126,%%r10; cmoveq %3,%%r10;"\
@ -72,28 +232,41 @@
KERNEL_k1m16n##ndim\
"leaq (%5,%3,2),%5; decq %4; jnz "#ndim"016163b;"\
#ndim"016164:\n\t"\
KERNEL_TAIL_C_n##ndim(16)\
"prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
SAVE_m16(ndim)
SAVE_m16(ndim) END_SET_PA(16)
/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
#define KERNEL_k1m8n1(b_addr) \
#define kernel_k1m8n1(b_addr) \
"vmovups (%0),%%ymm1; addq $32,%0;"\
"vbroadcastss ("#b_addr"),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
"addq $4,"#b_addr";"
#define KERNEL_h_k1m8n2(b_addr) \
#define kernel_h_k1m8n2(b_addr) \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
"vbroadcastsd ("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
#define KERNEL_k1m8n2(b_addr) KERNEL_h_k1m8n2(b_addr) "addq $8,"#b_addr";"
#define KERNEL_h_k1m8n4(b_addr) \
KERNEL_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
#define KERNEL_k1m8n4(b_addr) KERNEL_h_k1m8n4(b_addr) "addq $16,"#b_addr";"
#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \
"vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
"vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
#define KERNEL_h_k1m8n8(b_addr) KERNEL_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1)
#define KERNEL_k1m8n8(b_addr) KERNEL_h_k1m8n8(b_addr) "addq $16,"#b_addr";"
#define KERNEL_h_k1m8n12(b_addr) KERNEL_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2)
#define KERNEL_k1m8n12(b_addr) KERNEL_h_k1m8n12(b_addr) "addq $16,"#b_addr";"
#define kernel_k1m8n2(b_addr) kernel_h_k1m8n2(b_addr) "addq $8,"#b_addr";"
#define kernel_h_k1m8n4(b_addr) \
kernel_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
#define kernel_k1m8n4(b_addr) kernel_h_k1m8n4(b_addr) "addq $16,"#b_addr";"
#define unit_gen_kernel_k1m8n4(c1,c2,c3,c4,k_no,...) \
"vbroadcastsd "#k_no"*16 ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
"vbroadcastsd "#k_no"*16+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) unit_gen_kernel_k1m8n4(c1,c2,c3,c4,0,__VA_ARGS__)
#define kernel_h_k1m8n8(b_addr) kernel_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1)
#define kernel_k1m8n8(b_addr) kernel_h_k1m8n8(b_addr) "addq $16,"#b_addr";"
#define kernel_h_k1m8n12(b_addr) kernel_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2)
#define kernel_k1m8n12(b_addr) kernel_h_k1m8n12(b_addr) "addq $16,"#b_addr";"
#define KERNEL_k1m8n1 kernel_k1m8n1(%1)
#define KERNEL_k1m8n2 kernel_k1m8n2(%1)
#define KERNEL_k1m8n4 kernel_k1m8n4(%1)
#define KERNEL_k1m8n8 kernel_k1m8n8(%1)
#define KERNEL_k1m8n12 kernel_k1m8n12(%1)
#define end_load_a_k1m8(k_no) "vmovsldup "#k_no"*32(%0),%%ymm1; vmovshdup "#k_no"*32(%0),%%ymm2;"
#define end_acc_nc2_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,k_no,%1,%%r12,1)
#define end_acc_nc3_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,k_no,%1,%%r12,2)
#define end_acc_nc4_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,k_no,%%r15)
#define end_acc_nc5_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,k_no,%%r15,%%r12,1)
#define end_acc_nc6_k1m8(k_no) unit_gen_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,k_no,%%r15,%%r12,2)
#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
@ -101,12 +274,21 @@
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
#define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);"
#define unit_save_m8n2(c1,c2) \
#ifdef TRMMKERNEL
#define SAVE_L_m8n1 "vmulps %%ymm4,%%ymm0,%%ymm4; vmovups %%ymm4,(%2);"
#define unit_save_m8n2(c1,c2) \
"vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\
"vunpcklpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5), %%ymm0,%%ymm1;vmovups %%ymm1,(%5);"\
"vunpckhpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1;vmovups %%ymm1,(%5,%3,1);"\
"vunpcklpd %%ymm3,%%ymm2,%%ymm1; vmulps %%ymm1,%%ymm0,%%ymm1; vmovups %%ymm1,(%5);"\
"vunpckhpd %%ymm3,%%ymm2,%%ymm1; vmulps %%ymm1,%%ymm0,%%ymm1; vmovups %%ymm1,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#else
#define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);"
#define unit_save_m8n2(c1,c2) \
"vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\
"vunpcklpd %%ymm3,%%ymm2,%%ymm1; vfmadd213ps (%5), %%ymm0,%%ymm1; vmovups %%ymm1,(%5);"\
"vunpckhpd %%ymm3,%%ymm2,%%ymm1; vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1; vmovups %%ymm1,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#endif
#define SAVE_L_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
#define SAVE_L_m8n4 SAVE_L_m8n2 unit_save_m8n2(%%ymm6,%%ymm7)
#define SAVE_L_m8n8 SAVE_L_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
@ -115,53 +297,68 @@
#define SAVE_R_m8n8 SAVE_R_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
#define SAVE_R_m8n12 SAVE_R_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
#define COMPUTE_L_m8(ndim,sim) \
INIT_m8n##ndim\
"movq %%r13,%4; movq %%r14,%1;"\
#ndim""#sim"882:\n\t"\
INIT_m8n##ndim START_SET_PAPB(8,ndim)\
"movq %%r13,%4;"\
KERNEL_HEAD_C_n##ndim(8)\
"testq %4,%4; jz "#ndim""#sim"883f;"\
KERNEL_k1m8n##ndim(%1)\
"decq %4; jmp "#ndim""#sim"882b;"\
#ndim""#sim"882:\n\t"\
kernel_k1m8n##ndim(%1)\
"decq %4; jnz "#ndim""#sim"882b;"\
#ndim""#sim"883:\n\t"\
KERNEL_TAIL_C_n##ndim(8)\
SAVE_L_m8n##ndim "addq $32,%2;"
#define COMPUTE_R_m8(ndim,sim) \
"subq %%r12,%0; subq %%r12,%0;"\
INIT_m8n##ndim\
"movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
#ndim""#sim"882:\n\t"\
INIT_m8n##ndim RECOVER_PA(8)\
"movq %%r13,%4;"\
KERNEL_HEAD_R_n##ndim(8)\
"testq %4,%4; jz "#ndim""#sim"883f;"\
KERNEL_k1m8n##ndim(%%r15)\
"decq %4; jmp "#ndim""#sim"882b;"\
#ndim""#sim"882:\n\t"\
kernel_k1m8n##ndim(%%r15)\
"decq %4; jnz "#ndim""#sim"882b;"\
#ndim""#sim"883:\n\t"\
SAVE_R_m8n##ndim
#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833)
#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833)
#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833)
#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833)
#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833)
KERNEL_TAIL_R_n##ndim(8)\
SAVE_R_m8n##ndim END_SET_PA(8)
#define COMPUTE_m8_n1 COMPUTE_L_m8(1,33833) END_SET_PA(8)
#define COMPUTE_m8_n2 COMPUTE_L_m8(2,33833) END_SET_PA(8)
#define COMPUTE_m8_n4 COMPUTE_L_m8(4,33833) END_SET_PA(8)
#define COMPUTE_m8_n8 COMPUTE_L_m8(8,33833) END_SET_PA(8)
#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833) END_SET_PA(8)
#define COMPUTE_m8_n16 COMPUTE_L_m8(12,33733) COMPUTE_R_m8(4,33933)
#define COMPUTE_m8_n20 COMPUTE_L_m8(12,33633) COMPUTE_R_m8(8,33933)
#define COMPUTE_m8_n24 COMPUTE_L_m8(12,33533) COMPUTE_R_m8(12,33933)
#define COMPUTE_m8(ndim) COMPUTE_m8_n##ndim
/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
#define KERNEL_k1m4n1(b_addr) \
#define kernel_k1m4n1(b_addr) \
"vmovups (%0),%%xmm1; addq $16,%0;"\
"vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"addq $4,"#b_addr";"
#define KERNEL_h_k1m4n2(b_addr) \
#define kernel_h_k1m4n2(b_addr) \
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
"vmovddup ("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
#define KERNEL_k1m4n2(b_addr) KERNEL_h_k1m4n2(b_addr) "addq $8,"#b_addr";"
#define KERNEL_h_k1m4n4(b_addr) \
KERNEL_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
#define KERNEL_k1m4n4(b_addr) KERNEL_h_k1m4n4(b_addr) "addq $16,"#b_addr";"
#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
"vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
"vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
#define KERNEL_h_k1m4n8(b_addr) KERNEL_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1)
#define KERNEL_k1m4n8(b_addr) KERNEL_h_k1m4n8(b_addr) "addq $16,"#b_addr";"
#define KERNEL_h_k1m4n12(b_addr) KERNEL_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2)
#define KERNEL_k1m4n12(b_addr) KERNEL_h_k1m4n12(b_addr) "addq $16,"#b_addr";"
#define kernel_k1m4n2(b_addr) kernel_h_k1m4n2(b_addr) "addq $8,"#b_addr";"
#define kernel_h_k1m4n4(b_addr) \
kernel_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
#define kernel_k1m4n4(b_addr) kernel_h_k1m4n4(b_addr) "addq $16,"#b_addr";"
#define unit_gen_kernel_k1m4n4(c1,c2,c3,c4,k_no,...) \
"vmovddup "#k_no"*16 ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
"vmovddup "#k_no"*16+8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) unit_gen_kernel_k1m4n4(c1,c2,c3,c4,0,__VA_ARGS__)
#define kernel_h_k1m4n8(b_addr) kernel_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1)
#define kernel_k1m4n8(b_addr) kernel_h_k1m4n8(b_addr) "addq $16,"#b_addr";"
#define kernel_h_k1m4n12(b_addr) kernel_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2)
#define kernel_k1m4n12(b_addr) kernel_h_k1m4n12(b_addr) "addq $16,"#b_addr";"
#define KERNEL_k1m4n1 kernel_k1m4n1(%1)
#define KERNEL_k1m4n2 kernel_k1m4n2(%1)
#define KERNEL_k1m4n4 kernel_k1m4n4(%1)
#define KERNEL_k1m4n8 kernel_k1m4n8(%1)
#define KERNEL_k1m4n12 kernel_k1m4n12(%1)
#define end_load_a_k1m4(k_no) "vmovsldup "#k_no"*16(%0),%%xmm1; vmovshdup "#k_no"*16(%0),%%xmm2;"
#define end_acc_nc2_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,k_no,%1,%%r12,1)
#define end_acc_nc3_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,k_no,%1,%%r12,2)
#define end_acc_nc4_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm4,%%xmm5,%%xmm6,%%xmm7,k_no,%%r15)
#define end_acc_nc5_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,k_no,%%r15,%%r12,1)
#define end_acc_nc6_k1m4(k_no) unit_gen_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,k_no,%%r15,%%r12,2)
#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
@ -169,12 +366,21 @@
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
#define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);"
#define unit_save_m4n2(c1,c2) \
#ifdef TRMMKERNEL
#define SAVE_L_m4n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovups %%xmm4,(%2);"
#define unit_save_m4n2(c1,c2) \
"vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\
"vunpcklpd %%xmm3,%%xmm2,%%xmm1;vmulps %%xmm1,%%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\
"vunpckhpd %%xmm3,%%xmm2,%%xmm1;vmulps %%xmm1,%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#else
#define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);"
#define unit_save_m4n2(c1,c2) \
"vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\
"vunpcklpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5), %%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\
"vunpckhpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5,%3,1),%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#endif
#define SAVE_L_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
#define SAVE_L_m4n4 SAVE_L_m4n2 unit_save_m4n2(%%xmm6,%%xmm7)
#define SAVE_L_m4n8 SAVE_L_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
@ -183,29 +389,32 @@
#define SAVE_R_m4n8 SAVE_R_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
#define SAVE_R_m4n12 SAVE_R_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
#define COMPUTE_L_m4(ndim,sim) \
INIT_m4n##ndim\
"movq %%r13,%4; movq %%r14,%1;"\
#ndim""#sim"442:\n\t"\
INIT_m4n##ndim START_SET_PAPB(4,ndim)\
"movq %%r13,%4;"\
KERNEL_HEAD_C_n##ndim(4)\
"testq %4,%4; jz "#ndim""#sim"443f;"\
KERNEL_k1m4n##ndim(%1)\
"decq %4; jmp "#ndim""#sim"442b;"\
#ndim""#sim"442:\n\t"\
kernel_k1m4n##ndim(%1)\
"decq %4; jnz "#ndim""#sim"442b;"\
#ndim""#sim"443:\n\t"\
KERNEL_TAIL_C_n##ndim(4)\
SAVE_L_m4n##ndim "addq $16,%2;"
#define COMPUTE_R_m4(ndim,sim) \
"subq %%r12,%0;"\
INIT_m4n##ndim\
"movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
#ndim""#sim"442:\n\t"\
INIT_m4n##ndim RECOVER_PA(4)\
"movq %%r13,%4;"\
KERNEL_HEAD_R_n##ndim(4)\
"testq %4,%4; jz "#ndim""#sim"443f;"\
KERNEL_k1m4n##ndim(%%r15)\
"decq %4; jmp "#ndim""#sim"442b;"\
#ndim""#sim"442:\n\t"\
kernel_k1m4n##ndim(%%r15)\
"decq %4; jnz "#ndim""#sim"442b;"\
#ndim""#sim"443:\n\t"\
SAVE_R_m4n##ndim
#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855)
#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855)
#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855)
#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855)
#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855)
KERNEL_TAIL_R_n##ndim(4)\
SAVE_R_m4n##ndim END_SET_PA(4)
#define COMPUTE_m4_n1 COMPUTE_L_m4(1,55855) END_SET_PA(4)
#define COMPUTE_m4_n2 COMPUTE_L_m4(2,55855) END_SET_PA(4)
#define COMPUTE_m4_n4 COMPUTE_L_m4(4,55855) END_SET_PA(4)
#define COMPUTE_m4_n8 COMPUTE_L_m4(8,55855) END_SET_PA(4)
#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855) END_SET_PA(4)
#define COMPUTE_m4_n16 COMPUTE_L_m4(12,55755) COMPUTE_R_m4(4,55955)
#define COMPUTE_m4_n20 COMPUTE_L_m4(12,55655) COMPUTE_R_m4(8,55955)
#define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955)
@ -217,40 +426,60 @@
"vmovsd (%0),%%xmm1; addq $8,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"addq $4,%1;"
#define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define KERNEL_k1m2n2 \
"vmovsd (%0),%%xmm1; addq $8,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
"addq $8,%1;"
#define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
#ifdef TRMMKERNEL
#define SAVE_h_m2n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
#define SAVE_h_m2n2 SAVE_h_m2n1 "vmulps %%xmm5,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
#else
#define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
#define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
#endif
#define INIT_m2n4 INIT_m2n2
#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
#define INIT_m2n16 INIT_m2n12 "vpxor %%xmm10,%%xmm10,%%xmm10; vpxor %%xmm11,%%xmm11,%%xmm11;"
#define INIT_m2n20 INIT_m2n16 "vpxor %%xmm12,%%xmm12,%%xmm12; vpxor %%xmm13,%%xmm13,%%xmm13;"
#define INIT_m2n24 INIT_m2n20 "vpxor %%xmm14,%%xmm14,%%xmm14; vpxor %%xmm15,%%xmm15,%%xmm15;"
#define unit_gen_kernel_k1m2n4(c1,c2,k_no,...) \
"vmovups "#k_no"*16("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"
#define KERNEL_h_k1m2n4 \
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;"\
"vmovups (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;" unit_gen_kernel_k1m2n4(%%xmm4,%%xmm5,0,%1)
#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 "vmovups (%1,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 unit_gen_kernel_k1m2n4(%%xmm6,%%xmm7,0,%1,%%r12,1)
#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%1;"
#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 \
"vmovups (%1,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm8; vfmadd231ps %%xmm2,%%xmm3,%%xmm9; addq $16,%1;"
#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 "vmovups (%%r15),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm10; vfmadd231ps %%xmm2,%%xmm3,%%xmm11;"
#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 unit_gen_kernel_k1m2n4(%%xmm8,%%xmm9,0,%1,%%r12,2) "addq $16,%1;"
#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 unit_gen_kernel_k1m2n4(%%xmm10,%%xmm11,0,%%r15)
#define KERNEL_k1m2n16 KERNEL_h_k1m2n16 "addq $16,%%r15;"
#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 "vmovups (%%r15,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm12; vfmadd231ps %%xmm2,%%xmm3,%%xmm13;"
#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 unit_gen_kernel_k1m2n4(%%xmm12,%%xmm13,0,%%r15,%%r12,1)
#define KERNEL_k1m2n20 KERNEL_h_k1m2n20 "addq $16,%%r15;"
#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 "vmovups (%%r15,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm14; vfmadd231ps %%xmm2,%%xmm3,%%xmm15;"
#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 unit_gen_kernel_k1m2n4(%%xmm14,%%xmm15,0,%%r15,%%r12,2)
#define KERNEL_k1m2n24 KERNEL_h_k1m2n24 "addq $16,%%r15;"
#define unit_save_m2n4(c1,c2) \
#define end_load_a_k1m2(k_no) "vbroadcastss "#k_no"*8(%0),%%xmm1; vbroadcastss "#k_no"*8+4(%0),%%xmm2;"
#define end_acc_nc2_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm6,%%xmm7,k_no,%1,%%r12,1)
#define end_acc_nc3_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm8,%%xmm9,k_no,%1,%%r12,2)
#define end_acc_nc4_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm10,%%xmm11,k_no,%%r15)
#define end_acc_nc5_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm12,%%xmm13,k_no,%%r15,%%r12,1)
#define end_acc_nc6_k1m2(k_no) unit_gen_kernel_k1m2n4(%%xmm14,%%xmm15,k_no,%%r15,%%r12,2)
#ifdef TRMMKERNEL
#define unit_save_m2n4(c1,c2) \
"vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
"vmulps %%xmm1,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"\
"vmulps %%xmm2,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#else
#define unit_save_m2n4(c1,c2) \
"vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"\
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#endif
#define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
#define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%xmm6,%%xmm7)
#define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%xmm8,%%xmm9)
@ -259,13 +488,15 @@
#define SAVE_h_m2n24 SAVE_h_m2n20 unit_save_m2n4(%%xmm14,%%xmm15)
#define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $8,%2;"
#define COMPUTE_m2(ndim) \
INIT_m2n##ndim\
"movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\
INIT_m2n##ndim START_SET_PAPB(2,ndim)\
"movq %%r13,%4;"\
KERNEL_HEAD_C_n##ndim(2)\
"testq %4,%4; jz "#ndim"002022f;"\
#ndim"002021:\n\t"\
KERNEL_k1m2n##ndim "decq %4; jnz "#ndim"002021b;"\
#ndim"002022:\n\t"\
SAVE_m2(ndim)
KERNEL_TAIL_C_n##ndim(2)\
SAVE_m2(ndim) END_SET_PA(2)
/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
@ -273,15 +504,25 @@
"vmovss (%1),%%xmm3; addq $4,%1;"\
"vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
#ifdef TRMMKERNEL
#define SAVE_h_m1n1 "vmulss %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
#else
#define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
#endif
#define INIT_m1n2 INIT_m1n1
#define KERNEL_k1m1n2 \
"vmovsd (%1),%%xmm3; addq $8,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define SAVE_h_m1n2 \
#ifdef TRMMKERNEL
#define SAVE_h_m1n2 \
"vmulps %%xmm4,%%xmm0,%%xmm4;"\
"vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
#else
#define SAVE_h_m1n2 \
"vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
"vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
#endif
#define INIT_m1n4 INIT_m1n2
#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
@ -300,12 +541,25 @@
#define KERNEL_k1m1n20 KERNEL_h_k1m1n20 "addq $16,%%r15;"
#define KERNEL_h_k1m1n24 KERNEL_h_k1m1n20 "vfmadd231ps (%%r15,%%r12,2),%%xmm1,%%xmm9;"
#define KERNEL_k1m1n24 KERNEL_h_k1m1n24 "addq $16,%%r15;"
#define unit_save_m1n4(c1) \
#define end_load_a_k1m1(k_no) "vbroadcastss "#k_no"*4(%0),%%xmm1;"
#define end_acc_nc2_k1m1(k_no) "vfmadd231ps "#k_no"*16(%1,%%r12,1),%%xmm1,%%xmm5;"
#define end_acc_nc3_k1m1(k_no) "vfmadd231ps "#k_no"*16(%1,%%r12,2),%%xmm1,%%xmm6;"
#define end_acc_nc4_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15),%%xmm1,%%xmm7;"
#define end_acc_nc5_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15,%%r12,1),%%xmm1,%%xmm8;"
#define end_acc_nc6_k1m1(k_no) "vfmadd231ps "#k_no"*16(%%r15,%%r12,2),%%xmm1,%%xmm9;"
#ifdef TRMMKERNEL
#define unit_save_m1n4(c1) \
"vmulps "#c1",%%xmm0,"#c1"; vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
"vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
"vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
#else
#define unit_save_m1n4(c1) \
"vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
"vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
"vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
"vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
"vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
#endif
#define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
#define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%xmm5)
#define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%xmm6)
@ -314,58 +568,102 @@
#define SAVE_h_m1n24 SAVE_h_m1n20 unit_save_m1n4(%%xmm9)
#define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $4,%2;"
#define COMPUTE_m1(ndim) \
INIT_m1n##ndim\
"movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\
INIT_m1n##ndim START_SET_PAPB(1,ndim)\
"movq %%r13,%4;"\
KERNEL_HEAD_C_n##ndim(1)\
"testq %4,%4; jz "#ndim"001012f;"\
#ndim"001011:\n\t"\
KERNEL_k1m1n##ndim "decq %4; jnz "#ndim"001011b;"\
#ndim"001012:\n\t"\
SAVE_m1(ndim)
KERNEL_TAIL_C_n##ndim(1)\
SAVE_m1(ndim) END_SET_PA(1)
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
/* %6 = "+r"(next_b), %7 = "m"(ALPHA), %8 = "m"(M) */
/* r11 = m_counter, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
/* %7 = "m"(ALPHA), %8 = "m"(M), %9 = "m"(K), %10 = "m"(off) */
#ifdef TRMMKERNEL
#if BACKWARDS == 1
#define OFFSET_TO_K "movq %9,%%r13; subq %10,%%r13;"
#else
#define OFFSET_TO_K "movq %10,%%r13;"
#endif
#else
#define OFFSET_TO_K "movq %9,%%r13;"
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
#if BACKWARDS == 1
#define START_UPDATE_OFFSET(ndim) {}
#define END_UPDATE_OFFSET(ndim) {off += (ndim);}
#else
#define START_UPDATE_OFFSET(ndim) {off += (ndim)>4 ? 4:(ndim);}
#define END_UPDATE_OFFSET(ndim) {off += (ndim)>4 ? ((ndim)-4):0;}
#endif
#else
#define START_UPDATE_OFFSET(ndim) {}
#define END_UPDATE_OFFSET(ndim) {}
#endif
#if defined(TRMMKERNEL) && defined(LEFT)
#if BACKWARDS == 1
#define START_UPDATE_K(mdim) ""
#define END_UPDATE_K(mdim) "subq $"#mdim",%%r13;"
#else
#define START_UPDATE_K(mdim) "addq $"#mdim",%%r13;"
#define END_UPDATE_K(mdim) ""
#endif
#else
#define START_UPDATE_K(mdim) ""
#define END_UPDATE_K(mdim) ""
#endif
#define COMPUTE(ndim) {\
next_b = b_pointer + ndim * K;\
__asm__ __volatile__(\
next_b = b_pointer + ndim * K; START_UPDATE_OFFSET(ndim)\
__asm__ __volatile__(\
"vbroadcastss %7,%%zmm0;"\
"movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\
OFFSET_TO_K "movq %9,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\
"cmpq $16,%%r11;jb 33101"#ndim"f;"\
"33109"#ndim":\n\t"\
COMPUTE_m16(ndim)\
START_UPDATE_K(16) COMPUTE_m16(ndim) END_UPDATE_K(16)\
"subq $16,%%r11;cmpq $16,%%r11;jnb 33109"#ndim"b;"\
"33101"#ndim":\n\t"\
"cmpq $8,%%r11;jb 33102"#ndim"f;"\
COMPUTE_m8(ndim)\
START_UPDATE_K(8) COMPUTE_m8(ndim) END_UPDATE_K(8)\
"subq $8,%%r11;"\
"33102"#ndim":\n\t"\
"cmpq $4,%%r11;jb 33103"#ndim"f;"\
COMPUTE_m4(ndim)\
START_UPDATE_K(4) COMPUTE_m4(ndim) END_UPDATE_K(4)\
"subq $4,%%r11;"\
"33103"#ndim":\n\t"\
"cmpq $2,%%r11;jb 33104"#ndim"f;"\
COMPUTE_m2(ndim)\
START_UPDATE_K(2) COMPUTE_m2(ndim) END_UPDATE_K(2)\
"subq $2,%%r11;"\
"33104"#ndim":\n\t"\
"testq %%r11,%%r11;jz 33105"#ndim"f;"\
COMPUTE_m1(ndim)\
START_UPDATE_K(1) COMPUTE_m1(ndim) END_UPDATE_K(1)\
"33105"#ndim":\n\t"\
"movq %%r13,%4; movq %%r14,%1; vzeroupper;"\
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(next_b):"m"(ALPHA),"m"(M)\
:"r10","r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
"zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
"cc","memory");\
a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M;\
"movq %%r14,%1; vzeroupper;"\
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_counter),"+r"(ctemp),"+r"(next_b)\
:"m"(ALPHA),"m"(M),"m"(K),"m"(off):"r10","r11","r12","r13","r14","r15","cc","memory",\
"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\
"zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\
a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M; END_UPDATE_OFFSET(ndim)\
}
int __attribute__ ((noinline))
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC
#ifdef TRMMKERNEL
,BLASLONG offset
#endif
)
{
if(m==0||n==0||k==0||alpha==(float)0.0) return 0;
if(m==0||n==0) return 0;
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha;
int64_t M = (int64_t)m, K = (int64_t)k;
int64_t M = (int64_t)m, K = (int64_t)k, k_counter = K, off = 0;
BLASLONG n_count = n;
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
#ifdef TRMMKERNEL
#ifdef LEFT
off = offset;
#else
off = -offset;
#endif
#endif
for(;n_count>23;n_count-=24) COMPUTE(24)
for(;n_count>19;n_count-=20) COMPUTE(20)
for(;n_count>15;n_count-=16) COMPUTE(16)
@ -376,5 +674,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
if(n_count>0) COMPUTE(1)
return 0;
}
#include <immintrin.h>
#include "sgemm_direct_skylakex.c"
#ifndef TRMMKERNEL
#include <immintrin.h>
#include "sgemm_direct_skylakex.c"
#endif

View File

@ -0,0 +1,424 @@
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */
/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */
/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
#define KERNEL_k1m8n1 \
"vmovups (%0),%%ymm1; addq $32,%0;"\
"vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
"addq $4,%1;"
#define KERNEL_h_k1m8n2 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
"vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;"
#define KERNEL_h_k1m8n4 \
KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff,...) \
"vbroadcastsd "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
"vbroadcastsd "#boff"+8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,%1,%%r12,4)
#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;"
#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,%1,%%r12,8)
#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;"
#define KERNEL_k2m8n4 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\
unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,%1)\
"vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\
unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,%1)\
"addq $32,%1;"
#define KERNEL_L_k1m8n6 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\
"vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"addq $16,%1;"
#define KERNEL_L_k2m8n6 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\
"vbroadcastsd (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastsd 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\
"vbroadcastsd 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastsd 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastsd 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"addq $32,%1;"
#define KERNEL_L_k1m16n6 \
"vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\
"vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"addq $16,%1;"
#define KERNEL_L_k2m16n6 \
"vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\
"vbroadcastss (%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastss 4(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastss 8(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vbroadcastss 12(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastss (%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastss 4(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\
"vbroadcastss 16(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastss 20(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastss 24(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vbroadcastss 28(%1) ,%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastss 16(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastss 20(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"addq $32,%1;"
#define KERNEL_R_k1m16n6 \
"vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8); addq $32,%0;"\
"vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"addq $16,%1;"
#define KERNEL_R_k2m16n6 \
"vmovups (%0),%%ymm1; vmovups (%0,%%r12,8),%%ymm2; prefetcht0 512(%0,%%r12,8);"\
"vbroadcastss 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastss 12(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastss (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vbroadcastss 4(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastss 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastss 12(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"vmovups 32(%0),%%ymm1; vmovups 32(%0,%%r12,8),%%ymm2; addq $64,%0;"\
"vbroadcastss 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastss 28(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastss 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vbroadcastss 20(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastss 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastss 28(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"addq $32,%1;"
#define KERNEL_R_k1m8n6 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0); addq $32,%0;"\
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"addq $16,%1;"
#define KERNEL_R_k2m8n6 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; prefetcht0 512(%0);"\
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"\
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"\
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm8; vfmadd231ps %%ymm2,%%ymm3,%%ymm9;"\
"vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; addq $64,%0;"\
"vbroadcastsd 24(%1,%%r12,4),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm10; vfmadd231ps %%ymm2,%%ymm3,%%ymm11;"\
"vbroadcastsd 16(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm12; vfmadd231ps %%ymm2,%%ymm3,%%ymm13;"\
"vbroadcastsd 24(%1,%%r12,8),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm14; vfmadd231ps %%ymm2,%%ymm3,%%ymm15;"\
"addq $32,%1;"
#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
#define unit_init_m8n4(c1,c2,c3,c4) \
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
#define INIT_m8n8 unit_init_m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
#define INIT_m8n4 INIT_m8n8
#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
#define INIT_m8n6 INIT_m8n12
#define INIT_m16n6 INIT_m8n12
#define SAVE_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);"
#define unit_save_m8n2(c1,c2) \
"vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\
"vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps (%5,%3,1),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",(%5,%3,1); leaq (%5,%3,2),%5;"
#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
#define SAVE_m8n4 "movq %2,%5;"\
"vaddps %%ymm4,%%ymm8,%%ymm4; vaddps %%ymm5,%%ymm9,%%ymm5; vaddps %%ymm6,%%ymm10,%%ymm6; vaddps %%ymm7,%%ymm11,%%ymm7;"\
unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7)
#define SAVE_m8n8 "movq %2,%5;"\
unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
#define unit_save_m16n2(c1,c2,c3,c4) \
"vfmadd213ps (%5),%%ymm0,"#c1"; vfmadd213ps 32(%5),%%ymm0,"#c2"; vmovups "#c1",(%5); vmovups "#c2",32(%5);"\
"vfmadd213ps (%5,%3,1),%%ymm0,"#c3"; vfmadd213ps 32(%5,%3,1),%%ymm0,"#c4"; vmovups "#c3",(%5,%3,1); vmovups "#c4",32(%5,%3,1); leaq (%5,%3,2),%5;"
#define SAVE_L_m16n6 "movq %2,%5;"\
unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
#define SAVE_R_m16n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\
unit_save_m16n2(%%ymm4,%%ymm5,%%ymm6,%%ymm7) unit_save_m16n2(%%ymm8,%%ymm9,%%ymm10,%%ymm11) unit_save_m16n2(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
#define SAVE_L_m8n6 "movq %2,%5;"\
"vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\
"vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\
unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9)
#define SAVE_R_m8n6 "leaq (%2,%3,4),%5; leaq (%5,%3,2),%5;"\
"vaddps %%ymm4,%%ymm10,%%ymm4; vaddps %%ymm5,%%ymm11,%%ymm5; vaddps %%ymm6,%%ymm12,%%ymm6;"\
"vaddps %%ymm7,%%ymm13,%%ymm7; vaddps %%ymm8,%%ymm14,%%ymm8; vaddps %%ymm9,%%ymm15,%%ymm9;"\
unit_save_m8n2(%%ymm4,%%ymm5) unit_save_m8n2(%%ymm6,%%ymm7) unit_save_m8n2(%%ymm8,%%ymm9)
/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
#define KERNEL_k1m4n1 \
"vmovups (%0),%%xmm1; addq $16,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"addq $4,%1;"
#define KERNEL_h_k1m4n2 \
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
"vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;"
#define KERNEL_h_k1m4n4 \
KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
"vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
"vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,4)
#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;"
#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,8)
#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;"
#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
#define unit_init_m4n4(c1,c2,c3,c4) \
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
#define SAVE_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);"
#define unit_save_m4n2(c1,c2) \
"vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\
"vfmadd213ps (%5),%%xmm0,"#c1"; vmovups "#c1",(%5);"\
"vfmadd213ps (%5,%3,1),%%xmm0,"#c2"; vmovups "#c2",(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7)
#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
#define KERNEL_k1m2n1 \
"vmovsd (%0),%%xmm1; addq $8,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"addq $4,%1;"
#define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define KERNEL_k1m2n2 \
"vmovsd (%0),%%xmm1; addq $8,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
"addq $8,%1;"
#define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
#define INIT_m2n4 INIT_m2n2
#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
#define KERNEL_k1m2n4 \
"vmovups (%1),%%xmm3; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
"vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
"addq $8,%0;"
#define KERNEL_k1m2n8 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
"vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
"addq $8,%0;"
#define KERNEL_k1m2n12 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
"vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
"addq $8,%0;"
#define unit_save_m2n4(c1,c2) \
"vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
"vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
"vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7)
#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9)
/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
#define KERNEL_k1m1n1 \
"vmovss (%1),%%xmm3; addq $4,%1;"\
"vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
#define INIT_m1n2 INIT_m1n1
#define KERNEL_k1m1n2 \
"vmovsd (%1),%%xmm3; addq $8,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define SAVE_m1n2 \
"vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
"vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
#define INIT_m1n4 INIT_m1n2
#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
#define KERNEL_k1m1n4 \
"vmovups (%1),%%xmm3; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define KERNEL_k1m1n8 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
"addq $4,%0;"
#define KERNEL_k1m1n12 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
"addq $4,%0;"
#define unit_save_m1n4(c1) \
"vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
"vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
"vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
"vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
"vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5)
#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6)
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */
/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */
#define COMPUTE_SIMPLE(mdim,ndim) \
"movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m##mdim##n##ndim\
"testq %4,%4; jz 7"#mdim"7"#ndim"2f;"\
"7"#mdim"7"#ndim"1:\n\t"\
KERNEL_k1m##mdim##n##ndim "decq %4; jnz 7"#mdim"7"#ndim"1b;"\
"7"#mdim"7"#ndim"2:\n\t"\
SAVE_m##mdim##n##ndim "addq $"#mdim"*4,%2;"
#define COMPUTE_m8n1 COMPUTE_SIMPLE(8,1)
#define COMPUTE_m8n2 COMPUTE_SIMPLE(8,2)
#define COMPUTE_m8n8 COMPUTE_SIMPLE(8,8)
#define COMPUTE_m8n12 COMPUTE_SIMPLE(8,12)
#define COMPUTE_m8n4 \
"movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n4\
"cmpq $8,%4; jb 78740f;"\
"78749:\n\t"\
KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4 KERNEL_k2m8n4\
"subq $8,%4; cmpq $8,%4; jnb 78749b;"\
"78740:\n\t"\
"testq %4,%4; jz 78742f;"\
"78741:\n\t"\
KERNEL_k1m8n4 "decq %4; jnz 78741b;"\
"78742:\n\t"\
SAVE_m8n4 "addq $32,%2;"
#define COMPUTE_L_m16n6 \
"movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\
"movq %%r13,%4; movq %2,%5; cmpq $16,%%r13; jb 7116762f; movq $14,%4;"\
"7116761:\n\t"\
KERNEL_L_k2m16n6 "prefetcht0 128(%1); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\
KERNEL_L_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\
KERNEL_L_k2m16n6 "prefetcht0 128(%1); prefetcht1 (%6); cmpq $198,%4; cmoveq %2,%5;"\
KERNEL_L_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7116761b;"\
"movq %2,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\
"7116762:\n\t"\
"xorq %%r15,%%r15; testq %4,%4; jz 7116764f;"\
"7116763:\n\t"\
"prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\
KERNEL_L_k1m16n6 "cmpq $6,%%r15; cmoveq %2,%5; decq %4; jnz 7116763b;"\
"7116764:\n\t"\
SAVE_L_m16n6 "addq $32,%2;"
#define COMPUTE_R_m16n6 \
"movq %%r12,%%r13; sarq $2,%%r13; movq %%r14,%1;" INIT_m16n6\
"movq %%r13,%4; leaq (%2,%3,4),%5; leaq (%5,%3,2),%5; movq %5,%%r10; cmpq $16,%%r13; jb 7216762f; movq $14,%4;"\
"7216761:\n\t"\
KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); testq $24,%4; movq $84,%%r15; cmovz %3,%%r15;"\
KERNEL_R_k2m16n6 "prefetcht1 (%5); subq $63,%5; addq %%r15,%5;"\
KERNEL_R_k2m16n6 "prefetcht0 128(%1,%%r12,8); prefetcht1 (%6); cmpq $198,%4; cmoveq %%r10,%5;"\
KERNEL_R_k2m16n6 "addq $16,%6; addq $8,%4; cmpq %4,%%r13; jnb 7216761b;"\
"movq %%r10,%5; negq %4; leaq 14(%%r13,%4,1),%4;"\
"7216762:\n\t"\
"xorq %%r15,%%r15; testq %4,%4; jz 7216764f;"\
"7216763:\n\t"\
"prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5; incq %%r15;"\
KERNEL_R_k1m16n6 "cmpq $6,%%r15; cmoveq %%r10,%5; decq %4; jnz 7216763b;"\
"7216764:\n\t"\
"prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_R_m16n6 "addq $32,%2;"
#define COMPUTE_H_m8n6 \
"movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\
"cmpq $8,%4; jb 718760f; movq %2,%5; xorq %%r15,%%r15;"\
"718769:\n\t"\
KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\
KERNEL_L_k2m8n6 KERNEL_L_k2m8n6 "prefetcht2 (%5); leaq -31(%5,%%r15,1),%5;"\
"subq $8,%4; cmpq $8,%4; jnb 718769b;"\
"718760:\n\t"\
"testq %4,%4; jz 718762f;"\
"718761:\n\t"\
KERNEL_L_k1m8n6 "decq %4; jnz 718761b;"\
"718762:\n\t"\
SAVE_L_m8n6 "negq %%r12; leaq (%0,%%r12,8),%0; negq %%r12;"
#define COMPUTE_T_m8n6(side,sim) \
"movq %%r12,%4; sarq $2,%4; movq %%r14,%1;" INIT_m8n6\
"cmpq $8,%4; jb 72"#sim"8760f;"\
"72"#sim"8769:\n\t"\
KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6 KERNEL_##side##_k2m8n6\
"subq $8,%4; cmpq $8,%4; jnb 72"#sim"8769b;"\
"72"#sim"8760:\n\t"\
"testq %4,%4; jz 72"#sim"8762f;"\
"72"#sim"8761:\n\t"\
KERNEL_##side##_k1m8n6 "decq %4; jnz 72"#sim"8761b;"\
"72"#sim"8762:\n\t"\
SAVE_##side##_m8n6 "addq $32,%2;"
#define COMPUTE_NORMAL(ndim) {\
next_b = b_pointer + ndim * K;\
__asm__ __volatile__(\
"vbroadcastss %9,%%ymm0;"\
"movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\
"cmpq $8,%%r11;jb 33101"#ndim"f;"\
"33109"#ndim":\n\t"\
COMPUTE_m8n##ndim\
"subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\
"33101"#ndim":\n\t"\
"cmpq $4,%%r11;jb 33103"#ndim"f;"\
COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\
"33103"#ndim":\n\t"\
"cmpq $2,%%r11;jb 33104"#ndim"f;"\
COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\
"33104"#ndim":\n\t"\
"testq %%r11,%%r11;jz 33105"#ndim"f;"\
COMPUTE_SIMPLE(1,ndim)\
"33105"#ndim":\n\t"\
"movq %%r14,%1; vzeroupper;"\
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\
:"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\
}
#define COMPUTE_n12 {\
next_b = b_pointer + 12 * K;\
__asm__ __volatile__(\
"vbroadcastss %9,%%ymm0;"\
"movq %8,%%r12; salq $2,%%r12; movq %1,%%r14; movq %7,%%r11;"\
"cmpq $16,%%r11;jb 3310112f;"\
COMPUTE_H_m8n6\
"3310612:\n\t"\
COMPUTE_R_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jb 3310712f;"\
COMPUTE_L_m16n6 "subq $8,%%r11; cmpq $16,%%r11;jnb 3310612b;"\
COMPUTE_T_m8n6(R,5) "subq $8,%%r11; jmp 3310212f;"\
"3310712:\n\t"\
COMPUTE_T_m8n6(L,7) "subq $8,%%r11; jmp 3310212f;"\
"3310112:\n\t"\
"cmpq $8,%%r11;jb 3310212f;"\
COMPUTE_SIMPLE(8,12) "subq $8,%%r11;"\
"3310212:\n\t"\
"cmpq $4,%%r11;jb 3310312f;"\
COMPUTE_SIMPLE(4,12) "subq $4,%%r11;"\
"3310312:\n\t"\
"cmpq $2,%%r11;jb 3310412f;"\
COMPUTE_SIMPLE(2,12) "subq $2,%%r11;"\
"3310412:\n\t"\
"testq %%r11,%%r11;jz 3310512f;"\
COMPUTE_SIMPLE(1,12)\
"3310512:\n\t"\
"movq %%r14,%1; vzeroupper;"\
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(ctemp),"+r"(next_b)\
:"m"(M),"m"(K),"m"(ALPHA):"r10","r11","r12","r13","r14","r15",\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
a_pointer -= M * K; b_pointer += 12 * K; c_pointer += (LDC * 12 - M);\
}
#include "common.h"
#include <stdint.h>
int __attribute__ ((noinline))
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC){
if(m==0||n==0||k==0||alpha==(float)0.0) return 0;
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);
float ALPHA = alpha;
int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0;
BLASLONG n_count = n;
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
for(;n_count>11;n_count-=12) COMPUTE_n12
for(;n_count>7;n_count-=8) COMPUTE_NORMAL(8)
for(;n_count>3;n_count-=4) COMPUTE_NORMAL(4)
for(;n_count>1;n_count-=2) COMPUTE_NORMAL(2)
if(n_count>0) COMPUTE_NORMAL(1)
return 0;
}

View File

@ -50,7 +50,7 @@
"vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1)
#define KERNEL_2_k1m4n4 \
"vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\
"vpermilpd $5,-64(%0),%%ymm0; vpermilpd $5,-32(%0),%%ymm1;"\
acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1)
#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2)
#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2)
@ -93,9 +93,9 @@
"movq $10,%5; movq $84,%%r15;"\
#ndim"4441:\n\t"\
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\
#ndim"4442:\n\t"\

47
param.h
View File

@ -1722,16 +1722,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_R xgemm_r
#define XGEMM_DEFAULT_Q 128
#define CGEMM3M_DEFAULT_UNROLL_N 8
#define CGEMM3M_DEFAULT_UNROLL_M 4
#define ZGEMM3M_DEFAULT_UNROLL_N 8
#define ZGEMM3M_DEFAULT_UNROLL_M 2
#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 4
#define ZGEMM3M_DEFAULT_UNROLL_M 4
#define CGEMM3M_DEFAULT_P 448
#define ZGEMM3M_DEFAULT_P 224
#define CGEMM3M_DEFAULT_P 320
#define ZGEMM3M_DEFAULT_P 256
#define XGEMM3M_DEFAULT_P 112
#define CGEMM3M_DEFAULT_Q 224
#define ZGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_Q 320
#define ZGEMM3M_DEFAULT_Q 256
#define XGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_R 12288
#define ZGEMM3M_DEFAULT_R 12288
@ -2620,7 +2620,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*FIXME: this should be using the cache size, but there is currently no easy way to
query that on ARM. So if getarch counted more than 8 cores we simply assume the host
is a big desktop or server with abundant cache rather than a phone or embedded device */
#if NUM_CORES > 8
#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180)
#define SGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
@ -2705,6 +2705,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#elif defined(NEOVERSEN1)
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
#else // Other/undetected ARMv8 cores
#define SGEMM_DEFAULT_UNROLL_M 16