Merge pull request #63 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-06-09 15:54:30 +02:00 committed by GitHub
commit fdd1b50263
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 127 additions and 1491 deletions

View File

@ -783,6 +783,7 @@ endif
ifeq ($(F_COMPILER), FLANG)
CCOMMON_OPT += -DF_INTERFACE_FLANG
FCOMMON_OPT += -frecursive
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@ -796,6 +797,11 @@ endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
ifeq ($(OSNAME), Linux)
ifeq ($(ARCH), x86_64)
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
endif
endif
endif
ifeq ($(F_COMPILER), G77)
@ -1270,8 +1276,11 @@ endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
ifeq ($(FLANG_VENDOR),AOCC)
override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT)
else
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
endif
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =

View File

@ -1825,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX)
##################################### Sgeev ####################################################
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
sgeev.acml : sgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@ -1841,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX)
##################################### Dgeev ####################################################
dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dgeev.acml : dgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@ -1858,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX)
##################################### Cgeev ####################################################
cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
cgeev.acml : cgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@ -1875,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX)
##################################### Zgeev ####################################################
zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
zgeev.acml : zgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@ -1891,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX)
##################################### Sgetri ####################################################
sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
sgetri.acml : sgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@ -1907,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX)
##################################### Dgetri ####################################################
dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dgetri.acml : dgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@ -1924,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX)
##################################### Cgetri ####################################################
cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
cgetri.acml : cgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@ -1941,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX)
##################################### Zgetri ####################################################
zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
zgetri.acml : zgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)

View File

@ -170,9 +170,11 @@ int main(int argc, char *argv[]){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
#ifdef RETURN_BY_STACK
DOT (&result , &m, x, &inc_x, y, &inc_y );
#else
result = DOT (&m, x, &inc_x, y, &inc_y );
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;

View File

@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
#endif
do {
while (*address) {YIELDING;};
while (*address) {YIELDING;}
#ifndef C_MSVC
__asm__ __volatile__(
@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){
#else
extern unsigned int blas_quick_divide_table[];
static __inline int blas_quickdivide(unsigned int x, unsigned int y){
static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){
unsigned int result;
volatile unsigned int result;
if (y <= 1) return x;
@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
return result;
}
#endif

View File

@ -334,7 +334,7 @@ if ($link ne "") {
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/))
&& ($flags !~ /[0-9]+/)
&& ($flags !~ /^\-l$/)
) {

File diff suppressed because it is too large Load Diff

View File

@ -54,40 +54,40 @@
#define kernel_kstart_n10(mdim,updk) ""
#define kernel_kstart_n12(mdim,updk) ""
#define kernel_kend_n4(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16)
#define kernel_kend_n6(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48)
#define kernel_kend_n8(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80)
#define kernel_kend_n10(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112)
#define kernel_kend_n12(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144)
#endif
#else
#define HEAD_SET_OFF(ndim) {}
@ -129,18 +129,28 @@
#define init_update_k(mdim) ""
#define save_update_k(mdim) ""
#endif
#define KERNEL_h_k1m16n1 \
"vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\
"vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;"
#define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;"
#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
#ifdef BROADCAST_KERNEL
#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
"vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;"
#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\
#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
"vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\
"vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__)
"vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
#else
#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
"vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\
"vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";"
#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
#define KERNEL_h_k1m16n2 \
"vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\
unit_acc_m16n2(8,9,10,11,%1)
#endif
#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1)
#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
#define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;"
@ -151,24 +161,42 @@
#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2)
#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#ifdef BROADCAST_KERNEL
#define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;"
#define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1)
#define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2)
#define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15)
#define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1)
#define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2)
#else
#define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;"
#endif
#define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15)
#define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2)
#endif
#define save_init_m16 "movq %2,%3; addq $128,%2;"
#ifdef TRMMKERNEL
#define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
#ifdef BROADCAST_KERNEL
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
"vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
#else
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
"vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
"vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
#endif
#else
#define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
#ifdef BROADCAST_KERNEL
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
"vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
#else
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
"vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
"vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
#endif
#endif
#define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11)
#define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15)
@ -206,11 +234,11 @@
#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;"
#define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15)
#define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
#define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15)
#define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
#endif
#define save_init_m8 "movq %2,%3; addq $64,%2;"
#ifdef TRMMKERNEL
@ -258,11 +286,11 @@
#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;"
#define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
#define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
#endif
#define save_init_m4 "movq %2,%3; addq $32,%2;"
#ifdef TRMMKERNEL
@ -311,11 +339,11 @@
#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;"
#define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
#define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
#endif
#define save_init_m2 "movq %2,%3; addq $16,%2;"
#ifdef TRMMKERNEL
@ -362,11 +390,11 @@
#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;"
#define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
#define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
#define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
#define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
#define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
#define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
#define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
#define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
#define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
#define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
#endif
#define save_init_m1 "movq %2,%3; addq $8,%2;"
#ifdef TRMMKERNEL

View File

@ -200,7 +200,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO ) THEN

View File

@ -161,7 +161,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO.OR.SISNAN( ABS( G ) ) ) THEN

View File

@ -163,7 +163,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R

View File

@ -161,7 +161,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R

View File

@ -163,7 +163,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R

View File

@ -161,7 +161,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R

View File

@ -201,7 +201,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO ) THEN

View File

@ -161,7 +161,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO.OR.DISNAN( ABS( G ) ) ) THEN

View File

@ -72,12 +72,13 @@ main (int argc, char *argv[])
{
int m, n, k;
int i, j, l;
int x;
int ret = 0;
int loop = 100;
char transA = 'N', transB = 'N';
float alpha = 1.0, beta = 0.0;
for (int x = 0; x <= loop; x++)
for (x = 0; x <= loop; x++)
{
m = k = n = x;
float A[m * k];
@ -86,9 +87,9 @@ main (int argc, char *argv[])
bfloat16_bits AA[m * k], BB[k * n];
float DD[m * n], CC[m * n];
for (int j = 0; j < m; j++)
for (j = 0; j < m; j++)
{
for (int i = 0; i < m; i++)
for (i = 0; i < m; i++)
{
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;