Merge pull request #479 from wernsaar/develop

workaround for sandybridge zgemm kernel
This commit is contained in:
Zhang Xianyi 2014-12-23 00:59:41 +08:00
commit eb738148fe
34 changed files with 131 additions and 98 deletions

View File

@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
# ACML custom
ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
# ACML 6.1 custom
ACML=/home/saar/acml6.1/gfortran64_mp/lib
LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
# Atlas Ubuntu
#ATLAS=/usr/lib/atlas-base

View File

@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){
}
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT result;
@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
FLOAT wkopt[4];
@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char trans='N';
blasint m, i, j;
blasint m, n, i, j;
int loops = 1;
int has_param_n=0;
int l;
char *p;
@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){
if ( p != NULL )
loops = atoi(p);
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
has_param_n=1;
}
#ifdef linux
srandom(getpid());
@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
if ( has_param_n == 1 && n <= m )
n=n;
else
n=m;
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
for (l=0; l<loops; l++)
{
@ -189,7 +202,7 @@ int MAIN__(int argc, char *argv[]){
gettimeofday( &start, (struct timezone *)0);
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
@ -202,11 +215,11 @@ int MAIN__(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -214,5 +214,5 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a,*work;
FLOAT wkopt[4];
@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -114,7 +114,7 @@ int gettimeofday(struct timeval *tv, void *tz){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
@ -278,5 +278,5 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -34,17 +34,17 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMINCOPY = zgemm_ncopy_1.S
ZGEMMITCOPY = zgemm_tcopy_1.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S

View File

@ -1092,18 +1092,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
vxorpd %xmm4 , %xmm4 , %xmm4
vxorpd %xmm5 , %xmm5 , %xmm5
vxorpd %ymm4 , %ymm4 , %ymm4
vxorpd %ymm5 , %ymm5 , %ymm5
vxorpd %ymm6 , %ymm6 , %ymm6
vxorpd %ymm7 , %ymm7 , %ymm7
.endm
.macro KERNEL4x1
vbroadcastsd -12 * SIZE(BO), %ymm0
vbroadcastsd -11 * SIZE(BO), %ymm1
vbroadcastsd -10 * SIZE(BO), %ymm2
vbroadcastsd -9 * SIZE(BO), %ymm3
vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4
vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5
vbroadcastsd -8 * SIZE(BO), %ymm0
vbroadcastsd -7 * SIZE(BO), %ymm1
vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6
vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7
vbroadcastsd -6 * SIZE(BO), %ymm2
vbroadcastsd -5 * SIZE(BO), %ymm3
vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4
vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5
vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6
vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7
addq $ 8 *SIZE, BO
addq $ 32*SIZE, AO
.endm
.macro KERNEL4x1_SUB
vmovddup -12 * SIZE(BO), %xmm2
vmovups -16 * SIZE(AO), %xmm0
vmovups -14 * SIZE(AO), %xmm1
vfmadd231pd %xmm0 ,%xmm2 , %xmm4
vfmadd231pd %xmm1 ,%xmm2 , %xmm5
vbroadcastsd -12 * SIZE(BO), %ymm2
vmovups -16 * SIZE(AO), %ymm0
vfmadd231pd %ymm0 ,%ymm2 , %ymm4
addq $ 1*SIZE, BO
addq $ 4*SIZE, AO
@ -1112,21 +1142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x1
vmovddup ALPHA, %xmm0
vbroadcastsd ALPHA, %ymm0
vmulpd %xmm0 , %xmm4 , %xmm4
vmulpd %xmm0 , %xmm5 , %xmm5
vaddpd %ymm4,%ymm5, %ymm4
vaddpd %ymm6,%ymm7, %ymm6
vaddpd %ymm4,%ymm6, %ymm4
vmulpd %ymm0 , %ymm4 , %ymm4
#if !defined(TRMMKERNEL)
vaddpd (CO1) , %xmm4, %xmm4
vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5
vaddpd (CO1) , %ymm4, %ymm4
#endif
vmovups %xmm4 , (CO1)
vmovups %xmm5 , 2 * SIZE(CO1)
vmovups %ymm4 , (CO1)
addq $ 4*SIZE, CO1
.endm
@ -2112,15 +2143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_12:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1
dec %rax
jne .L1_12
@ -3180,15 +3203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_12:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1
dec %rax
jne .L1_12

View File

@ -120,7 +120,7 @@
REAL RZERO
PARAMETER ( RZERO = 0.0 )
INTEGER NMAX, INCMAX
PARAMETER ( NMAX = 65, INCMAX = 2 )
PARAMETER ( NMAX = 128, INCMAX = 2 )
INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
$ NALMAX = 7, NBEMAX = 7 )

View File

@ -102,7 +102,7 @@
REAL RZERO
PARAMETER ( RZERO = 0.0 )
INTEGER NMAX
PARAMETER ( NMAX = 65 )
PARAMETER ( NMAX = 128 )
INTEGER NIDMAX, NALMAX, NBEMAX
PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
* .. Local Scalars ..

View File

@ -117,7 +117,7 @@
DOUBLE PRECISION ZERO, ONE
PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
INTEGER NMAX, INCMAX
PARAMETER ( NMAX = 65, INCMAX = 2 )
PARAMETER ( NMAX = 128, INCMAX = 2 )
INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
$ NALMAX = 7, NBEMAX = 7 )

View File

@ -97,7 +97,7 @@
DOUBLE PRECISION ZERO, ONE
PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
INTEGER NMAX
PARAMETER ( NMAX = 65 )
PARAMETER ( NMAX = 128 )
INTEGER NIDMAX, NALMAX, NBEMAX
PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
* .. Local Scalars ..

View File

@ -117,7 +117,7 @@
REAL ZERO, ONE
PARAMETER ( ZERO = 0.0, ONE = 1.0 )
INTEGER NMAX, INCMAX
PARAMETER ( NMAX = 65, INCMAX = 2 )
PARAMETER ( NMAX = 128, INCMAX = 2 )
INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
$ NALMAX = 7, NBEMAX = 7 )

View File

@ -97,7 +97,7 @@
REAL ZERO, ONE
PARAMETER ( ZERO = 0.0, ONE = 1.0 )
INTEGER NMAX
PARAMETER ( NMAX = 65 )
PARAMETER ( NMAX = 128 )
INTEGER NIDMAX, NALMAX, NBEMAX
PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
* .. Local Scalars ..

View File

@ -121,7 +121,7 @@
DOUBLE PRECISION RZERO
PARAMETER ( RZERO = 0.0D0 )
INTEGER NMAX, INCMAX
PARAMETER ( NMAX = 65, INCMAX = 2 )
PARAMETER ( NMAX = 128, INCMAX = 2 )
INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX
PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7,
$ NALMAX = 7, NBEMAX = 7 )

View File

@ -104,7 +104,7 @@
DOUBLE PRECISION RZERO
PARAMETER ( RZERO = 0.0D0 )
INTEGER NMAX
PARAMETER ( NMAX = 65 )
PARAMETER ( NMAX = 128 )
INTEGER NIDMAX, NALMAX, NBEMAX
PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
* .. Local Scalars ..

View File

@ -1,11 +1,11 @@
SEP: Data file for testing Symmetric Eigenvalue Problem routines
6 Number of values of N
0 1 2 3 5 20 Values of N (dimension)
8 Number of values of N
0 1 2 3 5 19 20 21 Values of N (dimension)
5 Number of values of NB
1 3 3 3 10 Values of NB (blocksize)
2 2 2 2 2 Values of NBMIN (minimum blocksize)
1 0 5 9 1 Values of NX (crossover point)
60.0 Threshold value
160.0 Threshold value
T Put T to test the LAPACK routines
T Put T to test the driver routines
T Put T to test the error exits

View File

@ -1129,7 +1129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 4