From 200296b0f4c9f070632d29d4ed8ebb738914a2dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 May 2020 13:23:51 +0200 Subject: [PATCH 01/11] remove libomp from link list only for pgfortran at least the AMD (aocc) flavor of flang wants to link to a (real or dummy) libomp by default --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index d702044cc..fd06a6516 100644 --- a/f_check +++ b/f_check @@ -334,7 +334,7 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($flags !~ /omp/) + && ($vendor =~ /PGI/ && $flags !~ /omp/) && ($flags !~ /[0-9]+/) && ($flags !~ /^\-l$/) ) { From 6e270f91ec9bd610678b4d4a9026a653a402d9ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 May 2020 13:29:10 +0200 Subject: [PATCH 02/11] add support for RETURN_BY_STACK semantics, e.g. clang --- benchmark/zdot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmark/zdot.c b/benchmark/zdot.c index ed9d4d2e8..136135c9c 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -170,9 +170,11 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } gettimeofday( &start, (struct timezone *)0); - +#ifdef RETURN_BY_STACK + DOT (&result , &m, x, &inc_x, y, &inc_y ); +#else result = DOT (&m, x, &inc_x, y, &inc_y ); - +#endif gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; From ced49466f05d4e96b46d377b0a1374f4a72aafe8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 May 2020 13:35:51 +0200 Subject: [PATCH 03/11] Use the fortran compiler to link LAPACK-related benchmarks to fix linking problems with (at least) the AMD version of flang that creates dependencies on more than just the fortran runtime. --- benchmark/Makefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 53f422be4..2f70ceaf3 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1825,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX) ##################################### Sgeev #################################################### sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgeev.acml : sgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1841,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX) ##################################### Dgeev #################################################### dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgeev.acml : dgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1858,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX) ##################################### Cgeev #################################################### cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgeev.acml : cgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1875,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX) ##################################### Zgeev #################################################### zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgeev.acml : zgeev.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1891,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX) ##################################### Sgetri #################################################### sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm sgetri.acml : sgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1907,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX) ##################################### Dgetri #################################################### dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm dgetri.acml : dgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1924,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX) ##################################### Cgetri #################################################### cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm cgetri.acml : cgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) @@ -1941,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX) ##################################### Zgetri #################################################### zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm zgetri.acml : zgetri.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) From 86552bf4c74708fb53dd69253de8ef7dd948170a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 31 May 2020 15:22:12 +0200 Subject: [PATCH 04/11] Update f_check --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index fd06a6516..17d863224 100644 --- a/f_check +++ b/f_check @@ -334,7 +334,7 @@ if ($link ne "") { && ($flags !~ /kernel32/) && ($flags !~ /advapi32/) && ($flags !~ /shell32/) - && ($vendor =~ /PGI/ && $flags !~ /omp/) + && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) && ($flags !~ /[0-9]+/) && ($flags !~ /^\-l$/) ) { From 0464e662ad1257c9624170cc332ab8edc5906acb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 5 Jun 2020 10:03:36 +0200 Subject: [PATCH 05/11] make blas_quickdivide unsigned and guard against miscompilation --- common_x86_64.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 0247674cd..15d0c30aa 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ #endif do { - while (*address) {YIELDING;}; + while (*address) {YIELDING;} #ifndef C_MSVC __asm__ __volatile__( @@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ #else extern unsigned int blas_quick_divide_table[]; -static __inline int blas_quickdivide(unsigned int x, unsigned int y){ +static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ - unsigned int result; + volatile unsigned int result; if (y <= 1) return x; @@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); - return result; } #endif From 7f60fb6b91e1b9d4af39ae7b05717aea374bcee5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 5 Jun 2020 10:04:16 +0200 Subject: [PATCH 06/11] Delete spurious copy of common_param.h --- kernel/common_param.h | 1403 ----------------------------------------- 1 file changed, 1403 deletions(-) delete mode 100644 kernel/common_param.h diff --git a/kernel/common_param.h b/kernel/common_param.h deleted file mode 100644 index 29bb65e5c..000000000 --- a/kernel/common_param.h +++ /dev/null @@ -1,1403 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#ifndef COMMON_PARAM_H -#define COMMON_PARAM_H - -#ifndef ASSEMBLER - -#ifdef DYNAMIC_ARCH - -typedef struct { - int dtb_entries; - int offsetA, offsetB, align; - -#if 1 - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; - - float (*shamax_k) (BLASLONG, float *, BLASLONG); - float (*shamin_k) (BLASLONG, float *, BLASLONG); - float (*shmax_k) (BLASLONG, float *, BLASLONG); - float (*shmin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); - - float (*shnrm2_k) (BLASLONG, float *, BLASLONG); - float (*shasum_k) (BLASLONG, float *, BLASLONG); - float (*shsum_k) (BLASLONG, float *, BLASLONG); - int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - - int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - -#endif - int sgemm_p, sgemm_q, sgemm_r; - int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; - - int exclusive_cache; - - float (*samax_k) (BLASLONG, float *, BLASLONG); - float (*samin_k) (BLASLONG, float *, BLASLONG); - float (*smax_k) (BLASLONG, float *, BLASLONG); - float (*smin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - - float (*snrm2_k) (BLASLONG, float *, BLASLONG); - float (*sasum_k) (BLASLONG, float *, BLASLONG); - float (*ssum_k) (BLASLONG, float *, BLASLONG); - int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); - int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - - int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int dgemm_p, dgemm_q, dgemm_r; - int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; - - double (*damax_k) (BLASLONG, double *, BLASLONG); - double (*damin_k) (BLASLONG, double *, BLASLONG); - double (*dmax_k) (BLASLONG, double *, BLASLONG); - double (*dmin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); - - double (*dnrm2_k) (BLASLONG, double *, BLASLONG); - double (*dasum_k) (BLASLONG, double *, BLASLONG); - double (*dsum_k) (BLASLONG, double *, BLASLONG); - int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); - int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int qgemm_p, qgemm_q, qgemm_r; - int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; - - xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); - - xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - int cgemm_p, cgemm_q, cgemm_r; - int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - - float (*camax_k) (BLASLONG, float *, BLASLONG); - float (*camin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); - - float (*cnrm2_k) (BLASLONG, float *, BLASLONG); - float (*casum_k) (BLASLONG, float *, BLASLONG); - float (*csum_k) (BLASLONG, float *, BLASLONG); - int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - - int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - - int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int cgemm3m_p, cgemm3m_q, cgemm3m_r; - int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn; - - int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); - - int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - - int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); - - int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - - int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); - - int zgemm_p, zgemm_q, zgemm_r; - int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; - - double (*zamax_k) (BLASLONG, double *, BLASLONG); - double (*zamin_k) (BLASLONG, double *, BLASLONG); -BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); -BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); - - double (*znrm2_k) (BLASLONG, double *, BLASLONG); - double (*zasum_k) (BLASLONG, double *, BLASLONG); - double (*zsum_k) (BLASLONG, double *, BLASLONG); - int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - - int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - - int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); - - int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - - int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int zgemm3m_p, zgemm3m_q, zgemm3m_r; - int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn; - - int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); - - int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - - int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); - - int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - - int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - - int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); - int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - -#ifdef EXPRECISION - - int xgemm_p, xgemm_q, xgemm_r; - int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; - - xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); -BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); - - xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); - int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); - - int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); - - int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); - - int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - - int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int xgemm3m_p, xgemm3m_q, xgemm3m_r; - int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn; - - int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); - - int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - - int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - - int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - - int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); - -#endif - - - void (*init)(void); - - int snum_opt, dnum_opt, qnum_opt; - - int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); - int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); - int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); - int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); - - int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); - - int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); - - int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); - - int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); - - int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); - int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); - - int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); - int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); - - int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); - - int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); - - int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); - int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); - int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); - int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - -} gotoblas_t; - -extern gotoblas_t *gotoblas; - -#define DTB_ENTRIES gotoblas -> dtb_entries -#define GEMM_OFFSET_A gotoblas -> offsetA -#define GEMM_OFFSET_B gotoblas -> offsetB -#define GEMM_ALIGN gotoblas -> align - -#define HAVE_EX_L2 gotoblas -> exclusive_cache - -#define SHGEMM_P gotoblas -> shgemm_p -#define SHGEMM_Q gotoblas -> shgemm_q -#define SHGEMM_R gotoblas -> shgemm_r -#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m -#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n -#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn - -#define SGEMM_P gotoblas -> sgemm_p -#define SGEMM_Q gotoblas -> sgemm_q -#define SGEMM_R gotoblas -> sgemm_r -#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m -#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n -#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn - -#define DGEMM_P gotoblas -> dgemm_p -#define DGEMM_Q gotoblas -> dgemm_q -#define DGEMM_R gotoblas -> dgemm_r -#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m -#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n -#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn - -#define QGEMM_P gotoblas -> qgemm_p -#define QGEMM_Q gotoblas -> qgemm_q -#define QGEMM_R gotoblas -> qgemm_r -#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m -#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n -#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn - -#define CGEMM_P gotoblas -> cgemm_p -#define CGEMM_Q gotoblas -> cgemm_q -#define CGEMM_R gotoblas -> cgemm_r -#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m -#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n -#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn - -#define ZGEMM_P gotoblas -> zgemm_p -#define ZGEMM_Q gotoblas -> zgemm_q -#define ZGEMM_R gotoblas -> zgemm_r -#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m -#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n -#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn - -#define XGEMM_P gotoblas -> xgemm_p -#define XGEMM_Q gotoblas -> xgemm_q -#define XGEMM_R gotoblas -> xgemm_r -#define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m -#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n -#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn - -#define CGEMM3M_P gotoblas -> cgemm3m_p -#define CGEMM3M_Q gotoblas -> cgemm3m_q -#define CGEMM3M_R gotoblas -> cgemm3m_r -#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m -#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n -#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn - -#define ZGEMM3M_P gotoblas -> zgemm3m_p -#define ZGEMM3M_Q gotoblas -> zgemm3m_q -#define ZGEMM3M_R gotoblas -> zgemm3m_r -#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m -#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n -#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn - -#define XGEMM3M_P gotoblas -> xgemm3m_p -#define XGEMM3M_Q gotoblas -> xgemm3m_q -#define XGEMM3M_R gotoblas -> xgemm3m_r -#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m -#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n -#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn - -#else - -#define DTB_ENTRIES DTB_DEFAULT_ENTRIES - -#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A -#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B -#define GEMM_ALIGN GEMM_DEFAULT_ALIGN - -#ifdef HAVE_EXCLUSIVE_CACHE -#define HAVE_EX_L2 1 -#else -#define HAVE_EX_L2 0 -#endif - -#define SHGEMM_P SHGEMM_DEFAULT_P -#define SHGEMM_Q SHGEMM_DEFAULT_Q -#define SHGEMM_R SHGEMM_DEFAULT_R -#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#ifdef SHGEMM_DEFAULT_UNROLL_MN -#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN -#else -#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) -#endif - -#define SGEMM_P SGEMM_DEFAULT_P -#define SGEMM_Q SGEMM_DEFAULT_Q -#define SGEMM_R SGEMM_DEFAULT_R -#define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#ifdef SGEMM_DEFAULT_UNROLL_MN -#define SGEMM_UNROLL_MN SGEMM_DEFAULT_UNROLL_MN -#else -#define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) -#endif - -#define DGEMM_P DGEMM_DEFAULT_P -#define DGEMM_Q DGEMM_DEFAULT_Q -#define DGEMM_R DGEMM_DEFAULT_R -#define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#ifdef DGEMM_DEFAULT_UNROLL_MN -#define DGEMM_UNROLL_MN DGEMM_DEFAULT_UNROLL_MN -#else -#define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) -#endif - -#define QGEMM_P QGEMM_DEFAULT_P -#define QGEMM_Q QGEMM_DEFAULT_Q -#define QGEMM_R QGEMM_DEFAULT_R -#define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - -#define CGEMM_P CGEMM_DEFAULT_P -#define CGEMM_Q CGEMM_DEFAULT_Q -#define CGEMM_R CGEMM_DEFAULT_R -#define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#ifdef CGEMM_DEFAULT_UNROLL_MN -#define CGEMM_UNROLL_MN CGEMM_DEFAULT_UNROLL_MN -#else -#define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) -#endif - -#define ZGEMM_P ZGEMM_DEFAULT_P -#define ZGEMM_Q ZGEMM_DEFAULT_Q -#define ZGEMM_R ZGEMM_DEFAULT_R -#define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#ifdef ZGEMM_DEFAULT_UNROLL_MN -#define ZGEMM_UNROLL_MN ZGEMM_DEFAULT_UNROLL_MN -#else -#define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) -#endif - -#define XGEMM_P XGEMM_DEFAULT_P -#define XGEMM_Q XGEMM_DEFAULT_Q -#define XGEMM_R XGEMM_DEFAULT_R -#define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) - -#ifdef CGEMM3M_DEFAULT_UNROLL_N - -#define CGEMM3M_P CGEMM3M_DEFAULT_P -#define CGEMM3M_Q CGEMM3M_DEFAULT_Q -#define CGEMM3M_R CGEMM3M_DEFAULT_R -#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N)) - -#else - -#define CGEMM3M_P SGEMM_DEFAULT_P -#define CGEMM3M_Q SGEMM_DEFAULT_Q -#define CGEMM3M_R SGEMM_DEFAULT_R -#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) - -#endif - - -#ifdef ZGEMM3M_DEFAULT_UNROLL_N - -#define ZGEMM3M_P ZGEMM3M_DEFAULT_P -#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q -#define ZGEMM3M_R ZGEMM3M_DEFAULT_R -#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#else - -#define ZGEMM3M_P DGEMM_DEFAULT_P -#define ZGEMM3M_Q DGEMM_DEFAULT_Q -#define ZGEMM3M_R DGEMM_DEFAULT_R -#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) - -#endif - -#define XGEMM3M_P QGEMM_DEFAULT_P -#define XGEMM3M_Q QGEMM_DEFAULT_Q -#define XGEMM3M_R QGEMM_DEFAULT_R -#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) - - -#endif -#endif - -#ifndef COMPLEX -#if defined(XDOUBLE) -#define GEMM_P QGEMM_P -#define GEMM_Q QGEMM_Q -#define GEMM_R QGEMM_R -#define GEMM_UNROLL_M QGEMM_UNROLL_M -#define GEMM_UNROLL_N QGEMM_UNROLL_N -#define GEMM_UNROLL_MN QGEMM_UNROLL_MN -#define GEMM_DEFAULT_P QGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R QGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P DGEMM_P -#define GEMM_Q DGEMM_Q -#define GEMM_R DGEMM_R -#define GEMM_UNROLL_M DGEMM_UNROLL_M -#define GEMM_UNROLL_N DGEMM_UNROLL_N -#define GEMM_UNROLL_MN DGEMM_UNROLL_MN -#define GEMM_DEFAULT_P DGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R DGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif defined(HALF) -#define GEMM_P SHGEMM_P -#define GEMM_Q SHGEMM_Q -#define GEMM_R SHGEMM_R -#define GEMM_UNROLL_M SHGEMM_UNROLL_M -#define GEMM_UNROLL_N SHGEMM_UNROLL_N -#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P SGEMM_P -#define GEMM_Q SGEMM_Q -#define GEMM_R SGEMM_R -#define GEMM_UNROLL_M SGEMM_UNROLL_M -#define GEMM_UNROLL_N SGEMM_UNROLL_N -#define GEMM_UNROLL_MN SGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N -#endif -#else -#if defined(XDOUBLE) -#define GEMM_P XGEMM_P -#define GEMM_Q XGEMM_Q -#define GEMM_R XGEMM_R -#define GEMM_UNROLL_M XGEMM_UNROLL_M -#define GEMM_UNROLL_N XGEMM_UNROLL_N -#define GEMM_UNROLL_MN XGEMM_UNROLL_MN -#define GEMM_DEFAULT_P XGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R XGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N -#elif defined(DOUBLE) -#define GEMM_P ZGEMM_P -#define GEMM_Q ZGEMM_Q -#define GEMM_R ZGEMM_R -#define GEMM_UNROLL_M ZGEMM_UNROLL_M -#define GEMM_UNROLL_N ZGEMM_UNROLL_N -#define GEMM_UNROLL_MN ZGEMM_UNROLL_MN -#define GEMM_DEFAULT_P ZGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R ZGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N -#else -#define GEMM_P CGEMM_P -#define GEMM_Q CGEMM_Q -#define GEMM_R CGEMM_R -#define GEMM_UNROLL_M CGEMM_UNROLL_M -#define GEMM_UNROLL_N CGEMM_UNROLL_N -#define GEMM_UNROLL_MN CGEMM_UNROLL_MN -#define GEMM_DEFAULT_P CGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R CGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N -#endif -#endif - -#ifdef XDOUBLE -#define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N -#elif defined(DOUBLE) -#define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N -#else -#define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M -#define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N -#endif - - -#ifndef QGEMM_DEFAULT_UNROLL_M -#define QGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef QGEMM_DEFAULT_UNROLL_N -#define QGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_M -#define XGEMM_DEFAULT_UNROLL_M 2 -#endif - -#ifndef XGEMM_DEFAULT_UNROLL_N -#define XGEMM_DEFAULT_UNROLL_N 2 -#endif - -#ifndef GEMM_THREAD -#define GEMM_THREAD gemm_thread_n -#endif - -#ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) -#endif - -#ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) -#endif - -#ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) -#endif - -#ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) -#endif - -#ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) -#endif - -#ifndef SNUMOPT -#define SNUMOPT 2 -#endif - -#ifndef DNUMOPT -#define DNUMOPT 2 -#endif - -#ifndef QNUMOPT -#define QNUMOPT 1 -#endif - -#ifndef GEMM3M_P -#ifdef XDOUBLE -#define GEMM3M_P XGEMM3M_P -#elif defined(DOUBLE) -#define GEMM3M_P ZGEMM3M_P -#else -#define GEMM3M_P CGEMM3M_P -#endif -#endif - -#ifndef GEMM3M_Q -#ifdef XDOUBLE -#define GEMM3M_Q XGEMM3M_Q -#elif defined(DOUBLE) -#define GEMM3M_Q ZGEMM3M_Q -#else -#define GEMM3M_Q CGEMM3M_Q -#endif -#endif - -#ifndef GEMM3M_R -#ifdef XDOUBLE -#define GEMM3M_R XGEMM3M_R -#elif defined(DOUBLE) -#define GEMM3M_R ZGEMM3M_R -#else -#define GEMM3M_R CGEMM3M_R -#endif -#endif - - -#endif From 28915eed726404bd14ed2828d45fe5293c55603e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 5 Jun 2020 10:05:34 +0200 Subject: [PATCH 07/11] Cosmetic fixes for non-C99 compilers --- test/compare_sgemm_shgemm.c | 65 +++++++++---------------------------- 1 file changed, 16 insertions(+), 49 deletions(-) diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index 7e254f844..d37ae6851 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -46,83 +46,50 @@ typedef union } bits; } bfloat16_bits; -typedef union -{ - float v; - struct - { - uint32_t m:23; - uint32_t e:8; - uint32_t s:1; - } bits; -} float32_bits; - -float -float16to32 (bfloat16_bits f16) -{ - float32_bits f32; - f32.bits.s = f16.bits.s; - f32.bits.e = f16.bits.e; - f32.bits.m = (uint32_t) f16.bits.m << 16; - return f32.v; -} - int main (int argc, char *argv[]) { int m, n, k; int i, j, l; + int x; int ret = 0; int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; + char transa = 'N'; + char transb = 'N'; - for (int x = 0; x <= loop; x++) + for (x = 0; x <= loop; x++) { m = k = n = x; float A[m * k]; float B[k * n]; float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; - float DD[m * n], CC[m * n]; + float CC[m * n]; - for (int j = 0; j < m; j++) + for (j = 0; j < m; j++) { - for (int i = 0; i < m; i++) + for (i = 0; i < m; i++) { - A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; CC[j * k + i] = 0; - DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); + &m, BB, &k, &beta, CC, &m); + for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) - ret++; - if (transA == 'N' && transB == 'N') - { - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - { - DD[i * m + j] += - float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); - } - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (CC[i * m + j] != DD[i * m + j]) - ret++; - } + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) + ret++; } if (ret != 0) fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); From 0e3ac4a06bc3cce26d593f5b8acad20a6121d1ed Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Sat, 6 Jun 2020 14:56:57 +0800 Subject: [PATCH 08/11] Add files via upload --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 150 ++++++++++++--------- 1 file changed, 89 insertions(+), 61 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 416ace59b..9f2bf24e2 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -54,40 +54,40 @@ #define kernel_kstart_n10(mdim,updk) "" #define kernel_kstart_n12(mdim,updk) "" #define kernel_kend_n4(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) #define kernel_kend_n6(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) #define kernel_kend_n8(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) #define kernel_kend_n10(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) #define kernel_kend_n12(mdim) "xorq %3,%3;"\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\ - loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\ - loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\ - loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\ - loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\ - loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152) + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\ + loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\ + loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\ + loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\ + loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\ + loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144) #endif #else #define HEAD_SET_OFF(ndim) {} @@ -129,18 +129,28 @@ #define init_update_k(mdim) "" #define save_update_k(mdim) "" #endif - + #define KERNEL_h_k1m16n1 \ "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\ "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;" #define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;" -#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ +#ifdef BROADCAST_KERNEL + #define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\ "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;" -#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" -#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\ + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\ - "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" -#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__) + "vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) +#else + #define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\ + "vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\ + "vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";" + #define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__) + #define KERNEL_h_k1m16n2 \ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ + unit_acc_m16n2(8,9,10,11,%1) +#endif +#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) #define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;" #define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;" @@ -151,24 +161,42 @@ #define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2) #define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) + #ifdef BROADCAST_KERNEL #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;" - #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1) - #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2) - #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15) - #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1) - #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2) + #else + #define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;" + #endif + #define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15) + #define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2) #endif #define save_init_m16 "movq %2,%3; addq $128,%2;" #ifdef TRMMKERNEL #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #else #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;" + #ifdef BROADCAST_KERNEL #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\ "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;" + #else + #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\ + "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\ + "vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\ + "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;" + #endif #endif #define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11) #define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15) @@ -206,11 +234,11 @@ #define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;" - #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15) - #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15) + #define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2) #endif #define save_init_m8 "movq %2,%3; addq $64,%2;" #ifdef TRMMKERNEL @@ -258,11 +286,11 @@ #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" - #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m4 "movq %2,%3; addq $32,%2;" #ifdef TRMMKERNEL @@ -311,11 +339,11 @@ #define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" - #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) - #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) - #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15) - #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) - #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) + #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) + #define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2) + #define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15) + #define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1) + #define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2) #endif #define save_init_m2 "movq %2,%3; addq $16,%2;" #ifdef TRMMKERNEL @@ -362,11 +390,11 @@ #define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" - #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" - #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" - #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" - #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" - #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" + #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" + #define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;" + #define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;" + #define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;" + #define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;" #endif #define save_init_m1 "movq %2,%3; addq $8,%2;" #ifdef TRMMKERNEL From 13c28889a2c9a29ac781e51be747782a71cdaf50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Jun 2020 15:22:27 +0200 Subject: [PATCH 09/11] Update "cosmetic fixes for non-C99 compilers" --- test/compare_sgemm_shgemm.c | 58 +++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index d37ae6851..57aee7b8f 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -46,6 +46,27 @@ typedef union } bits; } bfloat16_bits; +typedef union +{ + float v; + struct + { + uint32_t m:23; + uint32_t e:8; + uint32_t s:1; + } bits; +} float32_bits; + +float +float16to32 (bfloat16_bits f16) +{ + float32_bits f32; + f32.bits.s = f16.bits.s; + f32.bits.e = f16.bits.e; + f32.bits.m = (uint32_t) f16.bits.m << 16; + return f32.v; +} + int main (int argc, char *argv[]) { @@ -56,8 +77,6 @@ main (int argc, char *argv[]) int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; - char transa = 'N'; - char transb = 'N'; for (x = 0; x <= loop; x++) { @@ -66,30 +85,45 @@ main (int argc, char *argv[]) float B[k * n]; float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; - float CC[m * n]; + float DD[m * n], CC[m * n]; for (j = 0; j < m; j++) { for (i = 0; i < m; i++) { - A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; CC[j * k + i] = 0; + DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); - + &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) - ret++; + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; + if (transA == 'N' && transB == 'N') + { + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (CC[i * m + j] != DD[i * m + j]) + ret++; + } } if (ret != 0) fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); From 4e28dc6353f9705d71bb1aa1bc0ae84dd1d2610b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jun 2020 00:05:02 +0200 Subject: [PATCH 10/11] Use only -O1 with AMD AOCC version of flang to prevent miscompilation of LAPACK codes and tests on Ryzen --- Makefile.system | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 547babaaa..252c816a9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -783,6 +783,7 @@ endif ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG +FCOMMON_OPT += -frecursive ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -796,6 +797,11 @@ endif ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif +ifeq ($(OSNAME), Linux) +ifeq ($(ARCH), x86_64) +FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +endif +endif endif ifeq ($(F_COMPILER), G77) @@ -1270,8 +1276,11 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) - +ifeq ($(FLANG_VENDOR),AOCC) +override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT) +else override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) +endif override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = From 522aaf53bfdc759cddfec6e25a701907702e9f7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 7 Jun 2020 14:30:20 +0200 Subject: [PATCH 11/11] Break out of potentially infinite rescaling loop in LAPACK xLARGV/xLARTG/xLARTGP Reference-LAPACK issue 411 --- lapack-netlib/SRC/clargv.f | 2 +- lapack-netlib/SRC/clartg.f | 2 +- lapack-netlib/SRC/dlartg.f | 2 +- lapack-netlib/SRC/dlartgp.f | 2 +- lapack-netlib/SRC/slartg.f | 2 +- lapack-netlib/SRC/slartgp.f | 2 +- lapack-netlib/SRC/zlargv.f | 2 +- lapack-netlib/SRC/zlartg.f | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/clargv.f b/lapack-netlib/SRC/clargv.f index ba53cae6f..36c5108df 100644 --- a/lapack-netlib/SRC/clargv.f +++ b/lapack-netlib/SRC/clargv.f @@ -200,7 +200,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/clartg.f b/lapack-netlib/SRC/clartg.f index da9a1cdef..baa68b657 100644 --- a/lapack-netlib/SRC/clartg.f +++ b/lapack-netlib/SRC/clartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.SISNAN( ABS( G ) ) ) THEN diff --git a/lapack-netlib/SRC/dlartg.f b/lapack-netlib/SRC/dlartg.f index 1c7c46f63..dc49986a0 100644 --- a/lapack-netlib/SRC/dlartg.f +++ b/lapack-netlib/SRC/dlartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/dlartgp.f b/lapack-netlib/SRC/dlartgp.f index 0cb0d2d13..334e416e8 100644 --- a/lapack-netlib/SRC/dlartgp.f +++ b/lapack-netlib/SRC/dlartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartg.f b/lapack-netlib/SRC/slartg.f index 784d4bc36..307c9c83a 100644 --- a/lapack-netlib/SRC/slartg.f +++ b/lapack-netlib/SRC/slartg.f @@ -163,7 +163,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/slartgp.f b/lapack-netlib/SRC/slartgp.f index ad76c94b4..f8be5f52b 100644 --- a/lapack-netlib/SRC/slartgp.f +++ b/lapack-netlib/SRC/slartgp.f @@ -161,7 +161,7 @@ F1 = F1*SAFMN2 G1 = G1*SAFMN2 SCALE = MAX( ABS( F1 ), ABS( G1 ) ) - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20) $ GO TO 10 R = SQRT( F1**2+G1**2 ) CS = F1 / R diff --git a/lapack-netlib/SRC/zlargv.f b/lapack-netlib/SRC/zlargv.f index 1e17983d5..f83ca1851 100644 --- a/lapack-netlib/SRC/zlargv.f +++ b/lapack-netlib/SRC/zlargv.f @@ -201,7 +201,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO ) THEN diff --git a/lapack-netlib/SRC/zlartg.f b/lapack-netlib/SRC/zlartg.f index 8989bb896..894b4ded0 100644 --- a/lapack-netlib/SRC/zlartg.f +++ b/lapack-netlib/SRC/zlartg.f @@ -161,7 +161,7 @@ FS = FS*SAFMN2 GS = GS*SAFMN2 SCALE = SCALE*SAFMN2 - IF( SCALE.GE.SAFMX2 ) + IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 ) $ GO TO 10 ELSE IF( SCALE.LE.SAFMN2 ) THEN IF( G.EQ.CZERO.OR.DISNAN( ABS( G ) ) ) THEN