diff --git a/CMakeLists.txt b/CMakeLists.txt index 35fd830ee..d59290c90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers) ####### option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) +option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) + option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) @@ -398,15 +400,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") endif() + if (${BUILD_LAPACK_DEPRECATED}) + set (BLD 1) + else () + set (BLD 0) + endif() + if (${BUILD_BFLOAT16}) + set (BBF16 1) + else () + set (BBF16 0) + endif() + if (${BUILD_SINGLE}) + set (BS 1) + else () + set (BS 0) + endif() + if (${BUILD_DOUBLE}) + set (BD 1) + else () + set (BD 0) + endif() + if (${BUILD_COMPLEX}) + set (BC 1) + else () + set (BC 0) + endif() + if (${BUILD_COMPLEX16}) + set (BZ 1) + else () + set (BZ 0) + endif() if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) else() add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f5e9dda91..71df13634 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -23,6 +23,9 @@ * Optimization on AMD Piledriver * Optimization on Intel Haswell +* Chris Sidebottom + * Optimizations and other improvements targeting AArch64 + ## Previous Developers * Zaheer Chothia @@ -212,4 +215,4 @@ In chronological order: * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. * Pablo Romero - * [2022-08] Fix building from sources for QNX \ No newline at end of file + * [2022-08] Fix building from sources for QNX diff --git a/Makefile b/Makefile index 144b3400d..3c4b8948a 100644 --- a/Makefile +++ b/Makefile @@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test .PHONY : all libs netlib $(RELA) test ctest shared install -.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test +.NOTPARALLEL : shared -all :: libs netlib $(RELA) tests shared +all :: tests @echo @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" @echo @@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif -tests : libs netlib $(RELA) shared +tests : shared ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS diff --git a/c_check b/c_check index e8f90e18a..9be152b12 100755 --- a/c_check +++ b/c_check @@ -35,9 +35,12 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/" fi -bn=`basename $compiler_name` +bn=`basename \"$compiler_name\"` + case "$bn" in - *-*) cross_suffix="$cross_suffix${bn%-*}-" + *-*) if [ "$bn" != '-' ]; then + cross_suffix="$cross_suffix${bn%-*}-" + fi esac compiler="" diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 45dda8686..544e226ab 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -436,6 +436,7 @@ if(USE_XBLAS) set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) endif() +if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) @@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) message(STATUS "Building deprecated routines") +endif() set(DSLASRC spotrs.f) @@ -930,6 +932,7 @@ if(USE_XBLAS) set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) endif() +if(BUILD_LAPACK_DEPRECATED) list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) @@ -943,6 +946,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) message(STATUS "Building deprecated routines") +endif() set(DSLASRC spotrs.c) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 3a9352197..be6a286fe 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -70,8 +70,6 @@ set(CSRC lapacke_cgeqlf_work.c lapacke_cgeqp3.c lapacke_cgeqp3_work.c - lapacke_cgeqpf.c - lapacke_cgeqpf_work.c lapacke_cgeqr.c lapacke_cgeqr_work.c lapacke_cgeqr2.c @@ -144,12 +142,8 @@ set(CSRC lapacke_cggqrf_work.c lapacke_cggrqf.c lapacke_cggrqf_work.c - lapacke_cggsvd.c - lapacke_cggsvd_work.c lapacke_cggsvd3.c lapacke_cggsvd3_work.c - lapacke_cggsvp.c - lapacke_cggsvp_work.c lapacke_cggsvp3.c lapacke_cggsvp3_work.c lapacke_cgtcon.c @@ -695,8 +689,6 @@ set(DSRC lapacke_dgeqlf_work.c lapacke_dgeqp3.c lapacke_dgeqp3_work.c - lapacke_dgeqpf.c - lapacke_dgeqpf_work.c lapacke_dgeqr.c lapacke_dgeqr_work.c lapacke_dgeqr2.c @@ -771,12 +763,8 @@ set(DSRC lapacke_dggqrf_work.c lapacke_dggrqf.c lapacke_dggrqf_work.c - lapacke_dggsvd.c - lapacke_dggsvd_work.c lapacke_dggsvd3.c lapacke_dggsvd3_work.c - lapacke_dggsvp.c - lapacke_dggsvp_work.c lapacke_dggsvp3.c lapacke_dggsvp3_work.c lapacke_dgtcon.c @@ -1275,8 +1263,6 @@ set(SSRC lapacke_sgeqlf_work.c lapacke_sgeqp3.c lapacke_sgeqp3_work.c - lapacke_sgeqpf.c - lapacke_sgeqpf_work.c lapacke_sgeqr.c lapacke_sgeqr_work.c lapacke_sgeqr2.c @@ -1351,12 +1337,8 @@ set(SSRC lapacke_sggqrf_work.c lapacke_sggrqf.c lapacke_sggrqf_work.c - lapacke_sggsvd.c - lapacke_sggsvd_work.c lapacke_sggsvd3.c lapacke_sggsvd3_work.c - lapacke_sggsvp.c - lapacke_sggsvp_work.c lapacke_sggsvp3.c lapacke_sggsvp3_work.c lapacke_sgtcon.c @@ -1849,8 +1831,6 @@ set(ZSRC lapacke_zgeqlf_work.c lapacke_zgeqp3.c lapacke_zgeqp3_work.c - lapacke_zgeqpf.c - lapacke_zgeqpf_work.c lapacke_zgeqr.c lapacke_zgeqr_work.c lapacke_zgeqr2.c @@ -1925,12 +1905,8 @@ set(ZSRC lapacke_zggqrf_work.c lapacke_zggrqf.c lapacke_zggrqf_work.c - lapacke_zggsvd.c - lapacke_zggsvd_work.c lapacke_zggsvd3.c lapacke_zggsvd3_work.c - lapacke_zggsvp.c - lapacke_zggsvp_work.c lapacke_zggsvp3.c lapacke_zggsvp3_work.c lapacke_zgtcon.c @@ -2401,6 +2377,12 @@ set(ZSRC lapacke_csyr_work.c lapacke_ilaver.c ) +if (BUILD_LAPACK_DEPRECATED) +set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) +set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) +set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) +set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) +endif() set(SRCX lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 1080ea974..809f48e95 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -268,7 +268,8 @@ int detect(void) #else #ifdef __APPLE__ sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; + if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1 + if (value == 3660830781) return CPU_VORTEX; //A15/M2 #endif return CPU_ARMV8; #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index ad13a8c8c..69cbba90e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1936,7 +1936,8 @@ static char *corename[] = { "ZEN", "SKYLAKEX", "DHYANA", - "COOPERLAKE" + "COOPERLAKE", + "SAPPHIRERAPIDS", }; static char *corename_lower[] = { @@ -1970,7 +1971,8 @@ static char *corename_lower[] = { "zen", "skylakex", "dhyana", - "cooperlake" + "cooperlake", + "sapphirerapids", }; @@ -2276,16 +2278,18 @@ int get_coretype(void){ return CORE_NEHALEM; } if (model == 15) { // Sapphire Rapids + if(support_amx_bf16()) + return CORE_SAPPHIRERAPIDS; if(support_avx512_bf16()) - return CPUTYPE_COOPERLAKE; + return CORE_COOPERLAKE; if(support_avx512()) - return CPUTYPE_SKYLAKEX; + return CORE_SKYLAKEX; if(support_avx2()) - return CPUTYPE_HASWELL; + return CORE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; + return CORE_NEHALEM; } break; diff --git a/exports/gensymbol b/exports/gensymbol index 5823c0b3b..b584167a4 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -21,7 +21,7 @@ blasobjsc=" chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2 csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm - ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum" + ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt" blasobjsd=" damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm @@ -29,7 +29,7 @@ blasobjsd=" dscal dsdot dspmv dspr2 dimatcopy domatcopy dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv - idamax idamin idmax idmin dgeadd dsum" + idamax idamin idmax idmin dgeadd dsum dgemmt" blasobjss=" isamax isamin ismax ismin @@ -38,7 +38,7 @@ blasobjss=" smax smin snrm2 simatcopy somatcopy srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv - strmm strmv strsm strsv sgeadd ssum" + strmm strmv strsm strsv sgeadd ssum sgemmt" blasobjsz=" izamax izamin @@ -48,7 +48,7 @@ blasobjsz=" zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv zomatcopy zimatcopy dzamax dzamin dzasum dznrm2 - zgeadd dzsum" + zgeadd dzsum zgemmt" blasobjs="lsame xerbla" bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" @@ -58,7 +58,7 @@ cblasobjsc=" cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv - cblas_scnrm2 cblas_scasum + cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy " cblasobjsd=" @@ -67,7 +67,7 @@ cblasobjsd=" cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2 cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv - cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd + cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy " @@ -78,7 +78,7 @@ cblasobjss=" cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm - cblas_strsv cblas_sgeadd + cblas_strsv cblas_sgeadd cblas_sgemmt cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy " @@ -89,7 +89,7 @@ cblasobjsz=" cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub - cblas_zaxpby cblas_zgeadd + cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy " diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index e38a3cc89..dd79e924d 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -21,7 +21,7 @@ chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, - ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum); + ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); @blasobjsd = ( damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, @@ -29,7 +29,7 @@ dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, - idamax,idamin,idmax,idmin,dgeadd,dsum); + idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); @blasobjss = ( isamax,isamin,ismax,ismin, @@ -38,7 +38,7 @@ smax,smin,snrm2,simatcopy,somatcopy, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, - strmm,strmv,strsm,strsv, sgeadd,ssum); + strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); @blasobjsz = ( izamax,izamin,, @@ -48,7 +48,7 @@ zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, - zgeadd, dzsum); + zgeadd, dzsum, zgemmt); @blasobjs = (lsame, xerbla); @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @@ -60,7 +60,7 @@ cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, cblas_scnrm2, cblas_scasum, cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy - ); + cblas_cgemmt); @cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, @@ -69,7 +69,7 @@ cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy - ); + cblas_dgemmt); @cblasobjss = ( cblas_sasum, cblas_saxpy, cblas_saxpby, @@ -80,7 +80,7 @@ cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, cblas_strsv, cblas_sgeadd, cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy - ); + cblas_sgemmt); @cblasobjsz = ( cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, @@ -90,7 +90,7 @@ cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_zaxpby, cblas_zgeadd, cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy -); + cblas_zgemmt); @cblasobjs = ( cblas_xerbla ); diff --git a/getarch.c b/getarch.c index 937a8db68..87384c084 100644 --- a/getarch.c +++ b/getarch.c @@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n"); #ifdef MAKE_NB_JOBS #if MAKE_NB_JOBS > 0 - printf("MAKE += -j %d\n", MAKE_NB_JOBS); + printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS); #else // Let make use parent -j argument or -j1 if there // is no make parent #endif #elif NO_PARALLEL_MAKE==1 - printf("MAKE += -j 1\n"); + printf("MAKEFLAGS += -j 1\n"); #else - printf("MAKE += -j %d\n", get_num_cores()); + printf("MAKEFLAGS += -j %d\n", get_num_cores()); #endif break; diff --git a/interface/gemmt.c b/interface/gemmt.c index 3eed1dfe4..d35406411 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -35,29 +35,26 @@ #include #include #include "common.h" -#ifdef FUNCTION_PROFILE -#include "functable.h" -#endif #ifndef COMPLEX #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE -#define ERROR_NAME "QGEMT " +#define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "DGEMT " +#define ERROR_NAME "DGEMMT " #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMT " +#define ERROR_NAME "SBGEMMT " #else -#define ERROR_NAME "SGEMT " +#define ERROR_NAME "SGEMMT " #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifdef XDOUBLE -#define ERROR_NAME "XGEMT " +#define ERROR_NAME "XGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "ZGEMT " +#define ERROR_NAME "ZGEMMT " #else -#define ERROR_NAME "CGEMT " +#define ERROR_NAME "CGEMMT " #endif #endif @@ -68,13 +65,13 @@ #ifndef CBLAS void NAME(char *UPLO, char *TRANSA, char *TRANSB, - blasint * M, blasint * N, blasint * K, + blasint * M, blasint * K, FLOAT * Alpha, IFLOAT * a, blasint * ldA, IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) { - blasint m, n, k; + blasint m, k; blasint lda, ldb, ldc; int transa, transb, uplo; blasint info; @@ -92,7 +89,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, PRINT_DEBUG_NAME; m = *M; - n = *N; k = *K; #if defined(COMPLEX) @@ -167,8 +163,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, info = 13; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) info = 3; if (transb < 0) @@ -184,7 +178,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, - blasint N, blasint k, + blasint k, #ifndef COMPLEX FLOAT alpha, IFLOAT * A, blasint LDA, @@ -205,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int transa, transb, uplo; blasint info; - blasint m, n, lda, ldb; + blasint m, lda, ldb; FLOAT *a, *b; XFLOAT *buffer; @@ -248,9 +242,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, transb = 3; #endif - m = M; - n = N; - a = (void *)A; b = (void *)B; lda = LDA; @@ -262,8 +253,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = 13; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) info = 3; if (transb < 0) @@ -273,8 +262,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } if (order == CblasRowMajor) { - m = N; - n = M; a = (void *)B; b = (void *)A; @@ -319,8 +306,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, info = 13; if (k < 0) info = 5; - if (n < 0) - info = 4; if (m < 0) info = 3; if (transb < 0) @@ -407,37 +392,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif - if ((m == 0) || (n == 0)) + if ((m == 0) ) return; IDEBUG_START; - FUNCTION_PROFILE_START(); - const blasint incb = (transb == 0) ? 1 : ldb; if (uplo == 1) { - for (i = 0; i < n; i++) { - j = n - i; + for (i = 0; i < m; i++) { + j = m - i; l = j; #if defined(COMPLEX) aa = a + i * 2; bb = b + i * ldb * 2; if (transa) { - l = k; aa = a + lda * i * 2; - bb = b + i * 2; } + if (transb) + bb = b + i * 2; cc = c + i * 2 * ldc + i * 2; #else aa = a + i; bb = b + i * ldb; if (transa) { - l = k; aa = a + lda * i; - bb = b + i; } + if (transb) + bb = b + i; cc = c + i * ldc + i; #endif @@ -458,8 +441,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -479,20 +460,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); #else + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!transa) (gemv_thread[(int)transa]) (j, k, alpha, aa, lda, bb, incb, cc, 1, buffer, nthreads); + else + (gemv_thread[(int)transa]) (k, j, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); } #endif @@ -501,21 +496,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } } else { - for (i = 0; i < n; i++) { + for (i = 0; i < m; i++) { j = i + 1; l = j; #if defined COMPLEX bb = b + i * ldb * 2; - if (transa) { - l = k; + if (transb) { bb = b + i * 2; } cc = c + i * 2 * ldc; #else bb = b + i * ldb; - if (transa) { - l = k; + if (transb) { bb = b + i; } cc = c + i * ldc; @@ -537,8 +530,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif IDEBUG_START; - FUNCTION_PROFILE_START(); - buffer_size = j + k + 128 / sizeof(FLOAT); #ifdef WINDOWS_ABI buffer_size += 160 / sizeof(FLOAT); @@ -558,30 +549,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif #if defined(COMPLEX) + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); #else + if (!transa) (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, incb, cc, 1, buffer); + else + (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); #endif #ifdef SMP } else { - + if (!transa) (gemv_thread[(int)transa]) (j, k, alpha, a, lda, bb, incb, cc, 1, buffer, nthreads); - + else + (gemv_thread[(int)transa]) (k, j, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); } #endif STACK_FREE(buffer); } } - FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, - args.m * args.k + args.k * args.n + - args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END;