Merge branch 'xianyi:develop' into cirrusjobs
This commit is contained in:
commit
0724df404c
|
@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers)
|
||||||
#######
|
#######
|
||||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
|
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
|
||||||
|
|
||||||
|
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
|
||||||
|
|
||||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||||
|
|
||||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||||
|
@ -398,15 +400,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
||||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (${BUILD_LAPACK_DEPRECATED})
|
||||||
|
set (BLD 1)
|
||||||
|
else ()
|
||||||
|
set (BLD 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_BFLOAT16})
|
||||||
|
set (BBF16 1)
|
||||||
|
else ()
|
||||||
|
set (BBF16 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_SINGLE})
|
||||||
|
set (BS 1)
|
||||||
|
else ()
|
||||||
|
set (BS 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_DOUBLE})
|
||||||
|
set (BD 1)
|
||||||
|
else ()
|
||||||
|
set (BD 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_COMPLEX})
|
||||||
|
set (BC 1)
|
||||||
|
else ()
|
||||||
|
set (BC 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_COMPLEX16})
|
||||||
|
set (BZ 1)
|
||||||
|
else ()
|
||||||
|
set (BZ 0)
|
||||||
|
endif()
|
||||||
if (NOT USE_PERL)
|
if (NOT USE_PERL)
|
||||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||||
COMMENT "renaming symbols"
|
COMMENT "renaming symbols"
|
||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||||
COMMENT "renaming symbols"
|
COMMENT "renaming symbols"
|
||||||
)
|
)
|
||||||
|
|
|
@ -23,6 +23,9 @@
|
||||||
* Optimization on AMD Piledriver
|
* Optimization on AMD Piledriver
|
||||||
* Optimization on Intel Haswell
|
* Optimization on Intel Haswell
|
||||||
|
|
||||||
|
* Chris Sidebottom <chris.sidebottom@arm.com>
|
||||||
|
* Optimizations and other improvements targeting AArch64
|
||||||
|
|
||||||
## Previous Developers
|
## Previous Developers
|
||||||
|
|
||||||
* Zaheer Chothia <zaheer.chothia@gmail.com>
|
* Zaheer Chothia <zaheer.chothia@gmail.com>
|
||||||
|
@ -212,4 +215,4 @@ In chronological order:
|
||||||
* [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
|
* [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
|
||||||
|
|
||||||
* Pablo Romero <https://github.com/pablorcum>
|
* Pablo Romero <https://github.com/pablorcum>
|
||||||
* [2022-08] Fix building from sources for QNX
|
* [2022-08] Fix building from sources for QNX
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
||||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||||
|
|
||||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
.NOTPARALLEL : shared
|
||||||
|
|
||||||
all :: libs netlib $(RELA) tests shared
|
all :: tests
|
||||||
@echo
|
@echo
|
||||||
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
||||||
@echo
|
@echo
|
||||||
|
@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
tests : libs netlib $(RELA) shared
|
tests : shared
|
||||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
touch $(LIBNAME)
|
touch $(LIBNAME)
|
||||||
ifndef NO_FBLAS
|
ifndef NO_FBLAS
|
||||||
|
|
7
c_check
7
c_check
|
@ -35,9 +35,12 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then
|
||||||
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
|
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
bn=`basename $compiler_name`
|
bn=`basename \"$compiler_name\"`
|
||||||
|
|
||||||
case "$bn" in
|
case "$bn" in
|
||||||
*-*) cross_suffix="$cross_suffix${bn%-*}-"
|
*-*) if [ "$bn" != '-' ]; then
|
||||||
|
cross_suffix="$cross_suffix${bn%-*}-"
|
||||||
|
fi
|
||||||
esac
|
esac
|
||||||
|
|
||||||
compiler=""
|
compiler=""
|
||||||
|
|
|
@ -436,6 +436,7 @@ if(USE_XBLAS)
|
||||||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(BUILD_LAPACK_DEPRECATED)
|
||||||
list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
|
list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
|
||||||
DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
|
DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
|
||||||
DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
|
DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
|
||||||
|
@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
|
||||||
DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
|
DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
|
||||||
DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
|
DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
|
||||||
message(STATUS "Building deprecated routines")
|
message(STATUS "Building deprecated routines")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(DSLASRC spotrs.f)
|
set(DSLASRC spotrs.f)
|
||||||
|
|
||||||
|
@ -930,6 +932,7 @@ if(USE_XBLAS)
|
||||||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(BUILD_LAPACK_DEPRECATED)
|
||||||
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
|
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
|
||||||
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
|
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
|
||||||
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
|
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
|
||||||
|
@ -943,6 +946,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
|
||||||
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
|
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
|
||||||
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
|
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
|
||||||
message(STATUS "Building deprecated routines")
|
message(STATUS "Building deprecated routines")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(DSLASRC spotrs.c)
|
set(DSLASRC spotrs.c)
|
||||||
|
|
||||||
|
|
|
@ -70,8 +70,6 @@ set(CSRC
|
||||||
lapacke_cgeqlf_work.c
|
lapacke_cgeqlf_work.c
|
||||||
lapacke_cgeqp3.c
|
lapacke_cgeqp3.c
|
||||||
lapacke_cgeqp3_work.c
|
lapacke_cgeqp3_work.c
|
||||||
lapacke_cgeqpf.c
|
|
||||||
lapacke_cgeqpf_work.c
|
|
||||||
lapacke_cgeqr.c
|
lapacke_cgeqr.c
|
||||||
lapacke_cgeqr_work.c
|
lapacke_cgeqr_work.c
|
||||||
lapacke_cgeqr2.c
|
lapacke_cgeqr2.c
|
||||||
|
@ -144,12 +142,8 @@ set(CSRC
|
||||||
lapacke_cggqrf_work.c
|
lapacke_cggqrf_work.c
|
||||||
lapacke_cggrqf.c
|
lapacke_cggrqf.c
|
||||||
lapacke_cggrqf_work.c
|
lapacke_cggrqf_work.c
|
||||||
lapacke_cggsvd.c
|
|
||||||
lapacke_cggsvd_work.c
|
|
||||||
lapacke_cggsvd3.c
|
lapacke_cggsvd3.c
|
||||||
lapacke_cggsvd3_work.c
|
lapacke_cggsvd3_work.c
|
||||||
lapacke_cggsvp.c
|
|
||||||
lapacke_cggsvp_work.c
|
|
||||||
lapacke_cggsvp3.c
|
lapacke_cggsvp3.c
|
||||||
lapacke_cggsvp3_work.c
|
lapacke_cggsvp3_work.c
|
||||||
lapacke_cgtcon.c
|
lapacke_cgtcon.c
|
||||||
|
@ -695,8 +689,6 @@ set(DSRC
|
||||||
lapacke_dgeqlf_work.c
|
lapacke_dgeqlf_work.c
|
||||||
lapacke_dgeqp3.c
|
lapacke_dgeqp3.c
|
||||||
lapacke_dgeqp3_work.c
|
lapacke_dgeqp3_work.c
|
||||||
lapacke_dgeqpf.c
|
|
||||||
lapacke_dgeqpf_work.c
|
|
||||||
lapacke_dgeqr.c
|
lapacke_dgeqr.c
|
||||||
lapacke_dgeqr_work.c
|
lapacke_dgeqr_work.c
|
||||||
lapacke_dgeqr2.c
|
lapacke_dgeqr2.c
|
||||||
|
@ -771,12 +763,8 @@ set(DSRC
|
||||||
lapacke_dggqrf_work.c
|
lapacke_dggqrf_work.c
|
||||||
lapacke_dggrqf.c
|
lapacke_dggrqf.c
|
||||||
lapacke_dggrqf_work.c
|
lapacke_dggrqf_work.c
|
||||||
lapacke_dggsvd.c
|
|
||||||
lapacke_dggsvd_work.c
|
|
||||||
lapacke_dggsvd3.c
|
lapacke_dggsvd3.c
|
||||||
lapacke_dggsvd3_work.c
|
lapacke_dggsvd3_work.c
|
||||||
lapacke_dggsvp.c
|
|
||||||
lapacke_dggsvp_work.c
|
|
||||||
lapacke_dggsvp3.c
|
lapacke_dggsvp3.c
|
||||||
lapacke_dggsvp3_work.c
|
lapacke_dggsvp3_work.c
|
||||||
lapacke_dgtcon.c
|
lapacke_dgtcon.c
|
||||||
|
@ -1275,8 +1263,6 @@ set(SSRC
|
||||||
lapacke_sgeqlf_work.c
|
lapacke_sgeqlf_work.c
|
||||||
lapacke_sgeqp3.c
|
lapacke_sgeqp3.c
|
||||||
lapacke_sgeqp3_work.c
|
lapacke_sgeqp3_work.c
|
||||||
lapacke_sgeqpf.c
|
|
||||||
lapacke_sgeqpf_work.c
|
|
||||||
lapacke_sgeqr.c
|
lapacke_sgeqr.c
|
||||||
lapacke_sgeqr_work.c
|
lapacke_sgeqr_work.c
|
||||||
lapacke_sgeqr2.c
|
lapacke_sgeqr2.c
|
||||||
|
@ -1351,12 +1337,8 @@ set(SSRC
|
||||||
lapacke_sggqrf_work.c
|
lapacke_sggqrf_work.c
|
||||||
lapacke_sggrqf.c
|
lapacke_sggrqf.c
|
||||||
lapacke_sggrqf_work.c
|
lapacke_sggrqf_work.c
|
||||||
lapacke_sggsvd.c
|
|
||||||
lapacke_sggsvd_work.c
|
|
||||||
lapacke_sggsvd3.c
|
lapacke_sggsvd3.c
|
||||||
lapacke_sggsvd3_work.c
|
lapacke_sggsvd3_work.c
|
||||||
lapacke_sggsvp.c
|
|
||||||
lapacke_sggsvp_work.c
|
|
||||||
lapacke_sggsvp3.c
|
lapacke_sggsvp3.c
|
||||||
lapacke_sggsvp3_work.c
|
lapacke_sggsvp3_work.c
|
||||||
lapacke_sgtcon.c
|
lapacke_sgtcon.c
|
||||||
|
@ -1849,8 +1831,6 @@ set(ZSRC
|
||||||
lapacke_zgeqlf_work.c
|
lapacke_zgeqlf_work.c
|
||||||
lapacke_zgeqp3.c
|
lapacke_zgeqp3.c
|
||||||
lapacke_zgeqp3_work.c
|
lapacke_zgeqp3_work.c
|
||||||
lapacke_zgeqpf.c
|
|
||||||
lapacke_zgeqpf_work.c
|
|
||||||
lapacke_zgeqr.c
|
lapacke_zgeqr.c
|
||||||
lapacke_zgeqr_work.c
|
lapacke_zgeqr_work.c
|
||||||
lapacke_zgeqr2.c
|
lapacke_zgeqr2.c
|
||||||
|
@ -1925,12 +1905,8 @@ set(ZSRC
|
||||||
lapacke_zggqrf_work.c
|
lapacke_zggqrf_work.c
|
||||||
lapacke_zggrqf.c
|
lapacke_zggrqf.c
|
||||||
lapacke_zggrqf_work.c
|
lapacke_zggrqf_work.c
|
||||||
lapacke_zggsvd.c
|
|
||||||
lapacke_zggsvd_work.c
|
|
||||||
lapacke_zggsvd3.c
|
lapacke_zggsvd3.c
|
||||||
lapacke_zggsvd3_work.c
|
lapacke_zggsvd3_work.c
|
||||||
lapacke_zggsvp.c
|
|
||||||
lapacke_zggsvp_work.c
|
|
||||||
lapacke_zggsvp3.c
|
lapacke_zggsvp3.c
|
||||||
lapacke_zggsvp3_work.c
|
lapacke_zggsvp3_work.c
|
||||||
lapacke_zgtcon.c
|
lapacke_zgtcon.c
|
||||||
|
@ -2401,6 +2377,12 @@ set(ZSRC
|
||||||
lapacke_csyr_work.c
|
lapacke_csyr_work.c
|
||||||
lapacke_ilaver.c
|
lapacke_ilaver.c
|
||||||
)
|
)
|
||||||
|
if (BUILD_LAPACK_DEPRECATED)
|
||||||
|
set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
|
||||||
|
set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
|
||||||
|
set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
|
||||||
|
set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
|
||||||
|
endif()
|
||||||
|
|
||||||
set(SRCX
|
set(SRCX
|
||||||
lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c
|
lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c
|
||||||
|
|
|
@ -268,7 +268,8 @@ int detect(void)
|
||||||
#else
|
#else
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
||||||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
|
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1
|
||||||
|
if (value == 3660830781) return CPU_VORTEX; //A15/M2
|
||||||
#endif
|
#endif
|
||||||
return CPU_ARMV8;
|
return CPU_ARMV8;
|
||||||
#endif
|
#endif
|
||||||
|
|
18
cpuid_x86.c
18
cpuid_x86.c
|
@ -1936,7 +1936,8 @@ static char *corename[] = {
|
||||||
"ZEN",
|
"ZEN",
|
||||||
"SKYLAKEX",
|
"SKYLAKEX",
|
||||||
"DHYANA",
|
"DHYANA",
|
||||||
"COOPERLAKE"
|
"COOPERLAKE",
|
||||||
|
"SAPPHIRERAPIDS",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1970,7 +1971,8 @@ static char *corename_lower[] = {
|
||||||
"zen",
|
"zen",
|
||||||
"skylakex",
|
"skylakex",
|
||||||
"dhyana",
|
"dhyana",
|
||||||
"cooperlake"
|
"cooperlake",
|
||||||
|
"sapphirerapids",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -2276,16 +2278,18 @@ int get_coretype(void){
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
}
|
}
|
||||||
if (model == 15) { // Sapphire Rapids
|
if (model == 15) { // Sapphire Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return CORE_SAPPHIRERAPIDS;
|
||||||
if(support_avx512_bf16())
|
if(support_avx512_bf16())
|
||||||
return CPUTYPE_COOPERLAKE;
|
return CORE_COOPERLAKE;
|
||||||
if(support_avx512())
|
if(support_avx512())
|
||||||
return CPUTYPE_SKYLAKEX;
|
return CORE_SKYLAKEX;
|
||||||
if(support_avx2())
|
if(support_avx2())
|
||||||
return CPUTYPE_HASWELL;
|
return CORE_HASWELL;
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CORE_SANDYBRIDGE;
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ blasobjsc="
|
||||||
chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
|
chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
|
||||||
chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
|
chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
|
||||||
csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
|
csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
|
||||||
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum"
|
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
|
||||||
|
|
||||||
blasobjsd="
|
blasobjsd="
|
||||||
damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
|
damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
|
||||||
|
@ -29,7 +29,7 @@ blasobjsd="
|
||||||
dscal dsdot dspmv dspr2 dimatcopy domatcopy
|
dscal dsdot dspmv dspr2 dimatcopy domatcopy
|
||||||
dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
|
dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
|
||||||
dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
|
dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
|
||||||
idamax idamin idmax idmin dgeadd dsum"
|
idamax idamin idmax idmin dgeadd dsum dgemmt"
|
||||||
|
|
||||||
blasobjss="
|
blasobjss="
|
||||||
isamax isamin ismax ismin
|
isamax isamin ismax ismin
|
||||||
|
@ -38,7 +38,7 @@ blasobjss="
|
||||||
smax smin snrm2 simatcopy somatcopy
|
smax smin snrm2 simatcopy somatcopy
|
||||||
srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
|
srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
|
||||||
ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
|
ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
|
||||||
strmm strmv strsm strsv sgeadd ssum"
|
strmm strmv strsm strsv sgeadd ssum sgemmt"
|
||||||
|
|
||||||
blasobjsz="
|
blasobjsz="
|
||||||
izamax izamin
|
izamax izamin
|
||||||
|
@ -48,7 +48,7 @@ blasobjsz="
|
||||||
zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
|
zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
|
||||||
ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
|
ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
|
||||||
zomatcopy zimatcopy dzamax dzamin dzasum dznrm2
|
zomatcopy zimatcopy dzamax dzamin dzasum dznrm2
|
||||||
zgeadd dzsum"
|
zgeadd dzsum zgemmt"
|
||||||
|
|
||||||
blasobjs="lsame xerbla"
|
blasobjs="lsame xerbla"
|
||||||
bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
|
bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
|
||||||
|
@ -58,7 +58,7 @@ cblasobjsc="
|
||||||
cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
|
cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
|
||||||
cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
|
cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
|
||||||
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
||||||
cblas_scnrm2 cblas_scasum
|
cblas_scnrm2 cblas_scasum cblas_cgemmt
|
||||||
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
||||||
"
|
"
|
||||||
cblasobjsd="
|
cblasobjsd="
|
||||||
|
@ -67,7 +67,7 @@ cblasobjsd="
|
||||||
cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
|
cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
|
||||||
cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
|
cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
|
||||||
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
||||||
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd
|
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
|
||||||
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
||||||
"
|
"
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ cblasobjss="
|
||||||
cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
|
cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
|
||||||
cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
|
cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
|
||||||
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
||||||
cblas_strsv cblas_sgeadd
|
cblas_strsv cblas_sgeadd cblas_sgemmt
|
||||||
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
||||||
"
|
"
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ cblasobjsz="
|
||||||
cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
|
cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
|
||||||
cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
|
cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
|
||||||
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
||||||
cblas_zaxpby cblas_zgeadd
|
cblas_zaxpby cblas_zgeadd cblas_zgemmt
|
||||||
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
||||||
"
|
"
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
|
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
|
||||||
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
|
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
|
||||||
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
|
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
|
||||||
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum);
|
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
|
||||||
|
|
||||||
@blasobjsd = (
|
@blasobjsd = (
|
||||||
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
|
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
|
||||||
|
@ -29,7 +29,7 @@
|
||||||
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
|
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
|
||||||
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
|
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
|
||||||
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
|
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
|
||||||
idamax,idamin,idmax,idmin,dgeadd,dsum);
|
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
|
||||||
|
|
||||||
@blasobjss = (
|
@blasobjss = (
|
||||||
isamax,isamin,ismax,ismin,
|
isamax,isamin,ismax,ismin,
|
||||||
|
@ -38,7 +38,7 @@
|
||||||
smax,smin,snrm2,simatcopy,somatcopy,
|
smax,smin,snrm2,simatcopy,somatcopy,
|
||||||
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
|
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
|
||||||
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
|
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
|
||||||
strmm,strmv,strsm,strsv, sgeadd,ssum);
|
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
|
||||||
|
|
||||||
@blasobjsz = (
|
@blasobjsz = (
|
||||||
izamax,izamin,,
|
izamax,izamin,,
|
||||||
|
@ -48,7 +48,7 @@
|
||||||
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
||||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
||||||
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
|
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
|
||||||
zgeadd, dzsum);
|
zgeadd, dzsum, zgemmt);
|
||||||
|
|
||||||
@blasobjs = (lsame, xerbla);
|
@blasobjs = (lsame, xerbla);
|
||||||
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
|
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
|
||||||
|
@ -60,7 +60,7 @@
|
||||||
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
|
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
|
||||||
cblas_scnrm2, cblas_scasum,
|
cblas_scnrm2, cblas_scasum,
|
||||||
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
|
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
|
||||||
);
|
cblas_cgemmt);
|
||||||
@cblasobjsd = (
|
@cblasobjsd = (
|
||||||
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
|
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
|
||||||
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
|
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
|
||||||
|
@ -69,7 +69,7 @@
|
||||||
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
|
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
|
||||||
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
|
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
|
||||||
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
|
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
|
||||||
);
|
cblas_dgemmt);
|
||||||
|
|
||||||
@cblasobjss = (
|
@cblasobjss = (
|
||||||
cblas_sasum, cblas_saxpy, cblas_saxpby,
|
cblas_sasum, cblas_saxpy, cblas_saxpby,
|
||||||
|
@ -80,7 +80,7 @@
|
||||||
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
|
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
|
||||||
cblas_strsv, cblas_sgeadd,
|
cblas_strsv, cblas_sgeadd,
|
||||||
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
|
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
|
||||||
);
|
cblas_sgemmt);
|
||||||
@cblasobjsz = (
|
@cblasobjsz = (
|
||||||
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
|
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
|
||||||
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
|
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
|
||||||
|
@ -90,7 +90,7 @@
|
||||||
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
|
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
|
||||||
cblas_zaxpby, cblas_zgeadd,
|
cblas_zaxpby, cblas_zgeadd,
|
||||||
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
|
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
|
||||||
);
|
cblas_zgemmt);
|
||||||
|
|
||||||
@cblasobjs = ( cblas_xerbla );
|
@cblasobjs = ( cblas_xerbla );
|
||||||
|
|
||||||
|
|
|
@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n");
|
||||||
|
|
||||||
#ifdef MAKE_NB_JOBS
|
#ifdef MAKE_NB_JOBS
|
||||||
#if MAKE_NB_JOBS > 0
|
#if MAKE_NB_JOBS > 0
|
||||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS);
|
||||||
#else
|
#else
|
||||||
// Let make use parent -j argument or -j1 if there
|
// Let make use parent -j argument or -j1 if there
|
||||||
// is no make parent
|
// is no make parent
|
||||||
#endif
|
#endif
|
||||||
#elif NO_PARALLEL_MAKE==1
|
#elif NO_PARALLEL_MAKE==1
|
||||||
printf("MAKE += -j 1\n");
|
printf("MAKEFLAGS += -j 1\n");
|
||||||
#else
|
#else
|
||||||
printf("MAKE += -j %d\n", get_num_cores());
|
printf("MAKEFLAGS += -j %d\n", get_num_cores());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -35,29 +35,26 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#ifdef FUNCTION_PROFILE
|
|
||||||
#include "functable.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#define SMP_THRESHOLD_MIN 65536.0
|
#define SMP_THRESHOLD_MIN 65536.0
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "QGEMT "
|
#define ERROR_NAME "QGEMMT "
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
#define ERROR_NAME "DGEMT "
|
#define ERROR_NAME "DGEMMT "
|
||||||
#elif defined(BFLOAT16)
|
#elif defined(BFLOAT16)
|
||||||
#define ERROR_NAME "SBGEMT "
|
#define ERROR_NAME "SBGEMMT "
|
||||||
#else
|
#else
|
||||||
#define ERROR_NAME "SGEMT "
|
#define ERROR_NAME "SGEMMT "
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#define SMP_THRESHOLD_MIN 8192.0
|
#define SMP_THRESHOLD_MIN 8192.0
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "XGEMT "
|
#define ERROR_NAME "XGEMMT "
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
#define ERROR_NAME "ZGEMT "
|
#define ERROR_NAME "ZGEMMT "
|
||||||
#else
|
#else
|
||||||
#define ERROR_NAME "CGEMT "
|
#define ERROR_NAME "CGEMMT "
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -68,13 +65,13 @@
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
blasint * M, blasint * N, blasint * K,
|
blasint * M, blasint * K,
|
||||||
FLOAT * Alpha,
|
FLOAT * Alpha,
|
||||||
IFLOAT * a, blasint * ldA,
|
IFLOAT * a, blasint * ldA,
|
||||||
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
||||||
{
|
{
|
||||||
|
|
||||||
blasint m, n, k;
|
blasint m, k;
|
||||||
blasint lda, ldb, ldc;
|
blasint lda, ldb, ldc;
|
||||||
int transa, transb, uplo;
|
int transa, transb, uplo;
|
||||||
blasint info;
|
blasint info;
|
||||||
|
@ -92,7 +89,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
PRINT_DEBUG_NAME;
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
m = *M;
|
m = *M;
|
||||||
n = *N;
|
|
||||||
k = *K;
|
k = *K;
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
|
@ -167,8 +163,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
info = 13;
|
info = 13;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
if (n < 0)
|
|
||||||
info = 4;
|
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
info = 3;
|
info = 3;
|
||||||
if (transb < 0)
|
if (transb < 0)
|
||||||
|
@ -184,7 +178,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
|
|
||||||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
|
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
|
||||||
blasint N, blasint k,
|
blasint k,
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
FLOAT alpha,
|
FLOAT alpha,
|
||||||
IFLOAT * A, blasint LDA,
|
IFLOAT * A, blasint LDA,
|
||||||
|
@ -205,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
int transa, transb, uplo;
|
int transa, transb, uplo;
|
||||||
blasint info;
|
blasint info;
|
||||||
blasint m, n, lda, ldb;
|
blasint m, lda, ldb;
|
||||||
FLOAT *a, *b;
|
FLOAT *a, *b;
|
||||||
XFLOAT *buffer;
|
XFLOAT *buffer;
|
||||||
|
|
||||||
|
@ -248,9 +242,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
transb = 3;
|
transb = 3;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
m = M;
|
|
||||||
n = N;
|
|
||||||
|
|
||||||
a = (void *)A;
|
a = (void *)A;
|
||||||
b = (void *)B;
|
b = (void *)B;
|
||||||
lda = LDA;
|
lda = LDA;
|
||||||
|
@ -262,8 +253,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
info = 13;
|
info = 13;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
if (n < 0)
|
|
||||||
info = 4;
|
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
info = 3;
|
info = 3;
|
||||||
if (transb < 0)
|
if (transb < 0)
|
||||||
|
@ -273,8 +262,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (order == CblasRowMajor) {
|
if (order == CblasRowMajor) {
|
||||||
m = N;
|
|
||||||
n = M;
|
|
||||||
|
|
||||||
a = (void *)B;
|
a = (void *)B;
|
||||||
b = (void *)A;
|
b = (void *)A;
|
||||||
|
@ -319,8 +306,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
info = 13;
|
info = 13;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
if (n < 0)
|
|
||||||
info = 4;
|
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
info = 3;
|
info = 3;
|
||||||
if (transb < 0)
|
if (transb < 0)
|
||||||
|
@ -407,37 +392,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if ((m == 0) || (n == 0))
|
if ((m == 0) )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
const blasint incb = (transb == 0) ? 1 : ldb;
|
const blasint incb = (transb == 0) ? 1 : ldb;
|
||||||
|
|
||||||
if (uplo == 1) {
|
if (uplo == 1) {
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < m; i++) {
|
||||||
j = n - i;
|
j = m - i;
|
||||||
|
|
||||||
l = j;
|
l = j;
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
aa = a + i * 2;
|
aa = a + i * 2;
|
||||||
bb = b + i * ldb * 2;
|
bb = b + i * ldb * 2;
|
||||||
if (transa) {
|
if (transa) {
|
||||||
l = k;
|
|
||||||
aa = a + lda * i * 2;
|
aa = a + lda * i * 2;
|
||||||
bb = b + i * 2;
|
|
||||||
}
|
}
|
||||||
|
if (transb)
|
||||||
|
bb = b + i * 2;
|
||||||
cc = c + i * 2 * ldc + i * 2;
|
cc = c + i * 2 * ldc + i * 2;
|
||||||
#else
|
#else
|
||||||
aa = a + i;
|
aa = a + i;
|
||||||
bb = b + i * ldb;
|
bb = b + i * ldb;
|
||||||
if (transa) {
|
if (transa) {
|
||||||
l = k;
|
|
||||||
aa = a + lda * i;
|
aa = a + lda * i;
|
||||||
bb = b + i;
|
|
||||||
}
|
}
|
||||||
|
if (transb)
|
||||||
|
bb = b + i;
|
||||||
cc = c + i * ldc + i;
|
cc = c + i * ldc + i;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -458,8 +441,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
buffer_size += 160 / sizeof(FLOAT);
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
|
@ -479,20 +460,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||||
aa, lda, bb, incb, cc, 1,
|
aa, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
|
||||||
|
aa, lda, bb, incb, cc, 1,
|
||||||
|
buffer);
|
||||||
#else
|
#else
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
||||||
bb, incb, cc, 1, buffer);
|
bb, incb, cc, 1, buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
|
||||||
|
bb, incb, cc, 1, buffer);
|
||||||
#endif
|
#endif
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
} else {
|
} else {
|
||||||
|
if (!transa)
|
||||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||||
lda, bb, incb, cc,
|
lda, bb, incb, cc,
|
||||||
1, buffer,
|
1, buffer,
|
||||||
nthreads);
|
nthreads);
|
||||||
|
else
|
||||||
|
(gemv_thread[(int)transa]) (k, j, alpha, aa,
|
||||||
|
lda, bb, incb, cc,
|
||||||
|
1, buffer,
|
||||||
|
nthreads);
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -501,21 +496,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < m; i++) {
|
||||||
j = i + 1;
|
j = i + 1;
|
||||||
|
|
||||||
l = j;
|
l = j;
|
||||||
#if defined COMPLEX
|
#if defined COMPLEX
|
||||||
bb = b + i * ldb * 2;
|
bb = b + i * ldb * 2;
|
||||||
if (transa) {
|
if (transb) {
|
||||||
l = k;
|
|
||||||
bb = b + i * 2;
|
bb = b + i * 2;
|
||||||
}
|
}
|
||||||
cc = c + i * 2 * ldc;
|
cc = c + i * 2 * ldc;
|
||||||
#else
|
#else
|
||||||
bb = b + i * ldb;
|
bb = b + i * ldb;
|
||||||
if (transa) {
|
if (transb) {
|
||||||
l = k;
|
|
||||||
bb = b + i;
|
bb = b + i;
|
||||||
}
|
}
|
||||||
cc = c + i * ldc;
|
cc = c + i * ldc;
|
||||||
|
@ -537,8 +530,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
buffer_size += 160 / sizeof(FLOAT);
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
|
@ -558,30 +549,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||||
a, lda, bb, incb, cc, 1,
|
a, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
|
||||||
|
a, lda, bb, incb, cc, 1,
|
||||||
|
buffer);
|
||||||
#else
|
#else
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
||||||
incb, cc, 1, buffer);
|
incb, cc, 1, buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
|
||||||
|
incb, cc, 1, buffer);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
} else {
|
} else {
|
||||||
|
if (!transa)
|
||||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||||
bb, incb, cc, 1,
|
bb, incb, cc, 1,
|
||||||
buffer, nthreads);
|
buffer, nthreads);
|
||||||
|
else
|
||||||
|
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
|
||||||
|
bb, incb, cc, 1,
|
||||||
|
buffer, nthreads);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
STACK_FREE(buffer);
|
STACK_FREE(buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
|
|
||||||
args.m * args.k + args.k * args.n +
|
|
||||||
args.m * args.n, 2 * args.m * args.n * args.k);
|
|
||||||
|
|
||||||
IDEBUG_END;
|
IDEBUG_END;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue