Merge branch 'develop'
This commit is contained in:
commit
d0c51c4de9
|
@ -121,5 +121,11 @@ In chronological order:
|
||||||
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
|
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
|
||||||
ARMv8 support.
|
ARMv8 support.
|
||||||
|
|
||||||
|
* Dan Kortschak
|
||||||
|
* [2015-01-07] Added test for drotmg bug #484.
|
||||||
|
|
||||||
|
* Ton van den Heuvel <https://github.com/ton>
|
||||||
|
* [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity().
|
||||||
|
|
||||||
* [Your name or handle] <[email or website]>
|
* [Your name or handle] <[email or website]>
|
||||||
* [Date] [Brief summary of your changes]
|
* [Date] [Brief summary of your changes]
|
||||||
|
|
|
@ -1,4 +1,24 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.2.14
|
||||||
|
24-Mar-2015
|
||||||
|
common:
|
||||||
|
* Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.)
|
||||||
|
* Improve ger and gemv for small matrices by stack allocation.
|
||||||
|
e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.)
|
||||||
|
* Introduce openblas_get_num_threads and openblas_get_num_procs.
|
||||||
|
(#497. Thanks, Erik Schnetter.)
|
||||||
|
* Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.)
|
||||||
|
* Fix c/zsyr bug with negative incx. (#492.)
|
||||||
|
* Fix race condition during shutdown causing a crash in
|
||||||
|
gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.)
|
||||||
|
|
||||||
|
x86/x86-64:
|
||||||
|
* Support AMD Streamroller.
|
||||||
|
|
||||||
|
ARM:
|
||||||
|
* Add Cortex-A9 and Cortex-A15 targets.
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.2.13
|
Version 0.2.13
|
||||||
3-Dec-2014
|
3-Dec-2014
|
||||||
|
|
|
@ -9,10 +9,10 @@
|
||||||
|
|
||||||
If you want to allocate 64 large pages,
|
If you want to allocate 64 large pages,
|
||||||
|
|
||||||
$shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset
|
$shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset
|
||||||
$shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page
|
$shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page
|
||||||
$shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number
|
$shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number
|
||||||
$shell> echo 3355443200 > /pros/sys/kernel/shmall
|
$shell> echo 3355443200 > /proc/sys/kernel/shmall
|
||||||
|
|
||||||
Also may add a few lines into /etc/security/limits.conf file.
|
Also may add a few lines into /etc/security/limits.conf file.
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,8 @@
|
||||||
|
# ifeq logical or
|
||||||
|
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
|
||||||
|
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||||
|
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), ARMV7)
|
ifeq ($(CORE), ARMV7)
|
||||||
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||||
|
|
|
@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
|
||||||
OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
|
OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
|
||||||
OPENBLAS_BINARY_DIR := $(PREFIX)/bin
|
OPENBLAS_BINARY_DIR := $(PREFIX)/bin
|
||||||
OPENBLAS_BUILD_DIR := $(CURDIR)
|
OPENBLAS_BUILD_DIR := $(CURDIR)
|
||||||
OPENBLAS_CMAKE_DIR := $(PREFIX)/cmake
|
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
|
||||||
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
||||||
|
|
||||||
.PHONY : install
|
.PHONY : install
|
||||||
|
@ -46,11 +46,11 @@ ifndef NO_CBLAS
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef NO_LAPACKE
|
ifndef NO_LAPACKE
|
||||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#for install static library
|
#for install static library
|
||||||
|
@ -95,7 +95,8 @@ endif
|
||||||
endif
|
endif
|
||||||
#Generating OpenBLASConfig.cmake
|
#Generating OpenBLASConfig.cmake
|
||||||
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||||
|
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||||
ifndef NO_SHARED
|
ifndef NO_SHARED
|
||||||
#ifeq logical or
|
#ifeq logical or
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.2.13
|
VERSION = 0.2.14
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -159,6 +159,19 @@ COMMON_PROF = -pg
|
||||||
# Build Debug version
|
# Build Debug version
|
||||||
# DEBUG = 1
|
# DEBUG = 1
|
||||||
|
|
||||||
|
# Improve GEMV and GER for small matrices by stack allocation.
|
||||||
|
# For details, https://github.com/xianyi/OpenBLAS/pull/482
|
||||||
|
#
|
||||||
|
# MAX_STACK_ALLOC=2048
|
||||||
|
|
||||||
|
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||||
|
# Avoid conflicts with other BLAS libraries, especially when using
|
||||||
|
# 64 bit integer interfaces in OpenBLAS.
|
||||||
|
# For details, https://github.com/xianyi/OpenBLAS/pull/459
|
||||||
|
#
|
||||||
|
# SYMBOLPREFIX=
|
||||||
|
# SYMBOLSUFFIX=
|
||||||
|
|
||||||
#
|
#
|
||||||
# End of user configuration
|
# End of user configuration
|
||||||
#
|
#
|
||||||
|
|
|
@ -61,6 +61,9 @@ endif
|
||||||
ifeq ($(TARGET), PILEDRIVER)
|
ifeq ($(TARGET), PILEDRIVER)
|
||||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(TARGET), STEAMROLLER)
|
||||||
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,6 +88,9 @@ endif
|
||||||
ifeq ($(TARGET_CORE), PILEDRIVER)
|
ifeq ($(TARGET_CORE), PILEDRIVER)
|
||||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(TARGET_CORE), STEAMROLLER)
|
||||||
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -305,6 +311,10 @@ ifdef SANITY_CHECK
|
||||||
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef MAX_STACK_ALLOC
|
||||||
|
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Architecture dependent settings
|
# Architecture dependent settings
|
||||||
#
|
#
|
||||||
|
@ -354,6 +364,12 @@ endif
|
||||||
|
|
||||||
|
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
|
||||||
|
#check
|
||||||
|
ifeq ($(USE_THREAD), 0)
|
||||||
|
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
|
||||||
|
endif
|
||||||
|
|
||||||
# ifeq logical or. GCC or LSB
|
# ifeq logical or. GCC or LSB
|
||||||
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
||||||
CCOMMON_OPT += -fopenmp
|
CCOMMON_OPT += -fopenmp
|
||||||
|
@ -392,7 +408,7 @@ endif
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER
|
||||||
endif
|
endif
|
||||||
ifneq ($(NO_AVX2), 1)
|
ifneq ($(NO_AVX2), 1)
|
||||||
DYNAMIC_CORE += HASWELL
|
DYNAMIC_CORE += HASWELL
|
||||||
|
|
|
@ -60,6 +60,7 @@ Please read GotoBLAS_01Readme.txt
|
||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
|
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
|
||||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||||
|
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||||
|
|
||||||
#### MIPS64:
|
#### MIPS64:
|
||||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||||
|
|
|
@ -32,6 +32,7 @@ ISTANBUL
|
||||||
BOBCAT
|
BOBCAT
|
||||||
BULLDOZER
|
BULLDOZER
|
||||||
PILEDRIVER
|
PILEDRIVER
|
||||||
|
STEAMROLLER
|
||||||
|
|
||||||
c)VIA CPU:
|
c)VIA CPU:
|
||||||
SSE_GENERIC
|
SSE_GENERIC
|
||||||
|
@ -62,6 +63,11 @@ SPARC
|
||||||
SPARCV7
|
SPARCV7
|
||||||
|
|
||||||
6.ARM CPU:
|
6.ARM CPU:
|
||||||
|
CORTEXA15
|
||||||
|
CORTEXA9
|
||||||
ARMV7
|
ARMV7
|
||||||
ARMV6
|
ARMV6
|
||||||
ARMV5
|
ARMV5
|
||||||
|
|
||||||
|
7.ARM 64-bit CPU:
|
||||||
|
ARMV8
|
||||||
|
|
|
@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system
|
||||||
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
|
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
|
||||||
|
|
||||||
# ACML custom
|
# ACML custom
|
||||||
ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
|
#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
|
||||||
LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
|
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
|
||||||
|
|
||||||
|
# ACML 6.1 custom
|
||||||
|
ACML=/home/werner/project/acml6.1/gfortran64_mp/lib
|
||||||
|
LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
|
||||||
|
|
||||||
|
|
||||||
# Atlas Ubuntu
|
# Atlas Ubuntu
|
||||||
#ATLAS=/usr/lib/atlas-base
|
#ATLAS=/usr/lib/atlas-base
|
||||||
|
|
|
@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *x, *y;
|
FLOAT *x, *y;
|
||||||
FLOAT alpha[2] = { 2.0, 2.0 };
|
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||||
|
@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
char *trans[] = {"T", "N"};
|
char *trans[] = {"T", "N"};
|
||||||
|
@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *x, *y;
|
FLOAT *x, *y;
|
||||||
FLOAT result;
|
FLOAT result;
|
||||||
|
@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
|
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
|
||||||
FLOAT wkopt[4];
|
FLOAT wkopt[4];
|
||||||
|
@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b, *c;
|
FLOAT *a, *b, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
FLOAT beta [] = {1.0, 1.0};
|
FLOAT beta [] = {1.0, 1.0};
|
||||||
char trans='N';
|
char trans='N';
|
||||||
blasint m, i, j;
|
blasint m, n, i, j;
|
||||||
int loops = 1;
|
int loops = 1;
|
||||||
|
int has_param_n=0;
|
||||||
int l;
|
int l;
|
||||||
char *p;
|
char *p;
|
||||||
|
|
||||||
|
@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){
|
||||||
if ( p != NULL )
|
if ( p != NULL )
|
||||||
loops = atoi(p);
|
loops = atoi(p);
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_PARAM_N"))) {
|
||||||
|
n = atoi(p);
|
||||||
|
has_param_n=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef linux
|
#ifdef linux
|
||||||
srandom(getpid());
|
srandom(getpid());
|
||||||
|
@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){
|
||||||
|
|
||||||
timeg=0;
|
timeg=0;
|
||||||
|
|
||||||
fprintf(stderr, " %6d : ", (int)m);
|
if ( has_param_n == 1 && n <= m )
|
||||||
|
n=n;
|
||||||
|
else
|
||||||
|
n=m;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
|
||||||
|
|
||||||
for (l=0; l<loops; l++)
|
for (l=0; l<loops; l++)
|
||||||
{
|
{
|
||||||
|
@ -189,7 +202,7 @@ int MAIN__(int argc, char *argv[]){
|
||||||
|
|
||||||
gettimeofday( &start, (struct timezone *)0);
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
|
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
|
||||||
|
|
||||||
gettimeofday( &stop, (struct timezone *)0);
|
gettimeofday( &stop, (struct timezone *)0);
|
||||||
|
|
||||||
|
@ -202,11 +215,11 @@ int MAIN__(int argc, char *argv[]){
|
||||||
timeg /= loops;
|
timeg /= loops;
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
" %10.2f MFlops\n",
|
" %10.2f MFlops\n",
|
||||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
|
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b, *c;
|
FLOAT *a, *b, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *x, *y;
|
FLOAT *a, *x, *y;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *x, *y;
|
FLOAT *a, *x, *y;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -214,5 +214,5 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
||||||
|
|
|
@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a,*work;
|
FLOAT *a,*work;
|
||||||
FLOAT wkopt[4];
|
FLOAT wkopt[4];
|
||||||
|
@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b, *c;
|
FLOAT *a, *b, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *x, *y;
|
FLOAT *a, *x, *y;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b, *c;
|
FLOAT *a, *b, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *c;
|
FLOAT *a, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b;
|
FLOAT *a, *b;
|
||||||
blasint *ipiv;
|
blasint *ipiv;
|
||||||
|
@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -114,7 +114,7 @@ int gettimeofday(struct timeval *tv, void *tz){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
char *trans[] = {"T", "N"};
|
char *trans[] = {"T", "N"};
|
||||||
|
@ -278,5 +278,5 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b, *c;
|
FLOAT *a, *b, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *x, *y;
|
FLOAT *a, *x, *y;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b, *c;
|
FLOAT *a, *b, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *c;
|
FLOAT *a, *c;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b;
|
FLOAT *a, *b;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int MAIN__(int argc, char *argv[]){
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *b;
|
FLOAT *a, *b;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||||
|
|
4
c_check
4
c_check
|
@ -81,6 +81,10 @@ if (($architecture eq "mips32") || ($architecture eq "mips64")) {
|
||||||
$defined = 1;
|
$defined = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (($architecture eq "arm") || ($architecture eq "arm64")) {
|
||||||
|
$defined = 1;
|
||||||
|
}
|
||||||
|
|
||||||
if ($architecture eq "alpha") {
|
if ($architecture eq "alpha") {
|
||||||
$defined = 1;
|
$defined = 1;
|
||||||
$binary = 64;
|
$binary = 64;
|
||||||
|
|
16
cblas.h
16
cblas.h
|
@ -13,6 +13,12 @@ extern "C" {
|
||||||
void openblas_set_num_threads(int num_threads);
|
void openblas_set_num_threads(int num_threads);
|
||||||
void goto_set_num_threads(int num_threads);
|
void goto_set_num_threads(int num_threads);
|
||||||
|
|
||||||
|
/*Get the number of threads on runtime.*/
|
||||||
|
int openblas_get_num_threads(void);
|
||||||
|
|
||||||
|
/*Get the number of physical processors (cores).*/
|
||||||
|
int openblas_get_num_procs(void);
|
||||||
|
|
||||||
/*Get the build configure on runtime.*/
|
/*Get the build configure on runtime.*/
|
||||||
char* openblas_get_config(void);
|
char* openblas_get_config(void);
|
||||||
|
|
||||||
|
@ -341,6 +347,16 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
|
||||||
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
|
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
|
||||||
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
|
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
|
||||||
|
|
||||||
|
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
|
||||||
|
float *c, OPENBLAS_CONST blasint cldc);
|
||||||
|
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
|
||||||
|
double *c, OPENBLAS_CONST blasint cldc);
|
||||||
|
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
|
||||||
|
float *c, OPENBLAS_CONST blasint cldc);
|
||||||
|
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
|
||||||
|
double *c, OPENBLAS_CONST blasint cldc);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif /* __cplusplus */
|
#endif /* __cplusplus */
|
||||||
|
|
|
@ -13,6 +13,12 @@ extern "C" {
|
||||||
void openblas_set_num_threads(int num_threads);
|
void openblas_set_num_threads(int num_threads);
|
||||||
void goto_set_num_threads(int num_threads);
|
void goto_set_num_threads(int num_threads);
|
||||||
|
|
||||||
|
/*Get the number of threads on runtime.*/
|
||||||
|
int openblas_get_num_threads(void);
|
||||||
|
|
||||||
|
/*Get the number of physical processors (cores).*/
|
||||||
|
int openblas_get_num_procs(void);
|
||||||
|
|
||||||
/*Get the build configure on runtime.*/
|
/*Get the build configure on runtime.*/
|
||||||
char* openblas_get_config(void);
|
char* openblas_get_config(void);
|
||||||
|
|
||||||
|
@ -327,6 +333,16 @@ void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, bl
|
||||||
blasint clda, blasint cldb);
|
blasint clda, blasint cldb);
|
||||||
void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a,
|
void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a,
|
||||||
blasint clda, blasint cldb);
|
blasint clda, blasint cldb);
|
||||||
|
|
||||||
|
void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta,
|
||||||
|
float *c, blasint cldc);
|
||||||
|
void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta,
|
||||||
|
double *c, blasint cldc);
|
||||||
|
void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta,
|
||||||
|
float *c, blasint cldc);
|
||||||
|
void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta,
|
||||||
|
double *c, blasint cldc);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif /* __cplusplus */
|
#endif /* __cplusplus */
|
||||||
|
|
8
common.h
8
common.h
|
@ -327,6 +327,14 @@ typedef int blasint;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
#ifdef STEAMROLLER
|
||||||
|
#ifndef YIELDING
|
||||||
|
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef YIELDING
|
#ifndef YIELDING
|
||||||
#define YIELDING sched_yield()
|
#define YIELDING sched_yield()
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -220,6 +220,7 @@
|
||||||
#define COMATCOPY_K_CTC comatcopy_k_ctc
|
#define COMATCOPY_K_CTC comatcopy_k_ctc
|
||||||
#define COMATCOPY_K_RTC comatcopy_k_rtc
|
#define COMATCOPY_K_RTC comatcopy_k_rtc
|
||||||
|
|
||||||
|
#define CGEADD_K cgeadd_k
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -402,6 +403,7 @@
|
||||||
#define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc
|
#define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc
|
||||||
#define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc
|
#define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc
|
||||||
#define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc
|
#define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc
|
||||||
|
#define CGEADD_K gotoblas -> cgeadd_k
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -149,6 +149,7 @@
|
||||||
#define DOMATCOPY_K_RN domatcopy_k_rn
|
#define DOMATCOPY_K_RN domatcopy_k_rn
|
||||||
#define DOMATCOPY_K_CT domatcopy_k_ct
|
#define DOMATCOPY_K_CT domatcopy_k_ct
|
||||||
#define DOMATCOPY_K_RT domatcopy_k_rt
|
#define DOMATCOPY_K_RT domatcopy_k_rt
|
||||||
|
#define DGEADD_K dgeadd_k
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -267,6 +268,8 @@
|
||||||
#define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct
|
#define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct
|
||||||
#define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt
|
#define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt
|
||||||
|
|
||||||
|
#define DGEADD_K gotoblas -> dgeadd_k
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define DGEMM_NN dgemm_nn
|
#define DGEMM_NN dgemm_nn
|
||||||
|
|
|
@ -754,6 +754,12 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do
|
||||||
void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
|
void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
|
||||||
void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
|
void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
|
||||||
|
|
||||||
|
void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
|
||||||
|
void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
|
||||||
|
void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
|
||||||
|
void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1762,6 +1762,11 @@ int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, dou
|
||||||
int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
|
int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
|
||||||
int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
|
int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
|
||||||
|
|
||||||
|
int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG);
|
||||||
|
int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG);
|
||||||
|
int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG);
|
||||||
|
int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
}
|
}
|
||||||
|
|
|
@ -634,7 +634,7 @@
|
||||||
#define OMATCOPY_K_RN DOMATCOPY_K_RN
|
#define OMATCOPY_K_RN DOMATCOPY_K_RN
|
||||||
#define OMATCOPY_K_CT DOMATCOPY_K_CT
|
#define OMATCOPY_K_CT DOMATCOPY_K_CT
|
||||||
#define OMATCOPY_K_RT DOMATCOPY_K_RT
|
#define OMATCOPY_K_RT DOMATCOPY_K_RT
|
||||||
|
#define GEADD_K DGEADD_K
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define AMAX_K SAMAX_K
|
#define AMAX_K SAMAX_K
|
||||||
|
@ -932,6 +932,7 @@
|
||||||
#define OMATCOPY_K_CT SOMATCOPY_K_CT
|
#define OMATCOPY_K_CT SOMATCOPY_K_CT
|
||||||
#define OMATCOPY_K_RT SOMATCOPY_K_RT
|
#define OMATCOPY_K_RT SOMATCOPY_K_RT
|
||||||
|
|
||||||
|
#define GEADD_K SGEADD_K
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
@ -1746,6 +1747,7 @@
|
||||||
#define OMATCOPY_K_RNC ZOMATCOPY_K_RNC
|
#define OMATCOPY_K_RNC ZOMATCOPY_K_RNC
|
||||||
#define OMATCOPY_K_CTC ZOMATCOPY_K_CTC
|
#define OMATCOPY_K_CTC ZOMATCOPY_K_CTC
|
||||||
#define OMATCOPY_K_RTC ZOMATCOPY_K_RTC
|
#define OMATCOPY_K_RTC ZOMATCOPY_K_RTC
|
||||||
|
#define GEADD_K ZGEADD_K
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -2159,6 +2161,8 @@
|
||||||
#define OMATCOPY_K_CTC COMATCOPY_K_CTC
|
#define OMATCOPY_K_CTC COMATCOPY_K_CTC
|
||||||
#define OMATCOPY_K_RTC COMATCOPY_K_RTC
|
#define OMATCOPY_K_RTC COMATCOPY_K_RTC
|
||||||
|
|
||||||
|
#define GEADD_K CGEADD_K
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -855,6 +855,10 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
||||||
int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||||
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||||
|
|
||||||
|
int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
|
||||||
|
int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
|
||||||
|
int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
|
||||||
|
int (*zgeadd_k) (BLASLONG, BLASLONG, float, double, double *, BLASLONG, double, double, double *, BLASLONG);
|
||||||
|
|
||||||
} gotoblas_t;
|
} gotoblas_t;
|
||||||
|
|
||||||
|
|
|
@ -153,6 +153,7 @@
|
||||||
#define SOMATCOPY_K_CT somatcopy_k_ct
|
#define SOMATCOPY_K_CT somatcopy_k_ct
|
||||||
#define SOMATCOPY_K_RT somatcopy_k_rt
|
#define SOMATCOPY_K_RT somatcopy_k_rt
|
||||||
|
|
||||||
|
#define SGEADD_K sgeadd_k
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -274,6 +275,7 @@
|
||||||
#define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct
|
#define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct
|
||||||
#define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt
|
#define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt
|
||||||
|
|
||||||
|
#define SGEADD_K gotoblas -> sgeadd_k
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#define MMXSTORE movd
|
#define MMXSTORE movd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
|
||||||
//Enable some optimazation for barcelona.
|
//Enable some optimazation for barcelona.
|
||||||
#define BARCELONA_OPTIMIZATION
|
#define BARCELONA_OPTIMIZATION
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
|
|
||||||
#ifdef ASSEMBLER
|
#ifdef ASSEMBLER
|
||||||
|
|
||||||
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
|
||||||
//Enable some optimazation for barcelona.
|
//Enable some optimazation for barcelona.
|
||||||
#define BARCELONA_OPTIMIZATION
|
#define BARCELONA_OPTIMIZATION
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -220,6 +220,7 @@
|
||||||
#define ZOMATCOPY_K_CTC zomatcopy_k_ctc
|
#define ZOMATCOPY_K_CTC zomatcopy_k_ctc
|
||||||
#define ZOMATCOPY_K_RTC zomatcopy_k_rtc
|
#define ZOMATCOPY_K_RTC zomatcopy_k_rtc
|
||||||
|
|
||||||
|
#define ZGEADD_K zgeadd_k
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -403,6 +404,8 @@
|
||||||
#define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc
|
#define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc
|
||||||
#define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc
|
#define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc
|
||||||
|
|
||||||
|
#define ZGEADD_K gotoblas -> zgeadd_k
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define ZGEMM_NN zgemm_nn
|
#define ZGEMM_NN zgemm_nn
|
||||||
|
|
2
cpuid.h
2
cpuid.h
|
@ -108,6 +108,7 @@
|
||||||
#define CORE_BULLDOZER 22
|
#define CORE_BULLDOZER 22
|
||||||
#define CORE_PILEDRIVER 23
|
#define CORE_PILEDRIVER 23
|
||||||
#define CORE_HASWELL 24
|
#define CORE_HASWELL 24
|
||||||
|
#define CORE_STEAMROLLER 25
|
||||||
|
|
||||||
#define HAVE_SSE (1 << 0)
|
#define HAVE_SSE (1 << 0)
|
||||||
#define HAVE_SSE2 (1 << 1)
|
#define HAVE_SSE2 (1 << 1)
|
||||||
|
@ -201,5 +202,6 @@ typedef struct {
|
||||||
#define CPUTYPE_BULLDOZER 46
|
#define CPUTYPE_BULLDOZER 46
|
||||||
#define CPUTYPE_PILEDRIVER 47
|
#define CPUTYPE_PILEDRIVER 47
|
||||||
#define CPUTYPE_HASWELL 48
|
#define CPUTYPE_HASWELL 48
|
||||||
|
#define CPUTYPE_STEAMROLLER 49
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
95
cpuid_arm.c
95
cpuid_arm.c
|
@ -30,16 +30,27 @@
|
||||||
#define CPU_UNKNOWN 0
|
#define CPU_UNKNOWN 0
|
||||||
#define CPU_ARMV6 1
|
#define CPU_ARMV6 1
|
||||||
#define CPU_ARMV7 2
|
#define CPU_ARMV7 2
|
||||||
#define CPU_CORTEXA15 3
|
#define CPU_CORTEXA9 3
|
||||||
|
#define CPU_CORTEXA15 4
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"UNKOWN",
|
"UNKOWN",
|
||||||
"ARMV6",
|
"ARMV6",
|
||||||
"ARMV7",
|
"ARMV7",
|
||||||
|
"CORTEXA9",
|
||||||
"CORTEXA15"
|
"CORTEXA15"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
static char *cpuname_lower[] = {
|
||||||
|
"unknown",
|
||||||
|
"armv6",
|
||||||
|
"armv7",
|
||||||
|
"cortexa9",
|
||||||
|
"cortexa15"
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
int get_feature(char *search)
|
int get_feature(char *search)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -85,6 +96,29 @@ int detect(void)
|
||||||
char buffer[512], *p;
|
char buffer[512], *p;
|
||||||
p = (char *) NULL ;
|
p = (char *) NULL ;
|
||||||
|
|
||||||
|
infile = fopen("/proc/cpuinfo", "r");
|
||||||
|
while (fgets(buffer, sizeof(buffer), infile))
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!strncmp("CPU part", buffer, 8))
|
||||||
|
{
|
||||||
|
p = strchr(buffer, ':') + 2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(infile);
|
||||||
|
if(p != NULL) {
|
||||||
|
if (strstr(p, "0xc09")) {
|
||||||
|
return CPU_CORTEXA9;
|
||||||
|
}
|
||||||
|
if (strstr(p, "0xc0f")) {
|
||||||
|
return CPU_CORTEXA15;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
p = (char *) NULL ;
|
||||||
infile = fopen("/proc/cpuinfo", "r");
|
infile = fopen("/proc/cpuinfo", "r");
|
||||||
|
|
||||||
while (fgets(buffer, sizeof(buffer), infile))
|
while (fgets(buffer, sizeof(buffer), infile))
|
||||||
|
@ -142,21 +176,7 @@ void get_architecture(void)
|
||||||
void get_subarchitecture(void)
|
void get_subarchitecture(void)
|
||||||
{
|
{
|
||||||
int d = detect();
|
int d = detect();
|
||||||
switch (d)
|
printf("%s", cpuname[d]);
|
||||||
{
|
|
||||||
|
|
||||||
case CPU_ARMV7:
|
|
||||||
printf("ARMV7");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case CPU_ARMV6:
|
|
||||||
printf("ARMV6");
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
printf("UNKNOWN");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_subdirname(void)
|
void get_subdirname(void)
|
||||||
|
@ -170,6 +190,36 @@ void get_cpuconfig(void)
|
||||||
int d = detect();
|
int d = detect();
|
||||||
switch (d)
|
switch (d)
|
||||||
{
|
{
|
||||||
|
case CPU_CORTEXA9:
|
||||||
|
printf("#define CORTEXA9\n");
|
||||||
|
printf("#define HAVE_VFP\n");
|
||||||
|
printf("#define HAVE_VFPV3\n");
|
||||||
|
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
|
||||||
|
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 32\n");
|
||||||
|
printf("#define L2_SIZE 1048576\n");
|
||||||
|
printf("#define L2_LINESIZE 32\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 128\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 4\n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CPU_CORTEXA15:
|
||||||
|
printf("#define CORTEXA15\n");
|
||||||
|
printf("#define HAVE_VFP\n");
|
||||||
|
printf("#define HAVE_VFPV3\n");
|
||||||
|
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
|
||||||
|
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 32\n");
|
||||||
|
printf("#define L2_SIZE 1048576\n");
|
||||||
|
printf("#define L2_LINESIZE 32\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 128\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 4\n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
|
||||||
case CPU_ARMV7:
|
case CPU_ARMV7:
|
||||||
printf("#define ARMV7\n");
|
printf("#define ARMV7\n");
|
||||||
|
@ -206,18 +256,7 @@ void get_libname(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
int d = detect();
|
int d = detect();
|
||||||
switch (d)
|
printf("%s", cpuname_lower[d]);
|
||||||
{
|
|
||||||
|
|
||||||
case CPU_ARMV7:
|
|
||||||
printf("armv7\n");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case CPU_ARMV6:
|
|
||||||
printf("armv6\n");
|
|
||||||
break;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
18
cpuid_x86.c
18
cpuid_x86.c
|
@ -1162,6 +1162,12 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_PILEDRIVER;
|
return CPUTYPE_PILEDRIVER;
|
||||||
else
|
else
|
||||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||||
|
case 0:
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_STEAMROLLER;
|
||||||
|
else
|
||||||
|
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||||
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
|
@ -1290,6 +1296,7 @@ static char *cpuname[] = {
|
||||||
"BULLDOZER",
|
"BULLDOZER",
|
||||||
"PILEDRIVER",
|
"PILEDRIVER",
|
||||||
"HASWELL",
|
"HASWELL",
|
||||||
|
"STEAMROLLER",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1341,6 +1348,7 @@ static char *lowercpuname[] = {
|
||||||
"bulldozer",
|
"bulldozer",
|
||||||
"piledriver",
|
"piledriver",
|
||||||
"haswell",
|
"haswell",
|
||||||
|
"steamroller",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
@ -1369,6 +1377,7 @@ static char *corename[] = {
|
||||||
"BULLDOZER",
|
"BULLDOZER",
|
||||||
"PILEDRIVER",
|
"PILEDRIVER",
|
||||||
"HASWELL",
|
"HASWELL",
|
||||||
|
"STEAMROLLER",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1397,6 +1406,7 @@ static char *corename_lower[] = {
|
||||||
"bulldozer",
|
"bulldozer",
|
||||||
"piledriver",
|
"piledriver",
|
||||||
"haswell",
|
"haswell",
|
||||||
|
"steamroller",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -1562,7 +1572,15 @@ int get_coretype(void){
|
||||||
return CORE_PILEDRIVER;
|
return CORE_PILEDRIVER;
|
||||||
else
|
else
|
||||||
return CORE_BARCELONA; //OS don't support AVX.
|
return CORE_BARCELONA; //OS don't support AVX.
|
||||||
|
|
||||||
|
case 0:
|
||||||
|
if(support_avx())
|
||||||
|
return CORE_STEAMROLLER;
|
||||||
|
else
|
||||||
|
return CORE_BARCELONA; //OS don't support AVX.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}else return CORE_BARCELONA;
|
}else return CORE_BARCELONA;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
TOPDIR = ../..
|
TOPDIR = ../..
|
||||||
include ../../Makefile.system
|
include ../../Makefile.system
|
||||||
|
|
||||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
||||||
|
|
||||||
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||||
|
|
||||||
|
@ -103,6 +103,12 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
|
||||||
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
|
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
openblas_get_num_threads.$(SUFFIX) : openblas_get_num_threads.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
openblas_get_num_procs.$(SUFFIX) : openblas_get_num_procs.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
openblas_get_config.$(SUFFIX) : openblas_get_config.c
|
openblas_get_config.$(SUFFIX) : openblas_get_config.c
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -66,6 +66,7 @@ extern gotoblas_t gotoblas_BOBCAT;
|
||||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||||
extern gotoblas_t gotoblas_BULLDOZER;
|
extern gotoblas_t gotoblas_BULLDOZER;
|
||||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||||
|
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||||
#ifdef NO_AVX2
|
#ifdef NO_AVX2
|
||||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||||
#else
|
#else
|
||||||
|
@ -77,6 +78,7 @@ extern gotoblas_t gotoblas_HASWELL;
|
||||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||||
|
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -275,7 +277,17 @@ static gotoblas_t *get_coretype(void){
|
||||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
|
}else if(model == 0){
|
||||||
|
//AMD STEAMROLLER
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_STEAMROLLER;
|
||||||
|
else{
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||||
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
return &gotoblas_BARCELONA;
|
return &gotoblas_BARCELONA;
|
||||||
}
|
}
|
||||||
|
@ -315,6 +327,7 @@ static char *corename[] = {
|
||||||
"Bulldozer",
|
"Bulldozer",
|
||||||
"Piledriver",
|
"Piledriver",
|
||||||
"Haswell",
|
"Haswell",
|
||||||
|
"Steamroller",
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
|
@ -339,6 +352,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||||
|
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||||
|
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
@ -349,9 +363,9 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||||
int i ;
|
int i ;
|
||||||
int found = -1;
|
int found = -1;
|
||||||
char message[128];
|
char message[128];
|
||||||
char mname[20];
|
//char mname[20];
|
||||||
|
|
||||||
for ( i=1 ; i <= 20; i++)
|
for ( i=1 ; i <= 21; i++)
|
||||||
{
|
{
|
||||||
if (!strncasecmp(coretype,corename[i],20))
|
if (!strncasecmp(coretype,corename[i],20))
|
||||||
{
|
{
|
||||||
|
@ -361,8 +375,8 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||||
}
|
}
|
||||||
if (found < 0)
|
if (found < 0)
|
||||||
{
|
{
|
||||||
strncpy(mname,coretype,20);
|
//strncpy(mname,coretype,20);
|
||||||
sprintf(message, "Core not found: %s\n",mname);
|
snprintf(message, 128, "Core not found: %s\n",coretype);
|
||||||
openblas_warning(1, message);
|
openblas_warning(1, message);
|
||||||
return(NULL);
|
return(NULL);
|
||||||
}
|
}
|
||||||
|
@ -370,6 +384,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||||
switch (found)
|
switch (found)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
case 21: return (&gotoblas_STEAMROLLER);
|
||||||
case 20: return (&gotoblas_HASWELL);
|
case 20: return (&gotoblas_HASWELL);
|
||||||
case 19: return (&gotoblas_PILEDRIVER);
|
case 19: return (&gotoblas_PILEDRIVER);
|
||||||
case 18: return (&gotoblas_BULLDOZER);
|
case 18: return (&gotoblas_BULLDOZER);
|
||||||
|
|
|
@ -241,6 +241,7 @@ void set_stack_limit(int limitMB){
|
||||||
*/
|
*/
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||||
It can be set by openblas_set_num_threads(int num_threads);
|
It can be set by openblas_set_num_threads(int num_threads);
|
||||||
|
@ -323,6 +324,23 @@ int blas_get_cpu_number(void){
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
int openblas_get_num_procs(void) {
|
||||||
|
#ifndef SMP
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return get_num_procs();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int openblas_get_num_threads(void) {
|
||||||
|
#ifndef SMP
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return blas_get_cpu_number();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
struct release_t {
|
struct release_t {
|
||||||
void *address;
|
void *address;
|
||||||
void (*func)(struct release_t *);
|
void (*func)(struct release_t *);
|
||||||
|
@ -1335,6 +1353,8 @@ void DESTRUCTOR gotoblas_quit(void) {
|
||||||
|
|
||||||
if (gotoblas_initialized == 0) return;
|
if (gotoblas_initialized == 0) return;
|
||||||
|
|
||||||
|
blas_shutdown();
|
||||||
|
|
||||||
#ifdef PROFILE
|
#ifdef PROFILE
|
||||||
moncontrol (0);
|
moncontrol (0);
|
||||||
#endif
|
#endif
|
||||||
|
@ -1356,8 +1376,6 @@ void DESTRUCTOR gotoblas_quit(void) {
|
||||||
#ifdef PROFILE
|
#ifdef PROFILE
|
||||||
moncontrol (1);
|
moncontrol (1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_shutdown();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011-2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
extern int openblas_get_num_procs(void);
|
||||||
|
|
||||||
|
int openblas_get_num_procs_(void) {
|
||||||
|
return openblas_get_num_procs();
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011-2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
extern int openblas_get_num_threads(void);
|
||||||
|
|
||||||
|
int openblas_get_num_threads_(void) {
|
||||||
|
return openblas_get_num_threads();
|
||||||
|
}
|
|
@ -166,7 +166,7 @@ int get_L2_size(void){
|
||||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||||
defined(PILEDRIVER) || defined(HASWELL)
|
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
|
||||||
|
|
||||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||||
|
|
||||||
|
@ -251,7 +251,7 @@ void blas_set_parameter(void){
|
||||||
|
|
||||||
env_var_t p;
|
env_var_t p;
|
||||||
int factor;
|
int factor;
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
||||||
int size = 16;
|
int size = 16;
|
||||||
#else
|
#else
|
||||||
int size = get_L2_size();
|
int size = get_L2_size();
|
||||||
|
|
|
@ -100,7 +100,12 @@ else
|
||||||
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
||||||
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
|
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(NOFORTRAN), 2)
|
||||||
|
#only build cblas without Fortran
|
||||||
|
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
|
else
|
||||||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
|
endif
|
||||||
|
|
||||||
dllinit.$(SUFFIX) : dllinit.c
|
dllinit.$(SUFFIX) : dllinit.c
|
||||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||||
|
|
|
@ -23,7 +23,8 @@
|
||||||
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
||||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv,
|
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv,
|
||||||
xerbla,
|
xerbla,
|
||||||
saxpby,daxpby,caxpby,zaxpby
|
saxpby,daxpby,caxpby,zaxpby,
|
||||||
|
sgeadd,dgeadd,cgeadd,zgeadd,
|
||||||
);
|
);
|
||||||
|
|
||||||
@cblasobjs = (
|
@cblasobjs = (
|
||||||
|
@ -55,6 +56,7 @@
|
||||||
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
|
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
|
||||||
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
|
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
|
||||||
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
|
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
|
||||||
|
cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd
|
||||||
);
|
);
|
||||||
|
|
||||||
@exblasobjs = (
|
@exblasobjs = (
|
||||||
|
@ -81,7 +83,10 @@
|
||||||
|
|
||||||
#both underscore and no underscore
|
#both underscore and no underscore
|
||||||
@misc_common_objs = (
|
@misc_common_objs = (
|
||||||
openblas_set_num_threads, openblas_get_parallel,
|
openblas_get_parallel,
|
||||||
|
openblas_get_num_procs,
|
||||||
|
openblas_set_num_threads,
|
||||||
|
openblas_get_num_threads,
|
||||||
);
|
);
|
||||||
|
|
||||||
@misc_no_underscore_objs = (
|
@misc_no_underscore_objs = (
|
||||||
|
|
47
getarch.c
47
getarch.c
|
@ -432,6 +432,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "PILEDRIVER"
|
#define CORENAME "PILEDRIVER"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined (FORCE_STEAMROLLER)
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "STEAMROLLER"
|
||||||
|
#define ARCHCONFIG "-DSTEAMROLLER " \
|
||||||
|
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
|
||||||
|
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
|
||||||
|
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
|
||||||
|
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
|
||||||
|
#define LIBNAME "steamroller"
|
||||||
|
#define CORENAME "STEAMROLLER"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef FORCE_SSE_GENERIC
|
#ifdef FORCE_SSE_GENERIC
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
@ -710,6 +727,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#else
|
#else
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_CORTEXA9
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "ARM"
|
||||||
|
#define SUBARCHITECTURE "CORTEXA9"
|
||||||
|
#define SUBDIRNAME "arm"
|
||||||
|
#define ARCHCONFIG "-DCORTEXA9 " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
|
||||||
|
"-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||||
|
#define LIBNAME "cortexa9"
|
||||||
|
#define CORENAME "CORTEXA9"
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_CORTEXA15
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "ARM"
|
||||||
|
#define SUBARCHITECTURE "CORTEXA15"
|
||||||
|
#define SUBDIRNAME "arm"
|
||||||
|
#define ARCHCONFIG "-DCORTEXA15 " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
|
||||||
|
"-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||||
|
#define LIBNAME "cortexa15"
|
||||||
|
#define CORENAME "CORTEXA15"
|
||||||
|
#else
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_ARMV6
|
#ifdef FORCE_ARMV6
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define ARCHITECTURE "ARM"
|
#define ARCHITECTURE "ARM"
|
||||||
|
|
|
@ -43,7 +43,8 @@ SBLAS2OBJS = \
|
||||||
SBLAS3OBJS = \
|
SBLAS3OBJS = \
|
||||||
sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
|
sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
|
||||||
strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
|
strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
|
||||||
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)
|
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
|
||||||
|
sgeadd.$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
DBLAS1OBJS = \
|
DBLAS1OBJS = \
|
||||||
|
@ -68,7 +69,8 @@ DBLAS2OBJS = \
|
||||||
DBLAS3OBJS = \
|
DBLAS3OBJS = \
|
||||||
dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
|
dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
|
||||||
dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
|
dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
|
||||||
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)
|
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\
|
||||||
|
dgeadd.$(SUFFIX)
|
||||||
|
|
||||||
CBLAS1OBJS = \
|
CBLAS1OBJS = \
|
||||||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
||||||
|
@ -96,7 +98,8 @@ CBLAS3OBJS = \
|
||||||
cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \
|
cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \
|
||||||
ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
|
ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
|
||||||
chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
|
chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
|
||||||
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)
|
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\
|
||||||
|
cgeadd.$(SUFFIX)
|
||||||
|
|
||||||
ZBLAS1OBJS = \
|
ZBLAS1OBJS = \
|
||||||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
||||||
|
@ -124,7 +127,8 @@ ZBLAS3OBJS = \
|
||||||
zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \
|
zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \
|
||||||
ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
|
ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
|
||||||
zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
|
zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
|
||||||
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)
|
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\
|
||||||
|
zgeadd.$(SUFFIX)
|
||||||
|
|
||||||
ifeq ($(SUPPORT_GEMM3M), 1)
|
ifeq ($(SUPPORT_GEMM3M), 1)
|
||||||
|
|
||||||
|
@ -269,7 +273,8 @@ CSBLAS2OBJS = \
|
||||||
|
|
||||||
CSBLAS3OBJS = \
|
CSBLAS3OBJS = \
|
||||||
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
|
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
|
||||||
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)
|
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
|
||||||
|
cblas_sgeadd.$(SUFFIX)
|
||||||
|
|
||||||
CDBLAS1OBJS = \
|
CDBLAS1OBJS = \
|
||||||
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||||
|
@ -285,7 +290,8 @@ CDBLAS2OBJS = \
|
||||||
|
|
||||||
CDBLAS3OBJS += \
|
CDBLAS3OBJS += \
|
||||||
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
|
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
|
||||||
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX)
|
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \
|
||||||
|
cblas_dgeadd.$(SUFFIX)
|
||||||
|
|
||||||
CCBLAS1OBJS = \
|
CCBLAS1OBJS = \
|
||||||
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||||
|
@ -308,7 +314,9 @@ CCBLAS3OBJS = \
|
||||||
cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \
|
cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \
|
||||||
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
|
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
|
||||||
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
|
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
|
||||||
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)
|
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
|
||||||
|
cblas_cgeadd.$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
CZBLAS1OBJS = \
|
CZBLAS1OBJS = \
|
||||||
|
@ -332,7 +340,9 @@ CZBLAS3OBJS = \
|
||||||
cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \
|
cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \
|
||||||
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
|
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
|
||||||
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
|
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
|
||||||
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX)
|
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
|
||||||
|
cblas_zgeadd.$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
ifeq ($(SUPPORT_GEMM3M), 1)
|
ifeq ($(SUPPORT_GEMM3M), 1)
|
||||||
|
|
||||||
|
@ -2103,4 +2113,27 @@ zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c
|
||||||
cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c
|
cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c
|
||||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||||
|
|
||||||
|
sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c
|
||||||
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c
|
||||||
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c
|
||||||
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c
|
||||||
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c
|
||||||
|
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c
|
||||||
|
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c
|
||||||
|
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c
|
||||||
|
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,148 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
#ifdef FUNCTION_PROFILE
|
||||||
|
#include "functable.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
#define ERROR_NAME "DGEADD "
|
||||||
|
#else
|
||||||
|
#define ERROR_NAME "SGEADD "
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef CBLAS
|
||||||
|
|
||||||
|
void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
||||||
|
FLOAT *BETA, FLOAT *c, blasint *LDC)
|
||||||
|
{
|
||||||
|
|
||||||
|
blasint m = *M;
|
||||||
|
blasint n = *N;
|
||||||
|
blasint lda = *LDA;
|
||||||
|
blasint ldc = *LDC;
|
||||||
|
FLOAT alpha = *ALPHA;
|
||||||
|
FLOAT beta = *BETA;
|
||||||
|
|
||||||
|
blasint info;
|
||||||
|
|
||||||
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
|
info = 0;
|
||||||
|
|
||||||
|
|
||||||
|
if (lda < MAX(1, m)) info = 6;
|
||||||
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
|
||||||
|
if (n < 0) info = 2;
|
||||||
|
if (m < 0) info = 1;
|
||||||
|
|
||||||
|
if (info != 0){
|
||||||
|
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta,
|
||||||
|
FLOAT *c, blasint ldc)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
void CNAME(enum CBLAS_ORDER order,
|
||||||
|
blasint m, blasint n,
|
||||||
|
FLOAT alpha,
|
||||||
|
FLOAT *a, blasint lda,
|
||||||
|
FLOAT beta,
|
||||||
|
FLOAT *c, blasint ldc){ */
|
||||||
|
|
||||||
|
blasint info, t;
|
||||||
|
|
||||||
|
PRINT_DEBUG_CNAME;
|
||||||
|
|
||||||
|
info = 0;
|
||||||
|
|
||||||
|
if (order == CblasColMajor) {
|
||||||
|
|
||||||
|
info = -1;
|
||||||
|
|
||||||
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
if (lda < MAX(1, m)) info = 5;
|
||||||
|
if (n < 0) info = 2;
|
||||||
|
if (m < 0) info = 1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (order == CblasRowMajor) {
|
||||||
|
info = -1;
|
||||||
|
|
||||||
|
t = n;
|
||||||
|
n = m;
|
||||||
|
m = t;
|
||||||
|
|
||||||
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
if (lda < MAX(1, m)) info = 5;
|
||||||
|
if (n < 0) info = 2;
|
||||||
|
if (m < 0) info = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (info >= 0) {
|
||||||
|
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((m==0) || (n==0)) return;
|
||||||
|
|
||||||
|
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
|
|
||||||
|
GEADD_K(m,n,alpha, a, lda, beta, c, ldc);
|
||||||
|
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n);
|
||||||
|
|
||||||
|
IDEBUG_END;
|
||||||
|
|
||||||
|
return;
|
||||||
|
|
||||||
|
}
|
|
@ -208,7 +208,20 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incx < 0) x -= (lenx - 1) * incx;
|
if (incx < 0) x -= (lenx - 1) * incx;
|
||||||
if (incy < 0) y -= (leny - 1) * incy;
|
if (incy < 0) y -= (leny - 1) * incy;
|
||||||
|
|
||||||
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
// make it volatile because some gemv implementation (ex: dgemv_n.S)
|
||||||
|
// do not restore all register
|
||||||
|
volatile int stack_alloc_size = m + n;
|
||||||
|
if(stack_alloc_size < 128)
|
||||||
|
//dgemv_n.S require a 128 bytes buffer
|
||||||
|
stack_alloc_size = 128;
|
||||||
|
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||||
|
stack_alloc_size = 0;
|
||||||
|
FLOAT stack_buffer[stack_alloc_size];
|
||||||
|
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#else
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
||||||
|
@ -237,6 +250,9 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
if(!stack_alloc_size)
|
||||||
|
#endif
|
||||||
blas_memory_free(buffer);
|
blas_memory_free(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||||
|
|
|
@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
if (incy < 0) y -= (n - 1) * incy;
|
if (incy < 0) y -= (n - 1) * incy;
|
||||||
if (incx < 0) x -= (m - 1) * incx;
|
if (incx < 0) x -= (m - 1) * incx;
|
||||||
|
|
||||||
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
volatile int stack_alloc_size = m;
|
||||||
|
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||||
|
stack_alloc_size = 0;
|
||||||
|
FLOAT stack_buffer[stack_alloc_size];
|
||||||
|
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#else
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef SMPTEST
|
#ifdef SMPTEST
|
||||||
nthreads = num_cpu_avail(2);
|
nthreads = num_cpu_avail(2);
|
||||||
|
@ -190,6 +198,9 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef MAX_STACK_ALLOC
|
||||||
|
if(!stack_alloc_size)
|
||||||
|
#endif
|
||||||
blas_memory_free(buffer);
|
blas_memory_free(buffer);
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||||
|
|
|
@ -0,0 +1,146 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
#ifdef FUNCTION_PROFILE
|
||||||
|
#include "functable.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
#define ERROR_NAME "ZGEADD "
|
||||||
|
#else
|
||||||
|
#define ERROR_NAME "CGEADD "
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef CBLAS
|
||||||
|
|
||||||
|
void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
||||||
|
FLOAT *BETA, FLOAT *c, blasint *LDC)
|
||||||
|
{
|
||||||
|
|
||||||
|
blasint m = *M;
|
||||||
|
blasint n = *N;
|
||||||
|
blasint lda = *LDA;
|
||||||
|
blasint ldc = *LDC;
|
||||||
|
|
||||||
|
blasint info;
|
||||||
|
|
||||||
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
|
info = 0;
|
||||||
|
|
||||||
|
|
||||||
|
if (lda < MAX(1, m)) info = 6;
|
||||||
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
|
||||||
|
if (n < 0) info = 2;
|
||||||
|
if (m < 0) info = 1;
|
||||||
|
|
||||||
|
if (info != 0){
|
||||||
|
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *BETA,
|
||||||
|
FLOAT *c, blasint ldc)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
void CNAME(enum CBLAS_ORDER order,
|
||||||
|
blasint m, blasint n,
|
||||||
|
FLOAT alpha,
|
||||||
|
FLOAT *a, blasint lda,
|
||||||
|
FLOAT beta,
|
||||||
|
FLOAT *c, blasint ldc){ */
|
||||||
|
|
||||||
|
blasint info, t;
|
||||||
|
|
||||||
|
PRINT_DEBUG_CNAME;
|
||||||
|
|
||||||
|
info = 0;
|
||||||
|
|
||||||
|
if (order == CblasColMajor) {
|
||||||
|
|
||||||
|
info = -1;
|
||||||
|
|
||||||
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
if (lda < MAX(1, m)) info = 5;
|
||||||
|
if (n < 0) info = 2;
|
||||||
|
if (m < 0) info = 1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (order == CblasRowMajor) {
|
||||||
|
info = -1;
|
||||||
|
|
||||||
|
t = n;
|
||||||
|
n = m;
|
||||||
|
m = t;
|
||||||
|
|
||||||
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
if (lda < MAX(1, m)) info = 5;
|
||||||
|
if (n < 0) info = 2;
|
||||||
|
if (m < 0) info = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (info >= 0) {
|
||||||
|
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((m==0) || (n==0)) return;
|
||||||
|
|
||||||
|
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
|
|
||||||
|
GEADD_K(m,n,ALPHA[0],ALPHA[1], a, lda, BETA[0], BETA[1], c, ldc);
|
||||||
|
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n);
|
||||||
|
|
||||||
|
IDEBUG_END;
|
||||||
|
|
||||||
|
return;
|
||||||
|
|
||||||
|
}
|
|
@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
if (incx < 0 ) x -= (n - 1) * incx;
|
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||||
|
|
||||||
|
|
|
@ -329,23 +329,27 @@ endif
|
||||||
###### BLAS extensions #####
|
###### BLAS extensions #####
|
||||||
SBLASOBJS += \
|
SBLASOBJS += \
|
||||||
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||||
somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX)
|
somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||||
|
sgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
DBLASOBJS += \
|
DBLASOBJS += \
|
||||||
domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||||
domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX)
|
domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||||
|
dgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CBLASOBJS += \
|
CBLASOBJS += \
|
||||||
comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||||
comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||||
comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
||||||
comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
|
comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||||
|
cgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ZBLASOBJS += \
|
ZBLASOBJS += \
|
||||||
zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||||
zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||||
zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
||||||
zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
|
zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||||
|
zgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||||
|
@ -3440,3 +3444,31 @@ $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC)
|
||||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
|
||||||
|
|
||||||
|
|
||||||
|
ifndef SGEADD_K
|
||||||
|
SGEADD_K = ../generic/geadd.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
|
||||||
|
|
||||||
|
ifndef DGEADD_K
|
||||||
|
DGEADD_K = ../generic/geadd.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
|
||||||
|
|
||||||
|
ifndef CGEADD_K
|
||||||
|
CGEADD_K = ../generic/zgeadd.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K)
|
||||||
|
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@
|
||||||
|
|
||||||
|
ifndef ZGEADD_K
|
||||||
|
ZGEADD_K = ../generic/zgeadd.c
|
||||||
|
endif
|
||||||
|
|
||||||
|
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
|
||||||
|
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
include $(KERNELDIR)/KERNEL.ARMV7
|
|
@ -0,0 +1 @@
|
||||||
|
include $(KERNELDIR)/KERNEL.ARMV7
|
|
@ -0,0 +1,64 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb)
|
||||||
|
{
|
||||||
|
BLASLONG i;
|
||||||
|
FLOAT *aptr,*bptr;
|
||||||
|
|
||||||
|
if ( rows <= 0 ) return(0);
|
||||||
|
if ( cols <= 0 ) return(0);
|
||||||
|
|
||||||
|
|
||||||
|
aptr = a;
|
||||||
|
bptr = b;
|
||||||
|
|
||||||
|
if ( alpha == 0.0 )
|
||||||
|
{
|
||||||
|
for ( i=0; i<cols ; i++ )
|
||||||
|
{
|
||||||
|
SCAL_K(rows, 0,0, beta, bptr, 1, NULL, 0,NULL,0);
|
||||||
|
bptr+=ldb;
|
||||||
|
}
|
||||||
|
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < cols; i++) {
|
||||||
|
AXPBY_K(rows, alpha, aptr, 1, beta, bptr, 1);
|
||||||
|
aptr += lda;
|
||||||
|
bptr += ldb;
|
||||||
|
}
|
||||||
|
|
||||||
|
return(0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alphar, FLOAT alphai, FLOAT *a, BLASLONG lda, FLOAT betar, FLOAT betai , FLOAT *b, BLASLONG ldb)
|
||||||
|
{
|
||||||
|
BLASLONG i;
|
||||||
|
FLOAT *aptr,*bptr;
|
||||||
|
|
||||||
|
if ( rows <= 0 ) return(0);
|
||||||
|
if ( cols <= 0 ) return(0);
|
||||||
|
|
||||||
|
|
||||||
|
aptr = a;
|
||||||
|
bptr = b;
|
||||||
|
lda *= 2;
|
||||||
|
ldb *= 2;
|
||||||
|
|
||||||
|
if ( alphar == 0.0 && alphai == 0.0 )
|
||||||
|
{
|
||||||
|
for ( i=0; i<cols ; i++ )
|
||||||
|
{
|
||||||
|
SCAL_K(rows, 0,0, betar, betai, bptr, 1, NULL, 0,NULL,0);
|
||||||
|
bptr+=ldb;
|
||||||
|
}
|
||||||
|
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < cols; i++) {
|
||||||
|
AXPBY_K(rows, alphar, alphai, aptr, 1, betar, betai, bptr, 1);
|
||||||
|
aptr += lda;
|
||||||
|
bptr += ldb;
|
||||||
|
}
|
||||||
|
return(0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -548,8 +548,9 @@ gotoblas_t TABLE_NAME = {
|
||||||
comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS,
|
comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS,
|
||||||
comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS,
|
comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS,
|
||||||
zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS,
|
zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS,
|
||||||
zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS
|
zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS,
|
||||||
|
|
||||||
|
sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -941,6 +942,23 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef STEAMROLLER
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "Steamroller\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef NANO
|
#ifdef NANO
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
|
|
@ -34,17 +34,17 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
|
||||||
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
|
ZGEMMINCOPY = zgemm_ncopy_1.S
|
||||||
ZGEMMINCOPY =
|
ZGEMMITCOPY = zgemm_tcopy_1.S
|
||||||
ZGEMMITCOPY =
|
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
ZGEMMINCOPYOBJ =
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMITCOPYOBJ =
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
|
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
|
||||||
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
|
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
|
||||||
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
|
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
DAXPYKERNEL = daxpy.c
|
||||||
|
CAXPYKERNEL = caxpy.c
|
||||||
|
ZAXPYKERNEL = zaxpy.c
|
||||||
|
|
||||||
|
SDOTKERNEL = sdot.c
|
||||||
|
DDOTKERNEL = ddot.c
|
||||||
|
|
||||||
|
DSYMV_U_KERNEL = dsymv_U.c
|
||||||
|
DSYMV_L_KERNEL = dsymv_L.c
|
||||||
|
SSYMV_U_KERNEL = ssymv_U.c
|
||||||
|
SSYMV_L_KERNEL = ssymv_L.c
|
||||||
|
|
||||||
|
SGEMVNKERNEL = sgemv_n_4.c
|
||||||
|
SGEMVTKERNEL = sgemv_t_4.c
|
||||||
|
|
||||||
|
DGEMVNKERNEL = dgemv_n_4.c
|
||||||
|
DGEMVTKERNEL = dgemv_t_4.c
|
||||||
|
|
||||||
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t_4.c
|
||||||
|
|
||||||
|
DCOPYKERNEL = dcopy_bulldozer.S
|
||||||
|
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
|
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
|
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S
|
||||||
|
ZGEMMINCOPY =
|
||||||
|
ZGEMMITCOPY =
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPYOBJ =
|
||||||
|
ZGEMMITCOPYOBJ =
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
|
||||||
|
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "caxpy_microk_bulldozer-2.c"
|
#include "caxpy_microk_bulldozer-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
|
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"prefetcht0 768(%2,%0,4) \n\t"
|
"prefetcht0 768(%2,%0,4) \n\t"
|
||||||
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
|
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
|
||||||
|
@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
|
||||||
"addq $16, %0 \n\t"
|
"addq $16, %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
|
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
|
||||||
|
|
||||||
"cmpq $0 , %1 \n\t"
|
"cmpq $0 , %1 \n\t"
|
||||||
"je .L01END%= \n\t"
|
"je 2f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"prefetcht0 320(%4,%0,4) \n\t"
|
"prefetcht0 320(%4,%0,4) \n\t"
|
||||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||||
|
@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
|
|
||||||
"addq $16, %0 \n\t"
|
"addq $16, %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L01END%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $4, %8 \n\t"
|
"cmpq $4, %8 \n\t"
|
||||||
"jne .L02END%= \n\t"
|
"jne 3f \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||||
|
@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
|
|
||||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||||
|
|
||||||
".L02END%=: \n\t"
|
"3: \n\t"
|
||||||
"vzeroupper \n\t"
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
|
@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
|
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
|
||||||
|
|
||||||
"cmpq $0 , %1 \n\t"
|
"cmpq $0 , %1 \n\t"
|
||||||
"je .L01END%= \n\t"
|
"je 2f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"prefetcht0 320(%4,%0,4) \n\t"
|
"prefetcht0 320(%4,%0,4) \n\t"
|
||||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||||
|
@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
|
|
||||||
"addq $16, %0 \n\t"
|
"addq $16, %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L01END%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $4, %6 \n\t"
|
"cmpq $4, %6 \n\t"
|
||||||
"jne .L02END%= \n\t"
|
"jne 3f \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||||
|
@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
|
|
||||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||||
|
|
||||||
".L02END%=: \n\t"
|
"3: \n\t"
|
||||||
"vzeroupper \n\t"
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
|
@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||||
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
|
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
|
||||||
|
|
||||||
"cmpq $0 , %1 \n\t"
|
"cmpq $0 , %1 \n\t"
|
||||||
"je .L01END%= \n\t"
|
"je 2f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"prefetcht0 320(%4,%0,4) \n\t"
|
"prefetcht0 320(%4,%0,4) \n\t"
|
||||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||||
|
@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L01END%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $4, %5 \n\t"
|
"cmpq $4, %5 \n\t"
|
||||||
"jne .L02END%= \n\t"
|
"jne 3f \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||||
|
|
||||||
|
@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||||
|
|
||||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||||
|
|
||||||
".L02END%=: \n\t"
|
"3: \n\t"
|
||||||
"vzeroupper \n\t"
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
|
@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
||||||
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
|
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
|
||||||
|
|
||||||
"cmpq $0 , %1 \n\t"
|
"cmpq $0 , %1 \n\t"
|
||||||
"je .L01END%= \n\t"
|
"je 2f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
|
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
|
||||||
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
|
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
|
||||||
|
|
||||||
|
@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
||||||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L01END%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $4, %6 \n\t"
|
"cmpq $4, %6 \n\t"
|
||||||
"jne .L02END%= \n\t"
|
"jne 3f \n\t"
|
||||||
|
|
||||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
|
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
|
||||||
|
|
||||||
|
@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
||||||
|
|
||||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||||
|
|
||||||
".L02END%=: \n\t"
|
"3: \n\t"
|
||||||
"vzeroupper \n\t"
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
|
|
|
@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
|
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
"testq $0x04, %1 \n\t"
|
||||||
"jz .L08LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||||
|
@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
".L08LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L08END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"prefetcht0 192(%4,%0,4) \n\t"
|
"prefetcht0 192(%4,%0,4) \n\t"
|
||||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||||
"prefetcht0 192(%5,%0,4) \n\t"
|
"prefetcht0 192(%5,%0,4) \n\t"
|
||||||
|
@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
|
|
||||||
"addq $16 , %0 \n\t"
|
"addq $16 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L08END%=: \n\t"
|
"3: \n\t"
|
||||||
|
|
||||||
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
|
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
|
||||||
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
|
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
|
||||||
|
@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
"testq $0x04, %1 \n\t"
|
||||||
"jz .L08LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||||
|
@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
".L08LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L08END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"prefetcht0 192(%4,%0,4) \n\t"
|
"prefetcht0 192(%4,%0,4) \n\t"
|
||||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||||
"prefetcht0 192(%5,%0,4) \n\t"
|
"prefetcht0 192(%5,%0,4) \n\t"
|
||||||
|
@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
|
|
||||||
"addq $16 , %0 \n\t"
|
"addq $16 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L08END%=: \n\t"
|
"3: \n\t"
|
||||||
|
|
||||||
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
|
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
|
||||||
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
|
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
|
||||||
|
@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
"testq $0x04, %1 \n\t"
|
||||||
"jz .L08LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||||
|
|
||||||
|
@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
".L08LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L08END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"prefetcht0 192(%4,%0,4) \n\t"
|
"prefetcht0 192(%4,%0,4) \n\t"
|
||||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||||
|
|
||||||
|
@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||||
|
|
||||||
"addq $16 , %0 \n\t"
|
"addq $16 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L08END%=: \n\t"
|
"3: \n\t"
|
||||||
|
|
||||||
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
|
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
|
||||||
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha
|
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "daxpy_microk_nehalem-2.c"
|
#include "daxpy_microk_nehalem-2.c"
|
||||||
#elif defined(BULLDOZER)
|
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "daxpy_microk_bulldozer-2.c"
|
#include "daxpy_microk_bulldozer-2.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
"vmovddup (%4), %%xmm0 \n\t" // alpha
|
"vmovddup (%4), %%xmm0 \n\t" // alpha
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"prefetcht0 768(%3,%0,8) \n\t"
|
"prefetcht0 768(%3,%0,8) \n\t"
|
||||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||||
|
@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
"shufpd $0, %%xmm0, %%xmm0 \n\t"
|
"shufpd $0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
// "prefetcht0 192(%2,%0,8) \n\t"
|
// "prefetcht0 192(%2,%0,8) \n\t"
|
||||||
// "prefetcht0 192(%3,%0,8) \n\t"
|
// "prefetcht0 192(%3,%0,8) \n\t"
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "ddot_microk_bulldozer-2.c"
|
#include "ddot_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "ddot_microk_nehalem-2.c"
|
#include "ddot_microk_nehalem-2.c"
|
||||||
|
|
|
@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||||
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
||||||
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
||||||
|
@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
|
|
@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
"xorpd %%xmm7, %%xmm7 \n\t"
|
"xorpd %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||||
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
|
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
|
||||||
|
@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"addpd %%xmm5, %%xmm4 \n\t"
|
"addpd %%xmm5, %%xmm4 \n\t"
|
||||||
"addpd %%xmm7, %%xmm6 \n\t"
|
"addpd %%xmm7, %%xmm6 \n\t"
|
||||||
|
|
|
@ -1092,18 +1092,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro INIT4x1
|
.macro INIT4x1
|
||||||
|
|
||||||
vxorpd %xmm4 , %xmm4 , %xmm4
|
vxorpd %ymm4 , %ymm4 , %ymm4
|
||||||
vxorpd %xmm5 , %xmm5 , %xmm5
|
vxorpd %ymm5 , %ymm5 , %ymm5
|
||||||
|
vxorpd %ymm6 , %ymm6 , %ymm6
|
||||||
|
vxorpd %ymm7 , %ymm7 , %ymm7
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL4x1
|
||||||
|
|
||||||
|
vbroadcastsd -12 * SIZE(BO), %ymm0
|
||||||
|
vbroadcastsd -11 * SIZE(BO), %ymm1
|
||||||
|
vbroadcastsd -10 * SIZE(BO), %ymm2
|
||||||
|
vbroadcastsd -9 * SIZE(BO), %ymm3
|
||||||
|
|
||||||
|
vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4
|
||||||
|
vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5
|
||||||
|
|
||||||
|
vbroadcastsd -8 * SIZE(BO), %ymm0
|
||||||
|
vbroadcastsd -7 * SIZE(BO), %ymm1
|
||||||
|
|
||||||
|
vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6
|
||||||
|
vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7
|
||||||
|
|
||||||
|
vbroadcastsd -6 * SIZE(BO), %ymm2
|
||||||
|
vbroadcastsd -5 * SIZE(BO), %ymm3
|
||||||
|
|
||||||
|
vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4
|
||||||
|
vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5
|
||||||
|
vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6
|
||||||
|
vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7
|
||||||
|
|
||||||
|
addq $ 8 *SIZE, BO
|
||||||
|
addq $ 32*SIZE, AO
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
.macro KERNEL4x1_SUB
|
.macro KERNEL4x1_SUB
|
||||||
vmovddup -12 * SIZE(BO), %xmm2
|
vbroadcastsd -12 * SIZE(BO), %ymm2
|
||||||
vmovups -16 * SIZE(AO), %xmm0
|
vmovups -16 * SIZE(AO), %ymm0
|
||||||
vmovups -14 * SIZE(AO), %xmm1
|
vfmadd231pd %ymm0 ,%ymm2 , %ymm4
|
||||||
vfmadd231pd %xmm0 ,%xmm2 , %xmm4
|
|
||||||
vfmadd231pd %xmm1 ,%xmm2 , %xmm5
|
|
||||||
addq $ 1*SIZE, BO
|
addq $ 1*SIZE, BO
|
||||||
addq $ 4*SIZE, AO
|
addq $ 4*SIZE, AO
|
||||||
|
|
||||||
|
@ -1112,21 +1142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro SAVE4x1
|
.macro SAVE4x1
|
||||||
|
|
||||||
vmovddup ALPHA, %xmm0
|
vbroadcastsd ALPHA, %ymm0
|
||||||
|
|
||||||
vmulpd %xmm0 , %xmm4 , %xmm4
|
vaddpd %ymm4,%ymm5, %ymm4
|
||||||
vmulpd %xmm0 , %xmm5 , %xmm5
|
vaddpd %ymm6,%ymm7, %ymm6
|
||||||
|
vaddpd %ymm4,%ymm6, %ymm4
|
||||||
|
|
||||||
|
vmulpd %ymm0 , %ymm4 , %ymm4
|
||||||
|
|
||||||
|
|
||||||
#if !defined(TRMMKERNEL)
|
#if !defined(TRMMKERNEL)
|
||||||
|
|
||||||
vaddpd (CO1) , %xmm4, %xmm4
|
vaddpd (CO1) , %ymm4, %ymm4
|
||||||
vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
vmovups %xmm4 , (CO1)
|
vmovups %ymm4 , (CO1)
|
||||||
vmovups %xmm5 , 2 * SIZE(CO1)
|
|
||||||
|
|
||||||
addq $ 4*SIZE, CO1
|
addq $ 4*SIZE, CO1
|
||||||
.endm
|
.endm
|
||||||
|
@ -2112,15 +2143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.L1_12:
|
.L1_12:
|
||||||
|
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
|
|
||||||
dec %rax
|
dec %rax
|
||||||
jne .L1_12
|
jne .L1_12
|
||||||
|
@ -3180,15 +3203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.L1_12:
|
.L1_12:
|
||||||
|
|
||||||
KERNEL4x1_SUB
|
KERNEL4x1
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
KERNEL4x1_SUB
|
|
||||||
|
|
||||||
dec %rax
|
dec %rax
|
||||||
jne .L1_12
|
jne .L1_12
|
||||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(NEHALEM)
|
#if defined(NEHALEM)
|
||||||
#include "dgemv_n_microk_nehalem-4.c"
|
#include "dgemv_n_microk_nehalem-4.c"
|
||||||
#elif defined(HASWELL)
|
#elif defined(HASWELL) || defined(STEAMROLLER)
|
||||||
#include "dgemv_n_microk_haswell-4.c"
|
#include "dgemv_n_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"shufpd $0, %%xmm13, %%xmm13 \n\t"
|
"shufpd $0, %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||||
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
||||||
|
|
||||||
|
@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
|
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||||
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
|
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
|
||||||
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
|
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
|
||||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||||
|
@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
"testq $0x04, %1 \n\t"
|
||||||
"jz .L8LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
|
@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
".L8LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L16END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||||
|
@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L16END%=: \n\t"
|
"3: \n\t"
|
||||||
"vzeroupper \n\t"
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
|
@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
"testq $0x04, %1 \n\t"
|
||||||
"jz .L8LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||||
|
@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
".L8LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L8END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||||
|
@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
|
|
||||||
"addq $8 , %0 \n\t"
|
"addq $8 , %0 \n\t"
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L8END%=: \n\t"
|
"3: \n\t"
|
||||||
"vzeroupper \n\t"
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
|
|
|
@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
|
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||||
|
@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
|
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
|
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||||
|
@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
|
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -0,0 +1,247 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x8 1
|
||||||
|
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||||
|
{
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||||
|
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||||
|
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||||
|
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||||
|
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
|
||||||
|
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
|
||||||
|
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
|
||||||
|
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
|
||||||
|
|
||||||
|
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||||
|
|
||||||
|
"testq $0x04, %1 \n\t"
|
||||||
|
"jz 2f \n\t"
|
||||||
|
|
||||||
|
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||||
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||||
|
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||||
|
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||||
|
|
||||||
|
"addq $4 , %8 \n\t"
|
||||||
|
"addq $4 , %0 \n\t"
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
|
"2: \n\t"
|
||||||
|
|
||||||
|
"cmpq $0, %1 \n\t"
|
||||||
|
"je 3f \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
|
||||||
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||||
|
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||||
|
|
||||||
|
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||||
|
"addq $8 , %0 \n\t"
|
||||||
|
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||||
|
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||||
|
|
||||||
|
"addq $8 , %8 \n\t"
|
||||||
|
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||||
|
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"3: \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (ap[0]), // 4
|
||||||
|
"r" (ap[1]), // 5
|
||||||
|
"r" (ap[2]), // 6
|
||||||
|
"r" (ap[3]), // 7
|
||||||
|
"r" (lda4), // 8
|
||||||
|
"r" (alpha) // 9
|
||||||
|
: "cc",
|
||||||
|
"%xmm0", "%xmm1",
|
||||||
|
"%xmm2", "%xmm3",
|
||||||
|
"%xmm4", "%xmm5",
|
||||||
|
"%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_4x4 1
|
||||||
|
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
{
|
||||||
|
|
||||||
|
BLASLONG register i = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__
|
||||||
|
(
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||||
|
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||||
|
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||||
|
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||||
|
|
||||||
|
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
||||||
|
|
||||||
|
"testq $0x04, %1 \n\t"
|
||||||
|
"jz 2f \n\t"
|
||||||
|
|
||||||
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||||
|
|
||||||
|
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||||
|
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||||
|
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||||
|
|
||||||
|
"addq $4 , %0 \n\t"
|
||||||
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
|
"2: \n\t"
|
||||||
|
|
||||||
|
"cmpq $0, %1 \n\t"
|
||||||
|
"je 3f \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||||
|
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||||
|
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||||
|
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||||
|
|
||||||
|
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||||
|
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||||
|
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||||
|
|
||||||
|
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||||
|
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||||
|
|
||||||
|
"vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
|
||||||
|
"vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
|
||||||
|
|
||||||
|
"addq $8 , %0 \n\t"
|
||||||
|
"subq $8 , %1 \n\t"
|
||||||
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
|
"3: \n\t"
|
||||||
|
"vzeroupper \n\t"
|
||||||
|
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"r" (i), // 0
|
||||||
|
"r" (n), // 1
|
||||||
|
"r" (x), // 2
|
||||||
|
"r" (y), // 3
|
||||||
|
"r" (ap[0]), // 4
|
||||||
|
"r" (ap[1]), // 5
|
||||||
|
"r" (ap[2]), // 6
|
||||||
|
"r" (ap[3]), // 7
|
||||||
|
"r" (alpha) // 8
|
||||||
|
: "cc",
|
||||||
|
"%xmm4", "%xmm5",
|
||||||
|
"%xmm6", "%xmm7",
|
||||||
|
"%xmm8", "%xmm9",
|
||||||
|
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(HASWELL)
|
#if defined(HASWELL) || defined(STEAMROLLER)
|
||||||
#include "dgemv_t_microk_haswell-4.c"
|
#include "dgemv_t_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||||
"xorpd %%xmm11 , %%xmm11 \n\t"
|
"xorpd %%xmm11 , %%xmm11 \n\t"
|
||||||
|
|
||||||
"testq $2 , %1 \n\t"
|
"testq $2 , %1 \n\t"
|
||||||
"jz .L01LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||||
|
@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||||
"subq $2 , %1 \n\t"
|
"subq $2 , %1 \n\t"
|
||||||
"addpd %%xmm13 , %%xmm11 \n\t"
|
"addpd %%xmm13 , %%xmm11 \n\t"
|
||||||
|
|
||||||
".L01LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L01END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||||
|
@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||||
|
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L01END%=: \n\t"
|
"3: \n\t"
|
||||||
|
|
||||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||||
"haddpd %%xmm11, %%xmm11 \n\t"
|
"haddpd %%xmm11, %%xmm11 \n\t"
|
||||||
|
@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||||
"xorpd %%xmm10 , %%xmm10 \n\t"
|
"xorpd %%xmm10 , %%xmm10 \n\t"
|
||||||
|
|
||||||
"testq $2 , %1 \n\t"
|
"testq $2 , %1 \n\t"
|
||||||
"jz .L01LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||||
|
@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||||
"addpd %%xmm12 , %%xmm10 \n\t"
|
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||||
"subq $2 , %1 \n\t"
|
"subq $2 , %1 \n\t"
|
||||||
|
|
||||||
".L01LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L01END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||||
"movups 16(%3,%0,8) , %%xmm14 \n\t"
|
"movups 16(%3,%0,8) , %%xmm14 \n\t"
|
||||||
|
@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"addpd %%xmm14 , %%xmm9 \n\t"
|
"addpd %%xmm14 , %%xmm9 \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L01END%=: \n\t"
|
"3: \n\t"
|
||||||
|
|
||||||
"addpd %%xmm9 , %%xmm10 \n\t"
|
"addpd %%xmm9 , %%xmm10 \n\t"
|
||||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||||
|
@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||||
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
|
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||||
|
@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||||
"subq $2 , %1 \n\t"
|
"subq $2 , %1 \n\t"
|
||||||
"movups %%xmm11, -16(%4,%0,8) \n\t"
|
"movups %%xmm11, -16(%4,%0,8) \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
"testq $0x04, %1 \n\t"
|
||||||
"jz .L08LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||||
|
|
||||||
|
@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
".L08LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L16END%= \n\t"
|
"je 3f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
// "prefetcht0 384(%2,%0,8) \n\t"
|
// "prefetcht0 384(%2,%0,8) \n\t"
|
||||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||||
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
|
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
|
||||||
|
@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
|
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L16END%=: \n\t"
|
"3: \n\t"
|
||||||
|
|
||||||
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
||||||
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "dsymv_L_microk_bulldozer-2.c"
|
#include "dsymv_L_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "dsymv_L_microk_nehalem-2.c"
|
#include "dsymv_L_microk_nehalem-2.c"
|
||||||
|
|
|
@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||||
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
|
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||||
|
@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||||
|
|
||||||
"cmpq %0 , %1 \n\t"
|
"cmpq %0 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"vmovsd (%9), %%xmm4 \n\t"
|
"vmovsd (%9), %%xmm4 \n\t"
|
||||||
"vmovsd 8(%9), %%xmm5 \n\t"
|
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||||
|
|
|
@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||||
"shufpd $0, %%xmm7, %%xmm7 \n\t"
|
"shufpd $0, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||||
"movups %%xmm12 , %%xmm11 \n\t"
|
"movups %%xmm12 , %%xmm11 \n\t"
|
||||||
|
@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||||
|
|
||||||
"cmpq %0 , %1 \n\t"
|
"cmpq %0 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"movsd (%9), %%xmm4 \n\t" // temp1[0]
|
"movsd (%9), %%xmm4 \n\t" // temp1[0]
|
||||||
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]
|
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "dsymv_U_microk_bulldozer-2.c"
|
#include "dsymv_U_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "dsymv_U_microk_nehalem-2.c"
|
#include "dsymv_U_microk_nehalem-2.c"
|
||||||
|
|
|
@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||||
"xorq %0,%0 \n\t"
|
"xorq %0,%0 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||||
|
@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||||
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
|
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
|
||||||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||||
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||||
|
|
|
@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||||
"xorq %0,%0 \n\t"
|
"xorq %0,%0 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||||
"movups %%xmm12 , %%xmm11 \n\t"
|
"movups %%xmm12 , %%xmm11 \n\t"
|
||||||
|
@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||||
|
|
||||||
"subq $2 , %1 \n\t"
|
"subq $2 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"haddpd %%xmm0, %%xmm0 \n\t"
|
"haddpd %%xmm0, %%xmm0 \n\t"
|
||||||
"haddpd %%xmm1, %%xmm1 \n\t"
|
"haddpd %%xmm1, %%xmm1 \n\t"
|
||||||
|
|
|
@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
"shufps $0, %%xmm0, %%xmm0 \n\t"
|
"shufps $0, %%xmm0, %%xmm0 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
// "prefetcht0 192(%2,%0,4) \n\t"
|
// "prefetcht0 192(%2,%0,4) \n\t"
|
||||||
// "prefetcht0 192(%3,%0,4) \n\t"
|
// "prefetcht0 192(%3,%0,4) \n\t"
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||||
|
|
||||||
"addq $16, %0 \n\t"
|
"addq $16, %0 \n\t"
|
||||||
"subq $16, %1 \n\t"
|
"subq $16, %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "sdot_microk_bulldozer-2.c"
|
#include "sdot_microk_bulldozer-2.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "sdot_microk_nehalem-2.c"
|
#include "sdot_microk_nehalem-2.c"
|
||||||
|
|
|
@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||||
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||||
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
||||||
|
@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
|
||||||
"addq $16, %0 \n\t"
|
"addq $16, %0 \n\t"
|
||||||
"subq $16, %1 \n\t"
|
"subq $16, %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||||
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||||
|
|
|
@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
"xorps %%xmm7, %%xmm7 \n\t"
|
"xorps %%xmm7, %%xmm7 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||||
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
|
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
|
||||||
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||||
|
@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
||||||
|
|
||||||
"addq $16, %0 \n\t"
|
"addq $16, %0 \n\t"
|
||||||
"subq $16, %1 \n\t"
|
"subq $16, %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
"addps %%xmm5, %%xmm4 \n\t"
|
"addps %%xmm5, %%xmm4 \n\t"
|
||||||
"addps %%xmm7, %%xmm6 \n\t"
|
"addps %%xmm7, %%xmm6 \n\t"
|
||||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||||
#include "sgemv_n_microk_bulldozer-4.c"
|
#include "sgemv_n_microk_bulldozer-4.c"
|
||||||
#elif defined(NEHALEM)
|
#elif defined(NEHALEM)
|
||||||
#include "sgemv_n_microk_nehalem-4.c"
|
#include "sgemv_n_microk_nehalem-4.c"
|
||||||
|
@ -39,8 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "sgemv_n_microk_haswell-4.c"
|
#include "sgemv_n_microk_haswell-4.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(STEAMROLLER)
|
||||||
|
#define NBMAX 2048
|
||||||
|
#else
|
||||||
#define NBMAX 4096
|
#define NBMAX 4096
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_4x8
|
#ifndef HAVE_KERNEL_4x8
|
||||||
|
|
||||||
|
@ -129,7 +132,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"shufps $0, %%xmm13, %%xmm13 \n\t"
|
"shufps $0, %%xmm13, %%xmm13 \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||||
|
|
||||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||||
|
@ -143,7 +146,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
|
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
|
||||||
|
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
@ -166,7 +169,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef HAVE_KERNEL_4x2
|
#ifndef HAVE_KERNEL_4x1
|
||||||
|
|
||||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
@ -184,10 +187,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||||
"shufps $0, %%xmm12, %%xmm12 \n\t"
|
"shufps $0, %%xmm12, %%xmm12 \n\t"
|
||||||
|
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L16END%= \n\t"
|
"je 2f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||||
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
|
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
|
||||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||||
|
@ -203,12 +206,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||||
|
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
|
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L16END%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"testq $0x04, %5 \n\t"
|
"testq $0x04, %5 \n\t"
|
||||||
"jz .L08LABEL%= \n\t"
|
"jz 3f \n\t"
|
||||||
|
|
||||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||||
|
@ -218,7 +221,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
|
|
||||||
".L08LABEL%=: \n\t"
|
"3: \n\t"
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
"r" (i), // 0
|
"r" (i), // 0
|
||||||
|
@ -262,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||||
(
|
(
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"movups (%2,%0,4) , %%xmm12 \n\t"
|
"movups (%2,%0,4) , %%xmm12 \n\t"
|
||||||
"movups (%3,%0,4) , %%xmm11 \n\t"
|
"movups (%3,%0,4) , %%xmm11 \n\t"
|
||||||
|
@ -271,7 +274,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||||
"movups %%xmm11, -16(%3,%0,4) \n\t"
|
"movups %%xmm11, -16(%3,%0,4) \n\t"
|
||||||
|
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
|
@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
"testq $0x04, %1 \n\t"
|
||||||
"jz .L08LABEL%= \n\t"
|
"jz 2f \n\t"
|
||||||
|
|
||||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||||
|
@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
||||||
|
|
||||||
".L08LABEL%=: \n\t"
|
"2: \n\t"
|
||||||
|
|
||||||
"testq $0x08, %1 \n\t"
|
"testq $0x08, %1 \n\t"
|
||||||
"jz .L16LABEL%= \n\t"
|
"jz 3f \n\t"
|
||||||
|
|
||||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||||
|
@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"subq $8 , %1 \n\t"
|
"subq $8 , %1 \n\t"
|
||||||
|
|
||||||
|
|
||||||
".L16LABEL%=: \n\t"
|
"3: \n\t"
|
||||||
|
|
||||||
"cmpq $0, %1 \n\t"
|
"cmpq $0, %1 \n\t"
|
||||||
"je .L16END%= \n\t"
|
"je 4f \n\t"
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
|
|
||||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||||
|
@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||||
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
||||||
|
|
||||||
"subq $16, %1 \n\t"
|
"subq $16, %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
".L16END%=: \n\t"
|
"4: \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
|
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
|
||||||
|
|
||||||
".align 16 \n\t"
|
".align 16 \n\t"
|
||||||
".L01LOOP%=: \n\t"
|
"1: \n\t"
|
||||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||||
|
|
||||||
|
@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||||
|
|
||||||
"addq $4 , %0 \n\t"
|
"addq $4 , %0 \n\t"
|
||||||
"subq $4 , %1 \n\t"
|
"subq $4 , %1 \n\t"
|
||||||
"jnz .L01LOOP%= \n\t"
|
"jnz 1b \n\t"
|
||||||
|
|
||||||
:
|
:
|
||||||
:
|
:
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue