Merge branch 'develop'
This commit is contained in:
commit
d0c51c4de9
|
@ -121,5 +121,11 @@ In chronological order:
|
|||
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
|
||||
ARMv8 support.
|
||||
|
||||
* Dan Kortschak
|
||||
* [2015-01-07] Added test for drotmg bug #484.
|
||||
|
||||
* Ton van den Heuvel <https://github.com/ton>
|
||||
* [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity().
|
||||
|
||||
* [Your name or handle] <[email or website]>
|
||||
* [Date] [Brief summary of your changes]
|
||||
|
|
|
@ -1,4 +1,24 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.14
|
||||
24-Mar-2015
|
||||
common:
|
||||
* Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.)
|
||||
* Improve ger and gemv for small matrices by stack allocation.
|
||||
e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.)
|
||||
* Introduce openblas_get_num_threads and openblas_get_num_procs.
|
||||
(#497. Thanks, Erik Schnetter.)
|
||||
* Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.)
|
||||
* Fix c/zsyr bug with negative incx. (#492.)
|
||||
* Fix race condition during shutdown causing a crash in
|
||||
gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.)
|
||||
|
||||
x86/x86-64:
|
||||
* Support AMD Streamroller.
|
||||
|
||||
ARM:
|
||||
* Add Cortex-A9 and Cortex-A15 targets.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.13
|
||||
3-Dec-2014
|
||||
|
|
|
@ -9,10 +9,10 @@
|
|||
|
||||
If you want to allocate 64 large pages,
|
||||
|
||||
$shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset
|
||||
$shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page
|
||||
$shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number
|
||||
$shell> echo 3355443200 > /pros/sys/kernel/shmall
|
||||
$shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset
|
||||
$shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page
|
||||
$shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number
|
||||
$shell> echo 3355443200 > /proc/sys/kernel/shmall
|
||||
|
||||
Also may add a few lines into /etc/security/limits.conf file.
|
||||
|
||||
|
|
|
@ -1,3 +1,8 @@
|
|||
# ifeq logical or
|
||||
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
|
||||
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV7)
|
||||
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
|
||||
|
|
|
@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
|
|||
OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
|
||||
OPENBLAS_BINARY_DIR := $(PREFIX)/bin
|
||||
OPENBLAS_BUILD_DIR := $(CURDIR)
|
||||
OPENBLAS_CMAKE_DIR := $(PREFIX)/cmake
|
||||
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
|
||||
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
||||
|
||||
.PHONY : install
|
||||
|
@ -46,11 +46,11 @@ ifndef NO_CBLAS
|
|||
endif
|
||||
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
|
@ -95,7 +95,8 @@ endif
|
|||
endif
|
||||
#Generating OpenBLASConfig.cmake
|
||||
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
|
||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
|
||||
ifndef NO_SHARED
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.13
|
||||
VERSION = 0.2.14
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -159,6 +159,19 @@ COMMON_PROF = -pg
|
|||
# Build Debug version
|
||||
# DEBUG = 1
|
||||
|
||||
# Improve GEMV and GER for small matrices by stack allocation.
|
||||
# For details, https://github.com/xianyi/OpenBLAS/pull/482
|
||||
#
|
||||
# MAX_STACK_ALLOC=2048
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoid conflicts with other BLAS libraries, especially when using
|
||||
# 64 bit integer interfaces in OpenBLAS.
|
||||
# For details, https://github.com/xianyi/OpenBLAS/pull/459
|
||||
#
|
||||
# SYMBOLPREFIX=
|
||||
# SYMBOLSUFFIX=
|
||||
|
||||
#
|
||||
# End of user configuration
|
||||
#
|
||||
|
|
|
@ -61,6 +61,9 @@ endif
|
|||
ifeq ($(TARGET), PILEDRIVER)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET), STEAMROLLER)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
@ -85,6 +88,9 @@ endif
|
|||
ifeq ($(TARGET_CORE), PILEDRIVER)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), STEAMROLLER)
|
||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
@ -305,6 +311,10 @@ ifdef SANITY_CHECK
|
|||
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
|
||||
endif
|
||||
|
||||
ifdef MAX_STACK_ALLOC
|
||||
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||
endif
|
||||
|
||||
#
|
||||
# Architecture dependent settings
|
||||
#
|
||||
|
@ -354,6 +364,12 @@ endif
|
|||
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
|
||||
#check
|
||||
ifeq ($(USE_THREAD), 0)
|
||||
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
|
||||
endif
|
||||
|
||||
# ifeq logical or. GCC or LSB
|
||||
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
|
||||
CCOMMON_OPT += -fopenmp
|
||||
|
@ -392,7 +408,7 @@ endif
|
|||
ifeq ($(ARCH), x86_64)
|
||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||
ifneq ($(NO_AVX), 1)
|
||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
|
||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER
|
||||
endif
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += HASWELL
|
||||
|
|
|
@ -60,6 +60,7 @@ Please read GotoBLAS_01Readme.txt
|
|||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||
|
||||
#### MIPS64:
|
||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||
|
|
|
@ -32,6 +32,7 @@ ISTANBUL
|
|||
BOBCAT
|
||||
BULLDOZER
|
||||
PILEDRIVER
|
||||
STEAMROLLER
|
||||
|
||||
c)VIA CPU:
|
||||
SSE_GENERIC
|
||||
|
@ -62,6 +63,11 @@ SPARC
|
|||
SPARCV7
|
||||
|
||||
6.ARM CPU:
|
||||
CORTEXA15
|
||||
CORTEXA9
|
||||
ARMV7
|
||||
ARMV6
|
||||
ARMV5
|
||||
|
||||
7.ARM 64-bit CPU:
|
||||
ARMV8
|
||||
|
|
|
@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system
|
|||
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
|
||||
|
||||
# ACML custom
|
||||
ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
|
||||
LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
|
||||
#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
|
||||
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
|
||||
|
||||
# ACML 6.1 custom
|
||||
ACML=/home/werner/project/acml6.1/gfortran64_mp/lib
|
||||
LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
|
||||
|
||||
|
||||
# Atlas Ubuntu
|
||||
#ATLAS=/usr/lib/atlas-base
|
||||
|
|
|
@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||
|
@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){
|
|||
}
|
||||
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
#ifndef COMPLEX
|
||||
char *trans[] = {"T", "N"};
|
||||
|
@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
FLOAT result;
|
||||
|
@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
|
||||
FLOAT wkopt[4];
|
||||
|
@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
FLOAT beta [] = {1.0, 1.0};
|
||||
char trans='N';
|
||||
blasint m, i, j;
|
||||
blasint m, n, i, j;
|
||||
int loops = 1;
|
||||
int has_param_n=0;
|
||||
int l;
|
||||
char *p;
|
||||
|
||||
|
@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){
|
|||
if ( p != NULL )
|
||||
loops = atoi(p);
|
||||
|
||||
if ((p = getenv("OPENBLAS_PARAM_N"))) {
|
||||
n = atoi(p);
|
||||
has_param_n=1;
|
||||
}
|
||||
|
||||
|
||||
#ifdef linux
|
||||
srandom(getpid());
|
||||
|
@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){
|
|||
|
||||
timeg=0;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
if ( has_param_n == 1 && n <= m )
|
||||
n=n;
|
||||
else
|
||||
n=m;
|
||||
|
||||
|
||||
|
||||
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
@ -189,7 +202,7 @@ int MAIN__(int argc, char *argv[]){
|
|||
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
||||
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
|
||||
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
|
||||
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
|
||||
|
@ -202,11 +215,11 @@ int MAIN__(int argc, char *argv[]){
|
|||
timeg /= loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6);
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -214,5 +214,5 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
||||
|
|
|
@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a,*work;
|
||||
FLOAT wkopt[4];
|
||||
|
@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b;
|
||||
blasint *ipiv;
|
||||
|
@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -114,7 +114,7 @@ int gettimeofday(struct timeval *tv, void *tz){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
#ifndef COMPLEX
|
||||
char *trans[] = {"T", "N"};
|
||||
|
@ -278,5 +278,5 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *c;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
|
@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
int MAIN__(int argc, char *argv[]){
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b;
|
||||
FLOAT alpha[] = {1.0, 1.0};
|
||||
|
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
||||
|
|
4
c_check
4
c_check
|
@ -81,6 +81,10 @@ if (($architecture eq "mips32") || ($architecture eq "mips64")) {
|
|||
$defined = 1;
|
||||
}
|
||||
|
||||
if (($architecture eq "arm") || ($architecture eq "arm64")) {
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
|
|
16
cblas.h
16
cblas.h
|
@ -13,6 +13,12 @@ extern "C" {
|
|||
void openblas_set_num_threads(int num_threads);
|
||||
void goto_set_num_threads(int num_threads);
|
||||
|
||||
/*Get the number of threads on runtime.*/
|
||||
int openblas_get_num_threads(void);
|
||||
|
||||
/*Get the number of physical processors (cores).*/
|
||||
int openblas_get_num_procs(void);
|
||||
|
||||
/*Get the build configure on runtime.*/
|
||||
char* openblas_get_config(void);
|
||||
|
||||
|
@ -341,6 +347,16 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
|
|||
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
|
||||
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
|
||||
|
||||
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
|
||||
float *c, OPENBLAS_CONST blasint cldc);
|
||||
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
|
||||
double *c, OPENBLAS_CONST blasint cldc);
|
||||
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
|
||||
float *c, OPENBLAS_CONST blasint cldc);
|
||||
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
|
||||
double *c, OPENBLAS_CONST blasint cldc);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
|
|
@ -13,6 +13,12 @@ extern "C" {
|
|||
void openblas_set_num_threads(int num_threads);
|
||||
void goto_set_num_threads(int num_threads);
|
||||
|
||||
/*Get the number of threads on runtime.*/
|
||||
int openblas_get_num_threads(void);
|
||||
|
||||
/*Get the number of physical processors (cores).*/
|
||||
int openblas_get_num_procs(void);
|
||||
|
||||
/*Get the build configure on runtime.*/
|
||||
char* openblas_get_config(void);
|
||||
|
||||
|
@ -327,6 +333,16 @@ void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, bl
|
|||
blasint clda, blasint cldb);
|
||||
void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a,
|
||||
blasint clda, blasint cldb);
|
||||
|
||||
void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta,
|
||||
float *c, blasint cldc);
|
||||
void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta,
|
||||
double *c, blasint cldc);
|
||||
void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta,
|
||||
float *c, blasint cldc);
|
||||
void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta,
|
||||
double *c, blasint cldc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
|
8
common.h
8
common.h
|
@ -327,6 +327,14 @@ typedef int blasint;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
#ifdef STEAMROLLER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
#ifndef YIELDING
|
||||
#define YIELDING sched_yield()
|
||||
#endif
|
||||
|
|
|
@ -220,6 +220,7 @@
|
|||
#define COMATCOPY_K_CTC comatcopy_k_ctc
|
||||
#define COMATCOPY_K_RTC comatcopy_k_rtc
|
||||
|
||||
#define CGEADD_K cgeadd_k
|
||||
|
||||
#else
|
||||
|
||||
|
@ -402,6 +403,7 @@
|
|||
#define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc
|
||||
#define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc
|
||||
#define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc
|
||||
#define CGEADD_K gotoblas -> cgeadd_k
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -149,6 +149,7 @@
|
|||
#define DOMATCOPY_K_RN domatcopy_k_rn
|
||||
#define DOMATCOPY_K_CT domatcopy_k_ct
|
||||
#define DOMATCOPY_K_RT domatcopy_k_rt
|
||||
#define DGEADD_K dgeadd_k
|
||||
|
||||
#else
|
||||
|
||||
|
@ -267,6 +268,8 @@
|
|||
#define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct
|
||||
#define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt
|
||||
|
||||
#define DGEADD_K gotoblas -> dgeadd_k
|
||||
|
||||
#endif
|
||||
|
||||
#define DGEMM_NN dgemm_nn
|
||||
|
|
|
@ -754,6 +754,12 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do
|
|||
void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
|
||||
void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
|
||||
|
||||
void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
|
||||
void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
|
||||
void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
|
||||
void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
|
|
|
@ -1762,6 +1762,11 @@ int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, dou
|
|||
int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
|
||||
int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
|
||||
|
||||
int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG);
|
||||
int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG);
|
||||
int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG);
|
||||
int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
|
|
|
@ -634,7 +634,7 @@
|
|||
#define OMATCOPY_K_RN DOMATCOPY_K_RN
|
||||
#define OMATCOPY_K_CT DOMATCOPY_K_CT
|
||||
#define OMATCOPY_K_RT DOMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K DGEADD_K
|
||||
#else
|
||||
|
||||
#define AMAX_K SAMAX_K
|
||||
|
@ -932,6 +932,7 @@
|
|||
#define OMATCOPY_K_CT SOMATCOPY_K_CT
|
||||
#define OMATCOPY_K_RT SOMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
|
@ -1746,6 +1747,7 @@
|
|||
#define OMATCOPY_K_RNC ZOMATCOPY_K_RNC
|
||||
#define OMATCOPY_K_CTC ZOMATCOPY_K_CTC
|
||||
#define OMATCOPY_K_RTC ZOMATCOPY_K_RTC
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#else
|
||||
|
||||
|
@ -2159,6 +2161,8 @@
|
|||
#define OMATCOPY_K_CTC COMATCOPY_K_CTC
|
||||
#define OMATCOPY_K_RTC COMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -855,6 +855,10 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
|
||||
int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
|
||||
int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
|
||||
int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
|
||||
int (*zgeadd_k) (BLASLONG, BLASLONG, float, double, double *, BLASLONG, double, double, double *, BLASLONG);
|
||||
|
||||
} gotoblas_t;
|
||||
|
||||
|
|
|
@ -153,6 +153,7 @@
|
|||
#define SOMATCOPY_K_CT somatcopy_k_ct
|
||||
#define SOMATCOPY_K_RT somatcopy_k_rt
|
||||
|
||||
#define SGEADD_K sgeadd_k
|
||||
|
||||
#else
|
||||
|
||||
|
@ -274,6 +275,7 @@
|
|||
#define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct
|
||||
#define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt
|
||||
|
||||
#define SGEADD_K gotoblas -> sgeadd_k
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
#define MMXSTORE movd
|
||||
#endif
|
||||
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
|
||||
//Enable some optimazation for barcelona.
|
||||
#define BARCELONA_OPTIMIZATION
|
||||
#endif
|
||||
|
|
|
@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
|
||||
#ifdef ASSEMBLER
|
||||
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
|
||||
//Enable some optimazation for barcelona.
|
||||
#define BARCELONA_OPTIMIZATION
|
||||
#endif
|
||||
|
|
|
@ -220,6 +220,7 @@
|
|||
#define ZOMATCOPY_K_CTC zomatcopy_k_ctc
|
||||
#define ZOMATCOPY_K_RTC zomatcopy_k_rtc
|
||||
|
||||
#define ZGEADD_K zgeadd_k
|
||||
|
||||
#else
|
||||
|
||||
|
@ -403,6 +404,8 @@
|
|||
#define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc
|
||||
#define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc
|
||||
|
||||
#define ZGEADD_K gotoblas -> zgeadd_k
|
||||
|
||||
#endif
|
||||
|
||||
#define ZGEMM_NN zgemm_nn
|
||||
|
|
10
cpuid.h
10
cpuid.h
|
@ -104,10 +104,11 @@
|
|||
#define CORE_ATOM 18
|
||||
#define CORE_NANO 19
|
||||
#define CORE_SANDYBRIDGE 20
|
||||
#define CORE_BOBCAT 21
|
||||
#define CORE_BULLDOZER 22
|
||||
#define CORE_BOBCAT 21
|
||||
#define CORE_BULLDOZER 22
|
||||
#define CORE_PILEDRIVER 23
|
||||
#define CORE_HASWELL 24
|
||||
#define CORE_HASWELL 24
|
||||
#define CORE_STEAMROLLER 25
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
|
@ -200,6 +201,7 @@ typedef struct {
|
|||
#define CPUTYPE_BOBCAT 45
|
||||
#define CPUTYPE_BULLDOZER 46
|
||||
#define CPUTYPE_PILEDRIVER 47
|
||||
#define CPUTYPE_HASWELL 48
|
||||
#define CPUTYPE_HASWELL 48
|
||||
#define CPUTYPE_STEAMROLLER 49
|
||||
|
||||
#endif
|
||||
|
|
95
cpuid_arm.c
95
cpuid_arm.c
|
@ -30,16 +30,27 @@
|
|||
#define CPU_UNKNOWN 0
|
||||
#define CPU_ARMV6 1
|
||||
#define CPU_ARMV7 2
|
||||
#define CPU_CORTEXA15 3
|
||||
#define CPU_CORTEXA9 3
|
||||
#define CPU_CORTEXA15 4
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"ARMV6",
|
||||
"ARMV7",
|
||||
"CORTEXA9",
|
||||
"CORTEXA15"
|
||||
};
|
||||
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"unknown",
|
||||
"armv6",
|
||||
"armv7",
|
||||
"cortexa9",
|
||||
"cortexa15"
|
||||
};
|
||||
|
||||
|
||||
int get_feature(char *search)
|
||||
{
|
||||
|
||||
|
@ -85,6 +96,29 @@ int detect(void)
|
|||
char buffer[512], *p;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("CPU part", buffer, 8))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
if(p != NULL) {
|
||||
if (strstr(p, "0xc09")) {
|
||||
return CPU_CORTEXA9;
|
||||
}
|
||||
if (strstr(p, "0xc0f")) {
|
||||
return CPU_CORTEXA15;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
|
@ -142,21 +176,7 @@ void get_architecture(void)
|
|||
void get_subarchitecture(void)
|
||||
{
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_ARMV7:
|
||||
printf("ARMV7");
|
||||
break;
|
||||
|
||||
case CPU_ARMV6:
|
||||
printf("ARMV6");
|
||||
break;
|
||||
|
||||
default:
|
||||
printf("UNKNOWN");
|
||||
break;
|
||||
}
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void)
|
||||
|
@ -170,6 +190,36 @@ void get_cpuconfig(void)
|
|||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
case CPU_CORTEXA9:
|
||||
printf("#define CORTEXA9\n");
|
||||
printf("#define HAVE_VFP\n");
|
||||
printf("#define HAVE_VFPV3\n");
|
||||
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
|
||||
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 128\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_CORTEXA15:
|
||||
printf("#define CORTEXA15\n");
|
||||
printf("#define HAVE_VFP\n");
|
||||
printf("#define HAVE_VFPV3\n");
|
||||
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
|
||||
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 128\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
|
||||
case CPU_ARMV7:
|
||||
printf("#define ARMV7\n");
|
||||
|
@ -206,18 +256,7 @@ void get_libname(void)
|
|||
{
|
||||
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_ARMV7:
|
||||
printf("armv7\n");
|
||||
break;
|
||||
|
||||
case CPU_ARMV6:
|
||||
printf("armv6\n");
|
||||
break;
|
||||
|
||||
}
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
||||
|
||||
|
|
18
cpuid_x86.c
18
cpuid_x86.c
|
@ -1162,6 +1162,12 @@ int get_cpuname(void){
|
|||
return CPUTYPE_PILEDRIVER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 0:
|
||||
if(support_avx())
|
||||
return CPUTYPE_STEAMROLLER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
|
@ -1290,6 +1296,7 @@ static char *cpuname[] = {
|
|||
"BULLDOZER",
|
||||
"PILEDRIVER",
|
||||
"HASWELL",
|
||||
"STEAMROLLER",
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
|
@ -1341,6 +1348,7 @@ static char *lowercpuname[] = {
|
|||
"bulldozer",
|
||||
"piledriver",
|
||||
"haswell",
|
||||
"steamroller",
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
|
@ -1369,6 +1377,7 @@ static char *corename[] = {
|
|||
"BULLDOZER",
|
||||
"PILEDRIVER",
|
||||
"HASWELL",
|
||||
"STEAMROLLER",
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
|
@ -1397,6 +1406,7 @@ static char *corename_lower[] = {
|
|||
"bulldozer",
|
||||
"piledriver",
|
||||
"haswell",
|
||||
"steamroller",
|
||||
};
|
||||
|
||||
|
||||
|
@ -1562,7 +1572,15 @@ int get_coretype(void){
|
|||
return CORE_PILEDRIVER;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
|
||||
case 0:
|
||||
if(support_avx())
|
||||
return CORE_STEAMROLLER;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX.
|
||||
}
|
||||
|
||||
|
||||
}else return CORE_BARCELONA;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
TOPDIR = ../..
|
||||
include ../../Makefile.system
|
||||
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
||||
|
||||
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||
|
||||
|
@ -103,6 +103,12 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
|
|||
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_get_num_threads.$(SUFFIX) : openblas_get_num_threads.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_get_num_procs.$(SUFFIX) : openblas_get_num_procs.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_get_config.$(SUFFIX) : openblas_get_config.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
|
|
@ -66,6 +66,7 @@ extern gotoblas_t gotoblas_BOBCAT;
|
|||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
|
@ -77,6 +78,7 @@ extern gotoblas_t gotoblas_HASWELL;
|
|||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -275,7 +277,17 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0){
|
||||
//AMD STEAMROLLER
|
||||
if(support_avx())
|
||||
return &gotoblas_STEAMROLLER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
|
@ -315,6 +327,7 @@ static char *corename[] = {
|
|||
"Bulldozer",
|
||||
"Piledriver",
|
||||
"Haswell",
|
||||
"Steamroller",
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
|
@ -339,6 +352,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
|
||||
return corename[0];
|
||||
}
|
||||
|
@ -349,9 +363,9 @@ static gotoblas_t *force_coretype(char *coretype){
|
|||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
char mname[20];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 20; i++)
|
||||
for ( i=1 ; i <= 21; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
|
@ -361,8 +375,8 @@ static gotoblas_t *force_coretype(char *coretype){
|
|||
}
|
||||
if (found < 0)
|
||||
{
|
||||
strncpy(mname,coretype,20);
|
||||
sprintf(message, "Core not found: %s\n",mname);
|
||||
//strncpy(mname,coretype,20);
|
||||
snprintf(message, 128, "Core not found: %s\n",coretype);
|
||||
openblas_warning(1, message);
|
||||
return(NULL);
|
||||
}
|
||||
|
@ -370,6 +384,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
|||
switch (found)
|
||||
{
|
||||
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
case 20: return (&gotoblas_HASWELL);
|
||||
case 19: return (&gotoblas_PILEDRIVER);
|
||||
case 18: return (&gotoblas_BULLDOZER);
|
||||
|
|
|
@ -241,6 +241,7 @@ void set_stack_limit(int limitMB){
|
|||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
|
@ -323,6 +324,23 @@ int blas_get_cpu_number(void){
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return blas_get_cpu_number();
|
||||
#endif
|
||||
}
|
||||
|
||||
struct release_t {
|
||||
void *address;
|
||||
void (*func)(struct release_t *);
|
||||
|
@ -1335,6 +1353,8 @@ void DESTRUCTOR gotoblas_quit(void) {
|
|||
|
||||
if (gotoblas_initialized == 0) return;
|
||||
|
||||
blas_shutdown();
|
||||
|
||||
#ifdef PROFILE
|
||||
moncontrol (0);
|
||||
#endif
|
||||
|
@ -1356,8 +1376,6 @@ void DESTRUCTOR gotoblas_quit(void) {
|
|||
#ifdef PROFILE
|
||||
moncontrol (1);
|
||||
#endif
|
||||
|
||||
blas_shutdown();
|
||||
}
|
||||
|
||||
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
extern int openblas_get_num_procs(void);
|
||||
|
||||
int openblas_get_num_procs_(void) {
|
||||
return openblas_get_num_procs();
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
extern int openblas_get_num_threads(void);
|
||||
|
||||
int openblas_get_num_threads_(void) {
|
||||
return openblas_get_num_threads();
|
||||
}
|
|
@ -166,7 +166,7 @@ int get_L2_size(void){
|
|||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL)
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
@ -251,7 +251,7 @@ void blas_set_parameter(void){
|
|||
|
||||
env_var_t p;
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
|
|
@ -100,7 +100,12 @@ else
|
|||
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
|
||||
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
|
||||
endif
|
||||
ifeq ($(NOFORTRAN), 2)
|
||||
#only build cblas without Fortran
|
||||
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
|
|
@ -23,7 +23,8 @@
|
|||
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv,
|
||||
xerbla,
|
||||
saxpby,daxpby,caxpby,zaxpby
|
||||
saxpby,daxpby,caxpby,zaxpby,
|
||||
sgeadd,dgeadd,cgeadd,zgeadd,
|
||||
);
|
||||
|
||||
@cblasobjs = (
|
||||
|
@ -55,6 +56,7 @@
|
|||
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
|
||||
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
|
||||
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
|
||||
cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd
|
||||
);
|
||||
|
||||
@exblasobjs = (
|
||||
|
@ -81,7 +83,10 @@
|
|||
|
||||
#both underscore and no underscore
|
||||
@misc_common_objs = (
|
||||
openblas_set_num_threads, openblas_get_parallel,
|
||||
openblas_get_parallel,
|
||||
openblas_get_num_procs,
|
||||
openblas_set_num_threads,
|
||||
openblas_get_num_threads,
|
||||
);
|
||||
|
||||
@misc_no_underscore_objs = (
|
||||
|
|
47
getarch.c
47
getarch.c
|
@ -432,6 +432,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "PILEDRIVER"
|
||||
#endif
|
||||
|
||||
#if defined (FORCE_STEAMROLLER)
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "STEAMROLLER"
|
||||
#define ARCHCONFIG "-DSTEAMROLLER " \
|
||||
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
|
||||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
|
||||
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
|
||||
#define LIBNAME "steamroller"
|
||||
#define CORENAME "STEAMROLLER"
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_SSE_GENERIC
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
|
@ -710,6 +727,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA9
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM"
|
||||
#define SUBARCHITECTURE "CORTEXA9"
|
||||
#define SUBDIRNAME "arm"
|
||||
#define ARCHCONFIG "-DCORTEXA9 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
|
||||
"-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "cortexa9"
|
||||
#define CORENAME "CORTEXA9"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA15
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM"
|
||||
#define SUBARCHITECTURE "CORTEXA15"
|
||||
#define SUBDIRNAME "arm"
|
||||
#define ARCHCONFIG "-DCORTEXA15 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
|
||||
"-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "cortexa15"
|
||||
#define CORENAME "CORTEXA15"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ARMV6
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM"
|
||||
|
|
|
@ -43,7 +43,8 @@ SBLAS2OBJS = \
|
|||
SBLAS3OBJS = \
|
||||
sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
|
||||
strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
|
||||
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)
|
||||
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
|
||||
sgeadd.$(SUFFIX)
|
||||
|
||||
|
||||
DBLAS1OBJS = \
|
||||
|
@ -68,7 +69,8 @@ DBLAS2OBJS = \
|
|||
DBLAS3OBJS = \
|
||||
dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
|
||||
dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
|
||||
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)
|
||||
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\
|
||||
dgeadd.$(SUFFIX)
|
||||
|
||||
CBLAS1OBJS = \
|
||||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
||||
|
@ -96,7 +98,8 @@ CBLAS3OBJS = \
|
|||
cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \
|
||||
ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
|
||||
chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
|
||||
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)
|
||||
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\
|
||||
cgeadd.$(SUFFIX)
|
||||
|
||||
ZBLAS1OBJS = \
|
||||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
||||
|
@ -124,7 +127,8 @@ ZBLAS3OBJS = \
|
|||
zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \
|
||||
ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
|
||||
zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
|
||||
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)
|
||||
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\
|
||||
zgeadd.$(SUFFIX)
|
||||
|
||||
ifeq ($(SUPPORT_GEMM3M), 1)
|
||||
|
||||
|
@ -269,7 +273,8 @@ CSBLAS2OBJS = \
|
|||
|
||||
CSBLAS3OBJS = \
|
||||
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
|
||||
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)
|
||||
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
|
||||
cblas_sgeadd.$(SUFFIX)
|
||||
|
||||
CDBLAS1OBJS = \
|
||||
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
|
||||
|
@ -285,7 +290,8 @@ CDBLAS2OBJS = \
|
|||
|
||||
CDBLAS3OBJS += \
|
||||
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
|
||||
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX)
|
||||
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \
|
||||
cblas_dgeadd.$(SUFFIX)
|
||||
|
||||
CCBLAS1OBJS = \
|
||||
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||
|
@ -308,7 +314,9 @@ CCBLAS3OBJS = \
|
|||
cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \
|
||||
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
|
||||
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
|
||||
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)
|
||||
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
|
||||
cblas_cgeadd.$(SUFFIX)
|
||||
|
||||
|
||||
|
||||
CZBLAS1OBJS = \
|
||||
|
@ -332,7 +340,9 @@ CZBLAS3OBJS = \
|
|||
cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \
|
||||
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
|
||||
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
|
||||
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX)
|
||||
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
|
||||
cblas_zgeadd.$(SUFFIX)
|
||||
|
||||
|
||||
ifeq ($(SUPPORT_GEMM3M), 1)
|
||||
|
||||
|
@ -2103,4 +2113,27 @@ zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c
|
|||
cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ERROR_NAME "DGEADD "
|
||||
#else
|
||||
#define ERROR_NAME "SGEADD "
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
||||
FLOAT *BETA, FLOAT *c, blasint *LDC)
|
||||
{
|
||||
|
||||
blasint m = *M;
|
||||
blasint n = *N;
|
||||
blasint lda = *LDA;
|
||||
blasint ldc = *LDC;
|
||||
FLOAT alpha = *ALPHA;
|
||||
FLOAT beta = *BETA;
|
||||
|
||||
blasint info;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
info = 0;
|
||||
|
||||
|
||||
if (lda < MAX(1, m)) info = 6;
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
|
||||
if (info != 0){
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#else
|
||||
void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta,
|
||||
FLOAT *c, blasint ldc)
|
||||
{
|
||||
/*
|
||||
void CNAME(enum CBLAS_ORDER order,
|
||||
blasint m, blasint n,
|
||||
FLOAT alpha,
|
||||
FLOAT *a, blasint lda,
|
||||
FLOAT beta,
|
||||
FLOAT *c, blasint ldc){ */
|
||||
|
||||
blasint info, t;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
info = 0;
|
||||
|
||||
if (order == CblasColMajor) {
|
||||
|
||||
info = -1;
|
||||
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
|
||||
}
|
||||
|
||||
if (order == CblasRowMajor) {
|
||||
info = -1;
|
||||
|
||||
t = n;
|
||||
n = m;
|
||||
m = t;
|
||||
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
}
|
||||
|
||||
if (info >= 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if ((m==0) || (n==0)) return;
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
|
||||
GEADD_K(m,n,alpha, a, lda, beta, c, ldc);
|
||||
|
||||
|
||||
FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return;
|
||||
|
||||
}
|
|
@ -208,7 +208,20 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (incx < 0) x -= (lenx - 1) * incx;
|
||||
if (incy < 0) y -= (leny - 1) * incy;
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
// make it volatile because some gemv implementation (ex: dgemv_n.S)
|
||||
// do not restore all register
|
||||
volatile int stack_alloc_size = m + n;
|
||||
if(stack_alloc_size < 128)
|
||||
//dgemv_n.S require a 128 bytes buffer
|
||||
stack_alloc_size = 128;
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||
stack_alloc_size = 0;
|
||||
FLOAT stack_buffer[stack_alloc_size];
|
||||
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||
#else
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
|
@ -237,7 +250,10 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
if(!stack_alloc_size)
|
||||
#endif
|
||||
blas_memory_free(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
|
|
@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
if (incy < 0) y -= (n - 1) * incy;
|
||||
if (incx < 0) x -= (m - 1) * incx;
|
||||
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
volatile int stack_alloc_size = m;
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
|
||||
stack_alloc_size = 0;
|
||||
FLOAT stack_buffer[stack_alloc_size];
|
||||
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
|
||||
#else
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#endif
|
||||
|
||||
#ifdef SMPTEST
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
@ -190,7 +198,10 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
#ifdef MAX_STACK_ALLOC
|
||||
if(!stack_alloc_size)
|
||||
#endif
|
||||
blas_memory_free(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ERROR_NAME "ZGEADD "
|
||||
#else
|
||||
#define ERROR_NAME "CGEADD "
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
||||
FLOAT *BETA, FLOAT *c, blasint *LDC)
|
||||
{
|
||||
|
||||
blasint m = *M;
|
||||
blasint n = *N;
|
||||
blasint lda = *LDA;
|
||||
blasint ldc = *LDC;
|
||||
|
||||
blasint info;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
info = 0;
|
||||
|
||||
|
||||
if (lda < MAX(1, m)) info = 6;
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
|
||||
if (info != 0){
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#else
|
||||
void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *BETA,
|
||||
FLOAT *c, blasint ldc)
|
||||
{
|
||||
/*
|
||||
void CNAME(enum CBLAS_ORDER order,
|
||||
blasint m, blasint n,
|
||||
FLOAT alpha,
|
||||
FLOAT *a, blasint lda,
|
||||
FLOAT beta,
|
||||
FLOAT *c, blasint ldc){ */
|
||||
|
||||
blasint info, t;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
info = 0;
|
||||
|
||||
if (order == CblasColMajor) {
|
||||
|
||||
info = -1;
|
||||
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
|
||||
}
|
||||
|
||||
if (order == CblasRowMajor) {
|
||||
info = -1;
|
||||
|
||||
t = n;
|
||||
n = m;
|
||||
m = t;
|
||||
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
}
|
||||
|
||||
if (info >= 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if ((m==0) || (n==0)) return;
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
|
||||
GEADD_K(m,n,ALPHA[0],ALPHA[1], a, lda, BETA[0], BETA[1], c, ldc);
|
||||
|
||||
|
||||
FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return;
|
||||
|
||||
}
|
|
@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
|
|||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx;
|
||||
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
|
|
|
@ -329,23 +329,27 @@ endif
|
|||
###### BLAS extensions #####
|
||||
SBLASOBJS += \
|
||||
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX)
|
||||
somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
sgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DBLASOBJS += \
|
||||
domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX)
|
||||
domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
dgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CBLASOBJS += \
|
||||
comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
||||
comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
|
||||
comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||
cgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZBLASOBJS += \
|
||||
zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
|
||||
zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
|
||||
zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
|
||||
zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
|
||||
zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
|
||||
zgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
||||
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
|
@ -3440,3 +3444,31 @@ $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC)
|
|||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
|
||||
|
||||
|
||||
ifndef SGEADD_K
|
||||
SGEADD_K = ../generic/geadd.c
|
||||
endif
|
||||
|
||||
$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
|
||||
|
||||
ifndef DGEADD_K
|
||||
DGEADD_K = ../generic/geadd.c
|
||||
endif
|
||||
|
||||
$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
|
||||
|
||||
ifndef CGEADD_K
|
||||
CGEADD_K = ../generic/zgeadd.c
|
||||
endif
|
||||
|
||||
$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@
|
||||
|
||||
ifndef ZGEADD_K
|
||||
ZGEADD_K = ../generic/zgeadd.c
|
||||
endif
|
||||
|
||||
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV7
|
|
@ -0,0 +1 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV7
|
|
@ -0,0 +1,64 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *aptr,*bptr;
|
||||
|
||||
if ( rows <= 0 ) return(0);
|
||||
if ( cols <= 0 ) return(0);
|
||||
|
||||
|
||||
aptr = a;
|
||||
bptr = b;
|
||||
|
||||
if ( alpha == 0.0 )
|
||||
{
|
||||
for ( i=0; i<cols ; i++ )
|
||||
{
|
||||
SCAL_K(rows, 0,0, beta, bptr, 1, NULL, 0,NULL,0);
|
||||
bptr+=ldb;
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
for (i = 0; i < cols; i++) {
|
||||
AXPBY_K(rows, alpha, aptr, 1, beta, bptr, 1);
|
||||
aptr += lda;
|
||||
bptr += ldb;
|
||||
}
|
||||
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alphar, FLOAT alphai, FLOAT *a, BLASLONG lda, FLOAT betar, FLOAT betai , FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *aptr,*bptr;
|
||||
|
||||
if ( rows <= 0 ) return(0);
|
||||
if ( cols <= 0 ) return(0);
|
||||
|
||||
|
||||
aptr = a;
|
||||
bptr = b;
|
||||
lda *= 2;
|
||||
ldb *= 2;
|
||||
|
||||
if ( alphar == 0.0 && alphai == 0.0 )
|
||||
{
|
||||
for ( i=0; i<cols ; i++ )
|
||||
{
|
||||
SCAL_K(rows, 0,0, betar, betai, bptr, 1, NULL, 0,NULL,0);
|
||||
bptr+=ldb;
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
for (i = 0; i < cols; i++) {
|
||||
AXPBY_K(rows, alphar, alphai, aptr, 1, betar, betai, bptr, 1);
|
||||
aptr += lda;
|
||||
bptr += ldb;
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -548,8 +548,9 @@ gotoblas_t TABLE_NAME = {
|
|||
comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS,
|
||||
comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS,
|
||||
zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS,
|
||||
zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS
|
||||
zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS,
|
||||
|
||||
sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS
|
||||
|
||||
};
|
||||
|
||||
|
@ -941,6 +942,23 @@ static void init_parameter(void) {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef STEAMROLLER
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "Steamroller\n");
|
||||
#endif
|
||||
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
#ifdef EXPRECISION
|
||||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef NANO
|
||||
|
||||
#ifdef DEBUG
|
||||
|
|
|
@ -34,17 +34,17 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
|
||||
ZGEMMINCOPY = zgemm_ncopy_1.S
|
||||
ZGEMMITCOPY = zgemm_tcopy_1.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMINCOPYOBJ =
|
||||
ZGEMMITCOPYOBJ =
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
||||
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
|
||||
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
|
||||
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
|
||||
DSYMV_U_KERNEL = dsymv_U.c
|
||||
DSYMV_L_KERNEL = dsymv_L.c
|
||||
SSYMV_U_KERNEL = ssymv_U.c
|
||||
SSYMV_L_KERNEL = ssymv_L.c
|
||||
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
DCOPYKERNEL = dcopy_bulldozer.S
|
||||
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPYOBJ =
|
||||
ZGEMMITCOPYOBJ =
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
|
||||
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "caxpy_microk_bulldozer-2.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 768(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
|
||||
|
@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %8 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
|
@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %6 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
|
@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %5 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
|
||||
|
@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
|
||||
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
|
||||
|
||||
|
@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %6 \n\t"
|
||||
"jne .L02END%= \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
|
||||
|
||||
|
@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
|||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
".L02END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L08END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
|
@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L08END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L08END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
|
@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L08END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
|
||||
|
@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
|
@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L08END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
|
@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
|||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L08END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha
|
||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NEHALEM)
|
||||
#include "daxpy_microk_nehalem-2.c"
|
||||
#elif defined(BULLDOZER)
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "daxpy_microk_bulldozer-2.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"vmovddup (%4), %%xmm0 \n\t" // alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 768(%3,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
|
@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"shufpd $0, %%xmm0, %%xmm0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 192(%2,%0,8) \n\t"
|
||||
// "prefetcht0 192(%3,%0,8) \n\t"
|
||||
|
||||
|
@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "ddot_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "ddot_microk_nehalem-2.c"
|
||||
|
|
|
@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
|
||||
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
|
||||
|
@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"xorpd %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
|
||||
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
|
||||
|
@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"addpd %%xmm5, %%xmm4 \n\t"
|
||||
"addpd %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -1092,18 +1092,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro INIT4x1
|
||||
|
||||
vxorpd %xmm4 , %xmm4 , %xmm4
|
||||
vxorpd %xmm5 , %xmm5 , %xmm5
|
||||
vxorpd %ymm4 , %ymm4 , %ymm4
|
||||
vxorpd %ymm5 , %ymm5 , %ymm5
|
||||
vxorpd %ymm6 , %ymm6 , %ymm6
|
||||
vxorpd %ymm7 , %ymm7 , %ymm7
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL4x1
|
||||
|
||||
vbroadcastsd -12 * SIZE(BO), %ymm0
|
||||
vbroadcastsd -11 * SIZE(BO), %ymm1
|
||||
vbroadcastsd -10 * SIZE(BO), %ymm2
|
||||
vbroadcastsd -9 * SIZE(BO), %ymm3
|
||||
|
||||
vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4
|
||||
vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5
|
||||
|
||||
vbroadcastsd -8 * SIZE(BO), %ymm0
|
||||
vbroadcastsd -7 * SIZE(BO), %ymm1
|
||||
|
||||
vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6
|
||||
vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7
|
||||
|
||||
vbroadcastsd -6 * SIZE(BO), %ymm2
|
||||
vbroadcastsd -5 * SIZE(BO), %ymm3
|
||||
|
||||
vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4
|
||||
vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5
|
||||
vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6
|
||||
vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7
|
||||
|
||||
addq $ 8 *SIZE, BO
|
||||
addq $ 32*SIZE, AO
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL4x1_SUB
|
||||
vmovddup -12 * SIZE(BO), %xmm2
|
||||
vmovups -16 * SIZE(AO), %xmm0
|
||||
vmovups -14 * SIZE(AO), %xmm1
|
||||
vfmadd231pd %xmm0 ,%xmm2 , %xmm4
|
||||
vfmadd231pd %xmm1 ,%xmm2 , %xmm5
|
||||
vbroadcastsd -12 * SIZE(BO), %ymm2
|
||||
vmovups -16 * SIZE(AO), %ymm0
|
||||
vfmadd231pd %ymm0 ,%ymm2 , %ymm4
|
||||
addq $ 1*SIZE, BO
|
||||
addq $ 4*SIZE, AO
|
||||
|
||||
|
@ -1112,21 +1142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.macro SAVE4x1
|
||||
|
||||
vmovddup ALPHA, %xmm0
|
||||
vbroadcastsd ALPHA, %ymm0
|
||||
|
||||
vmulpd %xmm0 , %xmm4 , %xmm4
|
||||
vmulpd %xmm0 , %xmm5 , %xmm5
|
||||
vaddpd %ymm4,%ymm5, %ymm4
|
||||
vaddpd %ymm6,%ymm7, %ymm6
|
||||
vaddpd %ymm4,%ymm6, %ymm4
|
||||
|
||||
vmulpd %ymm0 , %ymm4 , %ymm4
|
||||
|
||||
|
||||
#if !defined(TRMMKERNEL)
|
||||
|
||||
vaddpd (CO1) , %xmm4, %xmm4
|
||||
vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5
|
||||
vaddpd (CO1) , %ymm4, %ymm4
|
||||
|
||||
#endif
|
||||
|
||||
vmovups %xmm4 , (CO1)
|
||||
vmovups %xmm5 , 2 * SIZE(CO1)
|
||||
vmovups %ymm4 , (CO1)
|
||||
|
||||
addq $ 4*SIZE, CO1
|
||||
.endm
|
||||
|
@ -2112,15 +2143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.L1_12:
|
||||
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1
|
||||
|
||||
dec %rax
|
||||
jne .L1_12
|
||||
|
@ -3180,15 +3203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
.L1_12:
|
||||
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1_SUB
|
||||
KERNEL4x1
|
||||
|
||||
dec %rax
|
||||
jne .L1_12
|
||||
|
|
|
@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(NEHALEM)
|
||||
#include "dgemv_n_microk_nehalem-4.c"
|
||||
#elif defined(HASWELL)
|
||||
#elif defined(HASWELL) || defined(STEAMROLLER)
|
||||
#include "dgemv_n_microk_haswell-4.c"
|
||||
#endif
|
||||
|
||||
|
@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"shufpd $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
||||
|
||||
|
@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
|
@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L8LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
|
@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L8LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $8 , %1 \n\t"
|
||||
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L8LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L8LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L8END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
|
@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L8END%=: \n\t"
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
|
|
|
@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"xorpd %%xmm4 , %%xmm4 \n\t"
|
||||
"xorpd %%xmm5 , %%xmm5 \n\t"
|
||||
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
|
||||
|
@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -0,0 +1,247 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
|
||||
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
|
||||
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
|
||||
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %8 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"addq $8 , %8 \n\t"
|
||||
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
||||
"subq $8 , %1 \n\t"
|
||||
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (lda4), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||
|
||||
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
|
||||
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
"vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
|
||||
"vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(HASWELL)
|
||||
#if defined(HASWELL) || defined(STEAMROLLER)
|
||||
#include "dgemv_t_microk_haswell-4.c"
|
||||
#endif
|
||||
|
||||
|
@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"xorpd %%xmm11 , %%xmm11 \n\t"
|
||||
|
||||
"testq $2 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||
|
@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
"subq $2 , %1 \n\t"
|
||||
"addpd %%xmm13 , %%xmm11 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
|
||||
|
@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||
"haddpd %%xmm11, %%xmm11 \n\t"
|
||||
|
@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"xorpd %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
"testq $2 , %1 \n\t"
|
||||
"jz .L01LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||
|
@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"addpd %%xmm12 , %%xmm10 \n\t"
|
||||
"subq $2 , %1 \n\t"
|
||||
|
||||
".L01LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L01END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups 16(%3,%0,8) , %%xmm14 \n\t"
|
||||
|
@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|||
"subq $4 , %1 \n\t"
|
||||
"addpd %%xmm14 , %%xmm9 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L01END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"addpd %%xmm9 , %%xmm10 \n\t"
|
||||
"haddpd %%xmm10, %%xmm10 \n\t"
|
||||
|
@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
"movups (%4,%0,8) , %%xmm11 \n\t"
|
||||
|
@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|||
"subq $2 , %1 \n\t"
|
||||
"movups %%xmm11, -16(%4,%0,8) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||
|
||||
|
@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 384(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
|
||||
|
@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
|||
"subq $8 , %1 \n\t"
|
||||
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
|
||||
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
|
||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "dsymv_L_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "dsymv_L_microk_nehalem-2.c"
|
||||
|
|
|
@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
|
@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vmovsd (%9), %%xmm4 \n\t"
|
||||
"vmovsd 8(%9), %%xmm5 \n\t"
|
||||
|
|
|
@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"shufpd $0, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
|
@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"cmpq %0 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"movsd (%9), %%xmm4 \n\t" // temp1[0]
|
||||
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]
|
||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "dsymv_U_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "dsymv_U_microk_nehalem-2.c"
|
||||
|
|
|
@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
|
@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
|
||||
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
|
||||
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
|
||||
|
|
|
@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"xorq %0,%0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
|
||||
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
|
||||
"movups %%xmm12 , %%xmm11 \n\t"
|
||||
|
@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|||
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
|
||||
|
||||
"subq $2 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"haddpd %%xmm0, %%xmm0 \n\t"
|
||||
"haddpd %%xmm1, %%xmm1 \n\t"
|
||||
|
|
|
@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
"shufps $0, %%xmm0, %%xmm0 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 192(%2,%0,4) \n\t"
|
||||
// "prefetcht0 192(%3,%0,4) \n\t"
|
||||
|
||||
|
@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "sdot_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "sdot_microk_nehalem-2.c"
|
||||
|
|
|
@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
|
||||
|
@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
|
||||
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
"xorps %%xmm7, %%xmm7 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
|
||||
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
|
||||
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
|
||||
|
@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
|
|||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"addps %%xmm5, %%xmm4 \n\t"
|
||||
"addps %%xmm7, %%xmm6 \n\t"
|
||||
|
|
|
@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "sgemv_n_microk_bulldozer-4.c"
|
||||
#elif defined(NEHALEM)
|
||||
#include "sgemv_n_microk_nehalem-4.c"
|
||||
|
@ -39,8 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "sgemv_n_microk_haswell-4.c"
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(STEAMROLLER)
|
||||
#define NBMAX 2048
|
||||
#else
|
||||
#define NBMAX 4096
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x8
|
||||
|
||||
|
@ -129,7 +132,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"shufps $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
|
||||
"movups (%4,%0,4), %%xmm8 \n\t"
|
||||
|
@ -143,7 +146,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -166,7 +169,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
#ifndef HAVE_KERNEL_4x1
|
||||
|
||||
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
|
@ -184,10 +187,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"shufps $0, %%xmm12, %%xmm12 \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||
|
@ -203,12 +206,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x04, %5 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
|
||||
|
@ -218,7 +221,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
|
@ -262,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
|||
(
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%2,%0,4) , %%xmm12 \n\t"
|
||||
"movups (%3,%0,4) , %%xmm11 \n\t"
|
||||
|
@ -271,7 +274,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
|||
"movups %%xmm11, -16(%3,%0,4) \n\t"
|
||||
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
|
@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz .L08LABEL%= \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $4 , %1 \n\t"
|
||||
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
".L08LABEL%=: \n\t"
|
||||
"2: \n\t"
|
||||
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz .L16LABEL%= \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"subq $8 , %1 \n\t"
|
||||
|
||||
|
||||
".L16LABEL%=: \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je .L16END%= \n\t"
|
||||
"je 4f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|||
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
".L16END%=: \n\t"
|
||||
"4: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
|
||||
|
||||
".align 16 \n\t"
|
||||
".L01LOOP%=: \n\t"
|
||||
"1: \n\t"
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
|
@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz .L01LOOP%= \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue