Merge branch 'develop'

This commit is contained in:
Zhang Xianyi 2015-03-24 15:07:07 -05:00
commit d0c51c4de9
137 changed files with 1947 additions and 444 deletions

View File

@ -121,5 +121,11 @@ In chronological order:
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
ARMv8 support.
* Dan Kortschak
* [2015-01-07] Added test for drotmg bug #484.
* Ton van den Heuvel <https://github.com/ton>
* [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity().
* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]

View File

@ -1,4 +1,24 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.14
24-Mar-2015
common:
* Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.)
* Improve ger and gemv for small matrices by stack allocation.
e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.)
* Introduce openblas_get_num_threads and openblas_get_num_procs.
(#497. Thanks, Erik Schnetter.)
* Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.)
* Fix c/zsyr bug with negative incx. (#492.)
* Fix race condition during shutdown causing a crash in
gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.)
x86/x86-64:
* Support AMD Streamroller.
ARM:
* Add Cortex-A9 and Cortex-A15 targets.
====================================================================
Version 0.2.13
3-Dec-2014

View File

@ -9,10 +9,10 @@
If you want to allocate 64 large pages,
$shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset
$shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page
$shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number
$shell> echo 3355443200 > /pros/sys/kernel/shmall
$shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset
$shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page
$shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number
$shell> echo 3355443200 > /proc/sys/kernel/shmall
Also may add a few lines into /etc/security/limits.conf file.

View File

@ -1,3 +1,8 @@
# ifeq logical or
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
ifeq ($(CORE), ARMV7)
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a

View File

@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
OPENBLAS_BINARY_DIR := $(PREFIX)/bin
OPENBLAS_BUILD_DIR := $(CURDIR)
OPENBLAS_CMAKE_DIR := $(PREFIX)/cmake
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
.PHONY : install
@ -46,11 +46,11 @@ ifndef NO_CBLAS
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
endif
#for install static library
@ -95,7 +95,8 @@ endif
endif
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.13
VERSION = 0.2.14
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -159,6 +159,19 @@ COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
# Improve GEMV and GER for small matrices by stack allocation.
# For details, https://github.com/xianyi/OpenBLAS/pull/482
#
# MAX_STACK_ALLOC=2048
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoid conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
# For details, https://github.com/xianyi/OpenBLAS/pull/459
#
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
#
# End of user configuration
#

View File

@ -61,6 +61,9 @@ endif
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@ -85,6 +88,9 @@ endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@ -305,6 +311,10 @@ ifdef SANITY_CHECK
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
endif
ifdef MAX_STACK_ALLOC
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
#
# Architecture dependent settings
#
@ -354,6 +364,12 @@ endif
ifeq ($(USE_OPENMP), 1)
#check
ifeq ($(USE_THREAD), 0)
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
endif
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
CCOMMON_OPT += -fopenmp
@ -392,7 +408,7 @@ endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL

View File

@ -60,6 +60,7 @@ Please read GotoBLAS_01Readme.txt
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.

View File

@ -32,6 +32,7 @@ ISTANBUL
BOBCAT
BULLDOZER
PILEDRIVER
STEAMROLLER
c)VIA CPU:
SSE_GENERIC
@ -62,6 +63,11 @@ SPARC
SPARCV7
6.ARM CPU:
CORTEXA15
CORTEXA9
ARMV7
ARMV6
ARMV5
7.ARM 64-bit CPU:
ARMV8

View File

@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
# ACML custom
ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
# ACML 6.1 custom
ACML=/home/werner/project/acml6.1/gfortran64_mp/lib
LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
# Atlas Ubuntu
#ATLAS=/usr/lib/atlas-base

View File

@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){
}
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT result;
@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
FLOAT wkopt[4];
@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char trans='N';
blasint m, i, j;
blasint m, n, i, j;
int loops = 1;
int has_param_n=0;
int l;
char *p;
@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){
if ( p != NULL )
loops = atoi(p);
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
has_param_n=1;
}
#ifdef linux
srandom(getpid());
@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
if ( has_param_n == 1 && n <= m )
n=n;
else
n=m;
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
for (l=0; l<loops; l++)
{
@ -189,7 +202,7 @@ int MAIN__(int argc, char *argv[]){
gettimeofday( &start, (struct timezone *)0);
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
@ -202,11 +215,11 @@ int MAIN__(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6);
}
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -214,5 +214,5 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a,*work;
FLOAT wkopt[4];
@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -114,7 +114,7 @@ int gettimeofday(struct timeval *tv, void *tz){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
#ifndef COMPLEX
char *trans[] = {"T", "N"};
@ -278,5 +278,5 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *c;
FLOAT alpha[] = {1.0, 1.0};
@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){
#endif
int MAIN__(int argc, char *argv[]){
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT alpha[] = {1.0, 1.0};
@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){
return 0;
}
void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -81,6 +81,10 @@ if (($architecture eq "mips32") || ($architecture eq "mips64")) {
$defined = 1;
}
if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
if ($architecture eq "alpha") {
$defined = 1;
$binary = 64;

16
cblas.h
View File

@ -13,6 +13,12 @@ extern "C" {
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the number of threads on runtime.*/
int openblas_get_num_threads(void);
/*Get the number of physical processors (cores).*/
int openblas_get_num_procs(void);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
@ -341,6 +347,16 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
double *c, OPENBLAS_CONST blasint cldc);
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
float *c, OPENBLAS_CONST blasint cldc);
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc);
#ifdef __cplusplus
}
#endif /* __cplusplus */

View File

@ -13,6 +13,12 @@ extern "C" {
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the number of threads on runtime.*/
int openblas_get_num_threads(void);
/*Get the number of physical processors (cores).*/
int openblas_get_num_procs(void);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
@ -327,6 +333,16 @@ void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, bl
blasint clda, blasint cldb);
void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a,
blasint clda, blasint cldb);
void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta,
float *c, blasint cldc);
void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta,
double *c, blasint cldc);
void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta,
float *c, blasint cldc);
void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta,
double *c, blasint cldc);
#ifdef __cplusplus
}
#endif /* __cplusplus */

View File

@ -327,6 +327,14 @@ typedef int blasint;
#endif
#endif
/*
#ifdef STEAMROLLER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
*/
#ifndef YIELDING
#define YIELDING sched_yield()
#endif

View File

@ -220,6 +220,7 @@
#define COMATCOPY_K_CTC comatcopy_k_ctc
#define COMATCOPY_K_RTC comatcopy_k_rtc
#define CGEADD_K cgeadd_k
#else
@ -402,6 +403,7 @@
#define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc
#define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc
#define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc
#define CGEADD_K gotoblas -> cgeadd_k
#endif

View File

@ -149,6 +149,7 @@
#define DOMATCOPY_K_RN domatcopy_k_rn
#define DOMATCOPY_K_CT domatcopy_k_ct
#define DOMATCOPY_K_RT domatcopy_k_rt
#define DGEADD_K dgeadd_k
#else
@ -267,6 +268,8 @@
#define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct
#define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt
#define DGEADD_K gotoblas -> dgeadd_k
#endif
#define DGEMM_NN dgemm_nn

View File

@ -754,6 +754,12 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do
void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *);
void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *);
void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*);
void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*);
#ifdef __cplusplus
}

View File

@ -1762,6 +1762,11 @@ int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, dou
int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG);
int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG);
int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG);
int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG);
int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);
#ifdef __CUDACC__
}

View File

@ -634,7 +634,7 @@
#define OMATCOPY_K_RN DOMATCOPY_K_RN
#define OMATCOPY_K_CT DOMATCOPY_K_CT
#define OMATCOPY_K_RT DOMATCOPY_K_RT
#define GEADD_K DGEADD_K
#else
#define AMAX_K SAMAX_K
@ -932,6 +932,7 @@
#define OMATCOPY_K_CT SOMATCOPY_K_CT
#define OMATCOPY_K_RT SOMATCOPY_K_RT
#define GEADD_K SGEADD_K
#endif
#else
#ifdef XDOUBLE
@ -1746,6 +1747,7 @@
#define OMATCOPY_K_RNC ZOMATCOPY_K_RNC
#define OMATCOPY_K_CTC ZOMATCOPY_K_CTC
#define OMATCOPY_K_RTC ZOMATCOPY_K_RTC
#define GEADD_K ZGEADD_K
#else
@ -2159,6 +2161,8 @@
#define OMATCOPY_K_CTC COMATCOPY_K_CTC
#define OMATCOPY_K_RTC COMATCOPY_K_RTC
#define GEADD_K CGEADD_K
#endif
#endif

View File

@ -855,6 +855,10 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
int (*zgeadd_k) (BLASLONG, BLASLONG, float, double, double *, BLASLONG, double, double, double *, BLASLONG);
} gotoblas_t;

View File

@ -153,6 +153,7 @@
#define SOMATCOPY_K_CT somatcopy_k_ct
#define SOMATCOPY_K_RT somatcopy_k_rt
#define SGEADD_K sgeadd_k
#else
@ -274,6 +275,7 @@
#define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct
#define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt
#define SGEADD_K gotoblas -> sgeadd_k
#endif

View File

@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define MMXSTORE movd
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER)
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER)
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@ -220,6 +220,7 @@
#define ZOMATCOPY_K_CTC zomatcopy_k_ctc
#define ZOMATCOPY_K_RTC zomatcopy_k_rtc
#define ZGEADD_K zgeadd_k
#else
@ -403,6 +404,8 @@
#define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc
#define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc
#define ZGEADD_K gotoblas -> zgeadd_k
#endif
#define ZGEMM_NN zgemm_nn

10
cpuid.h
View File

@ -104,10 +104,11 @@
#define CORE_ATOM 18
#define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
#define CORE_HASWELL 24
#define CORE_HASWELL 24
#define CORE_STEAMROLLER 25
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@ -200,6 +201,7 @@ typedef struct {
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
#define CPUTYPE_HASWELL 48
#define CPUTYPE_HASWELL 48
#define CPUTYPE_STEAMROLLER 49
#endif

View File

@ -30,16 +30,27 @@
#define CPU_UNKNOWN 0
#define CPU_ARMV6 1
#define CPU_ARMV7 2
#define CPU_CORTEXA15 3
#define CPU_CORTEXA9 3
#define CPU_CORTEXA15 4
static char *cpuname[] = {
"UNKOWN",
"ARMV6",
"ARMV7",
"CORTEXA9",
"CORTEXA15"
};
static char *cpuname_lower[] = {
"unknown",
"armv6",
"armv7",
"cortexa9",
"cortexa15"
};
int get_feature(char *search)
{
@ -85,6 +96,29 @@ int detect(void)
char buffer[512], *p;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("CPU part", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL) {
if (strstr(p, "0xc09")) {
return CPU_CORTEXA9;
}
if (strstr(p, "0xc0f")) {
return CPU_CORTEXA15;
}
}
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
@ -142,21 +176,7 @@ void get_architecture(void)
void get_subarchitecture(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV7:
printf("ARMV7");
break;
case CPU_ARMV6:
printf("ARMV6");
break;
default:
printf("UNKNOWN");
break;
}
printf("%s", cpuname[d]);
}
void get_subdirname(void)
@ -170,6 +190,36 @@ void get_cpuconfig(void)
int d = detect();
switch (d)
{
case CPU_CORTEXA9:
printf("#define CORTEXA9\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 128\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_CORTEXA15:
printf("#define CORTEXA15\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 128\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_ARMV7:
printf("#define ARMV7\n");
@ -206,18 +256,7 @@ void get_libname(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV7:
printf("armv7\n");
break;
case CPU_ARMV6:
printf("armv6\n");
break;
}
printf("%s", cpuname_lower[d]);
}

View File

@ -1162,6 +1162,12 @@ int get_cpuname(void){
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 0:
if(support_avx())
return CPUTYPE_STEAMROLLER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
}
break;
case 5:
@ -1290,6 +1296,7 @@ static char *cpuname[] = {
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
"STEAMROLLER",
};
static char *lowercpuname[] = {
@ -1341,6 +1348,7 @@ static char *lowercpuname[] = {
"bulldozer",
"piledriver",
"haswell",
"steamroller",
};
static char *corename[] = {
@ -1369,6 +1377,7 @@ static char *corename[] = {
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
"STEAMROLLER",
};
static char *corename_lower[] = {
@ -1397,6 +1406,7 @@ static char *corename_lower[] = {
"bulldozer",
"piledriver",
"haswell",
"steamroller",
};
@ -1562,7 +1572,15 @@ int get_coretype(void){
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 0:
if(support_avx())
return CORE_STEAMROLLER;
else
return CORE_BARCELONA; //OS don't support AVX.
}
}else return CORE_BARCELONA;
}
}

View File

@ -1,7 +1,7 @@
TOPDIR = ../..
include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
@ -103,6 +103,12 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_get_num_threads.$(SUFFIX) : openblas_get_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_get_num_procs.$(SUFFIX) : openblas_get_num_procs.c
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_get_config.$(SUFFIX) : openblas_get_config.c
$(CC) $(CFLAGS) -c $< -o $(@F)

View File

@ -66,6 +66,7 @@ extern gotoblas_t gotoblas_BOBCAT;
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_STEAMROLLER;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#else
@ -77,6 +78,7 @@ extern gotoblas_t gotoblas_HASWELL;
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#endif
@ -275,7 +277,17 @@ static gotoblas_t *get_coretype(void){
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0){
//AMD STEAMROLLER
if(support_avx())
return &gotoblas_STEAMROLLER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else {
return &gotoblas_BARCELONA;
}
@ -315,6 +327,7 @@ static char *corename[] = {
"Bulldozer",
"Piledriver",
"Haswell",
"Steamroller",
};
char *gotoblas_corename(void) {
@ -339,6 +352,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
return corename[0];
}
@ -349,9 +363,9 @@ static gotoblas_t *force_coretype(char *coretype){
int i ;
int found = -1;
char message[128];
char mname[20];
//char mname[20];
for ( i=1 ; i <= 20; i++)
for ( i=1 ; i <= 21; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
@ -361,8 +375,8 @@ static gotoblas_t *force_coretype(char *coretype){
}
if (found < 0)
{
strncpy(mname,coretype,20);
sprintf(message, "Core not found: %s\n",mname);
//strncpy(mname,coretype,20);
snprintf(message, 128, "Core not found: %s\n",coretype);
openblas_warning(1, message);
return(NULL);
}
@ -370,6 +384,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found)
{
case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);
case 19: return (&gotoblas_PILEDRIVER);
case 18: return (&gotoblas_BULLDOZER);

View File

@ -241,6 +241,7 @@ void set_stack_limit(int limitMB){
*/
#endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
@ -323,6 +324,23 @@ int blas_get_cpu_number(void){
}
#endif
int openblas_get_num_procs(void) {
#ifndef SMP
return 1;
#else
return get_num_procs();
#endif
}
int openblas_get_num_threads(void) {
#ifndef SMP
return 1;
#else
return blas_get_cpu_number();
#endif
}
struct release_t {
void *address;
void (*func)(struct release_t *);
@ -1335,6 +1353,8 @@ void DESTRUCTOR gotoblas_quit(void) {
if (gotoblas_initialized == 0) return;
blas_shutdown();
#ifdef PROFILE
moncontrol (0);
#endif
@ -1356,8 +1376,6 @@ void DESTRUCTOR gotoblas_quit(void) {
#ifdef PROFILE
moncontrol (1);
#endif
blas_shutdown();
}
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@ -0,0 +1,40 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
extern int openblas_get_num_procs(void);
int openblas_get_num_procs_(void) {
return openblas_get_num_procs();
}

View File

@ -0,0 +1,40 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
extern int openblas_get_num_threads(void);
int openblas_get_num_threads_(void) {
return openblas_get_num_threads();
}

View File

@ -166,7 +166,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL)
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -251,7 +251,7 @@ void blas_set_parameter(void){
env_var_t p;
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
int size = 16;
#else
int size = get_L2_size();

View File

@ -100,7 +100,12 @@ else
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
endif
ifeq ($(NOFORTRAN), 2)
#only build cblas without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<

View File

@ -23,7 +23,8 @@
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv,
xerbla,
saxpby,daxpby,caxpby,zaxpby
saxpby,daxpby,caxpby,zaxpby,
sgeadd,dgeadd,cgeadd,zgeadd,
);
@cblasobjs = (
@ -55,6 +56,7 @@
cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd
);
@exblasobjs = (
@ -81,7 +83,10 @@
#both underscore and no underscore
@misc_common_objs = (
openblas_set_num_threads, openblas_get_parallel,
openblas_get_parallel,
openblas_get_num_procs,
openblas_set_num_threads,
openblas_get_num_threads,
);
@misc_no_underscore_objs = (

View File

@ -432,6 +432,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "PILEDRIVER"
#endif
#if defined (FORCE_STEAMROLLER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "STEAMROLLER"
#define ARCHCONFIG "-DSTEAMROLLER " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
#define LIBNAME "steamroller"
#define CORENAME "STEAMROLLER"
#endif
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL
@ -710,6 +727,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_CORTEXA9
#define FORCE
#define ARCHITECTURE "ARM"
#define SUBARCHITECTURE "CORTEXA9"
#define SUBDIRNAME "arm"
#define ARCHCONFIG "-DCORTEXA9 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
"-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "cortexa9"
#define CORENAME "CORTEXA9"
#else
#endif
#ifdef FORCE_CORTEXA15
#define FORCE
#define ARCHITECTURE "ARM"
#define SUBARCHITECTURE "CORTEXA15"
#define SUBDIRNAME "arm"
#define ARCHCONFIG "-DCORTEXA15 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
"-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "cortexa15"
#define CORENAME "CORTEXA15"
#else
#endif
#ifdef FORCE_ARMV6
#define FORCE
#define ARCHITECTURE "ARM"

View File

@ -43,7 +43,8 @@ SBLAS2OBJS = \
SBLAS3OBJS = \
sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
sgeadd.$(SUFFIX)
DBLAS1OBJS = \
@ -68,7 +69,8 @@ DBLAS2OBJS = \
DBLAS3OBJS = \
dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\
dgeadd.$(SUFFIX)
CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
@ -96,7 +98,8 @@ CBLAS3OBJS = \
cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \
ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\
cgeadd.$(SUFFIX)
ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
@ -124,7 +127,8 @@ ZBLAS3OBJS = \
zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \
ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\
zgeadd.$(SUFFIX)
ifeq ($(SUPPORT_GEMM3M), 1)
@ -269,7 +273,8 @@ CSBLAS2OBJS = \
CSBLAS3OBJS = \
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
cblas_sgeadd.$(SUFFIX)
CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
@ -285,7 +290,8 @@ CDBLAS2OBJS = \
CDBLAS3OBJS += \
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX)
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \
cblas_dgeadd.$(SUFFIX)
CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
@ -308,7 +314,9 @@ CCBLAS3OBJS = \
cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
cblas_cgeadd.$(SUFFIX)
CZBLAS1OBJS = \
@ -332,7 +340,9 @@ CZBLAS3OBJS = \
cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX)
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
cblas_zgeadd.$(SUFFIX)
ifeq ($(SUPPORT_GEMM3M), 1)
@ -2103,4 +2113,27 @@ zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c
cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c
$(CC) -c $(CFLAGS) $< -o $(@F)
dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c
$(CC) -c $(CFLAGS) $< -o $(@F)
cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c
$(CC) -c $(CFLAGS) $< -o $(@F)
zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c
$(CC) -c $(CFLAGS) $< -o $(@F)
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)

148
interface/geadd.c Normal file
View File

@ -0,0 +1,148 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#if defined(DOUBLE)
#define ERROR_NAME "DGEADD "
#else
#define ERROR_NAME "SGEADD "
#endif
#ifndef CBLAS
void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *BETA, FLOAT *c, blasint *LDC)
{
blasint m = *M;
blasint n = *N;
blasint lda = *LDA;
blasint ldc = *LDC;
FLOAT alpha = *ALPHA;
FLOAT beta = *BETA;
blasint info;
PRINT_DEBUG_NAME;
info = 0;
if (lda < MAX(1, m)) info = 6;
if (ldc < MAX(1, m)) info = 8;
if (n < 0) info = 2;
if (m < 0) info = 1;
if (info != 0){
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta,
FLOAT *c, blasint ldc)
{
/*
void CNAME(enum CBLAS_ORDER order,
blasint m, blasint n,
FLOAT alpha,
FLOAT *a, blasint lda,
FLOAT beta,
FLOAT *c, blasint ldc){ */
blasint info, t;
PRINT_DEBUG_CNAME;
info = 0;
if (order == CblasColMajor) {
info = -1;
if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2;
if (m < 0) info = 1;
}
if (order == CblasRowMajor) {
info = -1;
t = n;
n = m;
m = t;
if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2;
if (m < 0) info = 1;
}
if (info >= 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#endif
if ((m==0) || (n==0)) return;
IDEBUG_START;
FUNCTION_PROFILE_START();
GEADD_K(m,n,alpha, a, lda, beta, c, ldc);
FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n);
IDEBUG_END;
return;
}

View File

@ -208,7 +208,20 @@ void CNAME(enum CBLAS_ORDER order,
if (incx < 0) x -= (lenx - 1) * incx;
if (incy < 0) y -= (leny - 1) * incy;
#ifdef MAX_STACK_ALLOC
// make it volatile because some gemv implementation (ex: dgemv_n.S)
// do not restore all register
volatile int stack_alloc_size = m + n;
if(stack_alloc_size < 128)
//dgemv_n.S require a 128 bytes buffer
stack_alloc_size = 128;
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
stack_alloc_size = 0;
FLOAT stack_buffer[stack_alloc_size];
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
#else
buffer = (FLOAT *)blas_memory_alloc(1);
#endif
#ifdef SMP
@ -237,7 +250,10 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
blas_memory_free(buffer);
#ifdef MAX_STACK_ALLOC
if(!stack_alloc_size)
#endif
blas_memory_free(buffer);
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);

View File

@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order,
if (incy < 0) y -= (n - 1) * incy;
if (incx < 0) x -= (m - 1) * incx;
#ifdef MAX_STACK_ALLOC
volatile int stack_alloc_size = m;
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
stack_alloc_size = 0;
FLOAT stack_buffer[stack_alloc_size];
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
#else
buffer = (FLOAT *)blas_memory_alloc(1);
#endif
#ifdef SMPTEST
nthreads = num_cpu_avail(2);
@ -190,7 +198,10 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
blas_memory_free(buffer);
#ifdef MAX_STACK_ALLOC
if(!stack_alloc_size)
#endif
blas_memory_free(buffer);
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);

146
interface/zgeadd.c Normal file
View File

@ -0,0 +1,146 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#if defined(DOUBLE)
#define ERROR_NAME "ZGEADD "
#else
#define ERROR_NAME "CGEADD "
#endif
#ifndef CBLAS
void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
FLOAT *BETA, FLOAT *c, blasint *LDC)
{
blasint m = *M;
blasint n = *N;
blasint lda = *LDA;
blasint ldc = *LDC;
blasint info;
PRINT_DEBUG_NAME;
info = 0;
if (lda < MAX(1, m)) info = 6;
if (ldc < MAX(1, m)) info = 8;
if (n < 0) info = 2;
if (m < 0) info = 1;
if (info != 0){
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *BETA,
FLOAT *c, blasint ldc)
{
/*
void CNAME(enum CBLAS_ORDER order,
blasint m, blasint n,
FLOAT alpha,
FLOAT *a, blasint lda,
FLOAT beta,
FLOAT *c, blasint ldc){ */
blasint info, t;
PRINT_DEBUG_CNAME;
info = 0;
if (order == CblasColMajor) {
info = -1;
if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2;
if (m < 0) info = 1;
}
if (order == CblasRowMajor) {
info = -1;
t = n;
n = m;
m = t;
if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2;
if (m < 0) info = 1;
}
if (info >= 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#endif
if ((m==0) || (n==0)) return;
IDEBUG_START;
FUNCTION_PROFILE_START();
GEADD_K(m,n,ALPHA[0],ALPHA[1], a, lda, BETA[0], BETA[1], c, ldc);
FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n);
IDEBUG_END;
return;
}

View File

@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
FUNCTION_PROFILE_START();
if (incx < 0 ) x -= (n - 1) * incx;
if (incx < 0 ) x -= (n - 1) * incx * 2;
buffer = (FLOAT *)blas_memory_alloc(1);

View File

@ -329,23 +329,27 @@ endif
###### BLAS extensions #####
SBLASOBJS += \
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX)
somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
sgeadd_k$(TSUFFIX).$(SUFFIX)
DBLASOBJS += \
domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX)
domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
dgeadd_k$(TSUFFIX).$(SUFFIX)
CBLASOBJS += \
comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
cgeadd_k$(TSUFFIX).$(SUFFIX)
ZBLASOBJS += \
zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX)
zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
zgeadd_k$(TSUFFIX).$(SUFFIX)
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
@ -3440,3 +3444,31 @@ $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
ifndef SGEADD_K
SGEADD_K = ../generic/geadd.c
endif
$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
ifndef DGEADD_K
DGEADD_K = ../generic/geadd.c
endif
$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
ifndef CGEADD_K
CGEADD_K = ../generic/zgeadd.c
endif
$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@
ifndef ZGEADD_K
ZGEADD_K = ../generic/zgeadd.c
endif
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@

View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.ARMV7

View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.ARMV7

64
kernel/generic/geadd.c Normal file
View File

@ -0,0 +1,64 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb)
{
BLASLONG i;
FLOAT *aptr,*bptr;
if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);
aptr = a;
bptr = b;
if ( alpha == 0.0 )
{
for ( i=0; i<cols ; i++ )
{
SCAL_K(rows, 0,0, beta, bptr, 1, NULL, 0,NULL,0);
bptr+=ldb;
}
return(0);
}
for (i = 0; i < cols; i++) {
AXPBY_K(rows, alpha, aptr, 1, beta, bptr, 1);
aptr += lda;
bptr += ldb;
}
return(0);
}

65
kernel/generic/zgeadd.c Normal file
View File

@ -0,0 +1,65 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alphar, FLOAT alphai, FLOAT *a, BLASLONG lda, FLOAT betar, FLOAT betai , FLOAT *b, BLASLONG ldb)
{
BLASLONG i;
FLOAT *aptr,*bptr;
if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);
aptr = a;
bptr = b;
lda *= 2;
ldb *= 2;
if ( alphar == 0.0 && alphai == 0.0 )
{
for ( i=0; i<cols ; i++ )
{
SCAL_K(rows, 0,0, betar, betai, bptr, 1, NULL, 0,NULL,0);
bptr+=ldb;
}
return(0);
}
for (i = 0; i < cols; i++) {
AXPBY_K(rows, alphar, alphai, aptr, 1, betar, betai, bptr, 1);
aptr += lda;
bptr += ldb;
}
return(0);
}

View File

@ -548,8 +548,9 @@ gotoblas_t TABLE_NAME = {
comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS,
comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS,
zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS,
zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS
zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS,
sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS
};
@ -941,6 +942,23 @@ static void init_parameter(void) {
#endif
#endif
#ifdef STEAMROLLER
#ifdef DEBUG
fprintf(stderr, "Steamroller\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO
#ifdef DEBUG

View File

@ -34,17 +34,17 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMINCOPY = zgemm_ncopy_1.S
ZGEMMITCOPY = zgemm_tcopy_1.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S

View File

@ -0,0 +1,88 @@
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_4.c
DCOPYKERNEL = dcopy_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(BULLDOZER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "caxpy_microk_bulldozer-2.c"
#endif

View File

@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 768(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %8 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %5 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
"cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"2: \n\t"
"cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:

View File

@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t"
@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L08END%=: \n\t"
"3: \n\t"
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t"
@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L08END%=: \n\t"
"3: \n\t"
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"addq $8 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"addq $16 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L08END%=: \n\t"
"3: \n\t"
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "daxpy_microk_nehalem-2.c"
#elif defined(BULLDOZER)
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "daxpy_microk_bulldozer-2.c"
#endif

View File

@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmovddup (%4), %%xmm0 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 768(%3,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"shufpd $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,8) \n\t"
// "prefetcht0 192(%3,%0,8) \n\t"
@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "ddot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c"

View File

@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"

View File

@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xorpd %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"addpd %%xmm5, %%xmm4 \n\t"
"addpd %%xmm7, %%xmm6 \n\t"

View File

@ -1092,18 +1092,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
vxorpd %xmm4 , %xmm4 , %xmm4
vxorpd %xmm5 , %xmm5 , %xmm5
vxorpd %ymm4 , %ymm4 , %ymm4
vxorpd %ymm5 , %ymm5 , %ymm5
vxorpd %ymm6 , %ymm6 , %ymm6
vxorpd %ymm7 , %ymm7 , %ymm7
.endm
.macro KERNEL4x1
vbroadcastsd -12 * SIZE(BO), %ymm0
vbroadcastsd -11 * SIZE(BO), %ymm1
vbroadcastsd -10 * SIZE(BO), %ymm2
vbroadcastsd -9 * SIZE(BO), %ymm3
vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4
vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5
vbroadcastsd -8 * SIZE(BO), %ymm0
vbroadcastsd -7 * SIZE(BO), %ymm1
vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6
vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7
vbroadcastsd -6 * SIZE(BO), %ymm2
vbroadcastsd -5 * SIZE(BO), %ymm3
vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4
vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5
vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6
vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7
addq $ 8 *SIZE, BO
addq $ 32*SIZE, AO
.endm
.macro KERNEL4x1_SUB
vmovddup -12 * SIZE(BO), %xmm2
vmovups -16 * SIZE(AO), %xmm0
vmovups -14 * SIZE(AO), %xmm1
vfmadd231pd %xmm0 ,%xmm2 , %xmm4
vfmadd231pd %xmm1 ,%xmm2 , %xmm5
vbroadcastsd -12 * SIZE(BO), %ymm2
vmovups -16 * SIZE(AO), %ymm0
vfmadd231pd %ymm0 ,%ymm2 , %ymm4
addq $ 1*SIZE, BO
addq $ 4*SIZE, AO
@ -1112,21 +1142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro SAVE4x1
vmovddup ALPHA, %xmm0
vbroadcastsd ALPHA, %ymm0
vmulpd %xmm0 , %xmm4 , %xmm4
vmulpd %xmm0 , %xmm5 , %xmm5
vaddpd %ymm4,%ymm5, %ymm4
vaddpd %ymm6,%ymm7, %ymm6
vaddpd %ymm4,%ymm6, %ymm4
vmulpd %ymm0 , %ymm4 , %ymm4
#if !defined(TRMMKERNEL)
vaddpd (CO1) , %xmm4, %xmm4
vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5
vaddpd (CO1) , %ymm4, %ymm4
#endif
vmovups %xmm4 , (CO1)
vmovups %xmm5 , 2 * SIZE(CO1)
vmovups %ymm4 , (CO1)
addq $ 4*SIZE, CO1
.endm
@ -2112,15 +2143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_12:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1
dec %rax
jne .L1_12
@ -3180,15 +3203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L1_12:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1
dec %rax
jne .L1_12

View File

@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NEHALEM)
#include "dgemv_n_microk_nehalem-4.c"
#elif defined(HASWELL)
#elif defined(HASWELL) || defined(STEAMROLLER)
#include "dgemv_n_microk_haswell-4.c"
#endif
@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufpd $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:
@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"shufpd $0, %%xmm12, %%xmm12 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"jz 2f \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"jz 2f \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L8LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L8END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L8END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t"
:

View File

@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:
@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufpd $0, %%xmm6 , %%xmm6 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_4x8 1
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"addq $4 , %8 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je 3f \n\t"
".align 16 \n\t"
"1: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"addq $8 , %8 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"jnz 1b \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
#define HAVE_KERNEL_4x4 1
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je 3f \n\t"
".align 16 \n\t"
"1: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
"vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz 1b \n\t"
"3: \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(HASWELL)
#if defined(HASWELL) || defined(STEAMROLLER)
#include "dgemv_t_microk_haswell-4.c"
#endif
@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"xorpd %%xmm11 , %%xmm11 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"subq $2 , %1 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t"
".L01LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0
@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"3: \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
"haddpd %%xmm11, %%xmm11 \n\t"
@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"xorpd %%xmm10 , %%xmm10 \n\t"
"testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"addpd %%xmm12 , %%xmm10 \n\t"
"subq $2 , %1 \n\t"
".L01LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups 16(%3,%0,8) , %%xmm14 \n\t"
@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"subq $4 , %1 \n\t"
"addpd %%xmm14 , %%xmm9 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L01END%=: \n\t"
"3: \n\t"
"addpd %%xmm9 , %%xmm10 \n\t"
"haddpd %%xmm10, %%xmm10 \n\t"
@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t"
@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"subq $2 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 3f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 384(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $8 , %1 \n\t"
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"3: \n\t"
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(BULLDOZER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "dsymv_L_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "dsymv_L_microk_nehalem-2.c"

View File

@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vmovsd (%9), %%xmm4 \n\t"
"vmovsd 8(%9), %%xmm5 \n\t"

View File

@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"shufpd $0, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"movsd (%9), %%xmm4 \n\t" // temp1[0]
"movsd 8(%9), %%xmm5 \n\t" // temp1[1]

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(BULLDOZER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "dsymv_U_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "dsymv_U_microk_nehalem-2.c"

View File

@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovups %%xmm9 , -32(%3,%0,8) \n\t"
"vmovups %%xmm11 , -16(%3,%0,8) \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"

View File

@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t"
@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
"subq $2 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"haddpd %%xmm0, %%xmm0 \n\t"
"haddpd %%xmm1, %%xmm1 \n\t"

View File

@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"shufps $0, %%xmm0, %%xmm0 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,4) \n\t"
// "prefetcht0 192(%3,%0,4) \n\t"
@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "sdot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"

View File

@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"

View File

@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xorps %%xmm7, %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"addps %%xmm5, %%xmm4 \n\t"
"addps %%xmm7, %%xmm6 \n\t"

View File

@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "sgemv_n_microk_bulldozer-4.c"
#elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-4.c"
@ -39,8 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemv_n_microk_haswell-4.c"
#endif
#if defined(STEAMROLLER)
#define NBMAX 2048
#else
#define NBMAX 4096
#endif
#ifndef HAVE_KERNEL_4x8
@ -129,7 +132,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufps $0, %%xmm13, %%xmm13 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t"
@ -143,7 +146,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:
@ -166,7 +169,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
#endif
#ifndef HAVE_KERNEL_4x2
#ifndef HAVE_KERNEL_4x1
static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@ -184,10 +187,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"shufps $0, %%xmm12, %%xmm12 \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 2f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
@ -203,12 +206,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"2: \n\t"
"testq $0x04, %5 \n\t"
"jz .L08LABEL%= \n\t"
"jz 3f \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
@ -218,7 +221,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
".L08LABEL%=: \n\t"
"3: \n\t"
:
:
"r" (i), // 0
@ -262,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
(
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4) , %%xmm12 \n\t"
"movups (%3,%0,4) , %%xmm11 \n\t"
@ -271,7 +274,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"movups %%xmm11, -16(%3,%0,4) \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

View File

@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
"testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
".L08LABEL%=: \n\t"
"2: \n\t"
"testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8 , %1 \n\t"
".L16LABEL%=: \n\t"
"3: \n\t"
"cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
".L16END%=: \n\t"
"4: \n\t"
:
:
@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%xmm8 \n\t" // alpha
".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
:
:

Some files were not shown because too many files have changed in this diff Show More