Merge branch 'develop'
This commit is contained in:
		
						commit
						e5ac3007e0
					
				|  | @ -1,4 +1,22 @@ | |||
| OpenBLAS ChangeLog | ||||
| ==================================================================== | ||||
| Version 0.2.6 | ||||
| 2-Mar-2013 | ||||
| common: | ||||
| 	* Improved OpenMP performance slightly. (d744c9) | ||||
| 	* Improved cblas.h compatibility with Intel MKL.(#185) | ||||
| 	* Fixed the overflowing bug in single thread cholesky factorization. | ||||
| 	* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) | ||||
| 
 | ||||
| x86/x86-64: | ||||
| 	* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | ||||
| 	  We will tune the performance in future. | ||||
| 	* Auto-detect Intel Xeon E7540. | ||||
| 	* Fixed the overflowing buffer bug of gemv. (#173) | ||||
| 	* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) | ||||
| 
 | ||||
| MIPS64: | ||||
| 
 | ||||
| ==================================================================== | ||||
| Version 0.2.5 | ||||
| 26-Nov-2012 | ||||
|  |  | |||
							
								
								
									
										2
									
								
								Makefile
								
								
								
								
							
							
						
						
									
										2
									
								
								Makefile
								
								
								
								
							|  | @ -314,7 +314,7 @@ clean :: | |||
| #endif
 | ||||
| 	@$(MAKE) -C reference clean | ||||
| 	@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | ||||
| 	@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | ||||
| 	@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib | ||||
| 	@if test -d $(NETLIB_LAPACK_DIR); then \
 | ||||
| 	echo deleting $(NETLIB_LAPACK_DIR); \
 | ||||
| 	rm -rf $(NETLIB_LAPACK_DIR) ;\
 | ||||
|  |  | |||
|  | @ -1,3 +1,5 @@ | |||
| # This is triggered by Makefile.system and runs before any of the code is built.
 | ||||
| 
 | ||||
| export BINARY | ||||
| export USE_OPENMP | ||||
| 
 | ||||
|  | @ -15,7 +17,7 @@ ifdef CPUIDEMU | |||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | ||||
| endif | ||||
| 
 | ||||
| all: getarch_2nd | ||||
| all: getarch_2nd cblas_noconst.h | ||||
| 	./getarch_2nd  0 >> $(TARGET_MAKE) | ||||
| 	./getarch_2nd  1 >> $(TARGET_CONF) | ||||
| 
 | ||||
|  | @ -36,4 +38,7 @@ else | |||
| 	$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c | ||||
| endif | ||||
| 
 | ||||
| cblas_noconst.h : cblas.h | ||||
| 	perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h	 | ||||
| 
 | ||||
| dummy: | ||||
|  | @ -3,7 +3,7 @@ | |||
| #
 | ||||
| 
 | ||||
| # This library's version
 | ||||
| VERSION = 0.2.5 | ||||
| VERSION = 0.2.6 | ||||
| 
 | ||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library 
 | ||||
|  |  | |||
|  | @ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE | |||
| export GOTOBLAS_MAKEFILE = 1 | ||||
| 
 | ||||
| # Generating Makefile.conf and config.h
 | ||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | ||||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | ||||
| 
 | ||||
| ifndef TARGET_CORE | ||||
| include $(TOPDIR)/Makefile.conf | ||||
|  | @ -277,14 +277,14 @@ ifeq ($(ARCH), x86) | |||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
 | ||||
| 	       CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE  | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE  | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
|  |  | |||
|  | @ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt | |||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | ||||
| - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. | ||||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | ||||
| - **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes. | ||||
| - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | ||||
| 
 | ||||
| #### MIPS64: | ||||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | ||||
|  |  | |||
|  | @ -29,6 +29,7 @@ BARCELONA | |||
| SHANGHAI | ||||
| ISTANBUL | ||||
| BOBCAT | ||||
| BULLDOZER | ||||
| 
 | ||||
| c)VIA CPU: | ||||
| SSE_GENERIC | ||||
|  |  | |||
							
								
								
									
										448
									
								
								cblas.h
								
								
								
								
							
							
						
						
									
										448
									
								
								cblas.h
								
								
								
								
							|  | @ -1,291 +1,293 @@ | |||
| #ifndef CBLAS_H | ||||
| #define CBLAS_H | ||||
| 
 | ||||
| #include <stddef.h> | ||||
| #include "common.h" | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| 	/* Assume C declarations for C++ */ | ||||
| #endif  /* __cplusplus */ | ||||
| 
 | ||||
| #include <stddef.h> | ||||
| #include "common.h" | ||||
| 
 | ||||
| /*Set the number of threads on runtime.*/ | ||||
| void openblas_set_num_threads(int num_threads); | ||||
| void goto_set_num_threads(int num_threads); | ||||
| 
 | ||||
| /*Get the build configure on runtime.*/ | ||||
| char* openblas_get_config(void); | ||||
| 
 | ||||
| #define CBLAS_INDEX size_t | ||||
| 
 | ||||
| enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102}; | ||||
| enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; | ||||
| enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122}; | ||||
| enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132}; | ||||
| enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142}; | ||||
| typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; | ||||
| typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; | ||||
| typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO; | ||||
| typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; | ||||
| typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE; | ||||
| 
 | ||||
| float  cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); | ||||
| double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); | ||||
| float  cblas_sdot(blasint n, float  *x, blasint incx, float  *y, blasint incy); | ||||
| double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); | ||||
| float  cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); | ||||
| double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | ||||
| float  cblas_sdot(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy); | ||||
| double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | ||||
| 
 | ||||
| openblas_complex_float  cblas_cdotu(blasint n, float  *x, blasint incx, float  *y, blasint incy); | ||||
| openblas_complex_float  cblas_cdotc(blasint n, float  *x, blasint incx, float  *y, blasint incy); | ||||
| openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); | ||||
| openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); | ||||
| openblas_complex_float  cblas_cdotu(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy); | ||||
| openblas_complex_float  cblas_cdotc(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy); | ||||
| openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | ||||
| openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | ||||
| 
 | ||||
| void  cblas_cdotu_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret); | ||||
| void  cblas_cdotc_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret); | ||||
| void  cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); | ||||
| void  cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); | ||||
| void  cblas_cdotu_sub(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy, openblas_complex_float  *ret); | ||||
| void  cblas_cdotc_sub(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy, openblas_complex_float  *ret); | ||||
| void  cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | ||||
| void  cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | ||||
| 
 | ||||
| float  cblas_sasum (blasint n, float  *x, blasint incx); | ||||
| double cblas_dasum (blasint n, double *x, blasint incx); | ||||
| float  cblas_scasum(blasint n, float  *x, blasint incx); | ||||
| double cblas_dzasum(blasint n, double *x, blasint incx); | ||||
| float  cblas_sasum (const blasint n, const float  *x, const blasint incx); | ||||
| double cblas_dasum (const blasint n, const double *x, const blasint incx); | ||||
| float  cblas_scasum(const blasint n, const float  *x, const blasint incx); | ||||
| double cblas_dzasum(const blasint n, const double *x, const blasint incx); | ||||
| 
 | ||||
| float  cblas_snrm2 (blasint N, float  *X, blasint incX); | ||||
| double cblas_dnrm2 (blasint N, double *X, blasint incX); | ||||
| float  cblas_scnrm2(blasint N, float  *X, blasint incX); | ||||
| double cblas_dznrm2(blasint N, double *X, blasint incX); | ||||
| float  cblas_snrm2 (const blasint N, const float  *X, const blasint incX); | ||||
| double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); | ||||
| float  cblas_scnrm2(const blasint N, const float  *X, const blasint incX); | ||||
| double cblas_dznrm2(const blasint N, const double *X, const blasint incX); | ||||
| 
 | ||||
| CBLAS_INDEX cblas_isamax(blasint n, float  *x, blasint incx); | ||||
| CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); | ||||
| CBLAS_INDEX cblas_icamax(blasint n, float  *x, blasint incx); | ||||
| CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); | ||||
| CBLAS_INDEX cblas_isamax(const blasint n, const float  *x, const blasint incx); | ||||
| CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); | ||||
| CBLAS_INDEX cblas_icamax(const blasint n, const float  *x, const blasint incx); | ||||
| CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); | ||||
| 
 | ||||
| void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); | ||||
| void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); | ||||
| void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); | ||||
| void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); | ||||
| void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); | ||||
| void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); | ||||
| void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); | ||||
| void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); | ||||
| 
 | ||||
| void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); | ||||
| void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | ||||
| void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); | ||||
| void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | ||||
| void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | ||||
| void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | ||||
| void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | ||||
| void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | ||||
| 
 | ||||
| void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); | ||||
| void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); | ||||
| void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); | ||||
| void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); | ||||
| void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | ||||
| void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | ||||
| void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | ||||
| void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | ||||
| 
 | ||||
| void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); | ||||
| void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double  s); | ||||
| void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); | ||||
| void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double  s); | ||||
| 
 | ||||
| void cblas_srotg(float *a, float *b, float *c, float *s); | ||||
| void cblas_drotg(double *a, double *b, double *c, double *s); | ||||
| 
 | ||||
| void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); | ||||
| void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); | ||||
| void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); | ||||
| void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); | ||||
| 
 | ||||
| void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); | ||||
| void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); | ||||
| void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); | ||||
| void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); | ||||
| 
 | ||||
| void cblas_sscal(blasint N, float alpha, float *X, blasint incX); | ||||
| void cblas_dscal(blasint N, double alpha, double *X, blasint incX); | ||||
| void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); | ||||
| void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); | ||||
| void cblas_csscal(blasint N, float alpha, float *X, blasint incX); | ||||
| void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); | ||||
| void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); | ||||
| void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); | ||||
| void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); | ||||
| void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); | ||||
| void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); | ||||
| void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); | ||||
| 
 | ||||
| void cblas_sgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n, | ||||
| 		 float alpha, float  *a, blasint lda,  float  *x, blasint incx,  float beta,  float  *y, blasint incy); | ||||
| void cblas_dgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n, | ||||
| 		 double alpha, double  *a, blasint lda,  double  *x, blasint incx,  double beta,  double  *y, blasint incy); | ||||
| void cblas_cgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n, | ||||
| 		 float *alpha, float  *a, blasint lda,  float  *x, blasint incx,  float *beta,  float  *y, blasint incy); | ||||
| void cblas_zgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n, | ||||
| 		 double *alpha, double  *a, blasint lda,  double  *x, blasint incx,  double *beta,  double  *y, blasint incy); | ||||
| void cblas_sgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n, | ||||
| 		 const float alpha, const float  *a, const blasint lda,  const float  *x, const blasint incx,  const float beta,  float  *y, const blasint incy); | ||||
| void cblas_dgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n, | ||||
| 		 const double alpha, const double  *a, const blasint lda,  const double  *x, const blasint incx,  const double beta,  double  *y, const blasint incy); | ||||
| void cblas_cgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n, | ||||
| 		 const float *alpha, const float  *a, const blasint lda,  const float  *x, const blasint incx,  const float *beta,  float  *y, const blasint incy); | ||||
| void cblas_zgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n, | ||||
| 		 const double *alpha, const double  *a, const blasint lda,  const double  *x, const blasint incx,  const double *beta,  double  *y, const blasint incy); | ||||
| 
 | ||||
| void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float   alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda); | ||||
| void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double  alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | ||||
| void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda); | ||||
| void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda); | ||||
| void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | ||||
| void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | ||||
| void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float   alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda); | ||||
| void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double  alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | ||||
| void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float  *alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda); | ||||
| void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float  *alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda); | ||||
| void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | ||||
| void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | ||||
| 
 | ||||
| void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| 
 | ||||
| void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| 
 | ||||
| void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | ||||
| void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | ||||
| void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | ||||
| void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | ||||
| void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | ||||
| void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | ||||
| void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | ||||
| void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | ||||
| 
 | ||||
| void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, | ||||
|                 blasint incX, float *Y, blasint incY, float *A, blasint lda); | ||||
| void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, | ||||
|                 blasint incX, double *Y, blasint incY, double *A, blasint lda); | ||||
| void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, | ||||
|                 float *Y, blasint incY, float *A, blasint lda); | ||||
| void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, | ||||
|                 double *Y, blasint incY, double *A, blasint lda); | ||||
| void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, | ||||
|                 const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | ||||
| void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, | ||||
|                 const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | ||||
| void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, | ||||
|                 const float *Y, const blasint incY, float *A, const blasint lda); | ||||
| void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, | ||||
|                 const double *Y, const blasint incY, double *A, const blasint lda); | ||||
| 
 | ||||
| void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | ||||
|                  blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | ||||
| void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | ||||
|                  blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | ||||
| void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | ||||
|                  blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | ||||
| void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | ||||
|                  blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | ||||
| void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | ||||
|                  const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | ||||
| void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | ||||
|                  const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | ||||
| void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | ||||
|                  const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | ||||
| void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | ||||
|                  const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | ||||
| 
 | ||||
| void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, | ||||
|                  blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | ||||
| void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, | ||||
|                  blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | ||||
| void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, | ||||
|                  const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | ||||
| void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, | ||||
|                  const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | ||||
| 
 | ||||
| 
 | ||||
| void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| 
 | ||||
| void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | ||||
| void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | ||||
| void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | ||||
| void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | ||||
| 
 | ||||
| void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, float *Ap, float *X, blasint incX); | ||||
| void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, double *Ap, double *X, blasint incX); | ||||
| void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, float *Ap, float *X, blasint incX); | ||||
| void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, double *Ap, double *X, blasint incX); | ||||
| void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const float *Ap, float *X, const blasint incX); | ||||
| void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const double *Ap, double *X, const blasint incX); | ||||
| void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const float *Ap, float *X, const blasint incX); | ||||
| void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const double *Ap, double *X, const blasint incX); | ||||
| 
 | ||||
| void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, float *Ap, float *X, blasint incX); | ||||
| void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, double *Ap, double *X, blasint incX); | ||||
| void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, float *Ap, float *X, blasint incX); | ||||
| void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | ||||
|                  blasint N, double *Ap, double *X, blasint incX); | ||||
| void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const float *Ap, float *X, const blasint incX); | ||||
| void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const double *Ap, double *X, const blasint incX); | ||||
| void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const float *Ap, float *X, const blasint incX); | ||||
| void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | ||||
|                  const blasint N, const double *Ap, double *X, const blasint incX); | ||||
| 
 | ||||
| void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, | ||||
|                  blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | ||||
| void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, | ||||
|                  blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | ||||
| void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, | ||||
|                  blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | ||||
| void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, | ||||
|                  blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | ||||
| void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, | ||||
|                  const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | ||||
| void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, | ||||
|                  const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | ||||
| void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, | ||||
|                  const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | ||||
| void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, | ||||
|                  const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | ||||
| 
 | ||||
| 
 | ||||
| void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, | ||||
|                  float *X, blasint incX, float beta, float *Y, blasint incY); | ||||
| void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, | ||||
|                  double *X, blasint incX, double beta, double *Y, blasint incY); | ||||
| void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, | ||||
|                  const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | ||||
| void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, | ||||
|                  const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | ||||
| 
 | ||||
| void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); | ||||
| void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); | ||||
| void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); | ||||
| void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); | ||||
| 
 | ||||
| void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); | ||||
| void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); | ||||
| void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); | ||||
| void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); | ||||
| 
 | ||||
| void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); | ||||
| void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); | ||||
| void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); | ||||
| void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); | ||||
| void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); | ||||
| void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); | ||||
| void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); | ||||
| void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); | ||||
| 
 | ||||
| void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | ||||
| 		 float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | ||||
| void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | ||||
| 		 double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | ||||
| void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | ||||
| 		 const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | ||||
| void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | ||||
| 		 const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | ||||
| 
 | ||||
| void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | ||||
| 		 float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); | ||||
| void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | ||||
| 		 double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); | ||||
| void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | ||||
| 		 const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | ||||
| void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | ||||
| 		 const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | ||||
| 
 | ||||
| void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | ||||
| 		 float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | ||||
| void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | ||||
| 		 double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | ||||
| void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | ||||
| 		 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | ||||
| void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | ||||
| 		 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | ||||
| void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | ||||
| 		 const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | ||||
| void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | ||||
| 		 const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | ||||
| void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | ||||
| 		 const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | ||||
| void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | ||||
| 		 const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | ||||
| 
 | ||||
| void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | ||||
|                  float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | ||||
| void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | ||||
|                  double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | ||||
| void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | ||||
|                  float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | ||||
| void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | ||||
|                  double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | ||||
| void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | ||||
|                  const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | ||||
| void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | ||||
|                  const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | ||||
| void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | ||||
|                  const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | ||||
| void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | ||||
|                  const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | ||||
| 
 | ||||
| void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		 blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | ||||
| void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		 blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | ||||
| void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		 blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); | ||||
| void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		 blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); | ||||
| void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		 const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | ||||
| void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		 const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | ||||
| void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		 const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); | ||||
| void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		 const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); | ||||
| 
 | ||||
| void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		  blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | ||||
| void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		  blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | ||||
| void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		  blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | ||||
| void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | ||||
| 		  blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | ||||
| void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		  const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | ||||
| void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		  const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | ||||
| void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		  const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | ||||
| void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | ||||
| 		  const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | ||||
| 
 | ||||
| void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | ||||
| void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | ||||
| void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | ||||
| void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | ||||
| void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | ||||
| void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | ||||
| void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | ||||
| void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | ||||
| 
 | ||||
| void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | ||||
| void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | ||||
| void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | ||||
| void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | ||||
|                  enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | ||||
| void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | ||||
| void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | ||||
| void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | ||||
| void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | ||||
|                  const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | ||||
| 
 | ||||
| void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | ||||
|                  float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | ||||
| void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | ||||
|                  double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | ||||
| void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | ||||
|                  const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | ||||
| void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | ||||
|                  const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | ||||
| 
 | ||||
| void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | ||||
|                  float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | ||||
| void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | ||||
|                  double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | ||||
| void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | ||||
|                  const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | ||||
| void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | ||||
|                  const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | ||||
| 
 | ||||
| void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | ||||
|                   float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | ||||
| void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | ||||
|                   double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | ||||
| void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | ||||
|                   const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | ||||
| void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | ||||
|                   const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | ||||
| 
 | ||||
| void cblas_xerbla(blasint p, char *rout, char *form, ...); | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| } | ||||
|       | ||||
| #endif  /* __cplusplus */ | ||||
| 
 | ||||
| #endif | ||||
|  |  | |||
							
								
								
									
										6
									
								
								common.h
								
								
								
								
							
							
						
						
									
										6
									
								
								common.h
								
								
								
								
							|  | @ -390,7 +390,8 @@ typedef int blasint; | |||
| /* C99 supports complex floating numbers natively, which GCC also offers as an
 | ||||
|    extension since version 3.0.  If neither are available, use a compatible | ||||
|    structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | ||||
| #if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 | ||||
| #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | ||||
|      (__GNUC__ >= 3 && !defined(__cplusplus))) | ||||
|   #define OPENBLAS_COMPLEX_C99 | ||||
|   typedef float _Complex openblas_complex_float; | ||||
|   typedef double _Complex openblas_complex_double; | ||||
|  | @ -557,7 +558,8 @@ typedef struct { | |||
| #include "common_level3.h" | ||||
| #include "common_lapack.h" | ||||
| #ifdef CBLAS | ||||
| #include "cblas.h" | ||||
| /* This header file is generated from "cblas.h" (see Makefile.prebuild). */ | ||||
| #include "cblas_noconst.h" | ||||
| #endif | ||||
| 
 | ||||
| #ifndef ASSEMBLER | ||||
|  |  | |||
							
								
								
									
										1
									
								
								cpuid.h
								
								
								
								
							
							
						
						
									
										1
									
								
								cpuid.h
								
								
								
								
							|  | @ -126,6 +126,7 @@ | |||
| #define HAVE_128BITFPU   (1 << 16) | ||||
| #define HAVE_FASTMOVU    (1 << 17) | ||||
| #define HAVE_AVX      (1 <<  18) | ||||
| #define HAVE_FMA4     (1 <<  19) | ||||
| 
 | ||||
| #define CACHE_INFO_L1_I     1 | ||||
| #define CACHE_INFO_L1_D     2 | ||||
|  |  | |||
							
								
								
									
										32
									
								
								cpuid_x86.c
								
								
								
								
							
							
						
						
									
										32
									
								
								cpuid_x86.c
								
								
								
								
							|  | @ -43,6 +43,8 @@ | |||
| #ifdef NO_AVX | ||||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | ||||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | ||||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | ||||
| #define CORE_BULLDOZER CORE_BARCELONA | ||||
| #endif | ||||
| 
 | ||||
| #ifndef CPUIDEMU | ||||
|  | @ -116,8 +118,9 @@ static inline int have_excpuid(void){ | |||
| 
 | ||||
| #ifndef NO_AVX | ||||
| static inline void xgetbv(int op, int * eax, int * edx){ | ||||
|   //Use binary code for xgetbv
 | ||||
|   __asm__ __volatile__ | ||||
|     ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | ||||
|     (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
|  | @ -228,6 +231,9 @@ int get_cputype(int gettype){ | |||
|       cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||||
|       if ((ecx & (1 <<  6)) != 0) feature |= HAVE_SSE4A; | ||||
|       if ((ecx & (1 <<  7)) != 0) feature |= HAVE_MISALIGNSSE; | ||||
| #ifndef NO_AVX | ||||
|       if ((ecx & (1 <<  16)) != 0) feature |= HAVE_FMA4; | ||||
| #endif | ||||
|       if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | ||||
|       if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | ||||
|     } | ||||
|  | @ -1030,6 +1036,8 @@ int get_cpuname(void){ | |||
| 	    return CPUTYPE_SANDYBRIDGE; | ||||
| 	  else | ||||
| 	    return CPUTYPE_NEHALEM; | ||||
| 	case 14: | ||||
| 	  // Xeon E7540
 | ||||
| 	case 15: | ||||
| 	  //Xeon Processor E7 (Westmere-EX)
 | ||||
| 	  return CPUTYPE_NEHALEM; | ||||
|  | @ -1075,8 +1083,12 @@ int get_cpuname(void){ | |||
| 	return CPUTYPE_OPTERON; | ||||
|       case  1: | ||||
|       case 10: | ||||
|       case  6:   //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | ||||
| 	return CPUTYPE_BARCELONA; | ||||
|       case  6:   //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | ||||
| 	if(support_avx()) | ||||
| 	  return CPUTYPE_BULLDOZER; | ||||
| 	else | ||||
| 	  return CPUTYPE_BARCELONA; //OS don't support AVX.
 | ||||
|       case  5: | ||||
| 	return CPUTYPE_BOBCAT; | ||||
|       } | ||||
|  | @ -1398,6 +1410,8 @@ int get_coretype(void){ | |||
| 	    return CORE_SANDYBRIDGE; | ||||
| 	  else | ||||
| 	    return CORE_NEHALEM; //OS doesn't support AVX
 | ||||
| 	case 14: | ||||
| 	  //Xeon E7540
 | ||||
| 	case 15: | ||||
| 	  //Xeon Processor E7 (Westmere-EX)
 | ||||
| 	  return CORE_NEHALEM; | ||||
|  | @ -1427,8 +1441,13 @@ int get_coretype(void){ | |||
|     if (family == 0xf){ | ||||
|       if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;  | ||||
|       else if (exfamily == 5) return CORE_BOBCAT;  | ||||
|       else if (exfamily == 6) return CORE_BARCELONA;  //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | ||||
|       else return CORE_BARCELONA; | ||||
|       else if (exfamily == 6) { | ||||
| 	//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | ||||
| 	if(support_avx()) | ||||
| 	  return CORE_BULLDOZER; | ||||
| 	else | ||||
| 	  return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
 | ||||
|       }else return CORE_BARCELONA; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|  | @ -1494,6 +1513,9 @@ void get_cpuconfig(void){ | |||
|       printf("#define DTB_SIZE %d\n", info.size * 1024); | ||||
|       printf("#define DTB_ASSOCIATIVE %d\n", info.associative); | ||||
|       printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); | ||||
|     } else { | ||||
|       //fall back for some virtual machines.
 | ||||
|       printf("#define DTB_DEFAULT_ENTRIES 32\n"); | ||||
|     } | ||||
|      | ||||
|     features = get_cputype(GET_FEATURE); | ||||
|  | @ -1511,6 +1533,7 @@ void get_cpuconfig(void){ | |||
|     if (features & HAVE_AVX )    printf("#define HAVE_AVX\n"); | ||||
|     if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | ||||
|     if (features & HAVE_3DNOW)   printf("#define HAVE_3DNOW\n"); | ||||
|     if (features & HAVE_FMA4 )    printf("#define HAVE_FMA4\n"); | ||||
|     if (features & HAVE_CFLUSH)  printf("#define HAVE_CFLUSH\n"); | ||||
|     if (features & HAVE_HIT)     printf("#define HAVE_HIT 1\n"); | ||||
|     if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | ||||
|  | @ -1577,5 +1600,6 @@ void get_sse(void){ | |||
|   if (features & HAVE_AVX )    printf("HAVE_AVX=1\n"); | ||||
|   if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | ||||
|   if (features & HAVE_3DNOW)   printf("HAVE_3DNOW=1\n"); | ||||
|   if (features & HAVE_FMA4 )    printf("HAVE_FMA4=1\n"); | ||||
| 
 | ||||
| } | ||||
|  |  | |||
|  | @ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| 
 | ||||
|   a = (FLOAT *)args -> a; | ||||
|   x = (FLOAT *)args -> b; | ||||
|   y = (FLOAT *)args -> c; | ||||
| 
 | ||||
|   lda  = args -> lda; | ||||
|   incx = args -> ldb; | ||||
|  | @ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
|   n_from = 0; | ||||
|   n_to   = n; | ||||
| 
 | ||||
|   //Use y as each thread's n* COMPSIZE elements in sb buffer
 | ||||
|   y = buffer;    | ||||
|   buffer += ((COMPSIZE * n  + 1023) & ~1023); | ||||
| 
 | ||||
|   if (range_m) { | ||||
|     n_from = *(range_m + 0); | ||||
|     n_to   = *(range_m + 1); | ||||
|  | @ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
|     a += n_from * lda  * COMPSIZE; | ||||
|   } | ||||
| 
 | ||||
|   if (range_n) y += *range_n * COMPSIZE; | ||||
| 
 | ||||
|   if (incx != 1) { | ||||
|     COPY_K(n, x, incx, buffer, 1); | ||||
|  | @ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| 
 | ||||
|   if (num_cpu) { | ||||
|     queue[0].sa = NULL; | ||||
|     queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; | ||||
|     queue[0].sb = buffer; | ||||
|     queue[num_cpu - 1].next = NULL; | ||||
|    | ||||
|     exec_blas(num_cpu, queue); | ||||
|  | @ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| #else | ||||
| 	    ONE, ZERO, | ||||
| #endif | ||||
| 	    buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); | ||||
| 	    (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); | ||||
|   } | ||||
| 
 | ||||
|   AXPYU_K(n, 0, 0, | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| TOPDIR	= ../.. | ||||
| include ../../Makefile.system | ||||
| 
 | ||||
| COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) | ||||
| COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) | ||||
| 
 | ||||
| COMMONOBJS	+= slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX)  dlamc3.$(SUFFIX) | ||||
| 
 | ||||
|  | @ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. | |||
| openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | ||||
| 	$(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| 
 | ||||
| openblas_get_config.$(SUFFIX) : openblas_get_config.c | ||||
| 	$(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| 
 | ||||
| blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | ||||
| 	$(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| 
 | ||||
|  |  | |||
|  | @ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ | |||
| 					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| 	  } | ||||
| 	} | ||||
| 	queue->sb=sb; | ||||
|       } | ||||
| 	 | ||||
| #ifdef MONITOR | ||||
|  |  | |||
|  | @ -49,8 +49,12 @@ | |||
| 
 | ||||
| int blas_server_avail = 0; | ||||
| 
 | ||||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | ||||
| 
 | ||||
| void goto_set_num_threads(int num_threads) { | ||||
| 
 | ||||
|   int i=0; | ||||
| 
 | ||||
|   if (num_threads < 1) num_threads = blas_num_threads; | ||||
| 
 | ||||
|   if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | ||||
|  | @ -63,6 +67,18 @@ void goto_set_num_threads(int num_threads) { | |||
| 
 | ||||
|   omp_set_num_threads(blas_cpu_number); | ||||
| 
 | ||||
|   //adjust buffer for each thread
 | ||||
|   for(i=0; i<blas_cpu_number; i++){ | ||||
|     if(blas_thread_buffer[i]==NULL){ | ||||
|       blas_thread_buffer[i]=blas_memory_alloc(2); | ||||
|     } | ||||
|   } | ||||
|   for(; i<MAX_CPU_NUMBER; i++){ | ||||
|     if(blas_thread_buffer[i]!=NULL){ | ||||
|       blas_memory_free(blas_thread_buffer[i]); | ||||
|       blas_thread_buffer[i]=NULL; | ||||
|     } | ||||
|   } | ||||
| #if defined(ARCH_MIPS64)  | ||||
|   //set parameters for different number of threads.
 | ||||
|   blas_set_parameter(); | ||||
|  | @ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) { | |||
| 
 | ||||
| int blas_thread_init(void){ | ||||
| 
 | ||||
|   int i=0; | ||||
| 
 | ||||
|   blas_get_cpu_number(); | ||||
| 
 | ||||
|   blas_server_avail = 1; | ||||
| 
 | ||||
|   for(i=0; i<blas_num_threads; i++){ | ||||
|     blas_thread_buffer[i]=blas_memory_alloc(2); | ||||
|   } | ||||
|   for(; i<MAX_CPU_NUMBER; i++){ | ||||
|       blas_thread_buffer[i]=NULL; | ||||
|   } | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
| int BLASFUNC(blas_thread_shutdown)(void){ | ||||
| 
 | ||||
|   int i=0; | ||||
|   blas_server_avail = 0; | ||||
| 
 | ||||
|   for(i=0; i<MAX_CPU_NUMBER; i++){ | ||||
|     if(blas_thread_buffer[i]!=NULL){ | ||||
|       blas_memory_free(blas_thread_buffer[i]); | ||||
|       blas_thread_buffer[i]=NULL; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   return 0; | ||||
| } | ||||
| 
 | ||||
|  | @ -177,6 +209,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| static void exec_threads(blas_queue_t *queue){ | ||||
| 
 | ||||
|   void *buffer, *sa, *sb; | ||||
|   int pos=0, release_flag=0; | ||||
|    | ||||
|   buffer = NULL; | ||||
|   sa = queue -> sa; | ||||
|  | @ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){ | |||
| 
 | ||||
|   if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | ||||
| 
 | ||||
|     pos = omp_get_thread_num(); | ||||
|     buffer = blas_thread_buffer[pos]; | ||||
| 
 | ||||
|     //fallback
 | ||||
|     if(buffer==NULL) { | ||||
|       buffer = blas_memory_alloc(2); | ||||
|       release_flag=1; | ||||
|     } | ||||
| 
 | ||||
|     if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | ||||
|      | ||||
|  | @ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| 					    + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| 	  } | ||||
|       } | ||||
|       queue->sb=sb; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|  | @ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| 
 | ||||
|     } | ||||
| 
 | ||||
|   if (buffer != NULL) blas_memory_free(buffer); | ||||
|   if (release_flag) blas_memory_free(buffer); | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| 	    } | ||||
| 	} | ||||
| 	queue->sb=sb; | ||||
|       } | ||||
| 	 | ||||
| #ifdef MONITOR | ||||
|  |  | |||
|  | @ -63,9 +63,11 @@ extern gotoblas_t  gotoblas_BARCELONA; | |||
| extern gotoblas_t  gotoblas_BOBCAT; | ||||
| #ifndef NO_AVX | ||||
| extern gotoblas_t  gotoblas_SANDYBRIDGE; | ||||
| extern gotoblas_t  gotoblas_BULLDOZER; | ||||
| #else | ||||
| //Use NEHALEM kernels for sandy bridge
 | ||||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | ||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
|  | @ -78,8 +80,9 @@ extern gotoblas_t  gotoblas_SANDYBRIDGE; | |||
| 
 | ||||
| #ifndef NO_AVX | ||||
| static inline void xgetbv(int op, int * eax, int * edx){ | ||||
|   //Use binary code for xgetbv
 | ||||
|   __asm__ __volatile__ | ||||
|     ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | ||||
|     (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
|  | @ -163,7 +166,8 @@ static gotoblas_t *get_coretype(void){ | |||
| 		   | ||||
| 	//Intel Xeon Processor 5600 (Westmere-EP)
 | ||||
| 	//Xeon Processor E7 (Westmere-EX)
 | ||||
| 	if (model == 12 || model == 15) return &gotoblas_NEHALEM; | ||||
| 	//Xeon E7540
 | ||||
| 	if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; | ||||
| 
 | ||||
| 	//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
 | ||||
| 	//Intel Core i7-3000 / Xeon E5
 | ||||
|  | @ -171,7 +175,7 @@ static gotoblas_t *get_coretype(void){ | |||
| 	  if(support_avx()) | ||||
| 	    return &gotoblas_SANDYBRIDGE; | ||||
| 	  else{ | ||||
| 	    fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); | ||||
| 	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | ||||
| 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 | ||||
| 	  } | ||||
| 	} | ||||
|  | @ -182,7 +186,7 @@ static gotoblas_t *get_coretype(void){ | |||
| 	  if(support_avx()) | ||||
| 	    return &gotoblas_SANDYBRIDGE; | ||||
| 	  else{ | ||||
| 	    fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); | ||||
| 	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | ||||
| 	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 | ||||
| 	  } | ||||
| 	} | ||||
|  | @ -202,6 +206,14 @@ static gotoblas_t *get_coretype(void){ | |||
| 	else return &gotoblas_OPTERON; | ||||
|       }  else if (exfamily == 5) { | ||||
| 	return &gotoblas_BOBCAT; | ||||
|       } else if (exfamily == 6) { | ||||
| 	//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | ||||
| 	  if(support_avx()) | ||||
| 	    return &gotoblas_BULLDOZER; | ||||
| 	  else{ | ||||
| 	    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | ||||
| 	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
 | ||||
| 	  }	 | ||||
|       } else { | ||||
| 	return &gotoblas_BARCELONA; | ||||
|       } | ||||
|  | @ -238,6 +250,7 @@ static char *corename[] = { | |||
|     "Nano", | ||||
|     "Sandybridge", | ||||
|     "Bobcat", | ||||
|     "Bulldozer", | ||||
| }; | ||||
| 
 | ||||
| char *gotoblas_corename(void) { | ||||
|  | @ -259,6 +272,7 @@ char *gotoblas_corename(void) { | |||
|   if (gotoblas == &gotoblas_NANO)         return corename[15]; | ||||
|   if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16]; | ||||
|   if (gotoblas == &gotoblas_BOBCAT)       return corename[17]; | ||||
|   if (gotoblas == &gotoblas_BULLDOZER)    return corename[18]; | ||||
| 
 | ||||
|   return corename[0]; | ||||
| } | ||||
|  |  | |||
|  | @ -0,0 +1,59 @@ | |||
| /*****************************************************************************
 | ||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are | ||||
| met: | ||||
| 
 | ||||
|    1. Redistributions of source code must retain the above copyright | ||||
|       notice, this list of conditions and the following disclaimer. | ||||
| 
 | ||||
|    2. Redistributions in binary form must reproduce the above copyright | ||||
|       notice, this list of conditions and the following disclaimer in | ||||
|       the documentation and/or other materials provided with the | ||||
|       distribution. | ||||
|    3. Neither the name of the ISCAS nor the names of its contributors may  | ||||
|       be used to endorse or promote products derived from this software  | ||||
|       without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"  | ||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE  | ||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE  | ||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE  | ||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR  | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER  | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,  | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE  | ||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| 
 | ||||
| **********************************************************************************/ | ||||
| 
 | ||||
| #include "common.h" | ||||
| 
 | ||||
| static char* openblas_config_str="" | ||||
| #ifdef USE64BITINT | ||||
|   "USE64BITINT " | ||||
| #endif | ||||
| #ifdef NO_CBLAS | ||||
|   "NO_CBLAS " | ||||
| #endif | ||||
| #ifdef NO_LAPACK | ||||
|   "NO_LAPACK " | ||||
| #endif | ||||
| #ifdef NO_LAPACKE | ||||
|   "NO_LAPACKE " | ||||
| #endif | ||||
| #ifdef DYNAMIC_ARCH | ||||
|   "DYNAMIC_ARCH " | ||||
| #endif | ||||
| #ifdef NO_AFFINITY | ||||
|   "NO_AFFINITY " | ||||
| #endif | ||||
|   ; | ||||
| 
 | ||||
| char* CNAME() { | ||||
|   return openblas_config_str; | ||||
| } | ||||
| 
 | ||||
|  | @ -163,7 +163,7 @@ int get_L2_size(void){ | |||
| 
 | ||||
|   int eax, ebx, ecx, edx; | ||||
| 
 | ||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ | ||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | ||||
|     defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
|   defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | ||||
| 
 | ||||
|  |  | |||
|  | @ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | ||||
| EXTRALIB += -lgfortran | ||||
| endif | ||||
| ifeq ($(USE_OPENMP), 1) | ||||
| ifeq ($(C_COMPILER), GCC) | ||||
| EXTRALIB += -lgomp | ||||
| endif | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ||||
|  |  | |||
|  | @ -74,6 +74,7 @@ | |||
| 
 | ||||
| @misc_no_underscore_objs = ( | ||||
|                             openblas_set_num_threads, goto_set_num_threads, | ||||
|                             openblas_get_config, | ||||
|                            ); | ||||
| 
 | ||||
| @misc_underscore_objs = ( | ||||
|  |  | |||
							
								
								
									
										18
									
								
								getarch.c
								
								
								
								
							
							
						
						
									
										18
									
								
								getarch.c
								
								
								
								
							|  | @ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME  "OPTERON" | ||||
| #endif | ||||
| 
 | ||||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) | ||||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) | ||||
| #define FORCE | ||||
| #define FORCE_INTEL | ||||
| #define ARCHITECTURE    "X86" | ||||
|  | @ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME  "BOBCAT" | ||||
| #endif | ||||
| 
 | ||||
| #if defined (FORCE_BULLDOZER) | ||||
| #define FORCE | ||||
| #define FORCE_INTEL | ||||
| #define ARCHITECTURE    "X86" | ||||
| #define SUBARCHITECTURE "BULLDOZER" | ||||
| #define ARCHCONFIG   "-DBULLDOZER " \ | ||||
| 		     "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | ||||
| 		     "-DL2_SIZE=1024000 -DL2_LINESIZE=64  -DL3_SIZE=16777216 " \ | ||||
| 		     "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | ||||
| 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | ||||
| 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ | ||||
|                      "-DHAVE_AVX -DHAVE_FMA4" | ||||
| #define LIBNAME   "bulldozer" | ||||
| #define CORENAME  "BULLDOZER" | ||||
| #endif | ||||
| 
 | ||||
| #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | ||||
| #define FORCE_INTEL | ||||
|  |  | |||
|  | @ -34,7 +34,7 @@ int main(int argc, char **argv) { | |||
| #ifdef USE64BITINT | ||||
| 	printf("#define USE64BITINT\n"); | ||||
| #endif | ||||
| 	printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); | ||||
| 	printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); | ||||
|   } | ||||
| 
 | ||||
|   return 0; | ||||
|  |  | |||
|  | @ -810,6 +810,22 @@ static void init_parameter(void) { | |||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #ifdef BULLDOZER | ||||
| 
 | ||||
| #ifdef DEBUG | ||||
|   fprintf(stderr, "Bulldozer\n"); | ||||
| #endif | ||||
| 
 | ||||
|   TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | ||||
|   TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | ||||
|   TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | ||||
|   TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | ||||
| #ifdef EXPRECISION | ||||
|   TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | ||||
|   TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #ifdef NANO | ||||
| 
 | ||||
| #ifdef DEBUG | ||||
|  |  | |||
|  | @ -0,0 +1,59 @@ | |||
| SGEMMKERNEL    =  gemm_kernel_4x4_barcelona.S | ||||
| SGEMMINCOPY    =   | ||||
| SGEMMITCOPY    =   | ||||
| SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c | ||||
| SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c | ||||
| SGEMMINCOPYOBJ =   | ||||
| SGEMMITCOPYOBJ =   | ||||
| SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL    =  gemm_kernel_2x4_barcelona.S | ||||
| DGEMMINCOPY    =  ../generic/gemm_ncopy_2.c | ||||
| DGEMMITCOPY    =  ../generic/gemm_tcopy_2.c | ||||
| DGEMMONCOPY    =  ../generic/gemm_ncopy_4.c | ||||
| DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c | ||||
| DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMKERNEL    =  zgemm_kernel_2x2_barcelona.S | ||||
| CGEMMINCOPY    =   | ||||
| CGEMMITCOPY    =   | ||||
| CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c | ||||
| CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c | ||||
| CGEMMINCOPYOBJ = | ||||
| CGEMMITCOPYOBJ =   | ||||
| CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMKERNEL    =  zgemm_kernel_1x2_barcelona.S | ||||
| ZGEMMINCOPY    =  ../generic/zgemm_ncopy_1.c | ||||
| ZGEMMITCOPY    =  ../generic/zgemm_tcopy_1.c | ||||
| ZGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c | ||||
| ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c | ||||
| ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| STRSMKERNEL_LN	=  trsm_kernel_LN_4x4_sse.S | ||||
| STRSMKERNEL_LT	=  trsm_kernel_LT_4x4_sse.S | ||||
| STRSMKERNEL_RN	=  trsm_kernel_LT_4x4_sse.S | ||||
| STRSMKERNEL_RT	=  trsm_kernel_RT_4x4_sse.S | ||||
| 
 | ||||
| DTRSMKERNEL_LN	=  trsm_kernel_LN_2x4_sse2.S | ||||
| DTRSMKERNEL_LT	=  trsm_kernel_LT_2x4_sse2.S | ||||
| DTRSMKERNEL_RN	=  trsm_kernel_LT_2x4_sse2.S | ||||
| DTRSMKERNEL_RT	=  trsm_kernel_RT_2x4_sse2.S | ||||
| 
 | ||||
| CTRSMKERNEL_LN	=  ztrsm_kernel_LN_2x2_sse.S | ||||
| CTRSMKERNEL_LT	=  ztrsm_kernel_LT_2x2_sse.S | ||||
| CTRSMKERNEL_RN	=  ztrsm_kernel_LT_2x2_sse.S | ||||
| CTRSMKERNEL_RT	=  ztrsm_kernel_RT_2x2_sse.S | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	=  ztrsm_kernel_LT_1x2_sse2.S | ||||
| ZTRSMKERNEL_LT	=  ztrsm_kernel_LT_1x2_sse2.S | ||||
| ZTRSMKERNEL_RN	=  ztrsm_kernel_LT_1x2_sse2.S | ||||
| ZTRSMKERNEL_RT	=  ztrsm_kernel_RT_1x2_sse2.S | ||||
| 
 | ||||
| CGEMM3MKERNEL    =  zgemm3m_kernel_4x4_barcelona.S | ||||
| ZGEMM3MKERNEL    =  zgemm3m_kernel_2x4_barcelona.S | ||||
|  | @ -596,7 +596,7 @@ | |||
| .L22: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| 	addps	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movsd	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -842,7 +842,7 @@ | |||
| .L32: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| 	addss	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -1168,7 +1168,7 @@ | |||
| 
 | ||||
| .L52: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -1198,7 +1198,7 @@ | |||
| 	addps	%xmm0, %xmm5 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm2 | ||||
|  | @ -1347,7 +1347,7 @@ | |||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 
 | ||||
|  | @ -1531,7 +1531,7 @@ | |||
| 
 | ||||
| .L72: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulss	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -1778,7 +1778,7 @@ | |||
| 
 | ||||
| .L92: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(AA), %xmm0 | ||||
|  | @ -1793,7 +1793,7 @@ | |||
| 	mulps	12 * SIZE(BB), %xmm0 | ||||
| 	addps	%xmm0, %xmm7 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm3 | ||||
|  | @ -1924,7 +1924,7 @@ | |||
| 
 | ||||
| .L102: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movsd	 2 * SIZE(AA), %xmm0 | ||||
|  | @ -2069,7 +2069,7 @@ | |||
| 
 | ||||
| .L112: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 1 * SIZE(AA), %xmm0 | ||||
|  |  | |||
|  | @ -89,17 +89,22 @@ | |||
| #endif | ||||
| 
 | ||||
| #define STACKSIZE	16 | ||||
| #define ARGS	16 | ||||
| 
 | ||||
| #define M		 4 + STACKSIZE(%esp) | ||||
| #define N		 8 + STACKSIZE(%esp) | ||||
| #define ALPHA		16 + STACKSIZE(%esp) | ||||
| #define A		20 + STACKSIZE(%esp) | ||||
| #define STACK_LDA	24 + STACKSIZE(%esp) | ||||
| #define STACK_X		28 + STACKSIZE(%esp) | ||||
| #define STACK_INCX	32 + STACKSIZE(%esp) | ||||
| #define Y		36 + STACKSIZE(%esp) | ||||
| #define STACK_INCY	40 + STACKSIZE(%esp) | ||||
| #define BUFFER		44 + STACKSIZE(%esp) | ||||
| #define M		 4 + STACKSIZE+ARGS(%esp) | ||||
| #define N		 8 + STACKSIZE+ARGS(%esp) | ||||
| #define ALPHA		16 + STACKSIZE+ARGS(%esp) | ||||
| #define A		20 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_LDA	24 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_X		28 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCX	32 + STACKSIZE+ARGS(%esp) | ||||
| #define Y		36 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCY	40 + STACKSIZE+ARGS(%esp) | ||||
| #define BUFFER		44 + STACKSIZE+ARGS(%esp) | ||||
| #define MMM	0+ARGS(%esp) | ||||
| #define YY	4+ARGS(%esp) | ||||
| #define AA	8+ARGS(%esp) | ||||
| #define LDAX	12+ARGS(%esp) | ||||
| 	 | ||||
| #define I	%eax | ||||
| #define J	%ebx | ||||
|  | @ -114,6 +119,7 @@ | |||
| 
 | ||||
| 	PROLOGUE | ||||
| 
 | ||||
| 	subl	$ARGS,%esp | ||||
| 	pushl	%ebp | ||||
| 	pushl	%edi | ||||
| 	pushl	%esi | ||||
|  | @ -121,7 +127,34 @@ | |||
| 
 | ||||
| 	PROFCODE | ||||
| 
 | ||||
| 	movl	Y,J | ||||
| 	movl	J,YY				# backup Y | ||||
| 	movl	A,J | ||||
| 	movl	J,AA				# backup A | ||||
| 	movl	M,J | ||||
| 	movl	J,MMM				# backup MM | ||||
| .L0t: | ||||
| 	xorl	J,J | ||||
| 	addl	$1,J | ||||
| 	sall	$21,J | ||||
| 	subl	J,MMM | ||||
| 	movl	J,M | ||||
| 	jge		.L00t | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| 	movl	MMM,%eax | ||||
| 	addl	J,%eax | ||||
| 	jle		.L999x | ||||
| 	movl	%eax,M | ||||
| 
 | ||||
| .L00t: | ||||
| 	movl	AA,%eax | ||||
| 	movl	%eax,A | ||||
| 
 | ||||
| 	movl	YY,J | ||||
| 	movl	J,Y | ||||
| 	movl	STACK_LDA,  LDA | ||||
| 
 | ||||
| 	movl	STACK_X,    X | ||||
| 	movl	STACK_INCX, INCX | ||||
| 
 | ||||
|  | @ -651,12 +684,22 @@ | |||
| 	addss	0 * SIZE(X), %xmm0 | ||||
| 	movss	%xmm0, (Y1) | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L999: | ||||
| 	movl	M,J | ||||
| 	leal	(,J,SIZE),%eax | ||||
| 	addl	%eax,AA | ||||
| 	movl	YY,J | ||||
| 	addl	%eax,J | ||||
| 	movl	J,YY | ||||
| 	jmp		.L0t | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L999x: | ||||
| 	popl	%ebx | ||||
| 	popl	%esi | ||||
| 	popl	%edi	 | ||||
| 	popl	%ebp | ||||
| 	addl	$ARGS,%esp | ||||
| 	ret | ||||
| 
 | ||||
| 	EPILOGUE | ||||
|  |  | |||
|  | @ -76,17 +76,22 @@ | |||
| #endif | ||||
| 
 | ||||
| #define STACKSIZE	16 | ||||
| #define ARGS	16 | ||||
| 
 | ||||
| #define M		 4 + STACKSIZE(%esp) | ||||
| #define N		 8 + STACKSIZE(%esp) | ||||
| #define ALPHA		16 + STACKSIZE(%esp) | ||||
| #define A		24 + STACKSIZE(%esp) | ||||
| #define STACK_LDA	28 + STACKSIZE(%esp) | ||||
| #define STACK_X		32 + STACKSIZE(%esp) | ||||
| #define STACK_INCX	36 + STACKSIZE(%esp) | ||||
| #define Y		40 + STACKSIZE(%esp) | ||||
| #define STACK_INCY	44 + STACKSIZE(%esp) | ||||
| #define BUFFER		48 + STACKSIZE(%esp) | ||||
| #define M		 4 + STACKSIZE+ARGS(%esp) | ||||
| #define N		 8 + STACKSIZE+ARGS(%esp) | ||||
| #define ALPHA		16 + STACKSIZE+ARGS(%esp) | ||||
| #define A		24 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_LDA	28 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_X		32 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCX	36 + STACKSIZE+ARGS(%esp) | ||||
| #define Y		40 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCY	44 + STACKSIZE+ARGS(%esp) | ||||
| #define BUFFER		48 + STACKSIZE+ARGS(%esp) | ||||
| 
 | ||||
| #define MMM	0+ARGS(%esp) | ||||
| #define YY	4+ARGS(%esp) | ||||
| #define AA	8+ARGS(%esp) | ||||
| 	 | ||||
| #define I	%eax | ||||
| #define J	%ebx | ||||
|  | @ -101,6 +106,8 @@ | |||
| 
 | ||||
| 	PROLOGUE | ||||
| 
 | ||||
| 
 | ||||
| 	subl	$ARGS,%esp | ||||
| 	pushl	%ebp | ||||
| 	pushl	%edi | ||||
| 	pushl	%esi | ||||
|  | @ -108,6 +115,33 @@ | |||
| 
 | ||||
| 	PROFCODE | ||||
| 
 | ||||
| 	movl	Y,J | ||||
| 	movl	J,YY				# backup Y | ||||
| 	movl	A,J | ||||
| 	movl	J,AA				# backup A | ||||
| 	movl	M,J | ||||
| 	movl	J,MMM				# backup MM | ||||
| .L0t: | ||||
| 	xorl	J,J | ||||
| 	addl	$1,J | ||||
| 	sall	$20,J | ||||
| 	subl	J,MMM | ||||
| 	movl	J,M | ||||
| 	jge		.L00t | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| 	movl	MMM,%eax | ||||
| 	addl	J,%eax | ||||
| 	jle		.L999x | ||||
| 	movl	%eax,M | ||||
| 
 | ||||
| .L00t: | ||||
| 	movl	AA,%eax | ||||
| 	movl	%eax,A | ||||
| 
 | ||||
| 	movl	YY,J | ||||
| 	movl	J,Y | ||||
| 
 | ||||
| 	movl	STACK_LDA,  LDA | ||||
| 	movl	STACK_X,    X | ||||
| 	movl	STACK_INCX, INCX | ||||
|  | @ -677,10 +711,22 @@ | |||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L999: | ||||
| 	movl	M,J | ||||
| 	leal	(,J,SIZE),%eax | ||||
| 	addl	%eax,AA | ||||
| 	movl	YY,J | ||||
| 	addl	%eax,J | ||||
| 	movl	J,YY | ||||
| 	jmp		.L0t | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L999x: | ||||
| 
 | ||||
| 	popl	%ebx | ||||
| 	popl	%esi | ||||
| 	popl	%edi	 | ||||
| 	popl	%ebp | ||||
| 	addl	$ARGS,%esp | ||||
| 	ret | ||||
| 
 | ||||
| 	EPILOGUE | ||||
|  |  | |||
|  | @ -89,17 +89,24 @@ | |||
| #endif | ||||
| 
 | ||||
| #define STACKSIZE	16 | ||||
| #define ARGS	20 | ||||
| 
 | ||||
| #define M		 4 + STACKSIZE(%esp) | ||||
| #define N		 8 + STACKSIZE(%esp) | ||||
| #define ALPHA		16 + STACKSIZE(%esp) | ||||
| #define A		20 + STACKSIZE(%esp) | ||||
| #define STACK_LDA	24 + STACKSIZE(%esp) | ||||
| #define STACK_X		28 + STACKSIZE(%esp) | ||||
| #define STACK_INCX	32 + STACKSIZE(%esp) | ||||
| #define Y		36 + STACKSIZE(%esp) | ||||
| #define STACK_INCY	40 + STACKSIZE(%esp) | ||||
| #define BUFFER		44 + STACKSIZE(%esp) | ||||
| #define M		 4 + STACKSIZE+ARGS(%esp) | ||||
| #define N		 8 + STACKSIZE+ARGS(%esp) | ||||
| #define ALPHA		16 + STACKSIZE+ARGS(%esp) | ||||
| #define A		20 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_LDA	24 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_X		28 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCX	32 + STACKSIZE+ARGS(%esp) | ||||
| #define Y		36 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCY	40 + STACKSIZE+ARGS(%esp) | ||||
| #define BUFFER		44 + STACKSIZE+ARGS(%esp) | ||||
| 
 | ||||
| #define MMM	0+STACKSIZE(%esp) | ||||
| #define NN	4+STACKSIZE(%esp) | ||||
| #define AA	8+STACKSIZE(%esp) | ||||
| #define LDAX	12+STACKSIZE(%esp) | ||||
| #define XX	16+STACKSIZE(%esp) | ||||
| 	 | ||||
| #define I	%eax | ||||
| #define J	%ebx | ||||
|  | @ -114,6 +121,7 @@ | |||
| 
 | ||||
| 	PROLOGUE | ||||
| 
 | ||||
| 	subl	$ARGS,%esp | ||||
| 	pushl	%ebp | ||||
| 	pushl	%edi | ||||
| 	pushl	%esi | ||||
|  | @ -122,7 +130,42 @@ | |||
| 	PROFCODE | ||||
| 
 | ||||
| 	movl	STACK_LDA,  LDA | ||||
| 	movl	LDA,LDAX			# backup LDA | ||||
| 	movl	STACK_X,    X | ||||
| 	movl	X,XX | ||||
| 	movl	N,J | ||||
| 	movl	J,NN				# backup N | ||||
| 	movl	A,J | ||||
| 	movl	J,AA				# backup A | ||||
|     movl	M,J | ||||
| 	movl	J,MMM				# mov M to MMM | ||||
| .L0t: | ||||
| 	xorl	J,J | ||||
| 	addl	$1,J | ||||
| 	sall    $22,J                           # J=2^24*sizeof(float)=buffer size(16MB) | ||||
| 	subl    $8, J                           # Don't use last 8 float in the buffer. | ||||
| 	                                        # Now, split M by block J | ||||
| 	subl	J,MMM				# MMM=MMM-J | ||||
| 	movl	J,M		 | ||||
| 	jge		.L00t | ||||
| 	ALIGN_4 | ||||
| 	 | ||||
| 	movl	MMM,%eax | ||||
| 	addl	J,%eax | ||||
| 	jle		.L999x | ||||
| 	movl	%eax,M | ||||
| 
 | ||||
| .L00t: | ||||
| 	movl	AA,%eax | ||||
| 	movl	%eax,A			 	# mov AA to A | ||||
| 
 | ||||
| 	movl	NN,%eax | ||||
| 	movl	%eax,N				# reset N | ||||
| 
 | ||||
| 
 | ||||
| 	movl	LDAX,  LDA			# reset LDA | ||||
| 	movl	XX,X | ||||
| 
 | ||||
| 	movl	STACK_INCX, INCX | ||||
| 	movl	STACK_INCY, INCY | ||||
| 
 | ||||
|  | @ -198,6 +241,20 @@ | |||
| 	jg	.L06 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| //Padding zero to prevent loading the dirty number from buffer. | ||||
| 	movl	M,  I | ||||
| 	movl	$8, J | ||||
| 	andl	$7, I | ||||
| 	xorps	%xmm0, %xmm0 | ||||
| 	subl	I, J | ||||
| 	ALIGN_2 | ||||
| .L07: | ||||
| 	movss	%xmm0, 0 * SIZE(Y1) | ||||
| 	addl	$SIZE, Y1 | ||||
| 	decl	J | ||||
| 	jg	.L07 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L10: | ||||
| 	movl	Y, Y1 | ||||
| 
 | ||||
|  | @ -628,10 +685,22 @@ | |||
| 	ALIGN_4 | ||||
|  	 | ||||
| .L999: | ||||
| 	movl	M,J | ||||
| 	leal	(,J,SIZE),%eax | ||||
| 	addl	%eax,AA | ||||
| 	movl	XX,J | ||||
| 	addl	%eax,J | ||||
| 	movl	J,XX | ||||
| 	jmp		.L0t | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L999x: | ||||
| 	popl	%ebx | ||||
| 	popl	%esi | ||||
| 	popl	%edi	 | ||||
| 	popl	%ebp | ||||
| 
 | ||||
| 	addl	$ARGS,%esp | ||||
| 	ret | ||||
| 
 | ||||
| 	EPILOGUE | ||||
|  |  | |||
|  | @ -76,17 +76,23 @@ | |||
| #endif | ||||
| 
 | ||||
| #define STACKSIZE	16 | ||||
| #define ARGS	16 | ||||
| 
 | ||||
| #define M		 4 + STACKSIZE(%esp) | ||||
| #define N		 8 + STACKSIZE(%esp) | ||||
| #define ALPHA		16 + STACKSIZE(%esp) | ||||
| #define A		24 + STACKSIZE(%esp) | ||||
| #define STACK_LDA	28 + STACKSIZE(%esp) | ||||
| #define STACK_X		32 + STACKSIZE(%esp) | ||||
| #define STACK_INCX	36 + STACKSIZE(%esp) | ||||
| #define Y		40 + STACKSIZE(%esp) | ||||
| #define STACK_INCY	44 + STACKSIZE(%esp) | ||||
| #define BUFFER		48 + STACKSIZE(%esp) | ||||
| #define M		 4 + STACKSIZE+ARGS(%esp) | ||||
| #define N		 8 + STACKSIZE+ARGS(%esp) | ||||
| #define ALPHA		16 + STACKSIZE+ARGS(%esp) | ||||
| #define A		24 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_LDA	28 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_X		32 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCX	36 + STACKSIZE+ARGS(%esp) | ||||
| #define Y		40 + STACKSIZE+ARGS(%esp) | ||||
| #define STACK_INCY	44 + STACKSIZE+ARGS(%esp) | ||||
| #define BUFFER		48 + STACKSIZE+ARGS(%esp) | ||||
| 
 | ||||
| #define MMM	0+STACKSIZE(%esp) | ||||
| #define AA	4+STACKSIZE(%esp) | ||||
| #define LDAX 8+STACKSIZE(%esp) | ||||
| #define NN	12+STACKSIZE(%esp) | ||||
| 
 | ||||
| #define I	%eax | ||||
| #define J	%ebx | ||||
|  | @ -101,6 +107,8 @@ | |||
| 
 | ||||
| 	PROLOGUE | ||||
| 
 | ||||
| 	subl	$ARGS,%esp | ||||
| 
 | ||||
| 	pushl	%ebp | ||||
| 	pushl	%edi | ||||
| 	pushl	%esi | ||||
|  | @ -108,7 +116,40 @@ | |||
| 
 | ||||
| 	PROFCODE | ||||
| 
 | ||||
| 
 | ||||
| 	movl	STACK_LDA,  LDA | ||||
| 	movl	LDA,LDAX			# backup LDA | ||||
| 	movl	N,J | ||||
| 	movl	J,NN				# backup N | ||||
| 	movl	A,J | ||||
| 	movl	J,AA				# backup A | ||||
|     movl	M,J | ||||
| 	movl	J,MMM				# mov M to MMM | ||||
| .L0t: | ||||
| 	xorl	J,J | ||||
| 	addl	$1,J | ||||
| 	sall    $21,J                           # J=2^21*sizeof(double)=buffer size(16MB) | ||||
| 	subl    $4, J                           # Don't use last 4 double in the buffer. | ||||
| 	                                        # Now, split M by block J | ||||
| 	subl	J,MMM				# MMM=MMM-J | ||||
| 	movl	J,M		 | ||||
| 	jge		.L00t | ||||
| 	ALIGN_4 | ||||
| 	 | ||||
| 	movl	MMM,%eax | ||||
| 	addl	J,%eax | ||||
| 	jle		.L999x | ||||
| 	movl	%eax,M | ||||
| 
 | ||||
| .L00t: | ||||
| 	movl	AA,%eax | ||||
| 	movl	%eax,A			 	# mov AA to A | ||||
| 
 | ||||
| 	movl	NN,%eax | ||||
| 	movl	%eax,N				# reset N | ||||
| 
 | ||||
| 
 | ||||
| 	movl	LDAX,  LDA			# reset LDA | ||||
| 	movl	STACK_X,    X | ||||
| 	movl	STACK_INCX, INCX | ||||
| 	movl	STACK_INCY, INCY | ||||
|  | @ -117,6 +158,7 @@ | |||
| 	leal	(,INCY, SIZE), INCY | ||||
| 	leal	(,LDA,  SIZE), LDA | ||||
| 
 | ||||
| 
 | ||||
| 	subl	$-16 * SIZE, A | ||||
| 
 | ||||
| 	cmpl	$0, N | ||||
|  | @ -560,10 +602,19 @@ | |||
| 	ALIGN_4 | ||||
| 	 | ||||
| .L999: | ||||
| 	movl 	M,J | ||||
| 	leal 	(,J,SIZE),%eax | ||||
| 	addl	%eax,AA | ||||
| 	jmp		.L0t | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L999x: | ||||
| 	popl	%ebx | ||||
| 	popl	%esi | ||||
| 	popl	%edi	 | ||||
| 	popl	%ebp | ||||
| 
 | ||||
| 	addl	$ARGS,%esp | ||||
| 	ret | ||||
| 
 | ||||
| 	EPILOGUE | ||||
|  |  | |||
|  | @ -269,7 +269,7 @@ | |||
| 	sarl	$5, I | ||||
| 	jle	.L113 | ||||
| 
 | ||||
| #if defined(BARCELONA) | ||||
| #if defined(BARCELONA) || defined(BULLDOZER) | ||||
| 
 | ||||
| 	movaps	%xmm0, %xmm1 | ||||
| 	mulps	-32 * SIZE(X), %xmm1 | ||||
|  |  | |||
|  | @ -253,7 +253,7 @@ | |||
| 	sarl	$4, I | ||||
| 	jle	.L113 | ||||
| 
 | ||||
| #if defined(BARCELONA) | ||||
| #if defined(BARCELONA) || defined(BULLDOZER) | ||||
| 
 | ||||
| 	movaps  %xmm0, %xmm1 | ||||
| 	mulpd	-16 * SIZE(X), %xmm1 | ||||
|  |  | |||
|  | @ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN	4096 | ||||
| #define STACK_OFFSET	1024 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | ||||
|  | @ -439,7 +439,7 @@ | |||
| .L22: | ||||
| 	mulsd	%xmm0, %xmm2 | ||||
| 	addsd	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movlpd	 2 * SIZE(BB), %xmm2 | ||||
|  | @ -488,7 +488,7 @@ | |||
| 	movlpd	40 * SIZE(BB), %xmm3 | ||||
| 	addsd	%xmm0, %xmm7 | ||||
| 	movlpd	 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA) | ||||
| #endif | ||||
| 	mulsd	%xmm1, %xmm2 | ||||
|  | @ -1697,7 +1697,7 @@ | |||
| 
 | ||||
| .L42: | ||||
| 	mulpd	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulpd	 2 * SIZE(BB), %xmm0 | ||||
|  | @ -1727,7 +1727,7 @@ | |||
| 	addpd	%xmm0, %xmm7 | ||||
| 	movapd	16 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA) | ||||
| #endif | ||||
| 	mulpd	%xmm1, %xmm2 | ||||
|  |  | |||
|  | @ -64,7 +64,7 @@ | |||
| #define BORIG	60(%esp) | ||||
| #define BUFFER 128(%esp) | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | ||||
|  | @ -437,7 +437,7 @@ | |||
| .L32: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| 	addss	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -833,7 +833,7 @@ | |||
| .L22: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| 	addps	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -1848,7 +1848,7 @@ | |||
| 
 | ||||
| .L72: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulss	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -2109,7 +2109,7 @@ | |||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 
 | ||||
|  | @ -2429,7 +2429,7 @@ | |||
| 
 | ||||
| .L52: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -2459,7 +2459,7 @@ | |||
| 	addps	%xmm0, %xmm5 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm2 | ||||
|  | @ -2952,7 +2952,7 @@ | |||
| 
 | ||||
| .L112: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 1 * SIZE(AA), %xmm0 | ||||
|  | @ -3148,7 +3148,7 @@ | |||
| 
 | ||||
| .L102: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movsd	 2 * SIZE(AA), %xmm0 | ||||
|  | @ -3389,7 +3389,7 @@ | |||
| 
 | ||||
| .L92: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(AA), %xmm0 | ||||
|  | @ -3404,7 +3404,7 @@ | |||
| 	mulps	12 * SIZE(BB), %xmm0 | ||||
| 	addps	%xmm0, %xmm7 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm3 | ||||
|  |  | |||
|  | @ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN	4096 | ||||
| #define STACK_OFFSET	1024 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | ||||
|  | @ -910,7 +910,7 @@ | |||
| .L22: | ||||
| 	mulsd	%xmm0, %xmm2 | ||||
| 	addsd	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movlpd	 2 * SIZE(BB), %xmm2 | ||||
|  | @ -959,7 +959,7 @@ | |||
| 	movlpd	40 * SIZE(BB), %xmm3 | ||||
| 	addsd	%xmm0, %xmm7 | ||||
| 	movlpd	 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA) | ||||
| #endif | ||||
| 	mulsd	%xmm1, %xmm2 | ||||
|  | @ -1439,7 +1439,7 @@ | |||
| 
 | ||||
| .L42: | ||||
| 	mulpd	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulpd	 2 * SIZE(BB), %xmm0 | ||||
|  | @ -1469,7 +1469,7 @@ | |||
| 	addpd	%xmm0, %xmm7 | ||||
| 	movapd	16 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA) | ||||
| #endif | ||||
| 	mulpd	%xmm1, %xmm2 | ||||
|  |  | |||
|  | @ -64,7 +64,7 @@ | |||
| #define BORIG	60(%esp) | ||||
| #define BUFFER 128(%esp) | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | ||||
|  | @ -872,7 +872,7 @@ | |||
| .L22: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| 	addps	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -1316,7 +1316,7 @@ | |||
| .L32: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| 	addss	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -1855,7 +1855,7 @@ | |||
| 
 | ||||
| .L52: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -1885,7 +1885,7 @@ | |||
| 	addps	%xmm0, %xmm5 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm2 | ||||
|  | @ -2249,7 +2249,7 @@ | |||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 
 | ||||
|  | @ -2562,7 +2562,7 @@ | |||
| 
 | ||||
| .L72: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulss	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -2957,7 +2957,7 @@ | |||
| 
 | ||||
| .L92: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(AA), %xmm0 | ||||
|  | @ -2972,7 +2972,7 @@ | |||
| 	mulps	12 * SIZE(BB), %xmm0 | ||||
| 	addps	%xmm0, %xmm7 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm3 | ||||
|  | @ -3280,7 +3280,7 @@ | |||
| 
 | ||||
| .L102: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movsd	 2 * SIZE(AA), %xmm0 | ||||
|  | @ -3515,7 +3515,7 @@ | |||
| 
 | ||||
| .L112: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 1 * SIZE(AA), %xmm0 | ||||
|  |  | |||
|  | @ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN	4096 | ||||
| #define STACK_OFFSET	1024 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | ||||
|  | @ -1036,7 +1036,7 @@ | |||
| 
 | ||||
| .L42: | ||||
| 	mulpd	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulpd	 2 * SIZE(BB), %xmm0 | ||||
|  | @ -1066,7 +1066,7 @@ | |||
| 	addpd	%xmm0, %xmm7 | ||||
| 	movapd	16 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA) | ||||
| #endif | ||||
| 	mulpd	%xmm1, %xmm2 | ||||
|  | @ -2224,7 +2224,7 @@ | |||
| .L22: | ||||
| 	mulsd	%xmm0, %xmm2 | ||||
| 	addsd	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movlpd	 2 * SIZE(BB), %xmm2 | ||||
|  | @ -2273,7 +2273,7 @@ | |||
| 	movlpd	40 * SIZE(BB), %xmm3 | ||||
| 	addsd	%xmm0, %xmm7 | ||||
| 	movlpd	 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA) | ||||
| #endif | ||||
| 	mulsd	%xmm1, %xmm2 | ||||
|  |  | |||
|  | @ -64,7 +64,7 @@ | |||
| #define BORIG	60(%esp) | ||||
| #define BUFFER 128(%esp) | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | ||||
|  | @ -439,7 +439,7 @@ | |||
| 
 | ||||
| .L92: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(AA), %xmm0 | ||||
|  | @ -454,7 +454,7 @@ | |||
| 	mulps	12 * SIZE(BB), %xmm0 | ||||
| 	addps	%xmm0, %xmm7 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm3 | ||||
|  | @ -758,7 +758,7 @@ | |||
| 
 | ||||
| .L102: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movsd	 2 * SIZE(AA), %xmm0 | ||||
|  | @ -993,7 +993,7 @@ | |||
| 
 | ||||
| .L112: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 1 * SIZE(AA), %xmm0 | ||||
|  | @ -1324,7 +1324,7 @@ | |||
| 
 | ||||
| .L52: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -1354,7 +1354,7 @@ | |||
| 	addps	%xmm0, %xmm5 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm2 | ||||
|  | @ -1718,7 +1718,7 @@ | |||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 
 | ||||
|  | @ -2031,7 +2031,7 @@ | |||
| 
 | ||||
| .L72: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulss	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -2859,7 +2859,7 @@ | |||
| .L22: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| 	addps	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -3303,7 +3303,7 @@ | |||
| .L32: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| 	addss	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 4 * SIZE(BB), %xmm2 | ||||
|  |  | |||
|  | @ -74,7 +74,7 @@ | |||
| #define	BB	%ecx | ||||
| #define LDC	%ebp | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| #define movsd	movlps | ||||
| #endif | ||||
| 
 | ||||
|  | @ -625,7 +625,7 @@ | |||
| .L22: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| 	addps	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movsd	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -870,7 +870,7 @@ | |||
| .L32: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| 	addss	%xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 4 * SIZE(BB), %xmm2 | ||||
|  | @ -1173,7 +1173,7 @@ | |||
| 
 | ||||
| .L52: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -1203,7 +1203,7 @@ | |||
| 	addps	%xmm0, %xmm5 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm2 | ||||
|  | @ -1359,7 +1359,7 @@ | |||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 
 | ||||
|  | @ -1536,7 +1536,7 @@ | |||
| 
 | ||||
| .L72: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	mulss	 4 * SIZE(BB), %xmm0 | ||||
|  | @ -1794,7 +1794,7 @@ | |||
| 
 | ||||
| .L92: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(AA), %xmm0 | ||||
|  | @ -1809,7 +1809,7 @@ | |||
| 	mulps	12 * SIZE(BB), %xmm0 | ||||
| 	addps	%xmm0, %xmm7 | ||||
| 	movaps	32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	mulps	%xmm1, %xmm3 | ||||
|  | @ -1936,7 +1936,7 @@ | |||
| 
 | ||||
| .L102: | ||||
| 	mulps	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movsd	 2 * SIZE(AA), %xmm0 | ||||
|  | @ -2069,7 +2069,7 @@ | |||
| 
 | ||||
| .L112: | ||||
| 	mulss	%xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| 	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA) | ||||
| #endif | ||||
| 	movss	 1 * SIZE(AA), %xmm0 | ||||
|  |  | |||
|  | @ -71,7 +71,7 @@ | |||
| #define movsd		movlps | ||||
| #endif | ||||
| 
 | ||||
| #ifdef BARCELONA | ||||
| #if defined(BARCELONA)  || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetchnta | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 5) | ||||
|  |  | |||
|  | @ -58,7 +58,7 @@ | |||
| #define movsd		movlps | ||||
| #endif | ||||
| 
 | ||||
| #ifdef BARCELONA | ||||
| #if defined(BARCELONA) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetchnta | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(8 * 5) | ||||
|  |  | |||
|  | @ -71,7 +71,7 @@ | |||
| #define movsd		movlps | ||||
| #endif | ||||
| 
 | ||||
| #ifdef BARCELONA | ||||
| #if defined(BARCELONA) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetchnta | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 5) | ||||
|  |  | |||
|  | @ -58,7 +58,7 @@ | |||
| #define movsd		movlps | ||||
| #endif | ||||
| 
 | ||||
| #ifdef BARCELONA | ||||
| #if defined(BARCELONA) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetchnta | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(8 * 5) | ||||
|  |  | |||
|  | @ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN	4096 | ||||
| #define STACK_OFFSET	1024 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH      prefetch | ||||
|  | @ -533,7 +533,7 @@ | |||
| 	addps	%xmm0, %xmm7 | ||||
| 	movsd	16 * SIZE(AA), %xmm0 | ||||
| 	mulps	%xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	addps	%xmm2, %xmm4 | ||||
|  |  | |||
|  | @ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN	4096 | ||||
| #define STACK_OFFSET	1024 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH      prefetch | ||||
|  | @ -994,7 +994,7 @@ | |||
| 	addps	%xmm0, %xmm7 | ||||
| 	movsd	16 * SIZE(AA), %xmm0 | ||||
| 	mulps	%xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	addps	%xmm2, %xmm4 | ||||
|  |  | |||
|  | @ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN	4096 | ||||
| #define STACK_OFFSET	1024 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH      prefetch | ||||
|  | @ -1820,7 +1820,7 @@ | |||
| 	addps	%xmm0, %xmm7 | ||||
| 	movsd	16 * SIZE(AA), %xmm0 | ||||
| 	mulps	%xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 	prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | ||||
| 	addps	%xmm2, %xmm4 | ||||
|  |  | |||
|  | @ -0,0 +1,62 @@ | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | ||||
| ZGEMVTKERNEL = zgemv_t_dup.S | ||||
| 
 | ||||
| SGEMMKERNEL    =  gemm_kernel_8x4_barcelona.S | ||||
| SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c | ||||
| SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c | ||||
| SGEMMONCOPY    =  gemm_ncopy_4_opteron.S | ||||
| SGEMMOTCOPY    =  gemm_tcopy_4_opteron.S | ||||
| SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)  | ||||
| SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)  | ||||
| SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL    =  dgemm_kernel_4x4_bulldozer.S | ||||
| DGEMMINCOPY    = | ||||
| DGEMMITCOPY    = | ||||
| DGEMMONCOPY    =  gemm_ncopy_4_opteron.S | ||||
| DGEMMOTCOPY    =  gemm_tcopy_4_opteron.S | ||||
| DGEMMINCOPYOBJ = | ||||
| DGEMMITCOPYOBJ = | ||||
| DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMKERNEL    =  zgemm_kernel_4x2_barcelona.S | ||||
| CGEMMINCOPY    =  ../generic/zgemm_ncopy_4.c | ||||
| CGEMMITCOPY    =  ../generic/zgemm_tcopy_4.c | ||||
| CGEMMONCOPY    =  zgemm_ncopy_2.S | ||||
| CGEMMOTCOPY    =  zgemm_tcopy_2.S | ||||
| CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMKERNEL    =  zgemm_kernel_2x2_barcelona.S | ||||
| ZGEMMINCOPY    = | ||||
| ZGEMMITCOPY    = | ||||
| ZGEMMONCOPY    =  zgemm_ncopy_2.S | ||||
| ZGEMMOTCOPY    =  zgemm_tcopy_2.S | ||||
| ZGEMMINCOPYOBJ = | ||||
| ZGEMMITCOPYOBJ = | ||||
| ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| 
 | ||||
| STRSMKERNEL_LN	=  trsm_kernel_LN_8x4_sse.S | ||||
| STRSMKERNEL_LT	=  trsm_kernel_LT_8x4_sse.S | ||||
| STRSMKERNEL_RN	=  trsm_kernel_LT_8x4_sse.S | ||||
| STRSMKERNEL_RT	=  trsm_kernel_RT_8x4_sse.S | ||||
| 
 | ||||
| DTRSMKERNEL_LN	=  trsm_kernel_LN_4x4_barcelona.S | ||||
| DTRSMKERNEL_LT	=  trsm_kernel_LT_4x4_barcelona.S | ||||
| DTRSMKERNEL_RN	=  trsm_kernel_LT_4x4_barcelona.S | ||||
| DTRSMKERNEL_RT	=  trsm_kernel_RT_4x4_barcelona.S | ||||
| 
 | ||||
| CTRSMKERNEL_LN	=  ztrsm_kernel_LN_4x2_sse.S | ||||
| CTRSMKERNEL_LT	=  ztrsm_kernel_LT_4x2_sse.S | ||||
| CTRSMKERNEL_RN	=  ztrsm_kernel_LT_4x2_sse.S | ||||
| CTRSMKERNEL_RT	=  ztrsm_kernel_RT_4x2_sse.S | ||||
| 
 | ||||
| ZTRSMKERNEL_LN	=  ztrsm_kernel_LN_2x2_sse2.S | ||||
| ZTRSMKERNEL_LT	=  ztrsm_kernel_LT_2x2_sse2.S | ||||
| ZTRSMKERNEL_RN	=  ztrsm_kernel_LT_2x2_sse2.S | ||||
| ZTRSMKERNEL_RT	=  ztrsm_kernel_RT_2x2_sse2.S | ||||
| 
 | ||||
| CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_barcelona.S | ||||
| ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_barcelona.S | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -530,7 +530,7 @@ | |||
| #endif | ||||
| 	movsd	-32 * SIZE(Y), %xmm8 | ||||
| 
 | ||||
| 	pshufd	$0x39, %xmm4,  %xmm5 | ||||
| 	pshufd	$0x29, %xmm4,  %xmm5 | ||||
| 
 | ||||
| 	mulps	%xmm8,  %xmm5 | ||||
| 	addps	%xmm5,  %xmm3 | ||||
|  | @ -750,7 +750,8 @@ | |||
| 	xorps	%xmm5, %xmm5 | ||||
| 	movhlps	%xmm4, %xmm5 | ||||
| 
 | ||||
| 	mulps	-32 * SIZE(Y), %xmm5 | ||||
| 	movlps  -32 * SIZE(Y), %xmm4 | ||||
| 	mulps	%xmm4, %xmm5 | ||||
| 	addps	%xmm5, %xmm0 | ||||
| 
 | ||||
| 	addq	$2 * SIZE, X | ||||
|  | @ -992,7 +993,7 @@ | |||
| 	movsd	-32 * SIZE(Y), %xmm8 | ||||
| 
 | ||||
| 	movss	%xmm5, %xmm4 | ||||
| 	shufps	$0x93, %xmm5,  %xmm4 | ||||
| 	shufps	$0x93, %xmm4,  %xmm4 | ||||
| 
 | ||||
| 	mulps	%xmm8,  %xmm4 | ||||
| 	addps	%xmm4,  %xmm3 | ||||
|  |  | |||
|  | @ -930,7 +930,7 @@ | |||
| .L22: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| 	addps	%xmm9, %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movaps	 4 * SIZE(BO), %xmm9 | ||||
|  | @ -983,7 +983,7 @@ | |||
| 	addps	%xmm8, %xmm3 | ||||
| 	movaps	 0 * SIZE(AO), %xmm8 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm10, %xmm9 | ||||
|  | @ -1178,7 +1178,7 @@ | |||
| .L32: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| 	addps	%xmm9, %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movsd	 4 * SIZE(BO), %xmm9 | ||||
|  | @ -1423,7 +1423,7 @@ | |||
| .L42: | ||||
| 	mulss	%xmm8, %xmm9 | ||||
| 	addss	%xmm9, %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movss	 4 * SIZE(BO), %xmm9 | ||||
|  | @ -1765,7 +1765,7 @@ | |||
| 
 | ||||
| .L62: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	 4 * SIZE(BO), %xmm8 | ||||
|  | @ -1793,7 +1793,7 @@ | |||
| 	addps	%xmm8, %xmm5 | ||||
| 	movaps	32 * SIZE(AO), %xmm8 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm10, %xmm11 | ||||
|  | @ -1822,7 +1822,7 @@ | |||
| 	addps	%xmm10, %xmm5 | ||||
| 	movaps	48 * SIZE(AO), %xmm10 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm12, %xmm13 | ||||
|  | @ -1851,7 +1851,7 @@ | |||
| 	addps	%xmm12, %xmm5 | ||||
| 	movaps	64 * SIZE(AO), %xmm12 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm14, %xmm15 | ||||
|  | @ -2024,7 +2024,7 @@ | |||
| 
 | ||||
| .L72: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 
 | ||||
|  | @ -2208,7 +2208,7 @@ | |||
| .L82: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| 	addps	%xmm9, %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movsd	 4 * SIZE(BO), %xmm9 | ||||
|  | @ -2395,7 +2395,7 @@ | |||
| .L92: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| 	addps	%xmm9, %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movss	 4 * SIZE(BO), %xmm9 | ||||
|  | @ -2670,7 +2670,7 @@ | |||
| 
 | ||||
| .L112: | ||||
| 	mulps	%xmm9, %xmm8 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 
 | ||||
|  | @ -2687,7 +2687,7 @@ | |||
| 	addps	%xmm9, %xmm4 | ||||
| 	movaps	 8 * SIZE(BO), %xmm9 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm9, %xmm10 | ||||
|  | @ -2704,7 +2704,7 @@ | |||
| 	addps	%xmm9, %xmm4 | ||||
| 	movaps	32 * SIZE(BO), %xmm9 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm11, %xmm12 | ||||
|  | @ -2721,7 +2721,7 @@ | |||
| 	addps	%xmm11, %xmm4 | ||||
| 	movaps	24 * SIZE(BO), %xmm11 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm11, %xmm14 | ||||
|  | @ -2857,7 +2857,7 @@ | |||
| 
 | ||||
| .L122: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movaps	-28 * SIZE(AO), %xmm8 | ||||
|  | @ -2873,7 +2873,7 @@ | |||
| 	addps	%xmm8, %xmm3 | ||||
| 	movaps	  0 * SIZE(AO), %xmm8 | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO) | ||||
| #endif | ||||
| 	mulps	%xmm10, %xmm11 | ||||
|  | @ -3003,7 +3003,7 @@ | |||
| 
 | ||||
| .L132: | ||||
| 	mulps	%xmm8, %xmm9 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movsd	-30 * SIZE(AO), %xmm8 | ||||
|  | @ -3150,7 +3150,7 @@ | |||
| 
 | ||||
| .L142: | ||||
| 	mulss	%xmm8, %xmm9 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO) | ||||
| #endif | ||||
| 	movss	-31 * SIZE(AO), %xmm8 | ||||
|  |  | |||
|  | @ -39,7 +39,7 @@ | |||
| #define ASSEMBLER | ||||
| #include "common.h" | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define RPREFETCHSIZE (12 + 4) | ||||
| #define WPREFETCHSIZE (48 + 4) | ||||
| #define MOVNTQ	 MOVQ | ||||
|  | @ -79,7 +79,7 @@ | |||
| #define AO3	%r13 | ||||
| #define AO4	%rax | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define RPREFETCH prefetch | ||||
| #else | ||||
| #define RPREFETCH prefetch | ||||
|  |  | |||
|  | @ -39,7 +39,7 @@ | |||
| #define ASSEMBLER | ||||
| #include "common.h" | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define RPREFETCHSIZE (12 + 4) | ||||
| #define WPREFETCHSIZE (12 + 4) | ||||
| #define MOVNTQ	 MOVQ | ||||
|  | @ -96,7 +96,7 @@ | |||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define RPREFETCH prefetch | ||||
| #else | ||||
| #define RPREFETCH prefetch | ||||
|  |  | |||
|  | @ -469,7 +469,7 @@ | |||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L71: | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 	prefetch	PREFETCHSIZE * SIZE(X) | ||||
| #endif | ||||
| 
 | ||||
|  |  | |||
|  | @ -266,7 +266,7 @@ | |||
| 	sarq	$5, I | ||||
| 	jle	.L113 | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 
 | ||||
| 	movaps	%xmm0, %xmm1 | ||||
| 	mulps	-32 * SIZE(X), %xmm1 | ||||
|  |  | |||
|  | @ -251,7 +251,7 @@ | |||
| 	sarq	$4, I | ||||
| 	jle	.L113 | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| 
 | ||||
| 	movaps  %xmm0, %xmm1 | ||||
| 	mulpd	-16 * SIZE(X), %xmm1 | ||||
|  |  | |||
|  | @ -1,4 +1,3 @@ | |||
| /*********************************************************************/ | ||||
| /* Copyright 2009, 2010 The University of Texas at Austin.           */ | ||||
| /* All rights reserved.                                              */ | ||||
| /*                                                                   */ | ||||
|  | @ -47,7 +46,7 @@ | |||
| 	 | ||||
| #ifndef WINDOWS_ABI | ||||
| 
 | ||||
| #define STACKSIZE	64 | ||||
| #define STACKSIZE	128 | ||||
| 	 | ||||
| #define OLD_M	  %rdi | ||||
| #define OLD_N	  %rsi | ||||
|  | @ -57,6 +56,10 @@ | |||
| #define STACK_Y		16 + STACKSIZE(%rsp) | ||||
| #define STACK_INCY	24 + STACKSIZE(%rsp) | ||||
| #define STACK_BUFFER	32 + STACKSIZE(%rsp) | ||||
| #define MMM		56(%rsp) | ||||
| #define NN		64(%rsp) | ||||
| #define AA		72(%rsp) | ||||
| #define LDAX	80(%rsp) | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
|  | @ -71,6 +74,10 @@ | |||
| #define STACK_Y		 72 + STACKSIZE(%rsp) | ||||
| #define STACK_INCY	 80 + STACKSIZE(%rsp) | ||||
| #define STACK_BUFFER	 88 + STACKSIZE(%rsp) | ||||
| #define MMM	216(%rsp) | ||||
| #define NN	224(%rsp) | ||||
| #define AA	232(%rsp) | ||||
| #define LDAX 240(%rsp) | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
|  | @ -127,29 +134,48 @@ | |||
| 	movups	%xmm14, 192(%rsp) | ||||
| 	movups	%xmm15, 208(%rsp) | ||||
| 
 | ||||
| 	movq	OLD_M,	      M | ||||
| 	movq	OLD_N,        N | ||||
| 	movq	OLD_A,        A | ||||
| 	movq	OLD_LDA,      LDA | ||||
| 	movq	OLD_M,	      MMM | ||||
| 	movq	OLD_N,        NN | ||||
| 	movq	OLD_A,        X | ||||
| 	movq	X,	      AA | ||||
| 	movq	OLD_LDA,      X | ||||
| 	movq	X,	      LDAX | ||||
| 	movq	OLD_X,        X | ||||
| #else | ||||
| 	movq	OLD_M,	      M | ||||
| 	movq	OLD_N,        N | ||||
| 	movq	OLD_A,        A | ||||
| 	movq	OLD_LDA,      LDA | ||||
| 	movq	OLD_M,	      MMM | ||||
| 	movq	OLD_N,        NN | ||||
| 	movq	OLD_A,        AA | ||||
| 	movq	OLD_LDA,      LDAX | ||||
| #endif | ||||
| 
 | ||||
| 	movq	STACK_INCX,   INCX | ||||
| 	movq	STACK_Y,      Y | ||||
| 	movq	STACK_INCY,   INCY | ||||
| 	movq	STACK_BUFFER, BUFFER | ||||
| 
 | ||||
| #ifndef WINDOWS_ABI | ||||
| 	pshufd	$0, %xmm0, ALPHA | ||||
| #else | ||||
| 	pshufd	$0, %xmm3, ALPHA | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| .L0t: | ||||
| 	xorq	M,M | ||||
| 	addq	$1,M | ||||
| 	salq	$22,M | ||||
| 	subq	M,MMM | ||||
| 	jge		.L00t | ||||
| 	ALIGN_4 | ||||
| 	 | ||||
| 	movq	MMM,%rax | ||||
| 	addq	M,%rax | ||||
| 	jle		.L999x | ||||
| 	movq	%rax,M | ||||
| 
 | ||||
| .L00t: | ||||
| 	movq	LDAX,LDA | ||||
| 	movq	NN,N | ||||
| 	movq	AA,A | ||||
| 	movq	STACK_INCX,   INCX | ||||
| 	movq	STACK_Y,      Y | ||||
| 	movq	STACK_INCY,   INCY | ||||
| 	movq	STACK_BUFFER, BUFFER | ||||
| 
 | ||||
| 	leaq	(,INCX, SIZE), INCX | ||||
| 	leaq	(,INCY, SIZE), INCY | ||||
| 	leaq	(,LDA,  SIZE), LDA | ||||
|  | @ -6341,6 +6367,12 @@ | |||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L999: | ||||
| 	leaq	(,M,SIZE),%rax | ||||
| 	addq	%rax,AA | ||||
| 	jmp		.L0t | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L999x: | ||||
| 	movq	  0(%rsp), %rbx | ||||
| 	movq	  8(%rsp), %rbp | ||||
| 	movq	 16(%rsp), %r12 | ||||
|  |  | |||
|  | @ -76,7 +76,7 @@ | |||
| #define movsd		movlps | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 16) | ||||
|  |  | |||
|  | @ -76,7 +76,7 @@ | |||
| #define movsd		movlpd | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 16) | ||||
|  |  | |||
|  | @ -76,7 +76,7 @@ | |||
| #define movsd		movlps | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 16) | ||||
|  |  | |||
|  | @ -76,7 +76,7 @@ | |||
| #define movsd		movlpd | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 16) | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ | |||
| #define PREFETCHW    prefetcht0 | ||||
| #endif | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define movsd movlps | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ | |||
| #define PREFETCHW    prefetcht0 | ||||
| #endif | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define movsd movlps | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ | |||
| #define PREFETCHW    prefetcht0 | ||||
| #endif | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define movsd movlps | ||||
|  |  | |||
|  | @ -699,7 +699,7 @@ | |||
| 	movsd	-32 * SIZE(X), %xmm4 | ||||
| 
 | ||||
| 	pshufd	$0xb1,  %xmm4, %xmm12  | ||||
| 	shufps	$0x39,  %xmm8, %xmm8 | ||||
| 	shufps	$0x59,  %xmm8, %xmm8 | ||||
| 	mulps	%xmm8,  %xmm4 | ||||
| 	addps	%xmm4,  %xmm0 | ||||
| 	mulps	%xmm8,  %xmm12 | ||||
|  | @ -1336,7 +1336,7 @@ | |||
| 
 | ||||
| 	movss	%xmm9,  %xmm8 | ||||
| 	pshufd	$0xb1,  %xmm4, %xmm12  | ||||
| 	shufps	$0x93,  %xmm8, %xmm8 | ||||
| 	shufps	$0x03,  %xmm8, %xmm8 | ||||
| 	mulps	%xmm8,  %xmm4 | ||||
| 	addps	%xmm4,  %xmm0 | ||||
| 	mulps	%xmm8,  %xmm12 | ||||
|  | @ -1697,7 +1697,7 @@ | |||
| 	movsd	-32 * SIZE(Y), %xmm4 | ||||
| 
 | ||||
| 	pshufd	$0xb1,  %xmm4, %xmm12  | ||||
| 	shufps	$0x39,  %xmm8, %xmm8 | ||||
| 	shufps	$0xa9,  %xmm8, %xmm8 | ||||
| 	mulps	%xmm8,  %xmm4 | ||||
| 	addps	%xmm4,  %xmm0 | ||||
| 	mulps	%xmm8,  %xmm12 | ||||
|  | @ -2024,7 +2024,7 @@ | |||
| 
 | ||||
| 	movss	%xmm9,  %xmm8 | ||||
| 	pshufd	$0xb1,  %xmm4, %xmm12  | ||||
| 	shufps	$0x93,  %xmm8, %xmm8 | ||||
| 	shufps	$0x03,  %xmm8, %xmm8 | ||||
| 	mulps	%xmm8,  %xmm4 | ||||
| 	addps	%xmm4,  %xmm0 | ||||
| 	mulps	%xmm8,  %xmm12 | ||||
|  |  | |||
|  | @ -85,7 +85,7 @@ | |||
| #define movsd movlpd | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define RPREFETCHSIZE 32 | ||||
| #define WPREFETCHSIZE 48 | ||||
| #endif | ||||
|  |  | |||
|  | @ -160,7 +160,7 @@ | |||
| #define a3     %xmm14 | ||||
| #define	xt1    %xmm15 | ||||
| 
 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define MOVDDUP(a, b, c)	movddup	a(b), c | ||||
| #define MOVDDUP2(a, b, c)	movddup	a##b, c | ||||
| #else | ||||
|  |  | |||
|  | @ -76,7 +76,7 @@ | |||
| #define movsd		movlpd | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 16) | ||||
|  | @ -167,7 +167,7 @@ | |||
| #define a3     %xmm14 | ||||
| #define	xt1    %xmm15 | ||||
| 
 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define MOVDDUP(a, b, c)	movddup	a(b), c | ||||
| #define MOVDDUP2(a, b, c)	movddup	a##b, c | ||||
| #else | ||||
|  |  | |||
|  | @ -76,7 +76,7 @@ | |||
| #define movsd		movlpd | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 16) | ||||
|  | @ -166,7 +166,7 @@ | |||
| #define	xt1    %xmm14 | ||||
| #define	xt2    %xmm15 | ||||
| 
 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define MOVDDUP(a, b, c)	movddup	a(b), c | ||||
| #define MOVDDUP2(a, b, c)	movddup	a##b, c | ||||
| #else | ||||
|  |  | |||
|  | @ -76,7 +76,7 @@ | |||
| #define movsd		movlpd | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE	(16 * 16) | ||||
|  | @ -166,7 +166,7 @@ | |||
| #define a3     %xmm14 | ||||
| #define	xt1    %xmm15 | ||||
| 
 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | ||||
| #define MOVDDUP(a, b, c)	movddup	a(b), c | ||||
| #define MOVDDUP2(a, b, c)	movddup	a##b, c | ||||
| #else | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ | |||
| #define BORIG	 72(%rsp) | ||||
| #define BUFFER	128(%rsp) | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHNTA  prefetchnta | ||||
|  |  | |||
|  | @ -95,7 +95,7 @@ | |||
| #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHNTA  prefetchnta | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ | |||
| #define BORIG	 72(%rsp) | ||||
| #define BUFFER	128(%rsp) | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHNTA  prefetchnta | ||||
|  |  | |||
|  | @ -95,7 +95,7 @@ | |||
| #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHNTA  prefetchnta | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ | |||
| #define BORIG	 72(%rsp) | ||||
| #define BUFFER	128(%rsp) | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHNTA  prefetchnta | ||||
|  |  | |||
|  | @ -95,7 +95,7 @@ | |||
| #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | ||||
| 
 | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define PREFETCH     prefetch | ||||
| #define PREFETCHW    prefetchw | ||||
| #define PREFETCHNTA  prefetchnta | ||||
|  |  | |||
|  | @ -74,6 +74,13 @@ | |||
| #define ALIGNED_ACCESS | ||||
| #endif | ||||
| 
 | ||||
| #ifdef BULLDOZER | ||||
| #define PREFETCH	prefetch | ||||
| #define PREFETCHW	prefetchw | ||||
| #define PREFETCHSIZE (128 *   5) | ||||
| #define ALIGNED_ACCESS | ||||
| #endif | ||||
| 
 | ||||
| #ifdef NANO | ||||
| #define PREFETCH        prefetcht0 | ||||
| #define PREFETCHW       prefetcht0 | ||||
|  |  | |||
|  | @ -85,7 +85,7 @@ | |||
| #define movsd		movlps | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| #define ALIGNED_ACCESS | ||||
| #define MOVUPS_A	movaps | ||||
| #define MOVUPS_XL	movaps | ||||
|  |  | |||
|  | @ -66,7 +66,9 @@ static FLOAT dm1 = -1.; | |||
| #endif | ||||
| 
 | ||||
| #define GEMM_PQ  MAX(GEMM_P, GEMM_Q) | ||||
| #define REAL_GEMM_R (GEMM_R - GEMM_PQ) | ||||
| 
 | ||||
| //leave some space for GEMM_ALIGN in sb2
 | ||||
| #define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ) | ||||
| 
 | ||||
| #if 0 | ||||
| #define SHARED_ARRAY | ||||
|  | @ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| 			sa, | ||||
| 			sb2, | ||||
| 			a + (is + js * lda) * COMPSIZE, lda, | ||||
| 			- is + js); | ||||
| 			is - js); | ||||
| #endif | ||||
| 
 | ||||
| 	} | ||||
|  |  | |||
							
								
								
									
										2
									
								
								make.inc
								
								
								
								
							
							
						
						
									
										2
									
								
								make.inc
								
								
								
								
							|  | @ -4,7 +4,7 @@ DRVOPTS  = $(OPTS) | |||
| LOADER   = $(FORTRAN) | ||||
| TIMER     = NONE | ||||
| ARCHFLAGS= -ru | ||||
| RANLIB   = ranlib | ||||
| #RANLIB   = ranlib
 | ||||
| BLASLIB      =  | ||||
| TMGLIB       = tmglib.a | ||||
| EIGSRCLIB    = eigsrc.a | ||||
|  |  | |||
|  | @ -48,7 +48,8 @@ typedef int blasint; | |||
| /* C99 supports complex floating numbers natively, which GCC also offers as an
 | ||||
|    extension since version 3.0.  If neither are available, use a compatible | ||||
|    structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | ||||
| #if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 | ||||
| #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | ||||
|      (__GNUC__ >= 3 && !defined(__cplusplus))) | ||||
|   #define OPENBLAS_COMPLEX_C99 | ||||
|   #include <complex.h> | ||||
|   typedef float _Complex openblas_complex_float; | ||||
|  |  | |||
							
								
								
									
										2
									
								
								param.h
								
								
								
								
							
							
						
						
									
										2
									
								
								param.h
								
								
								
								
							|  | @ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | ||||
| 
 | ||||
| #define SNUMOPT		8 | ||||
| #define DNUMOPT		4 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue