Merge branch 'develop' into loongson3a
This commit is contained in:
		
						commit
						35b943f17f
					
				| 
						 | 
					@ -1,4 +1,22 @@
 | 
				
			||||||
OpenBLAS ChangeLog
 | 
					OpenBLAS ChangeLog
 | 
				
			||||||
 | 
					====================================================================
 | 
				
			||||||
 | 
					Version 0.2.6
 | 
				
			||||||
 | 
					2-Mar-2013
 | 
				
			||||||
 | 
					common:
 | 
				
			||||||
 | 
						* Improved OpenMP performance slightly. (d744c9)
 | 
				
			||||||
 | 
						* Improved cblas.h compatibility with Intel MKL.(#185)
 | 
				
			||||||
 | 
						* Fixed the overflowing bug in single thread cholesky factorization.
 | 
				
			||||||
 | 
						* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					x86/x86-64:
 | 
				
			||||||
 | 
						* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
 | 
				
			||||||
 | 
						  We will tune the performance in future.
 | 
				
			||||||
 | 
						* Auto-detect Intel Xeon E7540.
 | 
				
			||||||
 | 
						* Fixed the overflowing buffer bug of gemv. (#173)
 | 
				
			||||||
 | 
						* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					MIPS64:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
====================================================================
 | 
					====================================================================
 | 
				
			||||||
Version 0.2.5
 | 
					Version 0.2.5
 | 
				
			||||||
26-Nov-2012
 | 
					26-Nov-2012
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										8
									
								
								Makefile
								
								
								
								
							
							
						
						
									
										8
									
								
								Makefile
								
								
								
								
							| 
						 | 
					@ -225,9 +225,9 @@ ifndef NOFORTRAN
 | 
				
			||||||
	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
						-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
				
			||||||
	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
						-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
				
			||||||
ifdef INTERFACE64
 | 
					ifdef INTERFACE64
 | 
				
			||||||
	-@echo "CFLAGS      = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H  -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
						-@echo "override CFLAGS      = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H  -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
				
			||||||
else
 | 
					else
 | 
				
			||||||
	-@echo "CFLAGS      = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
						-@echo "override CFLAGS      = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
	-@echo "ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
						-@echo "ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
				
			||||||
	-@echo "ARCHFLAGS   = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
						-@echo "ARCHFLAGS   = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
 | 
				
			||||||
| 
						 | 
					@ -267,7 +267,7 @@ else
 | 
				
			||||||
ifeq ($(OSNAME), FreeBSD)
 | 
					ifeq ($(OSNAME), FreeBSD)
 | 
				
			||||||
	fetch $(LAPACK_URL)
 | 
						fetch $(LAPACK_URL)
 | 
				
			||||||
else
 | 
					else
 | 
				
			||||||
	wget $(LAPACK_URL)
 | 
						wget -O $@ $(LAPACK_URL)
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
| 
						 | 
					@ -314,7 +314,7 @@ clean ::
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	@$(MAKE) -C reference clean
 | 
						@$(MAKE) -C reference clean
 | 
				
			||||||
	@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
 | 
						@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
 | 
				
			||||||
	@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
 | 
						@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
 | 
				
			||||||
	@if test -d $(NETLIB_LAPACK_DIR); then \
 | 
						@if test -d $(NETLIB_LAPACK_DIR); then \
 | 
				
			||||||
	echo deleting $(NETLIB_LAPACK_DIR); \
 | 
						echo deleting $(NETLIB_LAPACK_DIR); \
 | 
				
			||||||
	rm -rf $(NETLIB_LAPACK_DIR) ;\
 | 
						rm -rf $(NETLIB_LAPACK_DIR) ;\
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					# This is triggered by Makefile.system and runs before any of the code is built.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export BINARY
 | 
					export BINARY
 | 
				
			||||||
export USE_OPENMP
 | 
					export USE_OPENMP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,7 +17,7 @@ ifdef CPUIDEMU
 | 
				
			||||||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
 | 
					EXFLAGS = -DCPUIDEMU -DVENDOR=99
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
all: getarch_2nd
 | 
					all: getarch_2nd cblas_noconst.h
 | 
				
			||||||
	./getarch_2nd  0 >> $(TARGET_MAKE)
 | 
						./getarch_2nd  0 >> $(TARGET_MAKE)
 | 
				
			||||||
	./getarch_2nd  1 >> $(TARGET_CONF)
 | 
						./getarch_2nd  1 >> $(TARGET_CONF)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -36,4 +38,7 @@ else
 | 
				
			||||||
	$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
 | 
						$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cblas_noconst.h : cblas.h
 | 
				
			||||||
 | 
						perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h	
 | 
				
			||||||
 | 
					
 | 
				
			||||||
dummy:
 | 
					dummy:
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# This library's version
 | 
					# This library's version
 | 
				
			||||||
VERSION = 0.2.5
 | 
					VERSION = 0.2.6
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 | 
					# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 | 
				
			||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library 
 | 
					# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library 
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE
 | 
				
			||||||
export GOTOBLAS_MAKEFILE = 1
 | 
					export GOTOBLAS_MAKEFILE = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Generating Makefile.conf and config.h
 | 
					# Generating Makefile.conf and config.h
 | 
				
			||||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
 | 
					DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifndef TARGET_CORE
 | 
					ifndef TARGET_CORE
 | 
				
			||||||
include $(TOPDIR)/Makefile.conf
 | 
					include $(TOPDIR)/Makefile.conf
 | 
				
			||||||
| 
						 | 
					@ -277,14 +277,14 @@ ifeq ($(ARCH), x86)
 | 
				
			||||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
 | 
					DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
 | 
				
			||||||
	       CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 | 
						       CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 | 
				
			||||||
ifneq ($(NO_AVX), 1)
 | 
					ifneq ($(NO_AVX), 1)
 | 
				
			||||||
DYNAMIC_CORE += SANDYBRIDGE 
 | 
					DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifeq ($(ARCH), x86_64)
 | 
					ifeq ($(ARCH), x86_64)
 | 
				
			||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 | 
					DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 | 
				
			||||||
ifneq ($(NO_AVX), 1)
 | 
					ifneq ($(NO_AVX), 1)
 | 
				
			||||||
DYNAMIC_CORE += SANDYBRIDGE 
 | 
					DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,7 +27,7 @@ On X86 box, compile this library for loongson3a CPU.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    make DEBUG=1
 | 
					    make DEBUG=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Intall to the directory (Optional)
 | 
					### Install to the directory (Optional)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Example:
 | 
					Example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt
 | 
				
			||||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
 | 
					- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
 | 
				
			||||||
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
 | 
					- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
 | 
				
			||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
 | 
					- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
 | 
				
			||||||
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
 | 
					- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### MIPS64:
 | 
					#### MIPS64:
 | 
				
			||||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
 | 
					- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,8 +8,8 @@ Supported List:
 | 
				
			||||||
1.X86/X86_64
 | 
					1.X86/X86_64
 | 
				
			||||||
a)Intel CPU:
 | 
					a)Intel CPU:
 | 
				
			||||||
P2
 | 
					P2
 | 
				
			||||||
COPPERMINE
 | 
					 | 
				
			||||||
KATMAI
 | 
					KATMAI
 | 
				
			||||||
 | 
					COPPERMINE
 | 
				
			||||||
NORTHWOOD
 | 
					NORTHWOOD
 | 
				
			||||||
PRESCOTT
 | 
					PRESCOTT
 | 
				
			||||||
BANIAS
 | 
					BANIAS
 | 
				
			||||||
| 
						 | 
					@ -29,6 +29,7 @@ BARCELONA
 | 
				
			||||||
SHANGHAI
 | 
					SHANGHAI
 | 
				
			||||||
ISTANBUL
 | 
					ISTANBUL
 | 
				
			||||||
BOBCAT
 | 
					BOBCAT
 | 
				
			||||||
 | 
					BULLDOZER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
c)VIA CPU:
 | 
					c)VIA CPU:
 | 
				
			||||||
SSE_GENERIC
 | 
					SSE_GENERIC
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										448
									
								
								cblas.h
								
								
								
								
							
							
						
						
									
										448
									
								
								cblas.h
								
								
								
								
							| 
						 | 
					@ -1,291 +1,293 @@
 | 
				
			||||||
#ifndef CBLAS_H
 | 
					#ifndef CBLAS_H
 | 
				
			||||||
#define CBLAS_H
 | 
					#define CBLAS_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <stddef.h>
 | 
				
			||||||
 | 
					#include "common.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef __cplusplus
 | 
					#ifdef __cplusplus
 | 
				
			||||||
extern "C" {
 | 
					extern "C" {
 | 
				
			||||||
	/* Assume C declarations for C++ */
 | 
						/* Assume C declarations for C++ */
 | 
				
			||||||
#endif  /* __cplusplus */
 | 
					#endif  /* __cplusplus */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <stddef.h>
 | 
					 | 
				
			||||||
#include "common.h"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/*Set the number of threads on runtime.*/
 | 
					/*Set the number of threads on runtime.*/
 | 
				
			||||||
void openblas_set_num_threads(int num_threads);
 | 
					void openblas_set_num_threads(int num_threads);
 | 
				
			||||||
void goto_set_num_threads(int num_threads);
 | 
					void goto_set_num_threads(int num_threads);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*Get the build configure on runtime.*/
 | 
				
			||||||
 | 
					char* openblas_get_config(void);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define CBLAS_INDEX size_t
 | 
					#define CBLAS_INDEX size_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102};
 | 
					typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
 | 
				
			||||||
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114};
 | 
					typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
 | 
				
			||||||
enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122};
 | 
					typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
 | 
				
			||||||
enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132};
 | 
					typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
 | 
				
			||||||
enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142};
 | 
					typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
float  cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy);
 | 
					float  cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
 | 
				
			||||||
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
 | 
					double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
 | 
				
			||||||
float  cblas_sdot(blasint n, float  *x, blasint incx, float  *y, blasint incy);
 | 
					float  cblas_sdot(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy);
 | 
				
			||||||
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
 | 
					double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
openblas_complex_float  cblas_cdotu(blasint n, float  *x, blasint incx, float  *y, blasint incy);
 | 
					openblas_complex_float  cblas_cdotu(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy);
 | 
				
			||||||
openblas_complex_float  cblas_cdotc(blasint n, float  *x, blasint incx, float  *y, blasint incy);
 | 
					openblas_complex_float  cblas_cdotc(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy);
 | 
				
			||||||
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
 | 
					openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
 | 
				
			||||||
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
 | 
					openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void  cblas_cdotu_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret);
 | 
					void  cblas_cdotu_sub(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy, openblas_complex_float  *ret);
 | 
				
			||||||
void  cblas_cdotc_sub(blasint n, float  *x, blasint incx, float  *y, blasint incy, openblas_complex_float  *ret);
 | 
					void  cblas_cdotc_sub(const blasint n, const float  *x, const blasint incx, const float  *y, const blasint incy, openblas_complex_float  *ret);
 | 
				
			||||||
void  cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
 | 
					void  cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
 | 
				
			||||||
void  cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
 | 
					void  cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
float  cblas_sasum (blasint n, float  *x, blasint incx);
 | 
					float  cblas_sasum (const blasint n, const float  *x, const blasint incx);
 | 
				
			||||||
double cblas_dasum (blasint n, double *x, blasint incx);
 | 
					double cblas_dasum (const blasint n, const double *x, const blasint incx);
 | 
				
			||||||
float  cblas_scasum(blasint n, float  *x, blasint incx);
 | 
					float  cblas_scasum(const blasint n, const float  *x, const blasint incx);
 | 
				
			||||||
double cblas_dzasum(blasint n, double *x, blasint incx);
 | 
					double cblas_dzasum(const blasint n, const double *x, const blasint incx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
float  cblas_snrm2 (blasint N, float  *X, blasint incX);
 | 
					float  cblas_snrm2 (const blasint N, const float  *X, const blasint incX);
 | 
				
			||||||
double cblas_dnrm2 (blasint N, double *X, blasint incX);
 | 
					double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
 | 
				
			||||||
float  cblas_scnrm2(blasint N, float  *X, blasint incX);
 | 
					float  cblas_scnrm2(const blasint N, const float  *X, const blasint incX);
 | 
				
			||||||
double cblas_dznrm2(blasint N, double *X, blasint incX);
 | 
					double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CBLAS_INDEX cblas_isamax(blasint n, float  *x, blasint incx);
 | 
					CBLAS_INDEX cblas_isamax(const blasint n, const float  *x, const blasint incx);
 | 
				
			||||||
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
 | 
					CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
 | 
				
			||||||
CBLAS_INDEX cblas_icamax(blasint n, float  *x, blasint incx);
 | 
					CBLAS_INDEX cblas_icamax(const blasint n, const float  *x, const blasint incx);
 | 
				
			||||||
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
 | 
					CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy);
 | 
					void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
 | 
				
			||||||
void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy);
 | 
					void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
 | 
				
			||||||
void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy);
 | 
					void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
 | 
				
			||||||
void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy);
 | 
					void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
 | 
					void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
 | 
				
			||||||
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
 | 
					void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
 | 
				
			||||||
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
 | 
					void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
 | 
				
			||||||
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
 | 
					void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
 | 
					void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
 | 
				
			||||||
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
 | 
					void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
 | 
				
			||||||
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
 | 
					void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
 | 
				
			||||||
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
 | 
					void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
 | 
					void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
 | 
				
			||||||
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double  s);
 | 
					void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double  s);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_srotg(float *a, float *b, float *c, float *s);
 | 
					void cblas_srotg(float *a, float *b, float *c, float *s);
 | 
				
			||||||
void cblas_drotg(double *a, double *b, double *c, double *s);
 | 
					void cblas_drotg(double *a, double *b, double *c, double *s);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
 | 
					void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
 | 
				
			||||||
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
 | 
					void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
 | 
					void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
 | 
				
			||||||
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
 | 
					void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
 | 
					void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
 | 
				
			||||||
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
 | 
					void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
 | 
				
			||||||
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
 | 
					void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
 | 
				
			||||||
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
 | 
					void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
 | 
				
			||||||
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
 | 
					void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
 | 
				
			||||||
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
 | 
					void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 | 
					void cblas_sgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
 | 
				
			||||||
		 float alpha, float  *a, blasint lda,  float  *x, blasint incx,  float beta,  float  *y, blasint incy);
 | 
							 const float alpha, const float  *a, const blasint lda,  const float  *x, const blasint incx,  const float beta,  float  *y, const blasint incy);
 | 
				
			||||||
void cblas_dgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 | 
					void cblas_dgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
 | 
				
			||||||
		 double alpha, double  *a, blasint lda,  double  *x, blasint incx,  double beta,  double  *y, blasint incy);
 | 
							 const double alpha, const double  *a, const blasint lda,  const double  *x, const blasint incx,  const double beta,  double  *y, const blasint incy);
 | 
				
			||||||
void cblas_cgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 | 
					void cblas_cgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
 | 
				
			||||||
		 float *alpha, float  *a, blasint lda,  float  *x, blasint incx,  float *beta,  float  *y, blasint incy);
 | 
							 const float *alpha, const float  *a, const blasint lda,  const float  *x, const blasint incx,  const float *beta,  float  *y, const blasint incy);
 | 
				
			||||||
void cblas_zgemv(enum CBLAS_ORDER order,  enum CBLAS_TRANSPOSE trans,  blasint m, blasint n,
 | 
					void cblas_zgemv(const enum CBLAS_ORDER order,  const enum CBLAS_TRANSPOSE trans,  const blasint m, const blasint n,
 | 
				
			||||||
		 double *alpha, double  *a, blasint lda,  double  *x, blasint incx,  double *beta,  double  *y, blasint incy);
 | 
							 const double *alpha, const double  *a, const blasint lda,  const double  *x, const blasint incx,  const double *beta,  double  *y, const blasint incy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float   alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
 | 
					void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float   alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda);
 | 
				
			||||||
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double  alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
 | 
					void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double  alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
 | 
				
			||||||
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
 | 
					void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float  *alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda);
 | 
				
			||||||
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float  *alpha, float  *X, blasint incX, float  *Y, blasint incY, float  *A, blasint lda);
 | 
					void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float  *alpha, const float  *X, const blasint incX, const float  *Y, const blasint incY, float  *A, const blasint lda);
 | 
				
			||||||
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
 | 
					void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
 | 
				
			||||||
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
 | 
					void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 | 
					void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 | 
					void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 | 
					void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 | 
					void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 | 
					void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 | 
					void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
 | 
					void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
 | 
					void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
 | 
					void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
 | 
				
			||||||
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
 | 
					void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
 | 
				
			||||||
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
 | 
					void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
 | 
				
			||||||
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
 | 
					void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
 | 
					void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
 | 
				
			||||||
                blasint incX, float *Y, blasint incY, float *A, blasint lda);
 | 
					                const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
 | 
				
			||||||
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
 | 
					void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
 | 
				
			||||||
                blasint incX, double *Y, blasint incY, double *A, blasint lda);
 | 
					                const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
 | 
				
			||||||
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
 | 
					void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
 | 
				
			||||||
                float *Y, blasint incY, float *A, blasint lda);
 | 
					                const float *Y, const blasint incY, float *A, const blasint lda);
 | 
				
			||||||
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
 | 
					void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
 | 
				
			||||||
                double *Y, blasint incY, double *A, blasint lda);
 | 
					                const double *Y, const blasint incY, double *A, const blasint lda);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
 | 
					void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
 | 
				
			||||||
                 blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
 | 
					                 const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
 | 
					void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
 | 
				
			||||||
                 blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
 | 
					                 const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
 | 
				
			||||||
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
 | 
					void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
 | 
				
			||||||
                 blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
 | 
					                 const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
 | 
					void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
 | 
				
			||||||
                 blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
 | 
					                 const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
 | 
					void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
 | 
				
			||||||
                 blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
 | 
					                 const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
 | 
					void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
 | 
				
			||||||
                 blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
 | 
					                 const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
 | 
				
			||||||
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
 | 
					                 const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, float *Ap, float *X, blasint incX);
 | 
					                 const blasint N, const float *Ap, float *X, const blasint incX);
 | 
				
			||||||
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, double *Ap, double *X, blasint incX);
 | 
					                 const blasint N, const double *Ap, double *X, const blasint incX);
 | 
				
			||||||
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, float *Ap, float *X, blasint incX);
 | 
					                 const blasint N, const float *Ap, float *X, const blasint incX);
 | 
				
			||||||
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, double *Ap, double *X, blasint incX);
 | 
					                 const blasint N, const double *Ap, double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, float *Ap, float *X, blasint incX);
 | 
					                 const blasint N, const float *Ap, float *X, const blasint incX);
 | 
				
			||||||
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, double *Ap, double *X, blasint incX);
 | 
					                 const blasint N, const double *Ap, double *X, const blasint incX);
 | 
				
			||||||
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, float *Ap, float *X, blasint incX);
 | 
					                 const blasint N, const float *Ap, float *X, const blasint incX);
 | 
				
			||||||
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
 | 
					void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
 | 
				
			||||||
                 blasint N, double *Ap, double *X, blasint incX);
 | 
					                 const blasint N, const double *Ap, double *X, const blasint incX);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
 | 
					void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
 | 
				
			||||||
                 blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
 | 
					                 const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
 | 
					void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
 | 
				
			||||||
                 blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
 | 
					                 const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
 | 
				
			||||||
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
 | 
					void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
 | 
				
			||||||
                 blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
 | 
					                 const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
 | 
					void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
 | 
				
			||||||
                 blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
 | 
					                 const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
 | 
					void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
 | 
				
			||||||
                 float *X, blasint incX, float beta, float *Y, blasint incY);
 | 
					                 const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
 | 
					void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
 | 
				
			||||||
                 double *X, blasint incX, double beta, double *Y, blasint incY);
 | 
					                 const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
 | 
					void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
 | 
				
			||||||
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
 | 
					void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
 | 
					void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
 | 
				
			||||||
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
 | 
					void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
 | 
					void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
 | 
				
			||||||
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
 | 
					void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
 | 
				
			||||||
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
 | 
					void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
 | 
				
			||||||
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
 | 
					void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
 | 
					void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
 | 
				
			||||||
		 float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
 | 
							 const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
 | 
					void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
 | 
				
			||||||
		 double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
 | 
							 const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
 | 
					void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
 | 
				
			||||||
		 float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
 | 
							 const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
 | 
				
			||||||
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
 | 
					void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
 | 
				
			||||||
		 double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
 | 
							 const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 | 
					void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
 | 
				
			||||||
		 float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 | 
							 const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 | 
					void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
 | 
				
			||||||
		 double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 | 
							 const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
 | 
				
			||||||
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 | 
					void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
 | 
				
			||||||
		 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 | 
							 const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
 | 
					void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
 | 
				
			||||||
		 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 | 
							 const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
 | 
					void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
 | 
				
			||||||
                 float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 | 
					                 const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
 | 
					void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
 | 
				
			||||||
                 double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 | 
					                 const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
 | 
				
			||||||
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
 | 
					void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
 | 
				
			||||||
                 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 | 
					                 const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
 | 
					void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
 | 
				
			||||||
                 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 | 
					                 const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		 blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
 | 
							 const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		 blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
 | 
							 const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
 | 
				
			||||||
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		 blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
 | 
							 const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		 blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
 | 
							 const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		  blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 | 
							  const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		  blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 | 
							  const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
 | 
				
			||||||
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		  blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 | 
							  const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
 | 
					void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
 | 
				
			||||||
		  blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 | 
							  const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
 | 
				
			||||||
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
 | 
				
			||||||
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
 | 
				
			||||||
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
 | 
				
			||||||
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
 | 
				
			||||||
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
 | 
				
			||||||
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
 | 
					void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
 | 
				
			||||||
                 enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
 | 
					                 const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
 | 
					void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
 | 
				
			||||||
                 float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
 | 
					                 const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
 | 
					void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
 | 
				
			||||||
                 double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
 | 
					                 const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
 | 
					void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
 | 
				
			||||||
                 float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
 | 
					                 const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
 | 
					void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
 | 
				
			||||||
                 double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
 | 
					                 const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
 | 
					void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
 | 
				
			||||||
                  float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
 | 
					                  const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
 | 
				
			||||||
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
 | 
					void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
 | 
				
			||||||
                  double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
 | 
					                  const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void cblas_xerbla(blasint p, char *rout, char *form, ...);
 | 
					void cblas_xerbla(blasint p, char *rout, char *form, ...);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef __cplusplus
 | 
					#ifdef __cplusplus
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
     
 | 
					 | 
				
			||||||
#endif  /* __cplusplus */
 | 
					#endif  /* __cplusplus */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										6
									
								
								common.h
								
								
								
								
							
							
						
						
									
										6
									
								
								common.h
								
								
								
								
							| 
						 | 
					@ -390,7 +390,8 @@ typedef int blasint;
 | 
				
			||||||
/* C99 supports complex floating numbers natively, which GCC also offers as an
 | 
					/* C99 supports complex floating numbers natively, which GCC also offers as an
 | 
				
			||||||
   extension since version 3.0.  If neither are available, use a compatible
 | 
					   extension since version 3.0.  If neither are available, use a compatible
 | 
				
			||||||
   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
 | 
					   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
 | 
				
			||||||
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
 | 
					#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
 | 
				
			||||||
 | 
					     (__GNUC__ >= 3 && !defined(__cplusplus)))
 | 
				
			||||||
  #define OPENBLAS_COMPLEX_C99
 | 
					  #define OPENBLAS_COMPLEX_C99
 | 
				
			||||||
  typedef float _Complex openblas_complex_float;
 | 
					  typedef float _Complex openblas_complex_float;
 | 
				
			||||||
  typedef double _Complex openblas_complex_double;
 | 
					  typedef double _Complex openblas_complex_double;
 | 
				
			||||||
| 
						 | 
					@ -557,7 +558,8 @@ typedef struct {
 | 
				
			||||||
#include "common_level3.h"
 | 
					#include "common_level3.h"
 | 
				
			||||||
#include "common_lapack.h"
 | 
					#include "common_lapack.h"
 | 
				
			||||||
#ifdef CBLAS
 | 
					#ifdef CBLAS
 | 
				
			||||||
#include "cblas.h"
 | 
					/* This header file is generated from "cblas.h" (see Makefile.prebuild). */
 | 
				
			||||||
 | 
					#include "cblas_noconst.h"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef ASSEMBLER
 | 
					#ifndef ASSEMBLER
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										3
									
								
								cpuid.h
								
								
								
								
							
							
						
						
									
										3
									
								
								cpuid.h
								
								
								
								
							| 
						 | 
					@ -125,7 +125,8 @@
 | 
				
			||||||
#define HAVE_MISALIGNSSE (1 << 15)
 | 
					#define HAVE_MISALIGNSSE (1 << 15)
 | 
				
			||||||
#define HAVE_128BITFPU   (1 << 16)
 | 
					#define HAVE_128BITFPU   (1 << 16)
 | 
				
			||||||
#define HAVE_FASTMOVU    (1 << 17)
 | 
					#define HAVE_FASTMOVU    (1 << 17)
 | 
				
			||||||
#define HAVE_AVX     (1 <<  18)
 | 
					#define HAVE_AVX      (1 <<  18)
 | 
				
			||||||
 | 
					#define HAVE_FMA4     (1 <<  19)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define CACHE_INFO_L1_I     1
 | 
					#define CACHE_INFO_L1_I     1
 | 
				
			||||||
#define CACHE_INFO_L1_D     2
 | 
					#define CACHE_INFO_L1_D     2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										32
									
								
								cpuid_x86.c
								
								
								
								
							
							
						
						
									
										32
									
								
								cpuid_x86.c
								
								
								
								
							| 
						 | 
					@ -43,6 +43,8 @@
 | 
				
			||||||
#ifdef NO_AVX
 | 
					#ifdef NO_AVX
 | 
				
			||||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
 | 
					#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
 | 
				
			||||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
 | 
					#define CORE_SANDYBRIDGE CORE_NEHALEM
 | 
				
			||||||
 | 
					#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
 | 
				
			||||||
 | 
					#define CORE_BULLDOZER CORE_BARCELONA
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef CPUIDEMU
 | 
					#ifndef CPUIDEMU
 | 
				
			||||||
| 
						 | 
					@ -116,8 +118,9 @@ static inline int have_excpuid(void){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef NO_AVX
 | 
					#ifndef NO_AVX
 | 
				
			||||||
static inline void xgetbv(int op, int * eax, int * edx){
 | 
					static inline void xgetbv(int op, int * eax, int * edx){
 | 
				
			||||||
 | 
					  //Use binary code for xgetbv
 | 
				
			||||||
  __asm__ __volatile__
 | 
					  __asm__ __volatile__
 | 
				
			||||||
    ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
 | 
					    (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -228,6 +231,9 @@ int get_cputype(int gettype){
 | 
				
			||||||
      cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
 | 
					      cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
 | 
				
			||||||
      if ((ecx & (1 <<  6)) != 0) feature |= HAVE_SSE4A;
 | 
					      if ((ecx & (1 <<  6)) != 0) feature |= HAVE_SSE4A;
 | 
				
			||||||
      if ((ecx & (1 <<  7)) != 0) feature |= HAVE_MISALIGNSSE;
 | 
					      if ((ecx & (1 <<  7)) != 0) feature |= HAVE_MISALIGNSSE;
 | 
				
			||||||
 | 
					#ifndef NO_AVX
 | 
				
			||||||
 | 
					      if ((ecx & (1 <<  16)) != 0) feature |= HAVE_FMA4;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
      if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
 | 
					      if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
 | 
				
			||||||
      if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
 | 
					      if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
| 
						 | 
					@ -1030,6 +1036,8 @@ int get_cpuname(void){
 | 
				
			||||||
	    return CPUTYPE_SANDYBRIDGE;
 | 
						    return CPUTYPE_SANDYBRIDGE;
 | 
				
			||||||
	  else
 | 
						  else
 | 
				
			||||||
	    return CPUTYPE_NEHALEM;
 | 
						    return CPUTYPE_NEHALEM;
 | 
				
			||||||
 | 
						case 14:
 | 
				
			||||||
 | 
						  // Xeon E7540
 | 
				
			||||||
	case 15:
 | 
						case 15:
 | 
				
			||||||
	  //Xeon Processor E7 (Westmere-EX)
 | 
						  //Xeon Processor E7 (Westmere-EX)
 | 
				
			||||||
	  return CPUTYPE_NEHALEM;
 | 
						  return CPUTYPE_NEHALEM;
 | 
				
			||||||
| 
						 | 
					@ -1075,8 +1083,12 @@ int get_cpuname(void){
 | 
				
			||||||
	return CPUTYPE_OPTERON;
 | 
						return CPUTYPE_OPTERON;
 | 
				
			||||||
      case  1:
 | 
					      case  1:
 | 
				
			||||||
      case 10:
 | 
					      case 10:
 | 
				
			||||||
      case  6:   //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | 
					 | 
				
			||||||
	return CPUTYPE_BARCELONA;
 | 
						return CPUTYPE_BARCELONA;
 | 
				
			||||||
 | 
					      case  6:   //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | 
				
			||||||
 | 
						if(support_avx())
 | 
				
			||||||
 | 
						  return CPUTYPE_BULLDOZER;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
						  return CPUTYPE_BARCELONA; //OS don't support AVX.
 | 
				
			||||||
      case  5:
 | 
					      case  5:
 | 
				
			||||||
	return CPUTYPE_BOBCAT;
 | 
						return CPUTYPE_BOBCAT;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
| 
						 | 
					@ -1398,6 +1410,8 @@ int get_coretype(void){
 | 
				
			||||||
	    return CORE_SANDYBRIDGE;
 | 
						    return CORE_SANDYBRIDGE;
 | 
				
			||||||
	  else
 | 
						  else
 | 
				
			||||||
	    return CORE_NEHALEM; //OS doesn't support AVX
 | 
						    return CORE_NEHALEM; //OS doesn't support AVX
 | 
				
			||||||
 | 
						case 14:
 | 
				
			||||||
 | 
						  //Xeon E7540
 | 
				
			||||||
	case 15:
 | 
						case 15:
 | 
				
			||||||
	  //Xeon Processor E7 (Westmere-EX)
 | 
						  //Xeon Processor E7 (Westmere-EX)
 | 
				
			||||||
	  return CORE_NEHALEM;
 | 
						  return CORE_NEHALEM;
 | 
				
			||||||
| 
						 | 
					@ -1427,8 +1441,13 @@ int get_coretype(void){
 | 
				
			||||||
    if (family == 0xf){
 | 
					    if (family == 0xf){
 | 
				
			||||||
      if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; 
 | 
					      if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; 
 | 
				
			||||||
      else if (exfamily == 5) return CORE_BOBCAT; 
 | 
					      else if (exfamily == 5) return CORE_BOBCAT; 
 | 
				
			||||||
      else if (exfamily == 6) return CORE_BARCELONA;  //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | 
					      else if (exfamily == 6) {
 | 
				
			||||||
      else return CORE_BARCELONA;
 | 
						//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | 
				
			||||||
 | 
						if(support_avx())
 | 
				
			||||||
 | 
						  return CORE_BULLDOZER;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
						  return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
 | 
				
			||||||
 | 
					      }else return CORE_BARCELONA;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1494,6 +1513,9 @@ void get_cpuconfig(void){
 | 
				
			||||||
      printf("#define DTB_SIZE %d\n", info.size * 1024);
 | 
					      printf("#define DTB_SIZE %d\n", info.size * 1024);
 | 
				
			||||||
      printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
 | 
					      printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
 | 
				
			||||||
      printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
 | 
					      printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      //fall back for some virtual machines.
 | 
				
			||||||
 | 
					      printf("#define DTB_DEFAULT_ENTRIES 32\n");
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    features = get_cputype(GET_FEATURE);
 | 
					    features = get_cputype(GET_FEATURE);
 | 
				
			||||||
| 
						 | 
					@ -1511,6 +1533,7 @@ void get_cpuconfig(void){
 | 
				
			||||||
    if (features & HAVE_AVX )    printf("#define HAVE_AVX\n");
 | 
					    if (features & HAVE_AVX )    printf("#define HAVE_AVX\n");
 | 
				
			||||||
    if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
 | 
					    if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
 | 
				
			||||||
    if (features & HAVE_3DNOW)   printf("#define HAVE_3DNOW\n");
 | 
					    if (features & HAVE_3DNOW)   printf("#define HAVE_3DNOW\n");
 | 
				
			||||||
 | 
					    if (features & HAVE_FMA4 )    printf("#define HAVE_FMA4\n");
 | 
				
			||||||
    if (features & HAVE_CFLUSH)  printf("#define HAVE_CFLUSH\n");
 | 
					    if (features & HAVE_CFLUSH)  printf("#define HAVE_CFLUSH\n");
 | 
				
			||||||
    if (features & HAVE_HIT)     printf("#define HAVE_HIT 1\n");
 | 
					    if (features & HAVE_HIT)     printf("#define HAVE_HIT 1\n");
 | 
				
			||||||
    if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
 | 
					    if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
 | 
				
			||||||
| 
						 | 
					@ -1577,5 +1600,6 @@ void get_sse(void){
 | 
				
			||||||
  if (features & HAVE_AVX )    printf("HAVE_AVX=1\n");
 | 
					  if (features & HAVE_AVX )    printf("HAVE_AVX=1\n");
 | 
				
			||||||
  if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
 | 
					  if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
 | 
				
			||||||
  if (features & HAVE_3DNOW)   printf("HAVE_3DNOW=1\n");
 | 
					  if (features & HAVE_3DNOW)   printf("HAVE_3DNOW=1\n");
 | 
				
			||||||
 | 
					  if (features & HAVE_FMA4 )    printf("HAVE_FMA4=1\n");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  a = (FLOAT *)args -> a;
 | 
					  a = (FLOAT *)args -> a;
 | 
				
			||||||
  x = (FLOAT *)args -> b;
 | 
					  x = (FLOAT *)args -> b;
 | 
				
			||||||
  y = (FLOAT *)args -> c;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  lda  = args -> lda;
 | 
					  lda  = args -> lda;
 | 
				
			||||||
  incx = args -> ldb;
 | 
					  incx = args -> ldb;
 | 
				
			||||||
| 
						 | 
					@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
 | 
				
			||||||
  n_from = 0;
 | 
					  n_from = 0;
 | 
				
			||||||
  n_to   = n;
 | 
					  n_to   = n;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //Use y as each thread's n* COMPSIZE elements in sb buffer
 | 
				
			||||||
 | 
					  y = buffer;   
 | 
				
			||||||
 | 
					  buffer += ((COMPSIZE * n  + 1023) & ~1023);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (range_m) {
 | 
					  if (range_m) {
 | 
				
			||||||
    n_from = *(range_m + 0);
 | 
					    n_from = *(range_m + 0);
 | 
				
			||||||
    n_to   = *(range_m + 1);
 | 
					    n_to   = *(range_m + 1);
 | 
				
			||||||
| 
						 | 
					@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
 | 
				
			||||||
    a += n_from * lda  * COMPSIZE;
 | 
					    a += n_from * lda  * COMPSIZE;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (range_n) y += *range_n * COMPSIZE;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (incx != 1) {
 | 
					  if (incx != 1) {
 | 
				
			||||||
    COPY_K(n, x, incx, buffer, 1);
 | 
					    COPY_K(n, x, incx, buffer, 1);
 | 
				
			||||||
| 
						 | 
					@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (num_cpu) {
 | 
					  if (num_cpu) {
 | 
				
			||||||
    queue[0].sa = NULL;
 | 
					    queue[0].sa = NULL;
 | 
				
			||||||
    queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
 | 
					    queue[0].sb = buffer;
 | 
				
			||||||
    queue[num_cpu - 1].next = NULL;
 | 
					    queue[num_cpu - 1].next = NULL;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
    exec_blas(num_cpu, queue);
 | 
					    exec_blas(num_cpu, queue);
 | 
				
			||||||
| 
						 | 
					@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
	    ONE, ZERO,
 | 
						    ONE, ZERO,
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	    buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
 | 
						    (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  AXPYU_K(n, 0, 0,
 | 
					  AXPYU_K(n, 0, 0,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
TOPDIR	= ../..
 | 
					TOPDIR	= ../..
 | 
				
			||||||
include ../../Makefile.system
 | 
					include ../../Makefile.system
 | 
				
			||||||
 | 
					
 | 
				
			||||||
COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
 | 
					COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
COMMONOBJS	+= slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX)  dlamc3.$(SUFFIX)
 | 
					COMMONOBJS	+= slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX)  dlamc3.$(SUFFIX)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
 | 
				
			||||||
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
 | 
					openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
 | 
				
			||||||
	$(CC) $(CFLAGS) -c $< -o $(@F)
 | 
						$(CC) $(CFLAGS) -c $< -o $(@F)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					openblas_get_config.$(SUFFIX) : openblas_get_config.c
 | 
				
			||||||
 | 
						$(CC) $(CFLAGS) -c $< -o $(@F)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
 | 
					blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
 | 
				
			||||||
	$(CC) $(CFLAGS) -c $< -o $(@F)
 | 
						$(CC) $(CFLAGS) -c $< -o $(@F)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
 | 
				
			||||||
					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
										+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						queue->sb=sb;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
#ifdef MONITOR
 | 
					#ifdef MONITOR
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -49,8 +49,12 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int blas_server_avail = 0;
 | 
					int blas_server_avail = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void * blas_thread_buffer[MAX_CPU_NUMBER];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void goto_set_num_threads(int num_threads) {
 | 
					void goto_set_num_threads(int num_threads) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int i=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (num_threads < 1) num_threads = blas_num_threads;
 | 
					  if (num_threads < 1) num_threads = blas_num_threads;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 | 
					  if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 | 
				
			||||||
| 
						 | 
					@ -63,6 +67,18 @@ void goto_set_num_threads(int num_threads) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  omp_set_num_threads(blas_cpu_number);
 | 
					  omp_set_num_threads(blas_cpu_number);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //adjust buffer for each thread
 | 
				
			||||||
 | 
					  for(i=0; i<blas_cpu_number; i++){
 | 
				
			||||||
 | 
					    if(blas_thread_buffer[i]==NULL){
 | 
				
			||||||
 | 
					      blas_thread_buffer[i]=blas_memory_alloc(2);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  for(; i<MAX_CPU_NUMBER; i++){
 | 
				
			||||||
 | 
					    if(blas_thread_buffer[i]!=NULL){
 | 
				
			||||||
 | 
					      blas_memory_free(blas_thread_buffer[i]);
 | 
				
			||||||
 | 
					      blas_thread_buffer[i]=NULL;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
#if defined(ARCH_MIPS64) 
 | 
					#if defined(ARCH_MIPS64) 
 | 
				
			||||||
  //set parameters for different number of threads.
 | 
					  //set parameters for different number of threads.
 | 
				
			||||||
  blas_set_parameter();
 | 
					  blas_set_parameter();
 | 
				
			||||||
| 
						 | 
					@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int blas_thread_init(void){
 | 
					int blas_thread_init(void){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int i=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  blas_get_cpu_number();
 | 
					  blas_get_cpu_number();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  blas_server_avail = 1;
 | 
					  blas_server_avail = 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(i=0; i<blas_num_threads; i++){
 | 
				
			||||||
 | 
					    blas_thread_buffer[i]=blas_memory_alloc(2);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  for(; i<MAX_CPU_NUMBER; i++){
 | 
				
			||||||
 | 
					      blas_thread_buffer[i]=NULL;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return 0;
 | 
					  return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int BLASFUNC(blas_thread_shutdown)(void){
 | 
					int BLASFUNC(blas_thread_shutdown)(void){
 | 
				
			||||||
 | 
					  int i=0;
 | 
				
			||||||
  blas_server_avail = 0;
 | 
					  blas_server_avail = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(i=0; i<MAX_CPU_NUMBER; i++){
 | 
				
			||||||
 | 
					    if(blas_thread_buffer[i]!=NULL){
 | 
				
			||||||
 | 
					      blas_memory_free(blas_thread_buffer[i]);
 | 
				
			||||||
 | 
					      blas_thread_buffer[i]=NULL;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return 0;
 | 
					  return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -177,6 +209,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 | 
				
			||||||
static void exec_threads(blas_queue_t *queue){
 | 
					static void exec_threads(blas_queue_t *queue){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  void *buffer, *sa, *sb;
 | 
					  void *buffer, *sa, *sb;
 | 
				
			||||||
 | 
					  int pos=0, release_flag=0;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  buffer = NULL;
 | 
					  buffer = NULL;
 | 
				
			||||||
  sa = queue -> sa;
 | 
					  sa = queue -> sa;
 | 
				
			||||||
| 
						 | 
					@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
 | 
					  if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    buffer = blas_memory_alloc(2);
 | 
					    pos = omp_get_thread_num();
 | 
				
			||||||
 | 
					    buffer = blas_thread_buffer[pos];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    //fallback
 | 
				
			||||||
 | 
					    if(buffer==NULL) {
 | 
				
			||||||
 | 
					      buffer = blas_memory_alloc(2);
 | 
				
			||||||
 | 
					      release_flag=1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
 | 
					    if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
| 
						 | 
					@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){
 | 
				
			||||||
					    + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
										    + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					      queue->sb=sb;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (buffer != NULL) blas_memory_free(buffer);
 | 
					  if (release_flag) blas_memory_free(buffer);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
 | 
				
			||||||
					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
										  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						queue->sb=sb;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
#ifdef MONITOR
 | 
					#ifdef MONITOR
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -63,9 +63,11 @@ extern gotoblas_t  gotoblas_BARCELONA;
 | 
				
			||||||
extern gotoblas_t  gotoblas_BOBCAT;
 | 
					extern gotoblas_t  gotoblas_BOBCAT;
 | 
				
			||||||
#ifndef NO_AVX
 | 
					#ifndef NO_AVX
 | 
				
			||||||
extern gotoblas_t  gotoblas_SANDYBRIDGE;
 | 
					extern gotoblas_t  gotoblas_SANDYBRIDGE;
 | 
				
			||||||
 | 
					extern gotoblas_t  gotoblas_BULLDOZER;
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
//Use NEHALEM kernels for sandy bridge
 | 
					//Use NEHALEM kernels for sandy bridge
 | 
				
			||||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
 | 
					#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
 | 
				
			||||||
 | 
					#define gotoblas_BULLDOZER gotoblas_BARCELONA
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -78,8 +80,9 @@ extern gotoblas_t  gotoblas_SANDYBRIDGE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef NO_AVX
 | 
					#ifndef NO_AVX
 | 
				
			||||||
static inline void xgetbv(int op, int * eax, int * edx){
 | 
					static inline void xgetbv(int op, int * eax, int * edx){
 | 
				
			||||||
 | 
					  //Use binary code for xgetbv
 | 
				
			||||||
  __asm__ __volatile__
 | 
					  __asm__ __volatile__
 | 
				
			||||||
    ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
 | 
					    (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -163,7 +166,8 @@ static gotoblas_t *get_coretype(void){
 | 
				
			||||||
		  
 | 
							  
 | 
				
			||||||
	//Intel Xeon Processor 5600 (Westmere-EP)
 | 
						//Intel Xeon Processor 5600 (Westmere-EP)
 | 
				
			||||||
	//Xeon Processor E7 (Westmere-EX)
 | 
						//Xeon Processor E7 (Westmere-EX)
 | 
				
			||||||
	if (model == 12 || model == 15) return &gotoblas_NEHALEM;
 | 
						//Xeon E7540
 | 
				
			||||||
 | 
						if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
 | 
						//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
 | 
				
			||||||
	//Intel Core i7-3000 / Xeon E5
 | 
						//Intel Core i7-3000 / Xeon E5
 | 
				
			||||||
| 
						 | 
					@ -171,7 +175,7 @@ static gotoblas_t *get_coretype(void){
 | 
				
			||||||
	  if(support_avx())
 | 
						  if(support_avx())
 | 
				
			||||||
	    return &gotoblas_SANDYBRIDGE;
 | 
						    return &gotoblas_SANDYBRIDGE;
 | 
				
			||||||
	  else{
 | 
						  else{
 | 
				
			||||||
	    fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
 | 
						    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
 | 
				
			||||||
	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 | 
						    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -182,7 +186,7 @@ static gotoblas_t *get_coretype(void){
 | 
				
			||||||
	  if(support_avx())
 | 
						  if(support_avx())
 | 
				
			||||||
	    return &gotoblas_SANDYBRIDGE;
 | 
						    return &gotoblas_SANDYBRIDGE;
 | 
				
			||||||
	  else{
 | 
						  else{
 | 
				
			||||||
	    fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
 | 
						    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
 | 
				
			||||||
	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 | 
						    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -202,6 +206,14 @@ static gotoblas_t *get_coretype(void){
 | 
				
			||||||
	else return &gotoblas_OPTERON;
 | 
						else return &gotoblas_OPTERON;
 | 
				
			||||||
      }  else if (exfamily == 5) {
 | 
					      }  else if (exfamily == 5) {
 | 
				
			||||||
	return &gotoblas_BOBCAT;
 | 
						return &gotoblas_BOBCAT;
 | 
				
			||||||
 | 
					      } else if (exfamily == 6) {
 | 
				
			||||||
 | 
						//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
 | 
				
			||||||
 | 
						  if(support_avx())
 | 
				
			||||||
 | 
						    return &gotoblas_BULLDOZER;
 | 
				
			||||||
 | 
						  else{
 | 
				
			||||||
 | 
						    fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
 | 
				
			||||||
 | 
						    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
 | 
				
			||||||
 | 
						  }	
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
	return &gotoblas_BARCELONA;
 | 
						return &gotoblas_BARCELONA;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
| 
						 | 
					@ -238,6 +250,7 @@ static char *corename[] = {
 | 
				
			||||||
    "Nano",
 | 
					    "Nano",
 | 
				
			||||||
    "Sandybridge",
 | 
					    "Sandybridge",
 | 
				
			||||||
    "Bobcat",
 | 
					    "Bobcat",
 | 
				
			||||||
 | 
					    "Bulldozer",
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
char *gotoblas_corename(void) {
 | 
					char *gotoblas_corename(void) {
 | 
				
			||||||
| 
						 | 
					@ -259,6 +272,7 @@ char *gotoblas_corename(void) {
 | 
				
			||||||
  if (gotoblas == &gotoblas_NANO)         return corename[15];
 | 
					  if (gotoblas == &gotoblas_NANO)         return corename[15];
 | 
				
			||||||
  if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
 | 
					  if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
 | 
				
			||||||
  if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
 | 
					  if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
 | 
				
			||||||
 | 
					  if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return corename[0];
 | 
					  return corename[0];
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -273,6 +287,15 @@ void gotoblas_dynamic_init(void) {
 | 
				
			||||||
  if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
 | 
					  if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
  if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
 | 
					  if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
 | 
				
			||||||
 | 
					  /* sanity check, if 64bit pointer we can't have a 32 bit cpu */
 | 
				
			||||||
 | 
					  if (sizeof(void*) == 8) {
 | 
				
			||||||
 | 
					      if (gotoblas == &gotoblas_KATMAI ||
 | 
				
			||||||
 | 
					          gotoblas == &gotoblas_COPPERMINE ||
 | 
				
			||||||
 | 
					          gotoblas == &gotoblas_NORTHWOOD ||
 | 
				
			||||||
 | 
					          gotoblas == &gotoblas_BANIAS ||
 | 
				
			||||||
 | 
					          gotoblas == &gotoblas_ATHLON)
 | 
				
			||||||
 | 
					          gotoblas = &gotoblas_PRESCOTT;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  if (gotoblas && gotoblas -> init) {
 | 
					  if (gotoblas && gotoblas -> init) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,59 @@
 | 
				
			||||||
 | 
					/*****************************************************************************
 | 
				
			||||||
 | 
					Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
 | 
				
			||||||
 | 
					All rights reserved.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Redistribution and use in source and binary forms, with or without
 | 
				
			||||||
 | 
					modification, are permitted provided that the following conditions are
 | 
				
			||||||
 | 
					met:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   1. Redistributions of source code must retain the above copyright
 | 
				
			||||||
 | 
					      notice, this list of conditions and the following disclaimer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   2. Redistributions in binary form must reproduce the above copyright
 | 
				
			||||||
 | 
					      notice, this list of conditions and the following disclaimer in
 | 
				
			||||||
 | 
					      the documentation and/or other materials provided with the
 | 
				
			||||||
 | 
					      distribution.
 | 
				
			||||||
 | 
					   3. Neither the name of the ISCAS nor the names of its contributors may 
 | 
				
			||||||
 | 
					      be used to endorse or promote products derived from this software 
 | 
				
			||||||
 | 
					      without specific prior written permission.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 | 
				
			||||||
 | 
					AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 | 
				
			||||||
 | 
					IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 | 
				
			||||||
 | 
					ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
 | 
				
			||||||
 | 
					LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 | 
				
			||||||
 | 
					DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 | 
				
			||||||
 | 
					SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 | 
				
			||||||
 | 
					CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
 | 
				
			||||||
 | 
					OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
 | 
				
			||||||
 | 
					USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**********************************************************************************/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include "common.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static char* openblas_config_str=""
 | 
				
			||||||
 | 
					#ifdef USE64BITINT
 | 
				
			||||||
 | 
					  "USE64BITINT "
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef NO_CBLAS
 | 
				
			||||||
 | 
					  "NO_CBLAS "
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef NO_LAPACK
 | 
				
			||||||
 | 
					  "NO_LAPACK "
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef NO_LAPACKE
 | 
				
			||||||
 | 
					  "NO_LAPACKE "
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef DYNAMIC_ARCH
 | 
				
			||||||
 | 
					  "DYNAMIC_ARCH "
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef NO_AFFINITY
 | 
				
			||||||
 | 
					  "NO_AFFINITY "
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					char* CNAME() {
 | 
				
			||||||
 | 
					  return openblas_config_str;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -163,7 +163,7 @@ int get_L2_size(void){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int eax, ebx, ecx, edx;
 | 
					  int eax, ebx, ecx, edx;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
 | 
					#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
 | 
				
			||||||
    defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
 | 
					    defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
 | 
				
			||||||
  defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
 | 
					  defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT)
 | 
				
			||||||
ifeq ($(F_COMPILER), GFORTRAN)
 | 
					ifeq ($(F_COMPILER), GFORTRAN)
 | 
				
			||||||
EXTRALIB += -lgfortran
 | 
					EXTRALIB += -lgfortran
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					ifeq ($(USE_OPENMP), 1)
 | 
				
			||||||
 | 
					ifeq ($(C_COMPILER), GCC)
 | 
				
			||||||
 | 
					EXTRALIB += -lgomp
 | 
				
			||||||
 | 
					endif
 | 
				
			||||||
 | 
					endif
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifeq ($(OSNAME), CYGWIN_NT)
 | 
					ifeq ($(OSNAME), CYGWIN_NT)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -74,6 +74,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@misc_no_underscore_objs = (
 | 
					@misc_no_underscore_objs = (
 | 
				
			||||||
                            openblas_set_num_threads, goto_set_num_threads,
 | 
					                            openblas_set_num_threads, goto_set_num_threads,
 | 
				
			||||||
 | 
					                            openblas_get_config,
 | 
				
			||||||
                           );
 | 
					                           );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@misc_underscore_objs = (
 | 
					@misc_underscore_objs = (
 | 
				
			||||||
| 
						 | 
					@ -110,7 +111,7 @@
 | 
				
			||||||
		# already provided by @blasobjs: xerbla, lsame
 | 
							# already provided by @blasobjs: xerbla, lsame
 | 
				
			||||||
		ilaenv, ieeeck, lsamen, xerbla_array, iparmq,
 | 
							ilaenv, ieeeck, lsamen, xerbla_array, iparmq,
 | 
				
			||||||
		ilaprec, ilatrans, ilauplo, iladiag, chla_transtype,
 | 
							ilaprec, ilatrans, ilauplo, iladiag, chla_transtype,
 | 
				
			||||||
		ilaver, slamch,
 | 
							ilaver, slamch, slamc3,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		# SCLAUX  -- Auxiliary routines called from both REAL and COMPLEX.
 | 
							# SCLAUX  -- Auxiliary routines called from both REAL and COMPLEX.
 | 
				
			||||||
		# excluded: second_$(TIMER)
 | 
							# excluded: second_$(TIMER)
 | 
				
			||||||
| 
						 | 
					@ -147,7 +148,7 @@
 | 
				
			||||||
		dlasr,  dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc,
 | 
							dlasr,  dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc,
 | 
				
			||||||
		dsteqr, dsterf, dlaisnan, disnan,
 | 
							dsteqr, dsterf, dlaisnan, disnan,
 | 
				
			||||||
		dlartgp, dlartgs,
 | 
							dlartgp, dlartgs,
 | 
				
			||||||
		dlamch,
 | 
							dlamch, dlamc3,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		# SLASRC  -- Single precision real LAPACK routines
 | 
							# SLASRC  -- Single precision real LAPACK routines
 | 
				
			||||||
		# already provided by @lapackobjs:
 | 
							# already provided by @lapackobjs:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										56
									
								
								getarch.c
								
								
								
								
							
							
						
						
									
										56
									
								
								getarch.c
								
								
								
								
							| 
						 | 
					@ -96,14 +96,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
/* #define FORCE_PENRYN		*/
 | 
					/* #define FORCE_PENRYN		*/
 | 
				
			||||||
/* #define FORCE_DUNNINGTON	*/
 | 
					/* #define FORCE_DUNNINGTON	*/
 | 
				
			||||||
/* #define FORCE_NEHALEM	*/
 | 
					/* #define FORCE_NEHALEM	*/
 | 
				
			||||||
 | 
					/* #define FORCE_SANDYBRIDGE	*/
 | 
				
			||||||
 | 
					/* #define FORCE_ATOM		*/
 | 
				
			||||||
/* #define FORCE_ATHLON		*/
 | 
					/* #define FORCE_ATHLON		*/
 | 
				
			||||||
/* #define FORCE_OPTERON	*/
 | 
					/* #define FORCE_OPTERON	*/
 | 
				
			||||||
/* #define FORCE_OPTERON_SSE3	*/
 | 
					/* #define FORCE_OPTERON_SSE3	*/
 | 
				
			||||||
/* #define FORCE_BARCELONA	*/
 | 
					/* #define FORCE_BARCELONA	*/
 | 
				
			||||||
/* #define FORCE_SHANGHAI	*/
 | 
					/* #define FORCE_SHANGHAI	*/
 | 
				
			||||||
/* #define FORCE_ISTANBUL	*/
 | 
					/* #define FORCE_ISTANBUL	*/
 | 
				
			||||||
 | 
					/* #define FORCE_BOBCAT		*/
 | 
				
			||||||
/* #define FORCE_BULLDOZER	*/
 | 
					/* #define FORCE_BULLDOZER	*/
 | 
				
			||||||
/* #define FORCE_BOBCAT	*/
 | 
					 | 
				
			||||||
/* #define FORCE_SSE_GENERIC	*/
 | 
					/* #define FORCE_SSE_GENERIC	*/
 | 
				
			||||||
/* #define FORCE_VIAC3		*/
 | 
					/* #define FORCE_VIAC3		*/
 | 
				
			||||||
/* #define FORCE_NANO		*/
 | 
					/* #define FORCE_NANO		*/
 | 
				
			||||||
| 
						 | 
					@ -118,12 +120,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
/* #define FORCE_PPC440FP2	*/
 | 
					/* #define FORCE_PPC440FP2	*/
 | 
				
			||||||
/* #define FORCE_CELL		*/
 | 
					/* #define FORCE_CELL		*/
 | 
				
			||||||
/* #define FORCE_SICORTEX	*/
 | 
					/* #define FORCE_SICORTEX	*/
 | 
				
			||||||
/* #define FORCE_LOONGSON3A      */
 | 
					/* #define FORCE_LOONGSON3A	*/
 | 
				
			||||||
/* #define FORCE_LOONGSON3B      */
 | 
					/* #define FORCE_LOONGSON3B	*/
 | 
				
			||||||
/* #define FORCE_ITANIUM2	*/
 | 
					/* #define FORCE_ITANIUM2	*/
 | 
				
			||||||
/* #define FORCE_GENERIC	*/
 | 
					 | 
				
			||||||
/* #define FORCE_SPARC		*/
 | 
					/* #define FORCE_SPARC		*/
 | 
				
			||||||
/* #define FORCE_SPARCV7	*/
 | 
					/* #define FORCE_SPARCV7	*/
 | 
				
			||||||
 | 
					/* #define FORCE_GENERIC	*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef FORCE_P2
 | 
					#ifdef FORCE_P2
 | 
				
			||||||
#define FORCE
 | 
					#define FORCE
 | 
				
			||||||
| 
						 | 
					@ -139,20 +141,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
#define CORENAME  "P5"
 | 
					#define CORENAME  "P5"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef FORCE_COPPERMINE
 | 
					 | 
				
			||||||
#define FORCE
 | 
					 | 
				
			||||||
#define FORCE_INTEL
 | 
					 | 
				
			||||||
#define ARCHITECTURE    "X86"
 | 
					 | 
				
			||||||
#define SUBARCHITECTURE "PENTIUM3"
 | 
					 | 
				
			||||||
#define ARCHCONFIG   "-DPENTIUM3 " \
 | 
					 | 
				
			||||||
		     "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
 | 
					 | 
				
			||||||
		     "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
 | 
					 | 
				
			||||||
		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 | 
					 | 
				
			||||||
		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
 | 
					 | 
				
			||||||
#define LIBNAME   "coppermine"
 | 
					 | 
				
			||||||
#define CORENAME  "COPPERMINE"
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef FORCE_KATMAI
 | 
					#ifdef FORCE_KATMAI
 | 
				
			||||||
#define FORCE
 | 
					#define FORCE
 | 
				
			||||||
#define FORCE_INTEL
 | 
					#define FORCE_INTEL
 | 
				
			||||||
| 
						 | 
					@ -167,6 +155,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
#define CORENAME  "KATMAI"
 | 
					#define CORENAME  "KATMAI"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef FORCE_COPPERMINE
 | 
				
			||||||
 | 
					#define FORCE
 | 
				
			||||||
 | 
					#define FORCE_INTEL
 | 
				
			||||||
 | 
					#define ARCHITECTURE    "X86"
 | 
				
			||||||
 | 
					#define SUBARCHITECTURE "PENTIUM3"
 | 
				
			||||||
 | 
					#define ARCHCONFIG   "-DPENTIUM3 " \
 | 
				
			||||||
 | 
							     "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
 | 
				
			||||||
 | 
							     "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
 | 
				
			||||||
 | 
							     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 | 
				
			||||||
 | 
							     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
 | 
				
			||||||
 | 
					#define LIBNAME   "coppermine"
 | 
				
			||||||
 | 
					#define CORENAME  "COPPERMINE"
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef FORCE_NORTHWOOD
 | 
					#ifdef FORCE_NORTHWOOD
 | 
				
			||||||
#define FORCE
 | 
					#define FORCE
 | 
				
			||||||
#define FORCE_INTEL
 | 
					#define FORCE_INTEL
 | 
				
			||||||
| 
						 | 
					@ -350,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
#define CORENAME  "OPTERON"
 | 
					#define CORENAME  "OPTERON"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER)
 | 
					#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
 | 
				
			||||||
#define FORCE
 | 
					#define FORCE
 | 
				
			||||||
#define FORCE_INTEL
 | 
					#define FORCE_INTEL
 | 
				
			||||||
#define ARCHITECTURE    "X86"
 | 
					#define ARCHITECTURE    "X86"
 | 
				
			||||||
| 
						 | 
					@ -380,6 +382,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
#define CORENAME  "BOBCAT"
 | 
					#define CORENAME  "BOBCAT"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if defined (FORCE_BULLDOZER)
 | 
				
			||||||
 | 
					#define FORCE
 | 
				
			||||||
 | 
					#define FORCE_INTEL
 | 
				
			||||||
 | 
					#define ARCHITECTURE    "X86"
 | 
				
			||||||
 | 
					#define SUBARCHITECTURE "BULLDOZER"
 | 
				
			||||||
 | 
					#define ARCHCONFIG   "-DBULLDOZER " \
 | 
				
			||||||
 | 
							     "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
 | 
				
			||||||
 | 
							     "-DL2_SIZE=1024000 -DL2_LINESIZE=64  -DL3_SIZE=16777216 " \
 | 
				
			||||||
 | 
							     "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
 | 
				
			||||||
 | 
							     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
 | 
				
			||||||
 | 
							     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
 | 
				
			||||||
 | 
					                     "-DHAVE_AVX -DHAVE_FMA4"
 | 
				
			||||||
 | 
					#define LIBNAME   "bulldozer"
 | 
				
			||||||
 | 
					#define CORENAME  "BULLDOZER"
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef FORCE_SSE_GENERIC
 | 
					#ifdef FORCE_SSE_GENERIC
 | 
				
			||||||
#define FORCE
 | 
					#define FORCE
 | 
				
			||||||
#define FORCE_INTEL
 | 
					#define FORCE_INTEL
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -34,7 +34,7 @@ int main(int argc, char **argv) {
 | 
				
			||||||
#ifdef USE64BITINT
 | 
					#ifdef USE64BITINT
 | 
				
			||||||
	printf("#define USE64BITINT\n");
 | 
						printf("#define USE64BITINT\n");
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
 | 
						printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return 0;
 | 
					  return 0;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -634,10 +634,10 @@ static void init_parameter(void) {
 | 
				
			||||||
  TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
 | 
					  TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH)
 | 
					#if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef DEBUG
 | 
					#ifdef DEBUG
 | 
				
			||||||
  fprintf(stderr, "Katmai, Coppermine, Banias\n");
 | 
					  fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  TABLE_NAME.sgemm_p =  64 * (l2 >> 7);
 | 
					  TABLE_NAME.sgemm_p =  64 * (l2 >> 7);
 | 
				
			||||||
| 
						 | 
					@ -810,6 +810,22 @@ static void init_parameter(void) {
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef BULLDOZER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef DEBUG
 | 
				
			||||||
 | 
					  fprintf(stderr, "Bulldozer\n");
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
 | 
				
			||||||
 | 
					  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
 | 
				
			||||||
 | 
					  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
 | 
				
			||||||
 | 
					  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
 | 
				
			||||||
 | 
					#ifdef EXPRECISION
 | 
				
			||||||
 | 
					  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
 | 
				
			||||||
 | 
					  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef NANO
 | 
					#ifdef NANO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef DEBUG
 | 
					#ifdef DEBUG
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,59 @@
 | 
				
			||||||
 | 
					SGEMMKERNEL    =  gemm_kernel_4x4_barcelona.S
 | 
				
			||||||
 | 
					SGEMMINCOPY    =  
 | 
				
			||||||
 | 
					SGEMMITCOPY    =  
 | 
				
			||||||
 | 
					SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 | 
				
			||||||
 | 
					SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 | 
				
			||||||
 | 
					SGEMMINCOPYOBJ =  
 | 
				
			||||||
 | 
					SGEMMITCOPYOBJ =  
 | 
				
			||||||
 | 
					SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					DGEMMKERNEL    =  gemm_kernel_2x4_barcelona.S
 | 
				
			||||||
 | 
					DGEMMINCOPY    =  ../generic/gemm_ncopy_2.c
 | 
				
			||||||
 | 
					DGEMMITCOPY    =  ../generic/gemm_tcopy_2.c
 | 
				
			||||||
 | 
					DGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 | 
				
			||||||
 | 
					DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 | 
				
			||||||
 | 
					DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					CGEMMKERNEL    =  zgemm_kernel_2x2_barcelona.S
 | 
				
			||||||
 | 
					CGEMMINCOPY    =  
 | 
				
			||||||
 | 
					CGEMMITCOPY    =  
 | 
				
			||||||
 | 
					CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
 | 
				
			||||||
 | 
					CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
 | 
				
			||||||
 | 
					CGEMMINCOPYOBJ =
 | 
				
			||||||
 | 
					CGEMMITCOPYOBJ =  
 | 
				
			||||||
 | 
					CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					ZGEMMKERNEL    =  zgemm_kernel_1x2_barcelona.S
 | 
				
			||||||
 | 
					ZGEMMINCOPY    =  ../generic/zgemm_ncopy_1.c
 | 
				
			||||||
 | 
					ZGEMMITCOPY    =  ../generic/zgemm_tcopy_1.c
 | 
				
			||||||
 | 
					ZGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
 | 
				
			||||||
 | 
					ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
 | 
				
			||||||
 | 
					ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					STRSMKERNEL_LN	=  trsm_kernel_LN_4x4_sse.S
 | 
				
			||||||
 | 
					STRSMKERNEL_LT	=  trsm_kernel_LT_4x4_sse.S
 | 
				
			||||||
 | 
					STRSMKERNEL_RN	=  trsm_kernel_LT_4x4_sse.S
 | 
				
			||||||
 | 
					STRSMKERNEL_RT	=  trsm_kernel_RT_4x4_sse.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DTRSMKERNEL_LN	=  trsm_kernel_LN_2x4_sse2.S
 | 
				
			||||||
 | 
					DTRSMKERNEL_LT	=  trsm_kernel_LT_2x4_sse2.S
 | 
				
			||||||
 | 
					DTRSMKERNEL_RN	=  trsm_kernel_LT_2x4_sse2.S
 | 
				
			||||||
 | 
					DTRSMKERNEL_RT	=  trsm_kernel_RT_2x4_sse2.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CTRSMKERNEL_LN	=  ztrsm_kernel_LN_2x2_sse.S
 | 
				
			||||||
 | 
					CTRSMKERNEL_LT	=  ztrsm_kernel_LT_2x2_sse.S
 | 
				
			||||||
 | 
					CTRSMKERNEL_RN	=  ztrsm_kernel_LT_2x2_sse.S
 | 
				
			||||||
 | 
					CTRSMKERNEL_RT	=  ztrsm_kernel_RT_2x2_sse.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ZTRSMKERNEL_LN	=  ztrsm_kernel_LT_1x2_sse2.S
 | 
				
			||||||
 | 
					ZTRSMKERNEL_LT	=  ztrsm_kernel_LT_1x2_sse2.S
 | 
				
			||||||
 | 
					ZTRSMKERNEL_RN	=  ztrsm_kernel_LT_1x2_sse2.S
 | 
				
			||||||
 | 
					ZTRSMKERNEL_RT	=  ztrsm_kernel_RT_1x2_sse2.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CGEMM3MKERNEL    =  zgemm3m_kernel_4x4_barcelona.S
 | 
				
			||||||
 | 
					ZGEMM3MKERNEL    =  zgemm3m_kernel_2x4_barcelona.S
 | 
				
			||||||
| 
						 | 
					@ -596,7 +596,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 4 * SIZE(BB), %xmm2
 | 
						movsd	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -842,7 +842,7 @@
 | 
				
			||||||
.L32:
 | 
					.L32:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
	addss	%xmm2, %xmm4
 | 
						addss	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 4 * SIZE(BB), %xmm2
 | 
						movss	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1168,7 +1168,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L52:
 | 
					.L52:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	 4 * SIZE(BB), %xmm0
 | 
						mulps	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1198,7 +1198,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm5
 | 
						addps	%xmm0, %xmm5
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1347,7 +1347,7 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L62:
 | 
					.L62:
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1531,7 +1531,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L72:
 | 
					.L72:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulss	 4 * SIZE(BB), %xmm0
 | 
						mulss	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1778,7 +1778,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L92:
 | 
					.L92:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(AA), %xmm0
 | 
						movaps	 4 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1793,7 +1793,7 @@
 | 
				
			||||||
	mulps	12 * SIZE(BB), %xmm0
 | 
						mulps	12 * SIZE(BB), %xmm0
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm3
 | 
						mulps	%xmm1, %xmm3
 | 
				
			||||||
| 
						 | 
					@ -1924,7 +1924,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L102:
 | 
					.L102:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 2 * SIZE(AA), %xmm0
 | 
						movsd	 2 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -2069,7 +2069,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L112:
 | 
					.L112:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 1 * SIZE(AA), %xmm0
 | 
						movss	 1 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,17 +89,22 @@
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define STACKSIZE	16
 | 
					#define STACKSIZE	16
 | 
				
			||||||
 | 
					#define ARGS	16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define M		 4 + STACKSIZE(%esp)
 | 
					#define M		 4 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define N		 8 + STACKSIZE(%esp)
 | 
					#define N		 8 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define ALPHA		16 + STACKSIZE(%esp)
 | 
					#define ALPHA		16 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define A		20 + STACKSIZE(%esp)
 | 
					#define A		20 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_LDA	24 + STACKSIZE(%esp)
 | 
					#define STACK_LDA	24 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_X		28 + STACKSIZE(%esp)
 | 
					#define STACK_X		28 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCX	32 + STACKSIZE(%esp)
 | 
					#define STACK_INCX	32 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define Y		36 + STACKSIZE(%esp)
 | 
					#define Y		36 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCY	40 + STACKSIZE(%esp)
 | 
					#define STACK_INCY	40 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define BUFFER		44 + STACKSIZE(%esp)
 | 
					#define BUFFER		44 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
 | 
					#define MMM	0+ARGS(%esp)
 | 
				
			||||||
 | 
					#define YY	4+ARGS(%esp)
 | 
				
			||||||
 | 
					#define AA	8+ARGS(%esp)
 | 
				
			||||||
 | 
					#define LDAX	12+ARGS(%esp)
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
#define I	%eax
 | 
					#define I	%eax
 | 
				
			||||||
#define J	%ebx
 | 
					#define J	%ebx
 | 
				
			||||||
| 
						 | 
					@ -114,6 +119,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	PROLOGUE
 | 
						PROLOGUE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						subl	$ARGS,%esp
 | 
				
			||||||
	pushl	%ebp
 | 
						pushl	%ebp
 | 
				
			||||||
	pushl	%edi
 | 
						pushl	%edi
 | 
				
			||||||
	pushl	%esi
 | 
						pushl	%esi
 | 
				
			||||||
| 
						 | 
					@ -121,7 +127,34 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	PROFCODE
 | 
						PROFCODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	Y,J
 | 
				
			||||||
 | 
						movl	J,YY				# backup Y
 | 
				
			||||||
 | 
						movl	A,J
 | 
				
			||||||
 | 
						movl	J,AA				# backup A
 | 
				
			||||||
 | 
						movl	M,J
 | 
				
			||||||
 | 
						movl	J,MMM				# backup MM
 | 
				
			||||||
 | 
					.L0t:
 | 
				
			||||||
 | 
						xorl	J,J
 | 
				
			||||||
 | 
						addl	$1,J
 | 
				
			||||||
 | 
						sall	$21,J
 | 
				
			||||||
 | 
						subl	J,MMM
 | 
				
			||||||
 | 
						movl	J,M
 | 
				
			||||||
 | 
						jge		.L00t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	MMM,%eax
 | 
				
			||||||
 | 
						addl	J,%eax
 | 
				
			||||||
 | 
						jle		.L999x
 | 
				
			||||||
 | 
						movl	%eax,M
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L00t:
 | 
				
			||||||
 | 
						movl	AA,%eax
 | 
				
			||||||
 | 
						movl	%eax,A
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	YY,J
 | 
				
			||||||
 | 
						movl	J,Y
 | 
				
			||||||
	movl	STACK_LDA,  LDA
 | 
						movl	STACK_LDA,  LDA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movl	STACK_X,    X
 | 
						movl	STACK_X,    X
 | 
				
			||||||
	movl	STACK_INCX, INCX
 | 
						movl	STACK_INCX, INCX
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -651,12 +684,22 @@
 | 
				
			||||||
	addss	0 * SIZE(X), %xmm0
 | 
						addss	0 * SIZE(X), %xmm0
 | 
				
			||||||
	movss	%xmm0, (Y1)
 | 
						movss	%xmm0, (Y1)
 | 
				
			||||||
	ALIGN_3
 | 
						ALIGN_3
 | 
				
			||||||
 | 
					 | 
				
			||||||
.L999:
 | 
					.L999:
 | 
				
			||||||
 | 
						movl	M,J
 | 
				
			||||||
 | 
						leal	(,J,SIZE),%eax
 | 
				
			||||||
 | 
						addl	%eax,AA
 | 
				
			||||||
 | 
						movl	YY,J
 | 
				
			||||||
 | 
						addl	%eax,J
 | 
				
			||||||
 | 
						movl	J,YY
 | 
				
			||||||
 | 
						jmp		.L0t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L999x:
 | 
				
			||||||
	popl	%ebx
 | 
						popl	%ebx
 | 
				
			||||||
	popl	%esi
 | 
						popl	%esi
 | 
				
			||||||
	popl	%edi	
 | 
						popl	%edi	
 | 
				
			||||||
	popl	%ebp
 | 
						popl	%ebp
 | 
				
			||||||
 | 
						addl	$ARGS,%esp
 | 
				
			||||||
	ret
 | 
						ret
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	EPILOGUE
 | 
						EPILOGUE
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,17 +76,22 @@
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define STACKSIZE	16
 | 
					#define STACKSIZE	16
 | 
				
			||||||
 | 
					#define ARGS	16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define M		 4 + STACKSIZE(%esp)
 | 
					#define M		 4 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define N		 8 + STACKSIZE(%esp)
 | 
					#define N		 8 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define ALPHA		16 + STACKSIZE(%esp)
 | 
					#define ALPHA		16 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define A		24 + STACKSIZE(%esp)
 | 
					#define A		24 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_LDA	28 + STACKSIZE(%esp)
 | 
					#define STACK_LDA	28 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_X		32 + STACKSIZE(%esp)
 | 
					#define STACK_X		32 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCX	36 + STACKSIZE(%esp)
 | 
					#define STACK_INCX	36 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define Y		40 + STACKSIZE(%esp)
 | 
					#define Y		40 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCY	44 + STACKSIZE(%esp)
 | 
					#define STACK_INCY	44 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define BUFFER		48 + STACKSIZE(%esp)
 | 
					#define BUFFER		48 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MMM	0+ARGS(%esp)
 | 
				
			||||||
 | 
					#define YY	4+ARGS(%esp)
 | 
				
			||||||
 | 
					#define AA	8+ARGS(%esp)
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
#define I	%eax
 | 
					#define I	%eax
 | 
				
			||||||
#define J	%ebx
 | 
					#define J	%ebx
 | 
				
			||||||
| 
						 | 
					@ -101,6 +106,8 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	PROLOGUE
 | 
						PROLOGUE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						subl	$ARGS,%esp
 | 
				
			||||||
	pushl	%ebp
 | 
						pushl	%ebp
 | 
				
			||||||
	pushl	%edi
 | 
						pushl	%edi
 | 
				
			||||||
	pushl	%esi
 | 
						pushl	%esi
 | 
				
			||||||
| 
						 | 
					@ -108,6 +115,33 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	PROFCODE
 | 
						PROFCODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	Y,J
 | 
				
			||||||
 | 
						movl	J,YY				# backup Y
 | 
				
			||||||
 | 
						movl	A,J
 | 
				
			||||||
 | 
						movl	J,AA				# backup A
 | 
				
			||||||
 | 
						movl	M,J
 | 
				
			||||||
 | 
						movl	J,MMM				# backup MM
 | 
				
			||||||
 | 
					.L0t:
 | 
				
			||||||
 | 
						xorl	J,J
 | 
				
			||||||
 | 
						addl	$1,J
 | 
				
			||||||
 | 
						sall	$20,J
 | 
				
			||||||
 | 
						subl	J,MMM
 | 
				
			||||||
 | 
						movl	J,M
 | 
				
			||||||
 | 
						jge		.L00t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	MMM,%eax
 | 
				
			||||||
 | 
						addl	J,%eax
 | 
				
			||||||
 | 
						jle		.L999x
 | 
				
			||||||
 | 
						movl	%eax,M
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L00t:
 | 
				
			||||||
 | 
						movl	AA,%eax
 | 
				
			||||||
 | 
						movl	%eax,A
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	YY,J
 | 
				
			||||||
 | 
						movl	J,Y
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movl	STACK_LDA,  LDA
 | 
						movl	STACK_LDA,  LDA
 | 
				
			||||||
	movl	STACK_X,    X
 | 
						movl	STACK_X,    X
 | 
				
			||||||
	movl	STACK_INCX, INCX
 | 
						movl	STACK_INCX, INCX
 | 
				
			||||||
| 
						 | 
					@ -677,10 +711,22 @@
 | 
				
			||||||
	ALIGN_3
 | 
						ALIGN_3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L999:
 | 
					.L999:
 | 
				
			||||||
 | 
						movl	M,J
 | 
				
			||||||
 | 
						leal	(,J,SIZE),%eax
 | 
				
			||||||
 | 
						addl	%eax,AA
 | 
				
			||||||
 | 
						movl	YY,J
 | 
				
			||||||
 | 
						addl	%eax,J
 | 
				
			||||||
 | 
						movl	J,YY
 | 
				
			||||||
 | 
						jmp		.L0t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L999x:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	popl	%ebx
 | 
						popl	%ebx
 | 
				
			||||||
	popl	%esi
 | 
						popl	%esi
 | 
				
			||||||
	popl	%edi	
 | 
						popl	%edi	
 | 
				
			||||||
	popl	%ebp
 | 
						popl	%ebp
 | 
				
			||||||
 | 
						addl	$ARGS,%esp
 | 
				
			||||||
	ret
 | 
						ret
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	EPILOGUE
 | 
						EPILOGUE
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,17 +89,24 @@
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define STACKSIZE	16
 | 
					#define STACKSIZE	16
 | 
				
			||||||
 | 
					#define ARGS	20
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define M		 4 + STACKSIZE(%esp)
 | 
					#define M		 4 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define N		 8 + STACKSIZE(%esp)
 | 
					#define N		 8 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define ALPHA		16 + STACKSIZE(%esp)
 | 
					#define ALPHA		16 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define A		20 + STACKSIZE(%esp)
 | 
					#define A		20 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_LDA	24 + STACKSIZE(%esp)
 | 
					#define STACK_LDA	24 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_X		28 + STACKSIZE(%esp)
 | 
					#define STACK_X		28 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCX	32 + STACKSIZE(%esp)
 | 
					#define STACK_INCX	32 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define Y		36 + STACKSIZE(%esp)
 | 
					#define Y		36 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCY	40 + STACKSIZE(%esp)
 | 
					#define STACK_INCY	40 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define BUFFER		44 + STACKSIZE(%esp)
 | 
					#define BUFFER		44 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MMM	0+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					#define NN	4+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					#define AA	8+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					#define LDAX	12+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					#define XX	16+STACKSIZE(%esp)
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
#define I	%eax
 | 
					#define I	%eax
 | 
				
			||||||
#define J	%ebx
 | 
					#define J	%ebx
 | 
				
			||||||
| 
						 | 
					@ -114,6 +121,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	PROLOGUE
 | 
						PROLOGUE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						subl	$ARGS,%esp
 | 
				
			||||||
	pushl	%ebp
 | 
						pushl	%ebp
 | 
				
			||||||
	pushl	%edi
 | 
						pushl	%edi
 | 
				
			||||||
	pushl	%esi
 | 
						pushl	%esi
 | 
				
			||||||
| 
						 | 
					@ -122,7 +130,42 @@
 | 
				
			||||||
	PROFCODE
 | 
						PROFCODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movl	STACK_LDA,  LDA
 | 
						movl	STACK_LDA,  LDA
 | 
				
			||||||
 | 
						movl	LDA,LDAX			# backup LDA
 | 
				
			||||||
	movl	STACK_X,    X
 | 
						movl	STACK_X,    X
 | 
				
			||||||
 | 
						movl	X,XX
 | 
				
			||||||
 | 
						movl	N,J
 | 
				
			||||||
 | 
						movl	J,NN				# backup N
 | 
				
			||||||
 | 
						movl	A,J
 | 
				
			||||||
 | 
						movl	J,AA				# backup A
 | 
				
			||||||
 | 
					    movl	M,J
 | 
				
			||||||
 | 
						movl	J,MMM				# mov M to MMM
 | 
				
			||||||
 | 
					.L0t:
 | 
				
			||||||
 | 
						xorl	J,J
 | 
				
			||||||
 | 
						addl	$1,J
 | 
				
			||||||
 | 
						sall    $22,J                           # J=2^24*sizeof(float)=buffer size(16MB)
 | 
				
			||||||
 | 
						subl    $8, J                           # Don't use last 8 float in the buffer.
 | 
				
			||||||
 | 
						                                        # Now, split M by block J
 | 
				
			||||||
 | 
						subl	J,MMM				# MMM=MMM-J
 | 
				
			||||||
 | 
						movl	J,M		
 | 
				
			||||||
 | 
						jge		.L00t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						movl	MMM,%eax
 | 
				
			||||||
 | 
						addl	J,%eax
 | 
				
			||||||
 | 
						jle		.L999x
 | 
				
			||||||
 | 
						movl	%eax,M
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L00t:
 | 
				
			||||||
 | 
						movl	AA,%eax
 | 
				
			||||||
 | 
						movl	%eax,A			 	# mov AA to A
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	NN,%eax
 | 
				
			||||||
 | 
						movl	%eax,N				# reset N
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	LDAX,  LDA			# reset LDA
 | 
				
			||||||
 | 
						movl	XX,X
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movl	STACK_INCX, INCX
 | 
						movl	STACK_INCX, INCX
 | 
				
			||||||
	movl	STACK_INCY, INCY
 | 
						movl	STACK_INCY, INCY
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -198,6 +241,20 @@
 | 
				
			||||||
	jg	.L06
 | 
						jg	.L06
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//Padding zero to prevent loading the dirty number from buffer.
 | 
				
			||||||
 | 
						movl	M,  I
 | 
				
			||||||
 | 
						movl	$8, J
 | 
				
			||||||
 | 
						andl	$7, I
 | 
				
			||||||
 | 
						xorps	%xmm0, %xmm0
 | 
				
			||||||
 | 
						subl	I, J
 | 
				
			||||||
 | 
						ALIGN_2
 | 
				
			||||||
 | 
					.L07:
 | 
				
			||||||
 | 
						movss	%xmm0, 0 * SIZE(Y1)
 | 
				
			||||||
 | 
						addl	$SIZE, Y1
 | 
				
			||||||
 | 
						decl	J
 | 
				
			||||||
 | 
						jg	.L07
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L10:
 | 
					.L10:
 | 
				
			||||||
	movl	Y, Y1
 | 
						movl	Y, Y1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -628,10 +685,22 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 	
 | 
					 	
 | 
				
			||||||
.L999:
 | 
					.L999:
 | 
				
			||||||
 | 
						movl	M,J
 | 
				
			||||||
 | 
						leal	(,J,SIZE),%eax
 | 
				
			||||||
 | 
						addl	%eax,AA
 | 
				
			||||||
 | 
						movl	XX,J
 | 
				
			||||||
 | 
						addl	%eax,J
 | 
				
			||||||
 | 
						movl	J,XX
 | 
				
			||||||
 | 
						jmp		.L0t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L999x:
 | 
				
			||||||
	popl	%ebx
 | 
						popl	%ebx
 | 
				
			||||||
	popl	%esi
 | 
						popl	%esi
 | 
				
			||||||
	popl	%edi	
 | 
						popl	%edi	
 | 
				
			||||||
	popl	%ebp
 | 
						popl	%ebp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						addl	$ARGS,%esp
 | 
				
			||||||
	ret
 | 
						ret
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	EPILOGUE
 | 
						EPILOGUE
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,17 +76,23 @@
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define STACKSIZE	16
 | 
					#define STACKSIZE	16
 | 
				
			||||||
 | 
					#define ARGS	16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define M		 4 + STACKSIZE(%esp)
 | 
					#define M		 4 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define N		 8 + STACKSIZE(%esp)
 | 
					#define N		 8 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define ALPHA		16 + STACKSIZE(%esp)
 | 
					#define ALPHA		16 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define A		24 + STACKSIZE(%esp)
 | 
					#define A		24 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_LDA	28 + STACKSIZE(%esp)
 | 
					#define STACK_LDA	28 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_X		32 + STACKSIZE(%esp)
 | 
					#define STACK_X		32 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCX	36 + STACKSIZE(%esp)
 | 
					#define STACK_INCX	36 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define Y		40 + STACKSIZE(%esp)
 | 
					#define Y		40 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define STACK_INCY	44 + STACKSIZE(%esp)
 | 
					#define STACK_INCY	44 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
#define BUFFER		48 + STACKSIZE(%esp)
 | 
					#define BUFFER		48 + STACKSIZE+ARGS(%esp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define MMM	0+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					#define AA	4+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					#define LDAX 8+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					#define NN	12+STACKSIZE(%esp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define I	%eax
 | 
					#define I	%eax
 | 
				
			||||||
#define J	%ebx
 | 
					#define J	%ebx
 | 
				
			||||||
| 
						 | 
					@ -101,6 +107,8 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	PROLOGUE
 | 
						PROLOGUE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						subl	$ARGS,%esp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pushl	%ebp
 | 
						pushl	%ebp
 | 
				
			||||||
	pushl	%edi
 | 
						pushl	%edi
 | 
				
			||||||
	pushl	%esi
 | 
						pushl	%esi
 | 
				
			||||||
| 
						 | 
					@ -108,7 +116,40 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	PROFCODE
 | 
						PROFCODE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movl	STACK_LDA,  LDA
 | 
						movl	STACK_LDA,  LDA
 | 
				
			||||||
 | 
						movl	LDA,LDAX			# backup LDA
 | 
				
			||||||
 | 
						movl	N,J
 | 
				
			||||||
 | 
						movl	J,NN				# backup N
 | 
				
			||||||
 | 
						movl	A,J
 | 
				
			||||||
 | 
						movl	J,AA				# backup A
 | 
				
			||||||
 | 
					    movl	M,J
 | 
				
			||||||
 | 
						movl	J,MMM				# mov M to MMM
 | 
				
			||||||
 | 
					.L0t:
 | 
				
			||||||
 | 
						xorl	J,J
 | 
				
			||||||
 | 
						addl	$1,J
 | 
				
			||||||
 | 
						sall    $21,J                           # J=2^21*sizeof(double)=buffer size(16MB)
 | 
				
			||||||
 | 
						subl    $4, J                           # Don't use last 4 double in the buffer.
 | 
				
			||||||
 | 
						                                        # Now, split M by block J
 | 
				
			||||||
 | 
						subl	J,MMM				# MMM=MMM-J
 | 
				
			||||||
 | 
						movl	J,M		
 | 
				
			||||||
 | 
						jge		.L00t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						movl	MMM,%eax
 | 
				
			||||||
 | 
						addl	J,%eax
 | 
				
			||||||
 | 
						jle		.L999x
 | 
				
			||||||
 | 
						movl	%eax,M
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L00t:
 | 
				
			||||||
 | 
						movl	AA,%eax
 | 
				
			||||||
 | 
						movl	%eax,A			 	# mov AA to A
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	NN,%eax
 | 
				
			||||||
 | 
						movl	%eax,N				# reset N
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movl	LDAX,  LDA			# reset LDA
 | 
				
			||||||
	movl	STACK_X,    X
 | 
						movl	STACK_X,    X
 | 
				
			||||||
	movl	STACK_INCX, INCX
 | 
						movl	STACK_INCX, INCX
 | 
				
			||||||
	movl	STACK_INCY, INCY
 | 
						movl	STACK_INCY, INCY
 | 
				
			||||||
| 
						 | 
					@ -117,6 +158,7 @@
 | 
				
			||||||
	leal	(,INCY, SIZE), INCY
 | 
						leal	(,INCY, SIZE), INCY
 | 
				
			||||||
	leal	(,LDA,  SIZE), LDA
 | 
						leal	(,LDA,  SIZE), LDA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	subl	$-16 * SIZE, A
 | 
						subl	$-16 * SIZE, A
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	cmpl	$0, N
 | 
						cmpl	$0, N
 | 
				
			||||||
| 
						 | 
					@ -560,10 +602,19 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
.L999:
 | 
					.L999:
 | 
				
			||||||
 | 
						movl 	M,J
 | 
				
			||||||
 | 
						leal 	(,J,SIZE),%eax
 | 
				
			||||||
 | 
						addl	%eax,AA
 | 
				
			||||||
 | 
						jmp		.L0t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L999x:
 | 
				
			||||||
	popl	%ebx
 | 
						popl	%ebx
 | 
				
			||||||
	popl	%esi
 | 
						popl	%esi
 | 
				
			||||||
	popl	%edi	
 | 
						popl	%edi	
 | 
				
			||||||
	popl	%ebp
 | 
						popl	%ebp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						addl	$ARGS,%esp
 | 
				
			||||||
	ret
 | 
						ret
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	EPILOGUE
 | 
						EPILOGUE
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -269,7 +269,7 @@
 | 
				
			||||||
	sarl	$5, I
 | 
						sarl	$5, I
 | 
				
			||||||
	jle	.L113
 | 
						jle	.L113
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA)
 | 
					#if defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movaps	%xmm0, %xmm1
 | 
						movaps	%xmm0, %xmm1
 | 
				
			||||||
	mulps	-32 * SIZE(X), %xmm1
 | 
						mulps	-32 * SIZE(X), %xmm1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -253,7 +253,7 @@
 | 
				
			||||||
	sarl	$4, I
 | 
						sarl	$4, I
 | 
				
			||||||
	jle	.L113
 | 
						jle	.L113
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA)
 | 
					#if defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movaps  %xmm0, %xmm1
 | 
						movaps  %xmm0, %xmm1
 | 
				
			||||||
	mulpd	-16 * SIZE(X), %xmm1
 | 
						mulpd	-16 * SIZE(X), %xmm1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,7 @@
 | 
				
			||||||
#define STACK_ALIGN	4096
 | 
					#define STACK_ALIGN	4096
 | 
				
			||||||
#define STACK_OFFSET	1024
 | 
					#define STACK_OFFSET	1024
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHSIZE (8 * 10 + 4)
 | 
					#define PREFETCHSIZE (8 * 10 + 4)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					@ -439,7 +439,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulsd	%xmm0, %xmm2
 | 
						mulsd	%xmm0, %xmm2
 | 
				
			||||||
	addsd	%xmm2, %xmm4
 | 
						addsd	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movlpd	 2 * SIZE(BB), %xmm2
 | 
						movlpd	 2 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -488,7 +488,7 @@
 | 
				
			||||||
	movlpd	40 * SIZE(BB), %xmm3
 | 
						movlpd	40 * SIZE(BB), %xmm3
 | 
				
			||||||
	addsd	%xmm0, %xmm7
 | 
						addsd	%xmm0, %xmm7
 | 
				
			||||||
	movlpd	 8 * SIZE(AA), %xmm0
 | 
						movlpd	 8 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
						PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulsd	%xmm1, %xmm2
 | 
						mulsd	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1697,7 +1697,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L42:
 | 
					.L42:
 | 
				
			||||||
	mulpd	%xmm0, %xmm2
 | 
						mulpd	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulpd	 2 * SIZE(BB), %xmm0
 | 
						mulpd	 2 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1727,7 +1727,7 @@
 | 
				
			||||||
	addpd	%xmm0, %xmm7
 | 
						addpd	%xmm0, %xmm7
 | 
				
			||||||
	movapd	16 * SIZE(AA), %xmm0
 | 
						movapd	16 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulpd	%xmm1, %xmm2
 | 
						mulpd	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,7 +64,7 @@
 | 
				
			||||||
#define BORIG	60(%esp)
 | 
					#define BORIG	60(%esp)
 | 
				
			||||||
#define BUFFER 128(%esp)
 | 
					#define BUFFER 128(%esp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHSIZE (16 * 10 + 8)
 | 
					#define PREFETCHSIZE (16 * 10 + 8)
 | 
				
			||||||
| 
						 | 
					@ -437,7 +437,7 @@
 | 
				
			||||||
.L32:
 | 
					.L32:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
	addss	%xmm2, %xmm4
 | 
						addss	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 4 * SIZE(BB), %xmm2
 | 
						movss	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -833,7 +833,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(BB), %xmm2
 | 
						movaps	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1848,7 +1848,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L72:
 | 
					.L72:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulss	 4 * SIZE(BB), %xmm0
 | 
						mulss	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -2109,7 +2109,7 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L62:
 | 
					.L62:
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2429,7 +2429,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L52:
 | 
					.L52:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	 4 * SIZE(BB), %xmm0
 | 
						mulps	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -2459,7 +2459,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm5
 | 
						addps	%xmm0, %xmm5
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2952,7 +2952,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L112:
 | 
					.L112:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 1 * SIZE(AA), %xmm0
 | 
						movss	 1 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -3148,7 +3148,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L102:
 | 
					.L102:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 2 * SIZE(AA), %xmm0
 | 
						movsd	 2 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -3389,7 +3389,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L92:
 | 
					.L92:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(AA), %xmm0
 | 
						movaps	 4 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -3404,7 +3404,7 @@
 | 
				
			||||||
	mulps	12 * SIZE(BB), %xmm0
 | 
						mulps	12 * SIZE(BB), %xmm0
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm3
 | 
						mulps	%xmm1, %xmm3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,7 @@
 | 
				
			||||||
#define STACK_ALIGN	4096
 | 
					#define STACK_ALIGN	4096
 | 
				
			||||||
#define STACK_OFFSET	1024
 | 
					#define STACK_OFFSET	1024
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHSIZE (8 * 10 + 4)
 | 
					#define PREFETCHSIZE (8 * 10 + 4)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					@ -910,7 +910,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulsd	%xmm0, %xmm2
 | 
						mulsd	%xmm0, %xmm2
 | 
				
			||||||
	addsd	%xmm2, %xmm4
 | 
						addsd	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movlpd	 2 * SIZE(BB), %xmm2
 | 
						movlpd	 2 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -959,7 +959,7 @@
 | 
				
			||||||
	movlpd	40 * SIZE(BB), %xmm3
 | 
						movlpd	40 * SIZE(BB), %xmm3
 | 
				
			||||||
	addsd	%xmm0, %xmm7
 | 
						addsd	%xmm0, %xmm7
 | 
				
			||||||
	movlpd	 8 * SIZE(AA), %xmm0
 | 
						movlpd	 8 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
						PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulsd	%xmm1, %xmm2
 | 
						mulsd	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1439,7 +1439,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L42:
 | 
					.L42:
 | 
				
			||||||
	mulpd	%xmm0, %xmm2
 | 
						mulpd	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulpd	 2 * SIZE(BB), %xmm0
 | 
						mulpd	 2 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1469,7 +1469,7 @@
 | 
				
			||||||
	addpd	%xmm0, %xmm7
 | 
						addpd	%xmm0, %xmm7
 | 
				
			||||||
	movapd	16 * SIZE(AA), %xmm0
 | 
						movapd	16 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulpd	%xmm1, %xmm2
 | 
						mulpd	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,7 +64,7 @@
 | 
				
			||||||
#define BORIG	60(%esp)
 | 
					#define BORIG	60(%esp)
 | 
				
			||||||
#define BUFFER 128(%esp)
 | 
					#define BUFFER 128(%esp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHSIZE (16 * 10 + 8)
 | 
					#define PREFETCHSIZE (16 * 10 + 8)
 | 
				
			||||||
| 
						 | 
					@ -872,7 +872,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(BB), %xmm2
 | 
						movaps	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1316,7 +1316,7 @@
 | 
				
			||||||
.L32:
 | 
					.L32:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
	addss	%xmm2, %xmm4
 | 
						addss	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 4 * SIZE(BB), %xmm2
 | 
						movss	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1855,7 +1855,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L52:
 | 
					.L52:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	 4 * SIZE(BB), %xmm0
 | 
						mulps	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1885,7 +1885,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm5
 | 
						addps	%xmm0, %xmm5
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2249,7 +2249,7 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L62:
 | 
					.L62:
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2562,7 +2562,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L72:
 | 
					.L72:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulss	 4 * SIZE(BB), %xmm0
 | 
						mulss	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -2957,7 +2957,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L92:
 | 
					.L92:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(AA), %xmm0
 | 
						movaps	 4 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -2972,7 +2972,7 @@
 | 
				
			||||||
	mulps	12 * SIZE(BB), %xmm0
 | 
						mulps	12 * SIZE(BB), %xmm0
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm3
 | 
						mulps	%xmm1, %xmm3
 | 
				
			||||||
| 
						 | 
					@ -3280,7 +3280,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L102:
 | 
					.L102:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 2 * SIZE(AA), %xmm0
 | 
						movsd	 2 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -3515,7 +3515,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L112:
 | 
					.L112:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 1 * SIZE(AA), %xmm0
 | 
						movss	 1 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,7 @@
 | 
				
			||||||
#define STACK_ALIGN	4096
 | 
					#define STACK_ALIGN	4096
 | 
				
			||||||
#define STACK_OFFSET	1024
 | 
					#define STACK_OFFSET	1024
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHSIZE (8 * 10 + 4)
 | 
					#define PREFETCHSIZE (8 * 10 + 4)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					@ -1036,7 +1036,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L42:
 | 
					.L42:
 | 
				
			||||||
	mulpd	%xmm0, %xmm2
 | 
						mulpd	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulpd	 2 * SIZE(BB), %xmm0
 | 
						mulpd	 2 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1066,7 +1066,7 @@
 | 
				
			||||||
	addpd	%xmm0, %xmm7
 | 
						addpd	%xmm0, %xmm7
 | 
				
			||||||
	movapd	16 * SIZE(AA), %xmm0
 | 
						movapd	16 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulpd	%xmm1, %xmm2
 | 
						mulpd	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2224,7 +2224,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulsd	%xmm0, %xmm2
 | 
						mulsd	%xmm0, %xmm2
 | 
				
			||||||
	addsd	%xmm2, %xmm4
 | 
						addsd	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						PREFETCH (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movlpd	 2 * SIZE(BB), %xmm2
 | 
						movlpd	 2 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -2273,7 +2273,7 @@
 | 
				
			||||||
	movlpd	40 * SIZE(BB), %xmm3
 | 
						movlpd	40 * SIZE(BB), %xmm3
 | 
				
			||||||
	addsd	%xmm0, %xmm7
 | 
						addsd	%xmm0, %xmm7
 | 
				
			||||||
	movlpd	 8 * SIZE(AA), %xmm0
 | 
						movlpd	 8 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
						PREFETCH (PREFETCHSIZE  + 8) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulsd	%xmm1, %xmm2
 | 
						mulsd	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,7 +64,7 @@
 | 
				
			||||||
#define BORIG	60(%esp)
 | 
					#define BORIG	60(%esp)
 | 
				
			||||||
#define BUFFER 128(%esp)
 | 
					#define BUFFER 128(%esp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHSIZE (16 * 10 + 8)
 | 
					#define PREFETCHSIZE (16 * 10 + 8)
 | 
				
			||||||
| 
						 | 
					@ -439,7 +439,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L92:
 | 
					.L92:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(AA), %xmm0
 | 
						movaps	 4 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -454,7 +454,7 @@
 | 
				
			||||||
	mulps	12 * SIZE(BB), %xmm0
 | 
						mulps	12 * SIZE(BB), %xmm0
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm3
 | 
						mulps	%xmm1, %xmm3
 | 
				
			||||||
| 
						 | 
					@ -758,7 +758,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L102:
 | 
					.L102:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 2 * SIZE(AA), %xmm0
 | 
						movsd	 2 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -993,7 +993,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L112:
 | 
					.L112:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 1 * SIZE(AA), %xmm0
 | 
						movss	 1 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1324,7 +1324,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L52:
 | 
					.L52:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	 4 * SIZE(BB), %xmm0
 | 
						mulps	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1354,7 +1354,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm5
 | 
						addps	%xmm0, %xmm5
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1718,7 +1718,7 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L62:
 | 
					.L62:
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2031,7 +2031,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L72:
 | 
					.L72:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulss	 4 * SIZE(BB), %xmm0
 | 
						mulss	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -2859,7 +2859,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(BB), %xmm2
 | 
						movaps	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -3303,7 +3303,7 @@
 | 
				
			||||||
.L32:
 | 
					.L32:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
	addss	%xmm2, %xmm4
 | 
						addss	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 4 * SIZE(BB), %xmm2
 | 
						movss	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -74,7 +74,7 @@
 | 
				
			||||||
#define	BB	%ecx
 | 
					#define	BB	%ecx
 | 
				
			||||||
#define LDC	%ebp
 | 
					#define LDC	%ebp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
#define movsd	movlps
 | 
					#define movsd	movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -625,7 +625,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 4 * SIZE(BB), %xmm2
 | 
						movsd	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -870,7 +870,7 @@
 | 
				
			||||||
.L32:
 | 
					.L32:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
	addss	%xmm2, %xmm4
 | 
						addss	%xmm2, %xmm4
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 4 * SIZE(BB), %xmm2
 | 
						movss	 4 * SIZE(BB), %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1173,7 +1173,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L52:
 | 
					.L52:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	 4 * SIZE(BB), %xmm0
 | 
						mulps	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1203,7 +1203,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm5
 | 
						addps	%xmm0, %xmm5
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
| 
						 | 
					@ -1359,7 +1359,7 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L62:
 | 
					.L62:
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1536,7 +1536,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L72:
 | 
					.L72:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulss	 4 * SIZE(BB), %xmm0
 | 
						mulss	 4 * SIZE(BB), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1794,7 +1794,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L92:
 | 
					.L92:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(AA), %xmm0
 | 
						movaps	 4 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -1809,7 +1809,7 @@
 | 
				
			||||||
	mulps	12 * SIZE(BB), %xmm0
 | 
						mulps	12 * SIZE(BB), %xmm0
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movaps	32 * SIZE(AA), %xmm0
 | 
						movaps	32 * SIZE(AA), %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm1, %xmm3
 | 
						mulps	%xmm1, %xmm3
 | 
				
			||||||
| 
						 | 
					@ -1936,7 +1936,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L102:
 | 
					.L102:
 | 
				
			||||||
	mulps	%xmm0, %xmm2
 | 
						mulps	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 2 * SIZE(AA), %xmm0
 | 
						movsd	 2 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					@ -2069,7 +2069,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L112:
 | 
					.L112:
 | 
				
			||||||
	mulss	%xmm0, %xmm2
 | 
						mulss	%xmm0, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
						prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 1 * SIZE(AA), %xmm0
 | 
						movss	 1 * SIZE(AA), %xmm0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,7 +71,7 @@
 | 
				
			||||||
#define movsd		movlps
 | 
					#define movsd		movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef BARCELONA
 | 
					#if defined(BARCELONA)  || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetchnta
 | 
					#define PREFETCH	prefetchnta
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 5)
 | 
					#define PREFETCHSIZE	(16 * 5)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,7 +58,7 @@
 | 
				
			||||||
#define movsd		movlps
 | 
					#define movsd		movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef BARCELONA
 | 
					#if defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetchnta
 | 
					#define PREFETCH	prefetchnta
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(8 * 5)
 | 
					#define PREFETCHSIZE	(8 * 5)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,7 +71,7 @@
 | 
				
			||||||
#define movsd		movlps
 | 
					#define movsd		movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef BARCELONA
 | 
					#if defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetchnta
 | 
					#define PREFETCH	prefetchnta
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 5)
 | 
					#define PREFETCHSIZE	(16 * 5)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,7 +58,7 @@
 | 
				
			||||||
#define movsd		movlps
 | 
					#define movsd		movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef BARCELONA
 | 
					#if defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetchnta
 | 
					#define PREFETCH	prefetchnta
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(8 * 5)
 | 
					#define PREFETCHSIZE	(8 * 5)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -75,7 +75,7 @@
 | 
				
			||||||
#define STACK_ALIGN	4096
 | 
					#define STACK_ALIGN	4096
 | 
				
			||||||
#define STACK_OFFSET	1024
 | 
					#define STACK_OFFSET	1024
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCHSIZE (16 * 10 + 8)
 | 
					#define PREFETCHSIZE (16 * 10 + 8)
 | 
				
			||||||
#define WPREFETCHSIZE 112
 | 
					#define WPREFETCHSIZE 112
 | 
				
			||||||
#define PREFETCH      prefetch
 | 
					#define PREFETCH      prefetch
 | 
				
			||||||
| 
						 | 
					@ -533,7 +533,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movsd	16 * SIZE(AA), %xmm0
 | 
						movsd	16 * SIZE(AA), %xmm0
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA)
 | 
						prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -75,7 +75,7 @@
 | 
				
			||||||
#define STACK_ALIGN	4096
 | 
					#define STACK_ALIGN	4096
 | 
				
			||||||
#define STACK_OFFSET	1024
 | 
					#define STACK_OFFSET	1024
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCHSIZE (16 * 10 + 8)
 | 
					#define PREFETCHSIZE (16 * 10 + 8)
 | 
				
			||||||
#define WPREFETCHSIZE 112
 | 
					#define WPREFETCHSIZE 112
 | 
				
			||||||
#define PREFETCH      prefetch
 | 
					#define PREFETCH      prefetch
 | 
				
			||||||
| 
						 | 
					@ -994,7 +994,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movsd	16 * SIZE(AA), %xmm0
 | 
						movsd	16 * SIZE(AA), %xmm0
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA)
 | 
						prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -75,7 +75,7 @@
 | 
				
			||||||
#define STACK_ALIGN	4096
 | 
					#define STACK_ALIGN	4096
 | 
				
			||||||
#define STACK_OFFSET	1024
 | 
					#define STACK_OFFSET	1024
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCHSIZE (16 * 10 + 8)
 | 
					#define PREFETCHSIZE (16 * 10 + 8)
 | 
				
			||||||
#define WPREFETCHSIZE 112
 | 
					#define WPREFETCHSIZE 112
 | 
				
			||||||
#define PREFETCH      prefetch
 | 
					#define PREFETCH      prefetch
 | 
				
			||||||
| 
						 | 
					@ -1820,7 +1820,7 @@
 | 
				
			||||||
	addps	%xmm0, %xmm7
 | 
						addps	%xmm0, %xmm7
 | 
				
			||||||
	movsd	16 * SIZE(AA), %xmm0
 | 
						movsd	16 * SIZE(AA), %xmm0
 | 
				
			||||||
	mulps	%xmm1, %xmm2
 | 
						mulps	%xmm1, %xmm2
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
	prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA)
 | 
						prefetcht1     (PREFETCHSIZE + 16) * SIZE(AA)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	addps	%xmm2, %xmm4
 | 
						addps	%xmm2, %xmm4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1,62 @@
 | 
				
			||||||
 | 
					ZGEMVNKERNEL = zgemv_n_dup.S
 | 
				
			||||||
 | 
					ZGEMVTKERNEL = zgemv_t_dup.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SGEMMKERNEL    =  gemm_kernel_8x4_barcelona.S
 | 
				
			||||||
 | 
					SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
 | 
				
			||||||
 | 
					SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
 | 
				
			||||||
 | 
					SGEMMONCOPY    =  gemm_ncopy_4_opteron.S
 | 
				
			||||||
 | 
					SGEMMOTCOPY    =  gemm_tcopy_4_opteron.S
 | 
				
			||||||
 | 
					SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX) 
 | 
				
			||||||
 | 
					SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX) 
 | 
				
			||||||
 | 
					SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					DGEMMKERNEL    =  dgemm_kernel_4x4_bulldozer.S
 | 
				
			||||||
 | 
					DGEMMINCOPY    =
 | 
				
			||||||
 | 
					DGEMMITCOPY    =
 | 
				
			||||||
 | 
					DGEMMONCOPY    =  gemm_ncopy_4_opteron.S
 | 
				
			||||||
 | 
					DGEMMOTCOPY    =  gemm_tcopy_4_opteron.S
 | 
				
			||||||
 | 
					DGEMMINCOPYOBJ =
 | 
				
			||||||
 | 
					DGEMMITCOPYOBJ =
 | 
				
			||||||
 | 
					DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					CGEMMKERNEL    =  zgemm_kernel_4x2_barcelona.S
 | 
				
			||||||
 | 
					CGEMMINCOPY    =  ../generic/zgemm_ncopy_4.c
 | 
				
			||||||
 | 
					CGEMMITCOPY    =  ../generic/zgemm_tcopy_4.c
 | 
				
			||||||
 | 
					CGEMMONCOPY    =  zgemm_ncopy_2.S
 | 
				
			||||||
 | 
					CGEMMOTCOPY    =  zgemm_tcopy_2.S
 | 
				
			||||||
 | 
					CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					ZGEMMKERNEL    =  zgemm_kernel_2x2_barcelona.S
 | 
				
			||||||
 | 
					ZGEMMINCOPY    =
 | 
				
			||||||
 | 
					ZGEMMITCOPY    =
 | 
				
			||||||
 | 
					ZGEMMONCOPY    =  zgemm_ncopy_2.S
 | 
				
			||||||
 | 
					ZGEMMOTCOPY    =  zgemm_tcopy_2.S
 | 
				
			||||||
 | 
					ZGEMMINCOPYOBJ =
 | 
				
			||||||
 | 
					ZGEMMITCOPYOBJ =
 | 
				
			||||||
 | 
					ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					STRSMKERNEL_LN	=  trsm_kernel_LN_8x4_sse.S
 | 
				
			||||||
 | 
					STRSMKERNEL_LT	=  trsm_kernel_LT_8x4_sse.S
 | 
				
			||||||
 | 
					STRSMKERNEL_RN	=  trsm_kernel_LT_8x4_sse.S
 | 
				
			||||||
 | 
					STRSMKERNEL_RT	=  trsm_kernel_RT_8x4_sse.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DTRSMKERNEL_LN	=  trsm_kernel_LN_4x4_barcelona.S
 | 
				
			||||||
 | 
					DTRSMKERNEL_LT	=  trsm_kernel_LT_4x4_barcelona.S
 | 
				
			||||||
 | 
					DTRSMKERNEL_RN	=  trsm_kernel_LT_4x4_barcelona.S
 | 
				
			||||||
 | 
					DTRSMKERNEL_RT	=  trsm_kernel_RT_4x4_barcelona.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CTRSMKERNEL_LN	=  ztrsm_kernel_LN_4x2_sse.S
 | 
				
			||||||
 | 
					CTRSMKERNEL_LT	=  ztrsm_kernel_LT_4x2_sse.S
 | 
				
			||||||
 | 
					CTRSMKERNEL_RN	=  ztrsm_kernel_LT_4x2_sse.S
 | 
				
			||||||
 | 
					CTRSMKERNEL_RT	=  ztrsm_kernel_RT_4x2_sse.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ZTRSMKERNEL_LN	=  ztrsm_kernel_LN_2x2_sse2.S
 | 
				
			||||||
 | 
					ZTRSMKERNEL_LT	=  ztrsm_kernel_LT_2x2_sse2.S
 | 
				
			||||||
 | 
					ZTRSMKERNEL_RN	=  ztrsm_kernel_LT_2x2_sse2.S
 | 
				
			||||||
 | 
					ZTRSMKERNEL_RT	=  ztrsm_kernel_RT_2x2_sse2.S
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_barcelona.S
 | 
				
			||||||
 | 
					ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_barcelona.S
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,7 @@
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	%xmm0,  ALPHA
 | 
						movaps	%xmm0,  ALPHA
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
	movaps	%xmm3,  ALPHA
 | 
						
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movq	40(%rsp), X
 | 
						movq	40(%rsp), X
 | 
				
			||||||
	movq	48(%rsp), INCX
 | 
						movq	48(%rsp), INCX
 | 
				
			||||||
| 
						 | 
					@ -79,6 +79,10 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	SAVEREGISTERS
 | 
						SAVEREGISTERS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef WINDOWS_ABI
 | 
				
			||||||
 | 
						movaps	%xmm3,  ALPHA
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
						
 | 
				
			||||||
	shufps	$0, ALPHA, ALPHA
 | 
						shufps	$0, ALPHA, ALPHA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	leaq	(, INCX, SIZE), INCX
 | 
						leaq	(, INCX, SIZE), INCX
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,6 @@
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	%xmm0,  ALPHA
 | 
						movaps	%xmm0,  ALPHA
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
	movaps	%xmm3,  ALPHA
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movq	40(%rsp), X
 | 
						movq	40(%rsp), X
 | 
				
			||||||
	movq	48(%rsp), INCX
 | 
						movq	48(%rsp), INCX
 | 
				
			||||||
| 
						 | 
					@ -79,6 +78,10 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	SAVEREGISTERS
 | 
						SAVEREGISTERS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef WINDOWS_ABI
 | 
				
			||||||
 | 
						movaps	%xmm3,  ALPHA
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	unpcklpd ALPHA, ALPHA
 | 
						unpcklpd ALPHA, ALPHA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	leaq	(, INCX, SIZE), INCX
 | 
						leaq	(, INCX, SIZE), INCX
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| 
						 | 
					@ -74,6 +74,11 @@
 | 
				
			||||||
#define STACK_Y		 72 + STACKSIZE(%rsp)
 | 
					#define STACK_Y		 72 + STACKSIZE(%rsp)
 | 
				
			||||||
#define STACK_INCY	 80 + STACKSIZE(%rsp)
 | 
					#define STACK_INCY	 80 + STACKSIZE(%rsp)
 | 
				
			||||||
#define STACK_BUFFER	 88 + STACKSIZE(%rsp)
 | 
					#define STACK_BUFFER	 88 + STACKSIZE(%rsp)
 | 
				
			||||||
 | 
					//Temp variables for M,N,A,LDA
 | 
				
			||||||
 | 
					#define MMM	224(%rsp)
 | 
				
			||||||
 | 
					#define NN	232(%rsp)
 | 
				
			||||||
 | 
					#define AA	240(%rsp)
 | 
				
			||||||
 | 
					#define LDAX	248(%rsp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -134,6 +139,12 @@
 | 
				
			||||||
	movq	OLD_A,        A
 | 
						movq	OLD_A,        A
 | 
				
			||||||
	movq	OLD_LDA,      LDA
 | 
						movq	OLD_LDA,      LDA
 | 
				
			||||||
	movq	OLD_X,        X
 | 
						movq	OLD_X,        X
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						movq	M,	      MMM
 | 
				
			||||||
 | 
						movq	N,            NN
 | 
				
			||||||
 | 
						movq	A,            AA
 | 
				
			||||||
 | 
						movq	LDA,	      LDAX
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
	movq	OLD_M,	      MMM
 | 
						movq	OLD_M,	      MMM
 | 
				
			||||||
	movq	OLD_N,        NN
 | 
						movq	OLD_N,        NN
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -530,7 +530,7 @@
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	-32 * SIZE(Y), %xmm8
 | 
						movsd	-32 * SIZE(Y), %xmm8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pshufd	$0x39, %xmm4,  %xmm5
 | 
						pshufd	$0x29, %xmm4,  %xmm5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	mulps	%xmm8,  %xmm5
 | 
						mulps	%xmm8,  %xmm5
 | 
				
			||||||
	addps	%xmm5,  %xmm3
 | 
						addps	%xmm5,  %xmm3
 | 
				
			||||||
| 
						 | 
					@ -750,7 +750,8 @@
 | 
				
			||||||
	xorps	%xmm5, %xmm5
 | 
						xorps	%xmm5, %xmm5
 | 
				
			||||||
	movhlps	%xmm4, %xmm5
 | 
						movhlps	%xmm4, %xmm5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	mulps	-32 * SIZE(Y), %xmm5
 | 
						movlps  -32 * SIZE(Y), %xmm4
 | 
				
			||||||
 | 
						mulps	%xmm4, %xmm5
 | 
				
			||||||
	addps	%xmm5, %xmm0
 | 
						addps	%xmm5, %xmm0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	addq	$2 * SIZE, X
 | 
						addq	$2 * SIZE, X
 | 
				
			||||||
| 
						 | 
					@ -992,7 +993,7 @@
 | 
				
			||||||
	movsd	-32 * SIZE(Y), %xmm8
 | 
						movsd	-32 * SIZE(Y), %xmm8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movss	%xmm5, %xmm4
 | 
						movss	%xmm5, %xmm4
 | 
				
			||||||
	shufps	$0x93, %xmm5,  %xmm4
 | 
						shufps	$0x93, %xmm4,  %xmm4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	mulps	%xmm8,  %xmm4
 | 
						mulps	%xmm8,  %xmm4
 | 
				
			||||||
	addps	%xmm4,  %xmm3
 | 
						addps	%xmm4,  %xmm3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -930,7 +930,7 @@
 | 
				
			||||||
.L22:
 | 
					.L22:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
	addps	%xmm9, %xmm0
 | 
						addps	%xmm9, %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	 4 * SIZE(BO), %xmm9
 | 
						movaps	 4 * SIZE(BO), %xmm9
 | 
				
			||||||
| 
						 | 
					@ -983,7 +983,7 @@
 | 
				
			||||||
	addps	%xmm8, %xmm3
 | 
						addps	%xmm8, %xmm3
 | 
				
			||||||
	movaps	 0 * SIZE(AO), %xmm8
 | 
						movaps	 0 * SIZE(AO), %xmm8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm10, %xmm9
 | 
						mulps	%xmm10, %xmm9
 | 
				
			||||||
| 
						 | 
					@ -1178,7 +1178,7 @@
 | 
				
			||||||
.L32:
 | 
					.L32:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
	addps	%xmm9, %xmm0
 | 
						addps	%xmm9, %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 4 * SIZE(BO), %xmm9
 | 
						movsd	 4 * SIZE(BO), %xmm9
 | 
				
			||||||
| 
						 | 
					@ -1423,7 +1423,7 @@
 | 
				
			||||||
.L42:
 | 
					.L42:
 | 
				
			||||||
	mulss	%xmm8, %xmm9
 | 
						mulss	%xmm8, %xmm9
 | 
				
			||||||
	addss	%xmm9, %xmm0
 | 
						addss	%xmm9, %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 4 * SIZE(BO), %xmm9
 | 
						movss	 4 * SIZE(BO), %xmm9
 | 
				
			||||||
| 
						 | 
					@ -1765,7 +1765,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L62:
 | 
					.L62:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	 4 * SIZE(BO), %xmm8
 | 
						mulps	 4 * SIZE(BO), %xmm8
 | 
				
			||||||
| 
						 | 
					@ -1793,7 +1793,7 @@
 | 
				
			||||||
	addps	%xmm8, %xmm5
 | 
						addps	%xmm8, %xmm5
 | 
				
			||||||
	movaps	32 * SIZE(AO), %xmm8
 | 
						movaps	32 * SIZE(AO), %xmm8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm10, %xmm11
 | 
						mulps	%xmm10, %xmm11
 | 
				
			||||||
| 
						 | 
					@ -1822,7 +1822,7 @@
 | 
				
			||||||
	addps	%xmm10, %xmm5
 | 
						addps	%xmm10, %xmm5
 | 
				
			||||||
	movaps	48 * SIZE(AO), %xmm10
 | 
						movaps	48 * SIZE(AO), %xmm10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm12, %xmm13
 | 
						mulps	%xmm12, %xmm13
 | 
				
			||||||
| 
						 | 
					@ -1851,7 +1851,7 @@
 | 
				
			||||||
	addps	%xmm12, %xmm5
 | 
						addps	%xmm12, %xmm5
 | 
				
			||||||
	movaps	64 * SIZE(AO), %xmm12
 | 
						movaps	64 * SIZE(AO), %xmm12
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm14, %xmm15
 | 
						mulps	%xmm14, %xmm15
 | 
				
			||||||
| 
						 | 
					@ -2024,7 +2024,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L72:
 | 
					.L72:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2208,7 +2208,7 @@
 | 
				
			||||||
.L82:
 | 
					.L82:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
	addps	%xmm9, %xmm0
 | 
						addps	%xmm9, %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	 4 * SIZE(BO), %xmm9
 | 
						movsd	 4 * SIZE(BO), %xmm9
 | 
				
			||||||
| 
						 | 
					@ -2395,7 +2395,7 @@
 | 
				
			||||||
.L92:
 | 
					.L92:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
	addps	%xmm9, %xmm0
 | 
						addps	%xmm9, %xmm0
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	 4 * SIZE(BO), %xmm9
 | 
						movss	 4 * SIZE(BO), %xmm9
 | 
				
			||||||
| 
						 | 
					@ -2670,7 +2670,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L112:
 | 
					.L112:
 | 
				
			||||||
	mulps	%xmm9, %xmm8
 | 
						mulps	%xmm9, %xmm8
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2687,7 +2687,7 @@
 | 
				
			||||||
	addps	%xmm9, %xmm4
 | 
						addps	%xmm9, %xmm4
 | 
				
			||||||
	movaps	 8 * SIZE(BO), %xmm9
 | 
						movaps	 8 * SIZE(BO), %xmm9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm9, %xmm10
 | 
						mulps	%xmm9, %xmm10
 | 
				
			||||||
| 
						 | 
					@ -2704,7 +2704,7 @@
 | 
				
			||||||
	addps	%xmm9, %xmm4
 | 
						addps	%xmm9, %xmm4
 | 
				
			||||||
	movaps	32 * SIZE(BO), %xmm9
 | 
						movaps	32 * SIZE(BO), %xmm9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm11, %xmm12
 | 
						mulps	%xmm11, %xmm12
 | 
				
			||||||
| 
						 | 
					@ -2721,7 +2721,7 @@
 | 
				
			||||||
	addps	%xmm11, %xmm4
 | 
						addps	%xmm11, %xmm4
 | 
				
			||||||
	movaps	24 * SIZE(BO), %xmm11
 | 
						movaps	24 * SIZE(BO), %xmm11
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm11, %xmm14
 | 
						mulps	%xmm11, %xmm14
 | 
				
			||||||
| 
						 | 
					@ -2857,7 +2857,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L122:
 | 
					.L122:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movaps	-28 * SIZE(AO), %xmm8
 | 
						movaps	-28 * SIZE(AO), %xmm8
 | 
				
			||||||
| 
						 | 
					@ -2873,7 +2873,7 @@
 | 
				
			||||||
	addps	%xmm8, %xmm3
 | 
						addps	%xmm8, %xmm3
 | 
				
			||||||
	movaps	  0 * SIZE(AO), %xmm8
 | 
						movaps	  0 * SIZE(AO), %xmm8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	mulps	%xmm10, %xmm11
 | 
						mulps	%xmm10, %xmm11
 | 
				
			||||||
| 
						 | 
					@ -3003,7 +3003,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L132:
 | 
					.L132:
 | 
				
			||||||
	mulps	%xmm8, %xmm9
 | 
						mulps	%xmm8, %xmm9
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movsd	-30 * SIZE(AO), %xmm8
 | 
						movsd	-30 * SIZE(AO), %xmm8
 | 
				
			||||||
| 
						 | 
					@ -3150,7 +3150,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L142:
 | 
					.L142:
 | 
				
			||||||
	mulss	%xmm8, %xmm9
 | 
						mulss	%xmm8, %xmm9
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
						PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	movss	-31 * SIZE(AO), %xmm8
 | 
						movss	-31 * SIZE(AO), %xmm8
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,7 +39,7 @@
 | 
				
			||||||
#define ASSEMBLER
 | 
					#define ASSEMBLER
 | 
				
			||||||
#include "common.h"
 | 
					#include "common.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define RPREFETCHSIZE (12 + 4)
 | 
					#define RPREFETCHSIZE (12 + 4)
 | 
				
			||||||
#define WPREFETCHSIZE (48 + 4)
 | 
					#define WPREFETCHSIZE (48 + 4)
 | 
				
			||||||
#define MOVNTQ	 MOVQ
 | 
					#define MOVNTQ	 MOVQ
 | 
				
			||||||
| 
						 | 
					@ -79,7 +79,7 @@
 | 
				
			||||||
#define AO3	%r13
 | 
					#define AO3	%r13
 | 
				
			||||||
#define AO4	%rax
 | 
					#define AO4	%rax
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define RPREFETCH prefetch
 | 
					#define RPREFETCH prefetch
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define RPREFETCH prefetch
 | 
					#define RPREFETCH prefetch
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,7 +39,7 @@
 | 
				
			||||||
#define ASSEMBLER
 | 
					#define ASSEMBLER
 | 
				
			||||||
#include "common.h"
 | 
					#include "common.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define RPREFETCHSIZE (12 + 4)
 | 
					#define RPREFETCHSIZE (12 + 4)
 | 
				
			||||||
#define WPREFETCHSIZE (12 + 4)
 | 
					#define WPREFETCHSIZE (12 + 4)
 | 
				
			||||||
#define MOVNTQ	 MOVQ
 | 
					#define MOVNTQ	 MOVQ
 | 
				
			||||||
| 
						 | 
					@ -96,7 +96,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define RPREFETCH prefetch
 | 
					#define RPREFETCH prefetch
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define RPREFETCH prefetch
 | 
					#define RPREFETCH prefetch
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -469,7 +469,7 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L71:
 | 
					.L71:
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
	prefetch	PREFETCHSIZE * SIZE(X)
 | 
						prefetch	PREFETCHSIZE * SIZE(X)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -266,7 +266,7 @@
 | 
				
			||||||
	sarq	$5, I
 | 
						sarq	$5, I
 | 
				
			||||||
	jle	.L113
 | 
						jle	.L113
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movaps	%xmm0, %xmm1
 | 
						movaps	%xmm0, %xmm1
 | 
				
			||||||
	mulps	-32 * SIZE(X), %xmm1
 | 
						mulps	-32 * SIZE(X), %xmm1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -251,7 +251,7 @@
 | 
				
			||||||
	sarq	$4, I
 | 
						sarq	$4, I
 | 
				
			||||||
	jle	.L113
 | 
						jle	.L113
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movaps  %xmm0, %xmm1
 | 
						movaps  %xmm0, %xmm1
 | 
				
			||||||
	mulpd	-16 * SIZE(X), %xmm1
 | 
						mulpd	-16 * SIZE(X), %xmm1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
/*********************************************************************/
 | 
					 | 
				
			||||||
/* Copyright 2009, 2010 The University of Texas at Austin.           */
 | 
					/* Copyright 2009, 2010 The University of Texas at Austin.           */
 | 
				
			||||||
/* All rights reserved.                                              */
 | 
					/* All rights reserved.                                              */
 | 
				
			||||||
/*                                                                   */
 | 
					/*                                                                   */
 | 
				
			||||||
| 
						 | 
					@ -47,7 +46,7 @@
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
#ifndef WINDOWS_ABI
 | 
					#ifndef WINDOWS_ABI
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define STACKSIZE	64
 | 
					#define STACKSIZE	128
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
#define OLD_M	  %rdi
 | 
					#define OLD_M	  %rdi
 | 
				
			||||||
#define OLD_N	  %rsi
 | 
					#define OLD_N	  %rsi
 | 
				
			||||||
| 
						 | 
					@ -57,6 +56,10 @@
 | 
				
			||||||
#define STACK_Y		16 + STACKSIZE(%rsp)
 | 
					#define STACK_Y		16 + STACKSIZE(%rsp)
 | 
				
			||||||
#define STACK_INCY	24 + STACKSIZE(%rsp)
 | 
					#define STACK_INCY	24 + STACKSIZE(%rsp)
 | 
				
			||||||
#define STACK_BUFFER	32 + STACKSIZE(%rsp)
 | 
					#define STACK_BUFFER	32 + STACKSIZE(%rsp)
 | 
				
			||||||
 | 
					#define MMM		56(%rsp)
 | 
				
			||||||
 | 
					#define NN		64(%rsp)
 | 
				
			||||||
 | 
					#define AA		72(%rsp)
 | 
				
			||||||
 | 
					#define LDAX	80(%rsp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,6 +74,10 @@
 | 
				
			||||||
#define STACK_Y		 72 + STACKSIZE(%rsp)
 | 
					#define STACK_Y		 72 + STACKSIZE(%rsp)
 | 
				
			||||||
#define STACK_INCY	 80 + STACKSIZE(%rsp)
 | 
					#define STACK_INCY	 80 + STACKSIZE(%rsp)
 | 
				
			||||||
#define STACK_BUFFER	 88 + STACKSIZE(%rsp)
 | 
					#define STACK_BUFFER	 88 + STACKSIZE(%rsp)
 | 
				
			||||||
 | 
					#define MMM	216(%rsp)
 | 
				
			||||||
 | 
					#define NN	224(%rsp)
 | 
				
			||||||
 | 
					#define AA	232(%rsp)
 | 
				
			||||||
 | 
					#define LDAX 240(%rsp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -127,29 +134,48 @@
 | 
				
			||||||
	movups	%xmm14, 192(%rsp)
 | 
						movups	%xmm14, 192(%rsp)
 | 
				
			||||||
	movups	%xmm15, 208(%rsp)
 | 
						movups	%xmm15, 208(%rsp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movq	OLD_M,	      M
 | 
						movq	OLD_M,	      MMM
 | 
				
			||||||
	movq	OLD_N,        N
 | 
						movq	OLD_N,        NN
 | 
				
			||||||
	movq	OLD_A,        A
 | 
						movq	OLD_A,        X
 | 
				
			||||||
	movq	OLD_LDA,      LDA
 | 
						movq	X,	      AA
 | 
				
			||||||
 | 
						movq	OLD_LDA,      X
 | 
				
			||||||
 | 
						movq	X,	      LDAX
 | 
				
			||||||
	movq	OLD_X,        X
 | 
						movq	OLD_X,        X
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
	movq	OLD_M,	      M
 | 
						movq	OLD_M,	      MMM
 | 
				
			||||||
	movq	OLD_N,        N
 | 
						movq	OLD_N,        NN
 | 
				
			||||||
	movq	OLD_A,        A
 | 
						movq	OLD_A,        AA
 | 
				
			||||||
	movq	OLD_LDA,      LDA
 | 
						movq	OLD_LDA,      LDAX
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					 | 
				
			||||||
	movq	STACK_INCX,   INCX
 | 
					 | 
				
			||||||
	movq	STACK_Y,      Y
 | 
					 | 
				
			||||||
	movq	STACK_INCY,   INCY
 | 
					 | 
				
			||||||
	movq	STACK_BUFFER, BUFFER
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifndef WINDOWS_ABI
 | 
					#ifndef WINDOWS_ABI
 | 
				
			||||||
	pshufd	$0, %xmm0, ALPHA
 | 
						pshufd	$0, %xmm0, ALPHA
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
	pshufd	$0, %xmm3, ALPHA
 | 
						pshufd	$0, %xmm3, ALPHA
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L0t:
 | 
				
			||||||
 | 
						xorq	M,M
 | 
				
			||||||
 | 
						addq	$1,M
 | 
				
			||||||
 | 
						salq	$22,M
 | 
				
			||||||
 | 
						subq	M,MMM
 | 
				
			||||||
 | 
						jge		.L00t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						movq	MMM,%rax
 | 
				
			||||||
 | 
						addq	M,%rax
 | 
				
			||||||
 | 
						jle		.L999x
 | 
				
			||||||
 | 
						movq	%rax,M
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L00t:
 | 
				
			||||||
 | 
						movq	LDAX,LDA
 | 
				
			||||||
 | 
						movq	NN,N
 | 
				
			||||||
 | 
						movq	AA,A
 | 
				
			||||||
 | 
						movq	STACK_INCX,   INCX
 | 
				
			||||||
 | 
						movq	STACK_Y,      Y
 | 
				
			||||||
 | 
						movq	STACK_INCY,   INCY
 | 
				
			||||||
 | 
						movq	STACK_BUFFER, BUFFER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	leaq	(,INCX, SIZE), INCX
 | 
						leaq	(,INCX, SIZE), INCX
 | 
				
			||||||
	leaq	(,INCY, SIZE), INCY
 | 
						leaq	(,INCY, SIZE), INCY
 | 
				
			||||||
	leaq	(,LDA,  SIZE), LDA
 | 
						leaq	(,LDA,  SIZE), LDA
 | 
				
			||||||
| 
						 | 
					@ -6341,6 +6367,12 @@
 | 
				
			||||||
	ALIGN_4
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.L999:
 | 
					.L999:
 | 
				
			||||||
 | 
						leaq	(,M,SIZE),%rax
 | 
				
			||||||
 | 
						addq	%rax,AA
 | 
				
			||||||
 | 
						jmp		.L0t
 | 
				
			||||||
 | 
						ALIGN_4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.L999x:
 | 
				
			||||||
	movq	  0(%rsp), %rbx
 | 
						movq	  0(%rsp), %rbx
 | 
				
			||||||
	movq	  8(%rsp), %rbp
 | 
						movq	  8(%rsp), %rbp
 | 
				
			||||||
	movq	 16(%rsp), %r12
 | 
						movq	 16(%rsp), %r12
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@
 | 
				
			||||||
#define movsd		movlps
 | 
					#define movsd		movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetch
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 16)
 | 
					#define PREFETCHSIZE	(16 * 16)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@
 | 
				
			||||||
#define movsd		movlpd
 | 
					#define movsd		movlpd
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetch
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 16)
 | 
					#define PREFETCHSIZE	(16 * 16)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@
 | 
				
			||||||
#define movsd		movlps
 | 
					#define movsd		movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetch
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 16)
 | 
					#define PREFETCHSIZE	(16 * 16)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@
 | 
				
			||||||
#define movsd		movlpd
 | 
					#define movsd		movlpd
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetch
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 16)
 | 
					#define PREFETCHSIZE	(16 * 16)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,7 +86,7 @@
 | 
				
			||||||
#define PREFETCHW    prefetcht0
 | 
					#define PREFETCHW    prefetcht0
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define movsd movlps
 | 
					#define movsd movlps
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,7 +86,7 @@
 | 
				
			||||||
#define PREFETCHW    prefetcht0
 | 
					#define PREFETCHW    prefetcht0
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define movsd movlps
 | 
					#define movsd movlps
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,7 +86,7 @@
 | 
				
			||||||
#define PREFETCHW    prefetcht0
 | 
					#define PREFETCHW    prefetcht0
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define movsd movlps
 | 
					#define movsd movlps
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -699,7 +699,7 @@
 | 
				
			||||||
	movsd	-32 * SIZE(X), %xmm4
 | 
						movsd	-32 * SIZE(X), %xmm4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pshufd	$0xb1,  %xmm4, %xmm12 
 | 
						pshufd	$0xb1,  %xmm4, %xmm12 
 | 
				
			||||||
	shufps	$0x39,  %xmm8, %xmm8
 | 
						shufps	$0x59,  %xmm8, %xmm8
 | 
				
			||||||
	mulps	%xmm8,  %xmm4
 | 
						mulps	%xmm8,  %xmm4
 | 
				
			||||||
	addps	%xmm4,  %xmm0
 | 
						addps	%xmm4,  %xmm0
 | 
				
			||||||
	mulps	%xmm8,  %xmm12
 | 
						mulps	%xmm8,  %xmm12
 | 
				
			||||||
| 
						 | 
					@ -1336,7 +1336,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movss	%xmm9,  %xmm8
 | 
						movss	%xmm9,  %xmm8
 | 
				
			||||||
	pshufd	$0xb1,  %xmm4, %xmm12 
 | 
						pshufd	$0xb1,  %xmm4, %xmm12 
 | 
				
			||||||
	shufps	$0x93,  %xmm8, %xmm8
 | 
						shufps	$0x03,  %xmm8, %xmm8
 | 
				
			||||||
	mulps	%xmm8,  %xmm4
 | 
						mulps	%xmm8,  %xmm4
 | 
				
			||||||
	addps	%xmm4,  %xmm0
 | 
						addps	%xmm4,  %xmm0
 | 
				
			||||||
	mulps	%xmm8,  %xmm12
 | 
						mulps	%xmm8,  %xmm12
 | 
				
			||||||
| 
						 | 
					@ -1697,7 +1697,7 @@
 | 
				
			||||||
	movsd	-32 * SIZE(Y), %xmm4
 | 
						movsd	-32 * SIZE(Y), %xmm4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	pshufd	$0xb1,  %xmm4, %xmm12 
 | 
						pshufd	$0xb1,  %xmm4, %xmm12 
 | 
				
			||||||
	shufps	$0x39,  %xmm8, %xmm8
 | 
						shufps	$0xa9,  %xmm8, %xmm8
 | 
				
			||||||
	mulps	%xmm8,  %xmm4
 | 
						mulps	%xmm8,  %xmm4
 | 
				
			||||||
	addps	%xmm4,  %xmm0
 | 
						addps	%xmm4,  %xmm0
 | 
				
			||||||
	mulps	%xmm8,  %xmm12
 | 
						mulps	%xmm8,  %xmm12
 | 
				
			||||||
| 
						 | 
					@ -2024,7 +2024,7 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	movss	%xmm9,  %xmm8
 | 
						movss	%xmm9,  %xmm8
 | 
				
			||||||
	pshufd	$0xb1,  %xmm4, %xmm12 
 | 
						pshufd	$0xb1,  %xmm4, %xmm12 
 | 
				
			||||||
	shufps	$0x93,  %xmm8, %xmm8
 | 
						shufps	$0x03,  %xmm8, %xmm8
 | 
				
			||||||
	mulps	%xmm8,  %xmm4
 | 
						mulps	%xmm8,  %xmm4
 | 
				
			||||||
	addps	%xmm4,  %xmm0
 | 
						addps	%xmm4,  %xmm0
 | 
				
			||||||
	mulps	%xmm8,  %xmm12
 | 
						mulps	%xmm8,  %xmm12
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -85,7 +85,7 @@
 | 
				
			||||||
#define movsd movlpd
 | 
					#define movsd movlpd
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define RPREFETCHSIZE 32
 | 
					#define RPREFETCHSIZE 32
 | 
				
			||||||
#define WPREFETCHSIZE 48
 | 
					#define WPREFETCHSIZE 48
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -160,7 +160,7 @@
 | 
				
			||||||
#define a3     %xmm14
 | 
					#define a3     %xmm14
 | 
				
			||||||
#define	xt1    %xmm15
 | 
					#define	xt1    %xmm15
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
					#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
				
			||||||
#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
					#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@
 | 
				
			||||||
#define movsd		movlpd
 | 
					#define movsd		movlpd
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetch
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 16)
 | 
					#define PREFETCHSIZE	(16 * 16)
 | 
				
			||||||
| 
						 | 
					@ -167,7 +167,7 @@
 | 
				
			||||||
#define a3     %xmm14
 | 
					#define a3     %xmm14
 | 
				
			||||||
#define	xt1    %xmm15
 | 
					#define	xt1    %xmm15
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
					#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
				
			||||||
#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
					#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@
 | 
				
			||||||
#define movsd		movlpd
 | 
					#define movsd		movlpd
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetch
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 16)
 | 
					#define PREFETCHSIZE	(16 * 16)
 | 
				
			||||||
| 
						 | 
					@ -166,7 +166,7 @@
 | 
				
			||||||
#define	xt1    %xmm14
 | 
					#define	xt1    %xmm14
 | 
				
			||||||
#define	xt2    %xmm15
 | 
					#define	xt2    %xmm15
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
					#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
				
			||||||
#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
					#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -76,7 +76,7 @@
 | 
				
			||||||
#define movsd		movlpd
 | 
					#define movsd		movlpd
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA)  || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH	prefetch
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
#define PREFETCHW	prefetchw
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
#define PREFETCHSIZE	(16 * 16)
 | 
					#define PREFETCHSIZE	(16 * 16)
 | 
				
			||||||
| 
						 | 
					@ -166,7 +166,7 @@
 | 
				
			||||||
#define a3     %xmm14
 | 
					#define a3     %xmm14
 | 
				
			||||||
#define	xt1    %xmm15
 | 
					#define	xt1    %xmm15
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
 | 
					#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
 | 
				
			||||||
#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
					#define MOVDDUP(a, b, c)	movddup	a(b), c
 | 
				
			||||||
#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
					#define MOVDDUP2(a, b, c)	movddup	a##b, c
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,7 +86,7 @@
 | 
				
			||||||
#define BORIG	 72(%rsp)
 | 
					#define BORIG	 72(%rsp)
 | 
				
			||||||
#define BUFFER	128(%rsp)
 | 
					#define BUFFER	128(%rsp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHNTA  prefetchnta
 | 
					#define PREFETCHNTA  prefetchnta
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -95,7 +95,7 @@
 | 
				
			||||||
#define PREFETCHSIZE (8 * 6 + 4)
 | 
					#define PREFETCHSIZE (8 * 6 + 4)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHNTA  prefetchnta
 | 
					#define PREFETCHNTA  prefetchnta
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,7 +86,7 @@
 | 
				
			||||||
#define BORIG	 72(%rsp)
 | 
					#define BORIG	 72(%rsp)
 | 
				
			||||||
#define BUFFER	128(%rsp)
 | 
					#define BUFFER	128(%rsp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHNTA  prefetchnta
 | 
					#define PREFETCHNTA  prefetchnta
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -95,7 +95,7 @@
 | 
				
			||||||
#define PREFETCHSIZE (8 * 6 + 4)
 | 
					#define PREFETCHSIZE (8 * 6 + 4)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHNTA  prefetchnta
 | 
					#define PREFETCHNTA  prefetchnta
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,7 +86,7 @@
 | 
				
			||||||
#define BORIG	 72(%rsp)
 | 
					#define BORIG	 72(%rsp)
 | 
				
			||||||
#define BUFFER	128(%rsp)
 | 
					#define BUFFER	128(%rsp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHNTA  prefetchnta
 | 
					#define PREFETCHNTA  prefetchnta
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -95,7 +95,7 @@
 | 
				
			||||||
#define PREFETCHSIZE (8 * 6 + 4)
 | 
					#define PREFETCHSIZE (8 * 6 + 4)
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define PREFETCH     prefetch
 | 
					#define PREFETCH     prefetch
 | 
				
			||||||
#define PREFETCHW    prefetchw
 | 
					#define PREFETCHW    prefetchw
 | 
				
			||||||
#define PREFETCHNTA  prefetchnta
 | 
					#define PREFETCHNTA  prefetchnta
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -74,6 +74,13 @@
 | 
				
			||||||
#define ALIGNED_ACCESS
 | 
					#define ALIGNED_ACCESS
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef BULLDOZER
 | 
				
			||||||
 | 
					#define PREFETCH	prefetch
 | 
				
			||||||
 | 
					#define PREFETCHW	prefetchw
 | 
				
			||||||
 | 
					#define PREFETCHSIZE (128 *   5)
 | 
				
			||||||
 | 
					#define ALIGNED_ACCESS
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef NANO
 | 
					#ifdef NANO
 | 
				
			||||||
#define PREFETCH        prefetcht0
 | 
					#define PREFETCH        prefetcht0
 | 
				
			||||||
#define PREFETCHW       prefetcht0
 | 
					#define PREFETCHW       prefetcht0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -85,7 +85,7 @@
 | 
				
			||||||
#define movsd		movlps
 | 
					#define movsd		movlps
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
#define ALIGNED_ACCESS
 | 
					#define ALIGNED_ACCESS
 | 
				
			||||||
#define MOVUPS_A	movaps
 | 
					#define MOVUPS_A	movaps
 | 
				
			||||||
#define MOVUPS_XL	movaps
 | 
					#define MOVUPS_XL	movaps
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -66,7 +66,9 @@ static FLOAT dm1 = -1.;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define GEMM_PQ  MAX(GEMM_P, GEMM_Q)
 | 
					#define GEMM_PQ  MAX(GEMM_P, GEMM_Q)
 | 
				
			||||||
#define REAL_GEMM_R (GEMM_R - GEMM_PQ)
 | 
					
 | 
				
			||||||
 | 
					//leave some space for GEMM_ALIGN in sb2
 | 
				
			||||||
 | 
					#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if 0
 | 
					#if 0
 | 
				
			||||||
#define SHARED_ARRAY
 | 
					#define SHARED_ARRAY
 | 
				
			||||||
| 
						 | 
					@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
 | 
				
			||||||
			sa,
 | 
								sa,
 | 
				
			||||||
			sb2,
 | 
								sb2,
 | 
				
			||||||
			a + (is + js * lda) * COMPSIZE, lda,
 | 
								a + (is + js * lda) * COMPSIZE, lda,
 | 
				
			||||||
			- is + js);
 | 
								is - js);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										2
									
								
								make.inc
								
								
								
								
							
							
						
						
									
										2
									
								
								make.inc
								
								
								
								
							| 
						 | 
					@ -4,7 +4,7 @@ DRVOPTS  = $(OPTS)
 | 
				
			||||||
LOADER   = $(FORTRAN)
 | 
					LOADER   = $(FORTRAN)
 | 
				
			||||||
TIMER     = NONE
 | 
					TIMER     = NONE
 | 
				
			||||||
ARCHFLAGS= -ru
 | 
					ARCHFLAGS= -ru
 | 
				
			||||||
RANLIB   = ranlib
 | 
					#RANLIB   = ranlib
 | 
				
			||||||
BLASLIB      = 
 | 
					BLASLIB      = 
 | 
				
			||||||
TMGLIB       = tmglib.a
 | 
					TMGLIB       = tmglib.a
 | 
				
			||||||
EIGSRCLIB    = eigsrc.a
 | 
					EIGSRCLIB    = eigsrc.a
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -48,7 +48,8 @@ typedef int blasint;
 | 
				
			||||||
/* C99 supports complex floating numbers natively, which GCC also offers as an
 | 
					/* C99 supports complex floating numbers natively, which GCC also offers as an
 | 
				
			||||||
   extension since version 3.0.  If neither are available, use a compatible
 | 
					   extension since version 3.0.  If neither are available, use a compatible
 | 
				
			||||||
   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
 | 
					   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
 | 
				
			||||||
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
 | 
					#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
 | 
				
			||||||
 | 
					     (__GNUC__ >= 3 && !defined(__cplusplus)))
 | 
				
			||||||
  #define OPENBLAS_COMPLEX_C99
 | 
					  #define OPENBLAS_COMPLEX_C99
 | 
				
			||||||
  #include <complex.h>
 | 
					  #include <complex.h>
 | 
				
			||||||
  typedef float _Complex openblas_complex_float;
 | 
					  typedef float _Complex openblas_complex_float;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										2
									
								
								param.h
								
								
								
								
							
							
						
						
									
										2
									
								
								param.h
								
								
								
								
							| 
						 | 
					@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
 | 
					#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define SNUMOPT		8
 | 
					#define SNUMOPT		8
 | 
				
			||||||
#define DNUMOPT		4
 | 
					#define DNUMOPT		4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue