Merge branch 'develop'

This commit is contained in:
Zhang Xianyi 2013-03-02 14:24:23 +08:00
commit e5ac3007e0
81 changed files with 2899 additions and 454 deletions

View File

@ -1,4 +1,22 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.2.6
2-Mar-2013
common:
* Improved OpenMP performance slightly. (d744c9)
* Improved cblas.h compatibility with Intel MKL.(#185)
* Fixed the overflowing bug in single thread cholesky factorization.
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
x86/x86-64:
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
We will tune the performance in future.
* Auto-detect Intel Xeon E7540.
* Fixed the overflowing buffer bug of gemv. (#173)
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
MIPS64:
==================================================================== ====================================================================
Version 0.2.5 Version 0.2.5
26-Nov-2012 26-Nov-2012

View File

@ -314,7 +314,7 @@ clean ::
#endif #endif
@$(MAKE) -C reference clean @$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d $(NETLIB_LAPACK_DIR); then \ @if test -d $(NETLIB_LAPACK_DIR); then \
echo deleting $(NETLIB_LAPACK_DIR); \ echo deleting $(NETLIB_LAPACK_DIR); \
rm -rf $(NETLIB_LAPACK_DIR) ;\ rm -rf $(NETLIB_LAPACK_DIR) ;\

View File

@ -1,3 +1,5 @@
# This is triggered by Makefile.system and runs before any of the code is built.
export BINARY export BINARY
export USE_OPENMP export USE_OPENMP
@ -15,7 +17,7 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99 EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif endif
all: getarch_2nd all: getarch_2nd cblas_noconst.h
./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF) ./getarch_2nd 1 >> $(TARGET_CONF)
@ -36,4 +38,7 @@ else
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
endif endif
cblas_noconst.h : cblas.h
perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h
dummy: dummy:

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.2.5 VERSION = 0.2.6
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h # Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
ifndef TARGET_CORE ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf include $(TOPDIR)/Makefile.conf
@ -277,14 +277,14 @@ ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif endif
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif endif
endif endif

View File

@ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
#### MIPS64: #### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.

View File

@ -29,6 +29,7 @@ BARCELONA
SHANGHAI SHANGHAI
ISTANBUL ISTANBUL
BOBCAT BOBCAT
BULLDOZER
c)VIA CPU: c)VIA CPU:
SSE_GENERIC SSE_GENERIC

448
cblas.h
View File

@ -1,291 +1,293 @@
#ifndef CBLAS_H #ifndef CBLAS_H
#define CBLAS_H #define CBLAS_H
#include <stddef.h>
#include "common.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
/* Assume C declarations for C++ */ /* Assume C declarations for C++ */
#endif /* __cplusplus */ #endif /* __cplusplus */
#include <stddef.h>
#include "common.h"
/*Set the number of threads on runtime.*/ /*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads); void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
#define CBLAS_INDEX size_t #define CBLAS_INDEX size_t
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
float cblas_sasum (blasint n, float *x, blasint incx); float cblas_sasum (const blasint n, const float *x, const blasint incx);
double cblas_dasum (blasint n, double *x, blasint incx); double cblas_dasum (const blasint n, const double *x, const blasint incx);
float cblas_scasum(blasint n, float *x, blasint incx); float cblas_scasum(const blasint n, const float *x, const blasint incx);
double cblas_dzasum(blasint n, double *x, blasint incx); double cblas_dzasum(const blasint n, const double *x, const blasint incx);
float cblas_snrm2 (blasint N, float *X, blasint incX); float cblas_snrm2 (const blasint N, const float *X, const blasint incX);
double cblas_dnrm2 (blasint N, double *X, blasint incX); double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
float cblas_scnrm2(blasint N, float *X, blasint incX); float cblas_scnrm2(const blasint N, const float *X, const blasint incX);
double cblas_dznrm2(blasint N, double *X, blasint incX); double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx);
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx);
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s);
void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s); void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
void cblas_sscal(blasint N, float alpha, float *X, blasint incX); void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
void cblas_dscal(blasint N, double alpha, double *X, blasint incX); void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
void cblas_csscal(blasint N, float alpha, float *X, blasint incX); void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy);
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy);
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy);
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy);
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
blasint incX, float *Y, blasint incY, float *A, blasint lda); const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
blasint incX, double *Y, blasint incY, double *A, blasint lda); const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
float *Y, blasint incY, float *A, blasint lda); const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
double *Y, blasint incY, double *A, blasint lda); const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
float *X, blasint incX, float beta, float *Y, blasint incY); const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
double *X, blasint incX, double beta, double *Y, blasint incY); const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...); void cblas_xerbla(blasint p, char *rout, char *form, ...);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif #endif

View File

@ -390,7 +390,8 @@ typedef int blasint;
/* C99 supports complex floating numbers natively, which GCC also offers as an /* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus)))
#define OPENBLAS_COMPLEX_C99 #define OPENBLAS_COMPLEX_C99
typedef float _Complex openblas_complex_float; typedef float _Complex openblas_complex_float;
typedef double _Complex openblas_complex_double; typedef double _Complex openblas_complex_double;
@ -557,7 +558,8 @@ typedef struct {
#include "common_level3.h" #include "common_level3.h"
#include "common_lapack.h" #include "common_lapack.h"
#ifdef CBLAS #ifdef CBLAS
#include "cblas.h" /* This header file is generated from "cblas.h" (see Makefile.prebuild). */
#include "cblas_noconst.h"
#endif #endif
#ifndef ASSEMBLER #ifndef ASSEMBLER

View File

@ -125,7 +125,8 @@
#define HAVE_MISALIGNSSE (1 << 15) #define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16) #define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17) #define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18) #define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19)
#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2

View File

@ -43,6 +43,8 @@
#ifdef NO_AVX #ifdef NO_AVX
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CORE_BULLDOZER CORE_BARCELONA
#endif #endif
#ifndef CPUIDEMU #ifndef CPUIDEMU
@ -116,8 +118,9 @@ static inline int have_excpuid(void){
#ifndef NO_AVX #ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){ static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__ __asm__ __volatile__
("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
} }
#endif #endif
@ -228,6 +231,9 @@ int get_cputype(int gettype){
cpuid(0x80000001, &eax, &ebx, &ecx, &edx); cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
#ifndef NO_AVX
if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4;
#endif
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
} }
@ -1030,6 +1036,8 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 14:
// Xeon E7540
case 15: case 15:
//Xeon Processor E7 (Westmere-EX) //Xeon Processor E7 (Westmere-EX)
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
@ -1075,8 +1083,12 @@ int get_cpuname(void){
return CPUTYPE_OPTERON; return CPUTYPE_OPTERON;
case 1: case 1:
case 10: case 10:
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 5: case 5:
return CPUTYPE_BOBCAT; return CPUTYPE_BOBCAT;
} }
@ -1398,6 +1410,8 @@ int get_coretype(void){
return CORE_SANDYBRIDGE; return CORE_SANDYBRIDGE;
else else
return CORE_NEHALEM; //OS doesn't support AVX return CORE_NEHALEM; //OS doesn't support AVX
case 14:
//Xeon E7540
case 15: case 15:
//Xeon Processor E7 (Westmere-EX) //Xeon Processor E7 (Westmere-EX)
return CORE_NEHALEM; return CORE_NEHALEM;
@ -1427,8 +1441,13 @@ int get_coretype(void){
if (family == 0xf){ if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT; else if (exfamily == 5) return CORE_BOBCAT;
else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series else if (exfamily == 6) {
else return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
}else return CORE_BARCELONA;
} }
} }
@ -1494,6 +1513,9 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE %d\n", info.size * 1024); printf("#define DTB_SIZE %d\n", info.size * 1024);
printf("#define DTB_ASSOCIATIVE %d\n", info.associative); printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
} else {
//fall back for some virtual machines.
printf("#define DTB_DEFAULT_ENTRIES 32\n");
} }
features = get_cputype(GET_FEATURE); features = get_cputype(GET_FEATURE);
@ -1511,6 +1533,7 @@ void get_cpuconfig(void){
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
@ -1577,5 +1600,6 @@ void get_sse(void){
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
} }

View File

@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a = (FLOAT *)args -> a; a = (FLOAT *)args -> a;
x = (FLOAT *)args -> b; x = (FLOAT *)args -> b;
y = (FLOAT *)args -> c;
lda = args -> lda; lda = args -> lda;
incx = args -> ldb; incx = args -> ldb;
@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
n_from = 0; n_from = 0;
n_to = n; n_to = n;
//Use y as each thread's n* COMPSIZE elements in sb buffer
y = buffer;
buffer += ((COMPSIZE * n + 1023) & ~1023);
if (range_m) { if (range_m) {
n_from = *(range_m + 0); n_from = *(range_m + 0);
n_to = *(range_m + 1); n_to = *(range_m + 1);
@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a += n_from * lda * COMPSIZE; a += n_from * lda * COMPSIZE;
} }
if (range_n) y += *range_n * COMPSIZE;
if (incx != 1) { if (incx != 1) {
COPY_K(n, x, incx, buffer, 1); COPY_K(n, x, incx, buffer, 1);
@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
if (num_cpu) { if (num_cpu) {
queue[0].sa = NULL; queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL; queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue); exec_blas(num_cpu, queue);
@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#else #else
ONE, ZERO, ONE, ZERO,
#endif #endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
} }
AXPYU_K(n, 0, 0, AXPYU_K(n, 0, 0,

View File

@ -1,7 +1,7 @@
TOPDIR = ../.. TOPDIR = ../..
include ../../Makefile.system include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
openblas_get_config.$(SUFFIX) : openblas_get_config.c
$(CC) $(CFLAGS) -c $< -o $(@F)
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)

View File

@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
#ifdef MONITOR #ifdef MONITOR

View File

@ -49,8 +49,12 @@
int blas_server_avail = 0; int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER];
void goto_set_num_threads(int num_threads) { void goto_set_num_threads(int num_threads) {
int i=0;
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@ -63,6 +67,18 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number); omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){
if(blas_thread_buffer[i]==NULL){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
}
for(; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
#if defined(ARCH_MIPS64) #if defined(ARCH_MIPS64)
//set parameters for different number of threads. //set parameters for different number of threads.
blas_set_parameter(); blas_set_parameter();
@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){ int blas_thread_init(void){
int i=0;
blas_get_cpu_number(); blas_get_cpu_number();
blas_server_avail = 1; blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
for(; i<MAX_CPU_NUMBER; i++){
blas_thread_buffer[i]=NULL;
}
return 0; return 0;
} }
int BLASFUNC(blas_thread_shutdown)(void){ int BLASFUNC(blas_thread_shutdown)(void){
int i=0;
blas_server_avail = 0; blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
return 0; return 0;
} }
@ -177,6 +209,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
static void exec_threads(blas_queue_t *queue){ static void exec_threads(blas_queue_t *queue){
void *buffer, *sa, *sb; void *buffer, *sa, *sb;
int pos=0, release_flag=0;
buffer = NULL; buffer = NULL;
sa = queue -> sa; sa = queue -> sa;
@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
buffer = blas_memory_alloc(2); pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos];
//fallback
if(buffer==NULL) {
buffer = blas_memory_alloc(2);
release_flag=1;
}
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
} }
@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){
} }
if (buffer != NULL) blas_memory_free(buffer); if (release_flag) blas_memory_free(buffer);
} }

View File

@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
#ifdef MONITOR #ifdef MONITOR

View File

@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX #ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
#else #else
//Use NEHALEM kernels for sandy bridge //Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#endif #endif
@ -78,8 +80,9 @@ extern gotoblas_t gotoblas_SANDYBRIDGE;
#ifndef NO_AVX #ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){ static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__ __asm__ __volatile__
("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
} }
#endif #endif
@ -163,7 +166,8 @@ static gotoblas_t *get_coretype(void){
//Intel Xeon Processor 5600 (Westmere-EP) //Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX) //Xeon Processor E7 (Westmere-EX)
if (model == 12 || model == 15) return &gotoblas_NEHALEM; //Xeon E7540
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5 //Intel Core i7-3000 / Xeon E5
@ -171,7 +175,7 @@ static gotoblas_t *get_coretype(void){
if(support_avx()) if(support_avx())
return &gotoblas_SANDYBRIDGE; return &gotoblas_SANDYBRIDGE;
else{ else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
@ -182,7 +186,7 @@ static gotoblas_t *get_coretype(void){
if(support_avx()) if(support_avx())
return &gotoblas_SANDYBRIDGE; return &gotoblas_SANDYBRIDGE;
else{ else{
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
@ -202,6 +206,14 @@ static gotoblas_t *get_coretype(void){
else return &gotoblas_OPTERON; else return &gotoblas_OPTERON;
} else if (exfamily == 5) { } else if (exfamily == 5) {
return &gotoblas_BOBCAT; return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
} else { } else {
return &gotoblas_BARCELONA; return &gotoblas_BARCELONA;
} }
@ -238,6 +250,7 @@ static char *corename[] = {
"Nano", "Nano",
"Sandybridge", "Sandybridge",
"Bobcat", "Bobcat",
"Bulldozer",
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@ -259,6 +272,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_NANO) return corename[15]; if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17]; if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
return corename[0]; return corename[0];
} }

View File

@ -0,0 +1,59 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
static char* openblas_config_str=""
#ifdef USE64BITINT
"USE64BITINT "
#endif
#ifdef NO_CBLAS
"NO_CBLAS "
#endif
#ifdef NO_LAPACK
"NO_LAPACK "
#endif
#ifdef NO_LAPACKE
"NO_LAPACKE "
#endif
#ifdef DYNAMIC_ARCH
"DYNAMIC_ARCH "
#endif
#ifdef NO_AFFINITY
"NO_AFFINITY "
#endif
;
char* CNAME() {
return openblas_config_str;
}

View File

@ -163,7 +163,7 @@ int get_L2_size(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)

View File

@ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT)
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
EXTRALIB += -lgfortran EXTRALIB += -lgfortran
endif endif
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
EXTRALIB += -lgomp
endif
endif
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)

View File

@ -74,6 +74,7 @@
@misc_no_underscore_objs = ( @misc_no_underscore_objs = (
openblas_set_num_threads, goto_set_num_threads, openblas_set_num_threads, goto_set_num_threads,
openblas_get_config,
); );
@misc_underscore_objs = ( @misc_underscore_objs = (

View File

@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "OPTERON" #define CORENAME "OPTERON"
#endif #endif
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
#define ARCHITECTURE "X86" #define ARCHITECTURE "X86"
@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BOBCAT" #define CORENAME "BOBCAT"
#endif #endif
#if defined (FORCE_BULLDOZER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BULLDOZER"
#define ARCHCONFIG "-DBULLDOZER " \
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
"-DHAVE_AVX -DHAVE_FMA4"
#define LIBNAME "bulldozer"
#define CORENAME "BULLDOZER"
#endif
#ifdef FORCE_SSE_GENERIC #ifdef FORCE_SSE_GENERIC
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL

View File

@ -34,7 +34,7 @@ int main(int argc, char **argv) {
#ifdef USE64BITINT #ifdef USE64BITINT
printf("#define USE64BITINT\n"); printf("#define USE64BITINT\n");
#endif #endif
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD);
} }
return 0; return 0;

View File

@ -810,6 +810,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef BULLDOZER
#ifdef DEBUG
fprintf(stderr, "Bulldozer\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO #ifdef NANO
#ifdef DEBUG #ifdef DEBUG

View File

@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@ -596,7 +596,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 4 * SIZE(BB), %xmm2 movsd 4 * SIZE(BB), %xmm2
@ -842,7 +842,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1168,7 +1168,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1198,7 +1198,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1347,7 +1347,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -1531,7 +1531,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -1778,7 +1778,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -1793,7 +1793,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -1924,7 +1924,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -2069,7 +2069,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -89,17 +89,22 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 20 + STACKSIZE(%esp) #define A 20 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 24 + STACKSIZE(%esp) #define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
#define STACK_X 28 + STACKSIZE(%esp) #define STACK_X 28 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 32 + STACKSIZE(%esp) #define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
#define Y 36 + STACKSIZE(%esp) #define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define LDAX 12+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -114,6 +119,7 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -121,7 +127,34 @@
PROFCODE PROFCODE
movl Y,J
movl J,YY # backup Y
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # backup MM
.L0t:
xorl J,J
addl $1,J
sall $21,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@ -651,12 +684,22 @@
addss 0 * SIZE(X), %xmm0 addss 0 * SIZE(X), %xmm0
movss %xmm0, (Y1) movss %xmm0, (Y1)
ALIGN_3 ALIGN_3
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -76,17 +76,22 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE(%esp) #define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE(%esp) #define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -101,6 +106,8 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -108,6 +115,33 @@
PROFCODE PROFCODE
movl Y,J
movl J,YY # backup Y
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # backup MM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@ -677,10 +711,22 @@
ALIGN_3 ALIGN_3
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -89,17 +89,24 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 20 + STACKSIZE(%esp) #define A 20 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 24 + STACKSIZE(%esp) #define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
#define STACK_X 28 + STACKSIZE(%esp) #define STACK_X 28 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 32 + STACKSIZE(%esp) #define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
#define Y 36 + STACKSIZE(%esp) #define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+STACKSIZE(%esp)
#define NN 4+STACKSIZE(%esp)
#define AA 8+STACKSIZE(%esp)
#define LDAX 12+STACKSIZE(%esp)
#define XX 16+STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -114,6 +121,7 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -122,7 +130,42 @@
PROFCODE PROFCODE
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl STACK_X, X movl STACK_X, X
movl X,XX
movl N,J
movl J,NN # backup N
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # mov M to MMM
.L0t:
xorl J,J
addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A # mov AA to A
movl NN,%eax
movl %eax,N # reset N
movl LDAX, LDA # reset LDA
movl XX,X
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY
@ -198,6 +241,20 @@
jg .L06 jg .L06
ALIGN_4 ALIGN_4
//Padding zero to prevent loading the dirty number from buffer.
movl M, I
movl $8, J
andl $7, I
xorps %xmm0, %xmm0
subl I, J
ALIGN_2
.L07:
movss %xmm0, 0 * SIZE(Y1)
addl $SIZE, Y1
decl J
jg .L07
ALIGN_4
.L10: .L10:
movl Y, Y1 movl Y, Y1
@ -628,10 +685,22 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl XX,J
addl %eax,J
movl J,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -76,17 +76,23 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE(%esp) #define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE(%esp) #define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+STACKSIZE(%esp)
#define AA 4+STACKSIZE(%esp)
#define LDAX 8+STACKSIZE(%esp)
#define NN 12+STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -101,6 +107,8 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -108,7 +116,40 @@
PROFCODE PROFCODE
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl N,J
movl J,NN # backup N
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # mov M to MMM
.L0t:
xorl J,J
addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A # mov AA to A
movl NN,%eax
movl %eax,N # reset N
movl LDAX, LDA # reset LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY
@ -117,6 +158,7 @@
leal (,INCY, SIZE), INCY leal (,INCY, SIZE), INCY
leal (,LDA, SIZE), LDA leal (,LDA, SIZE), LDA
subl $-16 * SIZE, A subl $-16 * SIZE, A
cmpl $0, N cmpl $0, N
@ -560,10 +602,19 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -269,7 +269,7 @@
sarl $5, I sarl $5, I
jle .L113 jle .L113
#if defined(BARCELONA) #if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1 mulps -32 * SIZE(X), %xmm1

View File

@ -253,7 +253,7 @@
sarl $4, I sarl $4, I
jle .L113 jle .L113
#if defined(BARCELONA) #if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1 mulpd -16 * SIZE(X), %xmm1

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -439,7 +439,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1697,7 +1697,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -437,7 +437,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -833,7 +833,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1848,7 +1848,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2109,7 +2109,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2429,7 +2429,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2952,7 +2952,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -3148,7 +3148,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3389,7 +3389,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -910,7 +910,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1439,7 +1439,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -872,7 +872,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1316,7 +1316,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1855,7 +1855,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2249,7 +2249,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2562,7 +2562,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2957,7 +2957,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -3280,7 +3280,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3515,7 +3515,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -1036,7 +1036,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2
@ -2224,7 +2224,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -439,7 +439,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -758,7 +758,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -993,7 +993,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -1324,7 +1324,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1718,7 +1718,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2031,7 +2031,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2859,7 +2859,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -3303,7 +3303,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2

View File

@ -74,7 +74,7 @@
#define BB %ecx #define BB %ecx
#define LDC %ebp #define LDC %ebp
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define movsd movlps #define movsd movlps
#endif #endif
@ -625,7 +625,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 4 * SIZE(BB), %xmm2 movsd 4 * SIZE(BB), %xmm2
@ -870,7 +870,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1173,7 +1173,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1203,7 +1203,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1359,7 +1359,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -1536,7 +1536,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -1794,7 +1794,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -1809,7 +1809,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -1936,7 +1936,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -2069,7 +2069,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -71,7 +71,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5) #define PREFETCHSIZE (16 * 5)

View File

@ -58,7 +58,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5) #define PREFETCHSIZE (8 * 5)

View File

@ -71,7 +71,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5) #define PREFETCHSIZE (16 * 5)

View File

@ -58,7 +58,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5) #define PREFETCHSIZE (8 * 5)

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -533,7 +533,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -994,7 +994,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

File diff suppressed because it is too large Load Diff

View File

@ -530,7 +530,7 @@
#endif #endif
movsd -32 * SIZE(Y), %xmm8 movsd -32 * SIZE(Y), %xmm8
pshufd $0x39, %xmm4, %xmm5 pshufd $0x29, %xmm4, %xmm5
mulps %xmm8, %xmm5 mulps %xmm8, %xmm5
addps %xmm5, %xmm3 addps %xmm5, %xmm3
@ -750,7 +750,8 @@
xorps %xmm5, %xmm5 xorps %xmm5, %xmm5
movhlps %xmm4, %xmm5 movhlps %xmm4, %xmm5
mulps -32 * SIZE(Y), %xmm5 movlps -32 * SIZE(Y), %xmm4
mulps %xmm4, %xmm5
addps %xmm5, %xmm0 addps %xmm5, %xmm0
addq $2 * SIZE, X addq $2 * SIZE, X
@ -992,7 +993,7 @@
movsd -32 * SIZE(Y), %xmm8 movsd -32 * SIZE(Y), %xmm8
movss %xmm5, %xmm4 movss %xmm5, %xmm4
shufps $0x93, %xmm5, %xmm4 shufps $0x93, %xmm4, %xmm4
mulps %xmm8, %xmm4 mulps %xmm8, %xmm4
addps %xmm4, %xmm3 addps %xmm4, %xmm3

View File

@ -930,7 +930,7 @@
.L22: .L22:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movaps 4 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm9
@ -983,7 +983,7 @@
addps %xmm8, %xmm3 addps %xmm8, %xmm3
movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(AO), %xmm8
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm10, %xmm9 mulps %xmm10, %xmm9
@ -1178,7 +1178,7 @@
.L32: .L32:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movsd 4 * SIZE(BO), %xmm9 movsd 4 * SIZE(BO), %xmm9
@ -1423,7 +1423,7 @@
.L42: .L42:
mulss %xmm8, %xmm9 mulss %xmm8, %xmm9
addss %xmm9, %xmm0 addss %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movss 4 * SIZE(BO), %xmm9 movss 4 * SIZE(BO), %xmm9
@ -1765,7 +1765,7 @@
.L62: .L62:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
mulps 4 * SIZE(BO), %xmm8 mulps 4 * SIZE(BO), %xmm8
@ -1793,7 +1793,7 @@
addps %xmm8, %xmm5 addps %xmm8, %xmm5
movaps 32 * SIZE(AO), %xmm8 movaps 32 * SIZE(AO), %xmm8
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm10, %xmm11 mulps %xmm10, %xmm11
@ -1822,7 +1822,7 @@
addps %xmm10, %xmm5 addps %xmm10, %xmm5
movaps 48 * SIZE(AO), %xmm10 movaps 48 * SIZE(AO), %xmm10
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
#endif #endif
mulps %xmm12, %xmm13 mulps %xmm12, %xmm13
@ -1851,7 +1851,7 @@
addps %xmm12, %xmm5 addps %xmm12, %xmm5
movaps 64 * SIZE(AO), %xmm12 movaps 64 * SIZE(AO), %xmm12
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
#endif #endif
mulps %xmm14, %xmm15 mulps %xmm14, %xmm15
@ -2024,7 +2024,7 @@
.L72: .L72:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
@ -2208,7 +2208,7 @@
.L82: .L82:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movsd 4 * SIZE(BO), %xmm9 movsd 4 * SIZE(BO), %xmm9
@ -2395,7 +2395,7 @@
.L92: .L92:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
addps %xmm9, %xmm0 addps %xmm9, %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movss 4 * SIZE(BO), %xmm9 movss 4 * SIZE(BO), %xmm9
@ -2670,7 +2670,7 @@
.L112: .L112:
mulps %xmm9, %xmm8 mulps %xmm9, %xmm8
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
@ -2687,7 +2687,7 @@
addps %xmm9, %xmm4 addps %xmm9, %xmm4
movaps 8 * SIZE(BO), %xmm9 movaps 8 * SIZE(BO), %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm9, %xmm10 mulps %xmm9, %xmm10
@ -2704,7 +2704,7 @@
addps %xmm9, %xmm4 addps %xmm9, %xmm4
movaps 32 * SIZE(BO), %xmm9 movaps 32 * SIZE(BO), %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
#endif #endif
mulps %xmm11, %xmm12 mulps %xmm11, %xmm12
@ -2721,7 +2721,7 @@
addps %xmm11, %xmm4 addps %xmm11, %xmm4
movaps 24 * SIZE(BO), %xmm11 movaps 24 * SIZE(BO), %xmm11
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
#endif #endif
mulps %xmm11, %xmm14 mulps %xmm11, %xmm14
@ -2857,7 +2857,7 @@
.L122: .L122:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movaps -28 * SIZE(AO), %xmm8 movaps -28 * SIZE(AO), %xmm8
@ -2873,7 +2873,7 @@
addps %xmm8, %xmm3 addps %xmm8, %xmm3
movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(AO), %xmm8
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif #endif
mulps %xmm10, %xmm11 mulps %xmm10, %xmm11
@ -3003,7 +3003,7 @@
.L132: .L132:
mulps %xmm8, %xmm9 mulps %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movsd -30 * SIZE(AO), %xmm8 movsd -30 * SIZE(AO), %xmm8
@ -3150,7 +3150,7 @@
.L142: .L142:
mulss %xmm8, %xmm9 mulss %xmm8, %xmm9
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif #endif
movss -31 * SIZE(AO), %xmm8 movss -31 * SIZE(AO), %xmm8

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE (12 + 4) #define RPREFETCHSIZE (12 + 4)
#define WPREFETCHSIZE (48 + 4) #define WPREFETCHSIZE (48 + 4)
#define MOVNTQ MOVQ #define MOVNTQ MOVQ
@ -79,7 +79,7 @@
#define AO3 %r13 #define AO3 %r13
#define AO4 %rax #define AO4 %rax
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCH prefetch #define RPREFETCH prefetch
#else #else
#define RPREFETCH prefetch #define RPREFETCH prefetch

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE (12 + 4) #define RPREFETCHSIZE (12 + 4)
#define WPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4)
#define MOVNTQ MOVQ #define MOVNTQ MOVQ
@ -96,7 +96,7 @@
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCH prefetch #define RPREFETCH prefetch
#else #else
#define RPREFETCH prefetch #define RPREFETCH prefetch

View File

@ -469,7 +469,7 @@
ALIGN_4 ALIGN_4
.L71: .L71:
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
prefetch PREFETCHSIZE * SIZE(X) prefetch PREFETCHSIZE * SIZE(X)
#endif #endif

View File

@ -266,7 +266,7 @@
sarq $5, I sarq $5, I
jle .L113 jle .L113
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1 mulps -32 * SIZE(X), %xmm1

View File

@ -251,7 +251,7 @@
sarq $4, I sarq $4, I
jle .L113 jle .L113
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1 mulpd -16 * SIZE(X), %xmm1

View File

@ -1,4 +1,3 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
@ -47,7 +46,7 @@
#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI
#define STACKSIZE 64 #define STACKSIZE 128
#define OLD_M %rdi #define OLD_M %rdi
#define OLD_N %rsi #define OLD_N %rsi
@ -57,6 +56,10 @@
#define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_Y 16 + STACKSIZE(%rsp)
#define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp)
#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#else #else
@ -71,6 +74,10 @@
#define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_Y 72 + STACKSIZE(%rsp)
#define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp)
#define STACK_BUFFER 88 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp)
#define MMM 216(%rsp)
#define NN 224(%rsp)
#define AA 232(%rsp)
#define LDAX 240(%rsp)
#endif #endif
@ -127,29 +134,48 @@
movups %xmm14, 192(%rsp) movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp) movups %xmm15, 208(%rsp)
movq OLD_M, M movq OLD_M, MMM
movq OLD_N, N movq OLD_N, NN
movq OLD_A, A movq OLD_A, X
movq OLD_LDA, LDA movq X, AA
movq OLD_LDA, X
movq X, LDAX
movq OLD_X, X movq OLD_X, X
#else #else
movq OLD_M, M movq OLD_M, MMM
movq OLD_N, N movq OLD_N, NN
movq OLD_A, A movq OLD_A, AA
movq OLD_LDA, LDA movq OLD_LDA, LDAX
#endif #endif
movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER
#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI
pshufd $0, %xmm0, ALPHA pshufd $0, %xmm0, ALPHA
#else #else
pshufd $0, %xmm3, ALPHA pshufd $0, %xmm3, ALPHA
#endif #endif
.L0t:
xorq M,M
addq $1,M
salq $22,M
subq M,MMM
jge .L00t
ALIGN_4
movq MMM,%rax
addq M,%rax
jle .L999x
movq %rax,M
.L00t:
movq LDAX,LDA
movq NN,N
movq AA,A
movq STACK_INCX, INCX
movq STACK_Y, Y
movq STACK_INCY, INCY
movq STACK_BUFFER, BUFFER
leaq (,INCX, SIZE), INCX leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY leaq (,INCY, SIZE), INCY
leaq (,LDA, SIZE), LDA leaq (,LDA, SIZE), LDA
@ -6341,6 +6367,12 @@
ALIGN_4 ALIGN_4
.L999: .L999:
leaq (,M,SIZE),%rax
addq %rax,AA
jmp .L0t
ALIGN_4
.L999x:
movq 0(%rsp), %rbx movq 0(%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
movq 16(%rsp), %r12 movq 16(%rsp), %r12

View File

@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -86,7 +86,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define movsd movlps #define movsd movlps

View File

@ -86,7 +86,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define movsd movlps #define movsd movlps

View File

@ -86,7 +86,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define movsd movlps #define movsd movlps

View File

@ -699,7 +699,7 @@
movsd -32 * SIZE(X), %xmm4 movsd -32 * SIZE(X), %xmm4
pshufd $0xb1, %xmm4, %xmm12 pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8 shufps $0x59, %xmm8, %xmm8
mulps %xmm8, %xmm4 mulps %xmm8, %xmm4
addps %xmm4, %xmm0 addps %xmm4, %xmm0
mulps %xmm8, %xmm12 mulps %xmm8, %xmm12
@ -1336,7 +1336,7 @@
movss %xmm9, %xmm8 movss %xmm9, %xmm8
pshufd $0xb1, %xmm4, %xmm12 pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm8, %xmm8 shufps $0x03, %xmm8, %xmm8
mulps %xmm8, %xmm4 mulps %xmm8, %xmm4
addps %xmm4, %xmm0 addps %xmm4, %xmm0
mulps %xmm8, %xmm12 mulps %xmm8, %xmm12
@ -1697,7 +1697,7 @@
movsd -32 * SIZE(Y), %xmm4 movsd -32 * SIZE(Y), %xmm4
pshufd $0xb1, %xmm4, %xmm12 pshufd $0xb1, %xmm4, %xmm12
shufps $0x39, %xmm8, %xmm8 shufps $0xa9, %xmm8, %xmm8
mulps %xmm8, %xmm4 mulps %xmm8, %xmm4
addps %xmm4, %xmm0 addps %xmm4, %xmm0
mulps %xmm8, %xmm12 mulps %xmm8, %xmm12
@ -2024,7 +2024,7 @@
movss %xmm9, %xmm8 movss %xmm9, %xmm8
pshufd $0xb1, %xmm4, %xmm12 pshufd $0xb1, %xmm4, %xmm12
shufps $0x93, %xmm8, %xmm8 shufps $0x03, %xmm8, %xmm8
mulps %xmm8, %xmm4 mulps %xmm8, %xmm4
addps %xmm4, %xmm0 addps %xmm4, %xmm0
mulps %xmm8, %xmm12 mulps %xmm8, %xmm12

View File

@ -85,7 +85,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE 32 #define RPREFETCHSIZE 32
#define WPREFETCHSIZE 48 #define WPREFETCHSIZE 48
#endif #endif

View File

@ -160,7 +160,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)
@ -167,7 +167,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)
@ -166,7 +166,7 @@
#define xt1 %xmm14 #define xt1 %xmm14
#define xt2 %xmm15 #define xt2 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)
@ -166,7 +166,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -74,6 +74,13 @@
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#endif #endif
#ifdef BULLDOZER
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (128 * 5)
#define ALIGNED_ACCESS
#endif
#ifdef NANO #ifdef NANO
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0

View File

@ -85,7 +85,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#define MOVUPS_A movaps #define MOVUPS_A movaps
#define MOVUPS_XL movaps #define MOVUPS_XL movaps

View File

@ -66,7 +66,9 @@ static FLOAT dm1 = -1.;
#endif #endif
#define GEMM_PQ MAX(GEMM_P, GEMM_Q) #define GEMM_PQ MAX(GEMM_P, GEMM_Q)
#define REAL_GEMM_R (GEMM_R - GEMM_PQ)
//leave some space for GEMM_ALIGN in sb2
#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ)
#if 0 #if 0
#define SHARED_ARRAY #define SHARED_ARRAY
@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
sa, sa,
sb2, sb2,
a + (is + js * lda) * COMPSIZE, lda, a + (is + js * lda) * COMPSIZE, lda,
- is + js); is - js);
#endif #endif
} }

View File

@ -4,7 +4,7 @@ DRVOPTS = $(OPTS)
LOADER = $(FORTRAN) LOADER = $(FORTRAN)
TIMER = NONE TIMER = NONE
ARCHFLAGS= -ru ARCHFLAGS= -ru
RANLIB = ranlib #RANLIB = ranlib
BLASLIB = BLASLIB =
TMGLIB = tmglib.a TMGLIB = tmglib.a
EIGSRCLIB = eigsrc.a EIGSRCLIB = eigsrc.a

View File

@ -48,7 +48,8 @@ typedef int blasint;
/* C99 supports complex floating numbers natively, which GCC also offers as an /* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus)))
#define OPENBLAS_COMPLEX_C99 #define OPENBLAS_COMPLEX_C99
#include <complex.h> #include <complex.h>
typedef float _Complex openblas_complex_float; typedef float _Complex openblas_complex_float;

View File

@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define SNUMOPT 8 #define SNUMOPT 8
#define DNUMOPT 4 #define DNUMOPT 4