Merge branch 'develop'
This commit is contained in:
commit
e5ac3007e0
|
@ -1,4 +1,22 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.2.6
|
||||||
|
2-Mar-2013
|
||||||
|
common:
|
||||||
|
* Improved OpenMP performance slightly. (d744c9)
|
||||||
|
* Improved cblas.h compatibility with Intel MKL.(#185)
|
||||||
|
* Fixed the overflowing bug in single thread cholesky factorization.
|
||||||
|
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
|
||||||
|
|
||||||
|
x86/x86-64:
|
||||||
|
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
||||||
|
We will tune the performance in future.
|
||||||
|
* Auto-detect Intel Xeon E7540.
|
||||||
|
* Fixed the overflowing buffer bug of gemv. (#173)
|
||||||
|
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
|
||||||
|
|
||||||
|
MIPS64:
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.2.5
|
Version 0.2.5
|
||||||
26-Nov-2012
|
26-Nov-2012
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -314,7 +314,7 @@ clean ::
|
||||||
#endif
|
#endif
|
||||||
@$(MAKE) -C reference clean
|
@$(MAKE) -C reference clean
|
||||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
||||||
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
||||||
@if test -d $(NETLIB_LAPACK_DIR); then \
|
@if test -d $(NETLIB_LAPACK_DIR); then \
|
||||||
echo deleting $(NETLIB_LAPACK_DIR); \
|
echo deleting $(NETLIB_LAPACK_DIR); \
|
||||||
rm -rf $(NETLIB_LAPACK_DIR) ;\
|
rm -rf $(NETLIB_LAPACK_DIR) ;\
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# This is triggered by Makefile.system and runs before any of the code is built.
|
||||||
|
|
||||||
export BINARY
|
export BINARY
|
||||||
export USE_OPENMP
|
export USE_OPENMP
|
||||||
|
|
||||||
|
@ -15,7 +17,7 @@ ifdef CPUIDEMU
|
||||||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all: getarch_2nd
|
all: getarch_2nd cblas_noconst.h
|
||||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||||
|
|
||||||
|
@ -36,4 +38,7 @@ else
|
||||||
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
|
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
cblas_noconst.h : cblas.h
|
||||||
|
perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h
|
||||||
|
|
||||||
dummy:
|
dummy:
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.2.5
|
VERSION = 0.2.6
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
|
|
@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE
|
||||||
export GOTOBLAS_MAKEFILE = 1
|
export GOTOBLAS_MAKEFILE = 1
|
||||||
|
|
||||||
# Generating Makefile.conf and config.h
|
# Generating Makefile.conf and config.h
|
||||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
|
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
|
||||||
|
|
||||||
ifndef TARGET_CORE
|
ifndef TARGET_CORE
|
||||||
include $(TOPDIR)/Makefile.conf
|
include $(TOPDIR)/Makefile.conf
|
||||||
|
@ -277,14 +277,14 @@ ifeq ($(ARCH), x86)
|
||||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt
|
||||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||||
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
|
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
|
||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
||||||
|
|
||||||
#### MIPS64:
|
#### MIPS64:
|
||||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||||
|
|
|
@ -29,6 +29,7 @@ BARCELONA
|
||||||
SHANGHAI
|
SHANGHAI
|
||||||
ISTANBUL
|
ISTANBUL
|
||||||
BOBCAT
|
BOBCAT
|
||||||
|
BULLDOZER
|
||||||
|
|
||||||
c)VIA CPU:
|
c)VIA CPU:
|
||||||
SSE_GENERIC
|
SSE_GENERIC
|
||||||
|
|
448
cblas.h
448
cblas.h
|
@ -1,291 +1,293 @@
|
||||||
#ifndef CBLAS_H
|
#ifndef CBLAS_H
|
||||||
#define CBLAS_H
|
#define CBLAS_H
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
/* Assume C declarations for C++ */
|
/* Assume C declarations for C++ */
|
||||||
#endif /* __cplusplus */
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
/*Set the number of threads on runtime.*/
|
/*Set the number of threads on runtime.*/
|
||||||
void openblas_set_num_threads(int num_threads);
|
void openblas_set_num_threads(int num_threads);
|
||||||
void goto_set_num_threads(int num_threads);
|
void goto_set_num_threads(int num_threads);
|
||||||
|
|
||||||
|
/*Get the build configure on runtime.*/
|
||||||
|
char* openblas_get_config(void);
|
||||||
|
|
||||||
#define CBLAS_INDEX size_t
|
#define CBLAS_INDEX size_t
|
||||||
|
|
||||||
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
|
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
|
||||||
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114};
|
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
|
||||||
enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
|
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
|
||||||
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
|
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
|
||||||
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
|
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
|
||||||
|
|
||||||
float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy);
|
float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||||
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
|
double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||||
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
|
float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||||
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
|
double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
|
||||||
|
|
||||||
openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
|
openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||||
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
|
openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||||
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
|
openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
|
||||||
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
|
openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
|
||||||
|
|
||||||
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
|
void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
|
||||||
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
|
void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
|
||||||
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
|
void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
|
||||||
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
|
void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
|
||||||
|
|
||||||
float cblas_sasum (blasint n, float *x, blasint incx);
|
float cblas_sasum (const blasint n, const float *x, const blasint incx);
|
||||||
double cblas_dasum (blasint n, double *x, blasint incx);
|
double cblas_dasum (const blasint n, const double *x, const blasint incx);
|
||||||
float cblas_scasum(blasint n, float *x, blasint incx);
|
float cblas_scasum(const blasint n, const float *x, const blasint incx);
|
||||||
double cblas_dzasum(blasint n, double *x, blasint incx);
|
double cblas_dzasum(const blasint n, const double *x, const blasint incx);
|
||||||
|
|
||||||
float cblas_snrm2 (blasint N, float *X, blasint incX);
|
float cblas_snrm2 (const blasint N, const float *X, const blasint incX);
|
||||||
double cblas_dnrm2 (blasint N, double *X, blasint incX);
|
double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
|
||||||
float cblas_scnrm2(blasint N, float *X, blasint incX);
|
float cblas_scnrm2(const blasint N, const float *X, const blasint incX);
|
||||||
double cblas_dznrm2(blasint N, double *X, blasint incX);
|
double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
|
||||||
|
|
||||||
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
|
CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx);
|
||||||
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
|
CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
|
||||||
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
|
CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx);
|
||||||
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
|
CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
|
||||||
|
|
||||||
void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy);
|
void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
|
||||||
void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy);
|
void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
|
||||||
void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy);
|
void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
|
||||||
void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy);
|
void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
|
||||||
|
|
||||||
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
|
void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
|
||||||
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
|
void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
|
||||||
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
|
void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
|
||||||
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
|
void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
|
||||||
|
|
||||||
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
|
void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
|
||||||
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
|
void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
|
||||||
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
|
void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
|
||||||
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
|
void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
|
||||||
|
|
||||||
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
|
void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
|
||||||
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);
|
void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s);
|
||||||
|
|
||||||
void cblas_srotg(float *a, float *b, float *c, float *s);
|
void cblas_srotg(float *a, float *b, float *c, float *s);
|
||||||
void cblas_drotg(double *a, double *b, double *c, double *s);
|
void cblas_drotg(double *a, double *b, double *c, double *s);
|
||||||
|
|
||||||
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
|
void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
|
||||||
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
|
void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
|
||||||
|
|
||||||
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
|
void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
|
||||||
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
|
void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
|
||||||
|
|
||||||
void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
|
void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
|
||||||
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
|
void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
|
||||||
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
|
void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
|
||||||
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
|
void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
|
||||||
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
|
void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
|
||||||
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
|
void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
|
||||||
|
|
||||||
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||||
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
|
const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy);
|
||||||
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||||
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
|
const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy);
|
||||||
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||||
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
|
const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy);
|
||||||
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||||
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);
|
const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy);
|
||||||
|
|
||||||
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||||
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||||
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||||
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||||
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||||
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||||
|
|
||||||
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
|
|
||||||
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
|
|
||||||
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
|
void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
|
||||||
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
|
void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
|
||||||
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
|
void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
|
||||||
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
|
void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
|
||||||
|
|
||||||
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
|
void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
|
||||||
blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||||
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
|
void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
|
||||||
blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||||
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
|
void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
|
||||||
float *Y, blasint incY, float *A, blasint lda);
|
const float *Y, const blasint incY, float *A, const blasint lda);
|
||||||
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
|
void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
|
||||||
double *Y, blasint incY, double *A, blasint lda);
|
const double *Y, const blasint incY, double *A, const blasint lda);
|
||||||
|
|
||||||
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||||
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
|
const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||||
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||||
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
|
const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||||
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||||
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||||
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||||
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||||
|
|
||||||
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
|
void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
|
||||||
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
|
const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||||
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
|
void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
|
||||||
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
|
const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||||
|
|
||||||
|
|
||||||
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
|
|
||||||
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||||
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||||
|
|
||||||
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, float *Ap, float *X, blasint incX);
|
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||||
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, double *Ap, double *X, blasint incX);
|
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||||
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, float *Ap, float *X, blasint incX);
|
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||||
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, double *Ap, double *X, blasint incX);
|
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||||
|
|
||||||
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, float *Ap, float *X, blasint incX);
|
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||||
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, double *Ap, double *X, blasint incX);
|
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||||
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, float *Ap, float *X, blasint incX);
|
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||||
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||||
blasint N, double *Ap, double *X, blasint incX);
|
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||||
|
|
||||||
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
|
void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
|
||||||
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
|
const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||||
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
|
void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
|
||||||
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
|
const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||||
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
|
void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
|
||||||
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||||
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
|
void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
|
||||||
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||||
|
|
||||||
|
|
||||||
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
|
void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
|
||||||
float *X, blasint incX, float beta, float *Y, blasint incY);
|
const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||||
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
|
void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
|
||||||
double *X, blasint incX, double beta, double *Y, blasint incY);
|
const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||||
|
|
||||||
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
|
void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
|
||||||
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
|
void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
|
||||||
|
|
||||||
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
|
void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
|
||||||
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
|
void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
|
||||||
|
|
||||||
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
|
void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
|
||||||
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
|
void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
|
||||||
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
|
void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
|
||||||
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
|
void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
|
||||||
|
|
||||||
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
|
void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
|
||||||
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||||
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
|
void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
|
||||||
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||||
|
|
||||||
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
|
void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
|
||||||
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||||
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
|
void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
|
||||||
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||||
|
|
||||||
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||||
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||||
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||||
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||||
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||||
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||||
|
|
||||||
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||||
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||||
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||||
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||||
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||||
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||||
|
|
||||||
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
|
const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
|
||||||
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
|
const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
|
||||||
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
|
const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
|
||||||
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
|
const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
|
||||||
|
|
||||||
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||||
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||||
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||||
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||||
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||||
|
|
||||||
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||||
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||||
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||||
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||||
|
|
||||||
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||||
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||||
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||||
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
|
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||||
|
|
||||||
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||||
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||||
|
|
||||||
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||||
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
|
const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
|
||||||
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||||
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
|
const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
|
||||||
|
|
||||||
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||||
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||||
|
|
||||||
void cblas_xerbla(blasint p, char *rout, char *form, ...);
|
void cblas_xerbla(blasint p, char *rout, char *form, ...);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* __cplusplus */
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
6
common.h
6
common.h
|
@ -390,7 +390,8 @@ typedef int blasint;
|
||||||
/* C99 supports complex floating numbers natively, which GCC also offers as an
|
/* C99 supports complex floating numbers natively, which GCC also offers as an
|
||||||
extension since version 3.0. If neither are available, use a compatible
|
extension since version 3.0. If neither are available, use a compatible
|
||||||
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
|
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
|
||||||
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
|
#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
|
||||||
|
(__GNUC__ >= 3 && !defined(__cplusplus)))
|
||||||
#define OPENBLAS_COMPLEX_C99
|
#define OPENBLAS_COMPLEX_C99
|
||||||
typedef float _Complex openblas_complex_float;
|
typedef float _Complex openblas_complex_float;
|
||||||
typedef double _Complex openblas_complex_double;
|
typedef double _Complex openblas_complex_double;
|
||||||
|
@ -557,7 +558,8 @@ typedef struct {
|
||||||
#include "common_level3.h"
|
#include "common_level3.h"
|
||||||
#include "common_lapack.h"
|
#include "common_lapack.h"
|
||||||
#ifdef CBLAS
|
#ifdef CBLAS
|
||||||
#include "cblas.h"
|
/* This header file is generated from "cblas.h" (see Makefile.prebuild). */
|
||||||
|
#include "cblas_noconst.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef ASSEMBLER
|
#ifndef ASSEMBLER
|
||||||
|
|
1
cpuid.h
1
cpuid.h
|
@ -126,6 +126,7 @@
|
||||||
#define HAVE_128BITFPU (1 << 16)
|
#define HAVE_128BITFPU (1 << 16)
|
||||||
#define HAVE_FASTMOVU (1 << 17)
|
#define HAVE_FASTMOVU (1 << 17)
|
||||||
#define HAVE_AVX (1 << 18)
|
#define HAVE_AVX (1 << 18)
|
||||||
|
#define HAVE_FMA4 (1 << 19)
|
||||||
|
|
||||||
#define CACHE_INFO_L1_I 1
|
#define CACHE_INFO_L1_I 1
|
||||||
#define CACHE_INFO_L1_D 2
|
#define CACHE_INFO_L1_D 2
|
||||||
|
|
32
cpuid_x86.c
32
cpuid_x86.c
|
@ -43,6 +43,8 @@
|
||||||
#ifdef NO_AVX
|
#ifdef NO_AVX
|
||||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
||||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||||
|
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||||
|
#define CORE_BULLDOZER CORE_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPUIDEMU
|
#ifndef CPUIDEMU
|
||||||
|
@ -116,8 +118,9 @@ static inline int have_excpuid(void){
|
||||||
|
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
static inline void xgetbv(int op, int * eax, int * edx){
|
static inline void xgetbv(int op, int * eax, int * edx){
|
||||||
|
//Use binary code for xgetbv
|
||||||
__asm__ __volatile__
|
__asm__ __volatile__
|
||||||
("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -228,6 +231,9 @@ int get_cputype(int gettype){
|
||||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||||
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
|
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
|
||||||
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
|
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
|
||||||
|
#ifndef NO_AVX
|
||||||
|
if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4;
|
||||||
|
#endif
|
||||||
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
|
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
|
||||||
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
|
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
|
||||||
}
|
}
|
||||||
|
@ -1030,6 +1036,8 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
|
case 14:
|
||||||
|
// Xeon E7540
|
||||||
case 15:
|
case 15:
|
||||||
//Xeon Processor E7 (Westmere-EX)
|
//Xeon Processor E7 (Westmere-EX)
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
|
@ -1075,8 +1083,12 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_OPTERON;
|
return CPUTYPE_OPTERON;
|
||||||
case 1:
|
case 1:
|
||||||
case 10:
|
case 10:
|
||||||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
|
||||||
return CPUTYPE_BARCELONA;
|
return CPUTYPE_BARCELONA;
|
||||||
|
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_BULLDOZER;
|
||||||
|
else
|
||||||
|
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||||
case 5:
|
case 5:
|
||||||
return CPUTYPE_BOBCAT;
|
return CPUTYPE_BOBCAT;
|
||||||
}
|
}
|
||||||
|
@ -1398,6 +1410,8 @@ int get_coretype(void){
|
||||||
return CORE_SANDYBRIDGE;
|
return CORE_SANDYBRIDGE;
|
||||||
else
|
else
|
||||||
return CORE_NEHALEM; //OS doesn't support AVX
|
return CORE_NEHALEM; //OS doesn't support AVX
|
||||||
|
case 14:
|
||||||
|
//Xeon E7540
|
||||||
case 15:
|
case 15:
|
||||||
//Xeon Processor E7 (Westmere-EX)
|
//Xeon Processor E7 (Westmere-EX)
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
|
@ -1427,8 +1441,13 @@ int get_coretype(void){
|
||||||
if (family == 0xf){
|
if (family == 0xf){
|
||||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||||
else if (exfamily == 5) return CORE_BOBCAT;
|
else if (exfamily == 5) return CORE_BOBCAT;
|
||||||
else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
else if (exfamily == 6) {
|
||||||
else return CORE_BARCELONA;
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
|
if(support_avx())
|
||||||
|
return CORE_BULLDOZER;
|
||||||
|
else
|
||||||
|
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
|
||||||
|
}else return CORE_BARCELONA;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1494,6 +1513,9 @@ void get_cpuconfig(void){
|
||||||
printf("#define DTB_SIZE %d\n", info.size * 1024);
|
printf("#define DTB_SIZE %d\n", info.size * 1024);
|
||||||
printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
|
printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
|
||||||
printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
|
printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
|
||||||
|
} else {
|
||||||
|
//fall back for some virtual machines.
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 32\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
features = get_cputype(GET_FEATURE);
|
features = get_cputype(GET_FEATURE);
|
||||||
|
@ -1511,6 +1533,7 @@ void get_cpuconfig(void){
|
||||||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
||||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||||
|
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||||
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
||||||
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
|
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
|
||||||
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
|
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
|
||||||
|
@ -1577,5 +1600,6 @@ void get_sse(void){
|
||||||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
||||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||||
|
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
|
|
||||||
a = (FLOAT *)args -> a;
|
a = (FLOAT *)args -> a;
|
||||||
x = (FLOAT *)args -> b;
|
x = (FLOAT *)args -> b;
|
||||||
y = (FLOAT *)args -> c;
|
|
||||||
|
|
||||||
lda = args -> lda;
|
lda = args -> lda;
|
||||||
incx = args -> ldb;
|
incx = args -> ldb;
|
||||||
|
@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
n_from = 0;
|
n_from = 0;
|
||||||
n_to = n;
|
n_to = n;
|
||||||
|
|
||||||
|
//Use y as each thread's n* COMPSIZE elements in sb buffer
|
||||||
|
y = buffer;
|
||||||
|
buffer += ((COMPSIZE * n + 1023) & ~1023);
|
||||||
|
|
||||||
if (range_m) {
|
if (range_m) {
|
||||||
n_from = *(range_m + 0);
|
n_from = *(range_m + 0);
|
||||||
n_to = *(range_m + 1);
|
n_to = *(range_m + 1);
|
||||||
|
@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
a += n_from * lda * COMPSIZE;
|
a += n_from * lda * COMPSIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (range_n) y += *range_n * COMPSIZE;
|
|
||||||
|
|
||||||
if (incx != 1) {
|
if (incx != 1) {
|
||||||
COPY_K(n, x, incx, buffer, 1);
|
COPY_K(n, x, incx, buffer, 1);
|
||||||
|
@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
|
|
||||||
if (num_cpu) {
|
if (num_cpu) {
|
||||||
queue[0].sa = NULL;
|
queue[0].sa = NULL;
|
||||||
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
|
queue[0].sb = buffer;
|
||||||
queue[num_cpu - 1].next = NULL;
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
exec_blas(num_cpu, queue);
|
exec_blas(num_cpu, queue);
|
||||||
|
@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
#else
|
#else
|
||||||
ONE, ZERO,
|
ONE, ZERO,
|
||||||
#endif
|
#endif
|
||||||
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
|
(FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
AXPYU_K(n, 0, 0,
|
AXPYU_K(n, 0, 0,
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
TOPDIR = ../..
|
TOPDIR = ../..
|
||||||
include ../../Makefile.system
|
include ../../Makefile.system
|
||||||
|
|
||||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
|
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
|
||||||
|
|
||||||
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||||
|
|
||||||
|
@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
|
||||||
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
|
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
openblas_get_config.$(SUFFIX) : openblas_get_config.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
|
||||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
queue->sb=sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
|
|
@ -49,8 +49,12 @@
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
|
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads) {
|
void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
|
int i=0;
|
||||||
|
|
||||||
if (num_threads < 1) num_threads = blas_num_threads;
|
if (num_threads < 1) num_threads = blas_num_threads;
|
||||||
|
|
||||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||||
|
@ -63,6 +67,18 @@ void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
omp_set_num_threads(blas_cpu_number);
|
omp_set_num_threads(blas_cpu_number);
|
||||||
|
|
||||||
|
//adjust buffer for each thread
|
||||||
|
for(i=0; i<blas_cpu_number; i++){
|
||||||
|
if(blas_thread_buffer[i]==NULL){
|
||||||
|
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(; i<MAX_CPU_NUMBER; i++){
|
||||||
|
if(blas_thread_buffer[i]!=NULL){
|
||||||
|
blas_memory_free(blas_thread_buffer[i]);
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
#if defined(ARCH_MIPS64)
|
#if defined(ARCH_MIPS64)
|
||||||
//set parameters for different number of threads.
|
//set parameters for different number of threads.
|
||||||
blas_set_parameter();
|
blas_set_parameter();
|
||||||
|
@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
int blas_thread_init(void){
|
int blas_thread_init(void){
|
||||||
|
|
||||||
|
int i=0;
|
||||||
|
|
||||||
blas_get_cpu_number();
|
blas_get_cpu_number();
|
||||||
|
|
||||||
blas_server_avail = 1;
|
blas_server_avail = 1;
|
||||||
|
|
||||||
|
for(i=0; i<blas_num_threads; i++){
|
||||||
|
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
|
for(; i<MAX_CPU_NUMBER; i++){
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int BLASFUNC(blas_thread_shutdown)(void){
|
int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
|
int i=0;
|
||||||
blas_server_avail = 0;
|
blas_server_avail = 0;
|
||||||
|
|
||||||
|
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||||
|
if(blas_thread_buffer[i]!=NULL){
|
||||||
|
blas_memory_free(blas_thread_buffer[i]);
|
||||||
|
blas_thread_buffer[i]=NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -177,6 +209,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||||
static void exec_threads(blas_queue_t *queue){
|
static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
void *buffer, *sa, *sb;
|
void *buffer, *sa, *sb;
|
||||||
|
int pos=0, release_flag=0;
|
||||||
|
|
||||||
buffer = NULL;
|
buffer = NULL;
|
||||||
sa = queue -> sa;
|
sa = queue -> sa;
|
||||||
|
@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||||
|
|
||||||
|
pos = omp_get_thread_num();
|
||||||
|
buffer = blas_thread_buffer[pos];
|
||||||
|
|
||||||
|
//fallback
|
||||||
|
if(buffer==NULL) {
|
||||||
buffer = blas_memory_alloc(2);
|
buffer = blas_memory_alloc(2);
|
||||||
|
release_flag=1;
|
||||||
|
}
|
||||||
|
|
||||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||||
|
|
||||||
|
@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
queue->sb=sb;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (buffer != NULL) blas_memory_free(buffer);
|
if (release_flag) blas_memory_free(buffer);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
queue->sb=sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
|
|
@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA;
|
||||||
extern gotoblas_t gotoblas_BOBCAT;
|
extern gotoblas_t gotoblas_BOBCAT;
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||||
|
extern gotoblas_t gotoblas_BULLDOZER;
|
||||||
#else
|
#else
|
||||||
//Use NEHALEM kernels for sandy bridge
|
//Use NEHALEM kernels for sandy bridge
|
||||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||||
|
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,8 +80,9 @@ extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||||
|
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
static inline void xgetbv(int op, int * eax, int * edx){
|
static inline void xgetbv(int op, int * eax, int * edx){
|
||||||
|
//Use binary code for xgetbv
|
||||||
__asm__ __volatile__
|
__asm__ __volatile__
|
||||||
("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -163,7 +166,8 @@ static gotoblas_t *get_coretype(void){
|
||||||
|
|
||||||
//Intel Xeon Processor 5600 (Westmere-EP)
|
//Intel Xeon Processor 5600 (Westmere-EP)
|
||||||
//Xeon Processor E7 (Westmere-EX)
|
//Xeon Processor E7 (Westmere-EX)
|
||||||
if (model == 12 || model == 15) return &gotoblas_NEHALEM;
|
//Xeon E7540
|
||||||
|
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
|
||||||
|
|
||||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
||||||
//Intel Core i7-3000 / Xeon E5
|
//Intel Core i7-3000 / Xeon E5
|
||||||
|
@ -171,7 +175,7 @@ static gotoblas_t *get_coretype(void){
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_SANDYBRIDGE;
|
return &gotoblas_SANDYBRIDGE;
|
||||||
else{
|
else{
|
||||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -182,7 +186,7 @@ static gotoblas_t *get_coretype(void){
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_SANDYBRIDGE;
|
return &gotoblas_SANDYBRIDGE;
|
||||||
else{
|
else{
|
||||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -202,6 +206,14 @@ static gotoblas_t *get_coretype(void){
|
||||||
else return &gotoblas_OPTERON;
|
else return &gotoblas_OPTERON;
|
||||||
} else if (exfamily == 5) {
|
} else if (exfamily == 5) {
|
||||||
return &gotoblas_BOBCAT;
|
return &gotoblas_BOBCAT;
|
||||||
|
} else if (exfamily == 6) {
|
||||||
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_BULLDOZER;
|
||||||
|
else{
|
||||||
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
|
||||||
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
return &gotoblas_BARCELONA;
|
return &gotoblas_BARCELONA;
|
||||||
}
|
}
|
||||||
|
@ -238,6 +250,7 @@ static char *corename[] = {
|
||||||
"Nano",
|
"Nano",
|
||||||
"Sandybridge",
|
"Sandybridge",
|
||||||
"Bobcat",
|
"Bobcat",
|
||||||
|
"Bulldozer",
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
|
@ -259,6 +272,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
if (gotoblas == &gotoblas_NANO) return corename[15];
|
||||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||||
|
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||||
|
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the ISCAS nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
static char* openblas_config_str=""
|
||||||
|
#ifdef USE64BITINT
|
||||||
|
"USE64BITINT "
|
||||||
|
#endif
|
||||||
|
#ifdef NO_CBLAS
|
||||||
|
"NO_CBLAS "
|
||||||
|
#endif
|
||||||
|
#ifdef NO_LAPACK
|
||||||
|
"NO_LAPACK "
|
||||||
|
#endif
|
||||||
|
#ifdef NO_LAPACKE
|
||||||
|
"NO_LAPACKE "
|
||||||
|
#endif
|
||||||
|
#ifdef DYNAMIC_ARCH
|
||||||
|
"DYNAMIC_ARCH "
|
||||||
|
#endif
|
||||||
|
#ifdef NO_AFFINITY
|
||||||
|
"NO_AFFINITY "
|
||||||
|
#endif
|
||||||
|
;
|
||||||
|
|
||||||
|
char* CNAME() {
|
||||||
|
return openblas_config_str;
|
||||||
|
}
|
||||||
|
|
|
@ -163,7 +163,7 @@ int get_L2_size(void){
|
||||||
|
|
||||||
int eax, ebx, ecx, edx;
|
int eax, ebx, ecx, edx;
|
||||||
|
|
||||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
|
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
|
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT)
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
EXTRALIB += -lgfortran
|
EXTRALIB += -lgfortran
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
ifeq ($(C_COMPILER), GCC)
|
||||||
|
EXTRALIB += -lgomp
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
|
|
|
@ -74,6 +74,7 @@
|
||||||
|
|
||||||
@misc_no_underscore_objs = (
|
@misc_no_underscore_objs = (
|
||||||
openblas_set_num_threads, goto_set_num_threads,
|
openblas_set_num_threads, goto_set_num_threads,
|
||||||
|
openblas_get_config,
|
||||||
);
|
);
|
||||||
|
|
||||||
@misc_underscore_objs = (
|
@misc_underscore_objs = (
|
||||||
|
|
18
getarch.c
18
getarch.c
|
@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "OPTERON"
|
#define CORENAME "OPTERON"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER)
|
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
#define ARCHITECTURE "X86"
|
#define ARCHITECTURE "X86"
|
||||||
|
@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "BOBCAT"
|
#define CORENAME "BOBCAT"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined (FORCE_BULLDOZER)
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "BULLDOZER"
|
||||||
|
#define ARCHCONFIG "-DBULLDOZER " \
|
||||||
|
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
|
||||||
|
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
|
||||||
|
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
|
||||||
|
"-DHAVE_AVX -DHAVE_FMA4"
|
||||||
|
#define LIBNAME "bulldozer"
|
||||||
|
#define CORENAME "BULLDOZER"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_SSE_GENERIC
|
#ifdef FORCE_SSE_GENERIC
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
|
|
@ -34,7 +34,7 @@ int main(int argc, char **argv) {
|
||||||
#ifdef USE64BITINT
|
#ifdef USE64BITINT
|
||||||
printf("#define USE64BITINT\n");
|
printf("#define USE64BITINT\n");
|
||||||
#endif
|
#endif
|
||||||
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
|
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD);
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -810,6 +810,22 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef BULLDOZER
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "Bulldozer\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef NANO
|
#ifdef NANO
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
|
||||||
|
SGEMMINCOPY =
|
||||||
|
SGEMMITCOPY =
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
SGEMMINCOPYOBJ =
|
||||||
|
SGEMMITCOPYOBJ =
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||||
|
CGEMMINCOPY =
|
||||||
|
CGEMMITCOPY =
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ =
|
||||||
|
CGEMMITCOPYOBJ =
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
|
||||||
|
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
|
||||||
|
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
|
|
@ -596,7 +596,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 4 * SIZE(BB), %xmm2
|
movsd 4 * SIZE(BB), %xmm2
|
||||||
|
@ -842,7 +842,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1168,7 +1168,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1198,7 +1198,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -1347,7 +1347,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1531,7 +1531,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1778,7 +1778,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -1793,7 +1793,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -1924,7 +1924,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -2069,7 +2069,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
|
|
@ -89,17 +89,22 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 16
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA 16 + STACKSIZE(%esp)
|
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 20 + STACKSIZE(%esp)
|
#define A 20 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 24 + STACKSIZE(%esp)
|
#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 28 + STACKSIZE(%esp)
|
#define STACK_X 28 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 32 + STACKSIZE(%esp)
|
#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 36 + STACKSIZE(%esp)
|
#define Y 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 40 + STACKSIZE(%esp)
|
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 44 + STACKSIZE(%esp)
|
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||||
|
#define MMM 0+ARGS(%esp)
|
||||||
|
#define YY 4+ARGS(%esp)
|
||||||
|
#define AA 8+ARGS(%esp)
|
||||||
|
#define LDAX 12+ARGS(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -114,6 +119,7 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -121,7 +127,34 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
|
movl Y,J
|
||||||
|
movl J,YY # backup Y
|
||||||
|
movl A,J
|
||||||
|
movl J,AA # backup A
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM # backup MM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $21,J
|
||||||
|
subl J,MMM
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A
|
||||||
|
|
||||||
|
movl YY,J
|
||||||
|
movl J,Y
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
|
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
|
|
||||||
|
@ -651,12 +684,22 @@
|
||||||
addss 0 * SIZE(X), %xmm0
|
addss 0 * SIZE(X), %xmm0
|
||||||
movss %xmm0, (Y1)
|
movss %xmm0, (Y1)
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,J
|
||||||
|
leal (,J,SIZE),%eax
|
||||||
|
addl %eax,AA
|
||||||
|
movl YY,J
|
||||||
|
addl %eax,J
|
||||||
|
movl J,YY
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -76,17 +76,22 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 16
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA 16 + STACKSIZE(%esp)
|
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 24 + STACKSIZE(%esp)
|
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 32 + STACKSIZE(%esp)
|
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 40 + STACKSIZE(%esp)
|
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 48 + STACKSIZE(%esp)
|
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0+ARGS(%esp)
|
||||||
|
#define YY 4+ARGS(%esp)
|
||||||
|
#define AA 8+ARGS(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -101,6 +106,8 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -108,6 +115,33 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
|
movl Y,J
|
||||||
|
movl J,YY # backup Y
|
||||||
|
movl A,J
|
||||||
|
movl J,AA # backup A
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM # backup MM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $20,J
|
||||||
|
subl J,MMM
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A
|
||||||
|
|
||||||
|
movl YY,J
|
||||||
|
movl J,Y
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
|
@ -677,10 +711,22 @@
|
||||||
ALIGN_3
|
ALIGN_3
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,J
|
||||||
|
leal (,J,SIZE),%eax
|
||||||
|
addl %eax,AA
|
||||||
|
movl YY,J
|
||||||
|
addl %eax,J
|
||||||
|
movl J,YY
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
|
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -89,17 +89,24 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 20
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA 16 + STACKSIZE(%esp)
|
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 20 + STACKSIZE(%esp)
|
#define A 20 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 24 + STACKSIZE(%esp)
|
#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 28 + STACKSIZE(%esp)
|
#define STACK_X 28 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 32 + STACKSIZE(%esp)
|
#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 36 + STACKSIZE(%esp)
|
#define Y 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 40 + STACKSIZE(%esp)
|
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 44 + STACKSIZE(%esp)
|
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0+STACKSIZE(%esp)
|
||||||
|
#define NN 4+STACKSIZE(%esp)
|
||||||
|
#define AA 8+STACKSIZE(%esp)
|
||||||
|
#define LDAX 12+STACKSIZE(%esp)
|
||||||
|
#define XX 16+STACKSIZE(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -114,6 +121,7 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -122,7 +130,42 @@
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
|
movl LDA,LDAX # backup LDA
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
|
movl X,XX
|
||||||
|
movl N,J
|
||||||
|
movl J,NN # backup N
|
||||||
|
movl A,J
|
||||||
|
movl J,AA # backup A
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM # mov M to MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
|
||||||
|
subl $8, J # Don't use last 8 float in the buffer.
|
||||||
|
# Now, split M by block J
|
||||||
|
subl J,MMM # MMM=MMM-J
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A # mov AA to A
|
||||||
|
|
||||||
|
movl NN,%eax
|
||||||
|
movl %eax,N # reset N
|
||||||
|
|
||||||
|
|
||||||
|
movl LDAX, LDA # reset LDA
|
||||||
|
movl XX,X
|
||||||
|
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
|
||||||
|
@ -198,6 +241,20 @@
|
||||||
jg .L06
|
jg .L06
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
|
//Padding zero to prevent loading the dirty number from buffer.
|
||||||
|
movl M, I
|
||||||
|
movl $8, J
|
||||||
|
andl $7, I
|
||||||
|
xorps %xmm0, %xmm0
|
||||||
|
subl I, J
|
||||||
|
ALIGN_2
|
||||||
|
.L07:
|
||||||
|
movss %xmm0, 0 * SIZE(Y1)
|
||||||
|
addl $SIZE, Y1
|
||||||
|
decl J
|
||||||
|
jg .L07
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
.L10:
|
.L10:
|
||||||
movl Y, Y1
|
movl Y, Y1
|
||||||
|
|
||||||
|
@ -628,10 +685,22 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,J
|
||||||
|
leal (,J,SIZE),%eax
|
||||||
|
addl %eax,AA
|
||||||
|
movl XX,J
|
||||||
|
addl %eax,J
|
||||||
|
movl J,XX
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -76,17 +76,23 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define STACKSIZE 16
|
#define STACKSIZE 16
|
||||||
|
#define ARGS 16
|
||||||
|
|
||||||
#define M 4 + STACKSIZE(%esp)
|
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||||
#define N 8 + STACKSIZE(%esp)
|
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||||
#define ALPHA 16 + STACKSIZE(%esp)
|
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||||
#define A 24 + STACKSIZE(%esp)
|
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_X 32 + STACKSIZE(%esp)
|
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||||
#define Y 40 + STACKSIZE(%esp)
|
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||||
#define BUFFER 48 + STACKSIZE(%esp)
|
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||||
|
|
||||||
|
#define MMM 0+STACKSIZE(%esp)
|
||||||
|
#define AA 4+STACKSIZE(%esp)
|
||||||
|
#define LDAX 8+STACKSIZE(%esp)
|
||||||
|
#define NN 12+STACKSIZE(%esp)
|
||||||
|
|
||||||
#define I %eax
|
#define I %eax
|
||||||
#define J %ebx
|
#define J %ebx
|
||||||
|
@ -101,6 +107,8 @@
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
|
||||||
|
subl $ARGS,%esp
|
||||||
|
|
||||||
pushl %ebp
|
pushl %ebp
|
||||||
pushl %edi
|
pushl %edi
|
||||||
pushl %esi
|
pushl %esi
|
||||||
|
@ -108,7 +116,40 @@
|
||||||
|
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
|
|
||||||
movl STACK_LDA, LDA
|
movl STACK_LDA, LDA
|
||||||
|
movl LDA,LDAX # backup LDA
|
||||||
|
movl N,J
|
||||||
|
movl J,NN # backup N
|
||||||
|
movl A,J
|
||||||
|
movl J,AA # backup A
|
||||||
|
movl M,J
|
||||||
|
movl J,MMM # mov M to MMM
|
||||||
|
.L0t:
|
||||||
|
xorl J,J
|
||||||
|
addl $1,J
|
||||||
|
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
|
||||||
|
subl $4, J # Don't use last 4 double in the buffer.
|
||||||
|
# Now, split M by block J
|
||||||
|
subl J,MMM # MMM=MMM-J
|
||||||
|
movl J,M
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movl MMM,%eax
|
||||||
|
addl J,%eax
|
||||||
|
jle .L999x
|
||||||
|
movl %eax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movl AA,%eax
|
||||||
|
movl %eax,A # mov AA to A
|
||||||
|
|
||||||
|
movl NN,%eax
|
||||||
|
movl %eax,N # reset N
|
||||||
|
|
||||||
|
|
||||||
|
movl LDAX, LDA # reset LDA
|
||||||
movl STACK_X, X
|
movl STACK_X, X
|
||||||
movl STACK_INCX, INCX
|
movl STACK_INCX, INCX
|
||||||
movl STACK_INCY, INCY
|
movl STACK_INCY, INCY
|
||||||
|
@ -117,6 +158,7 @@
|
||||||
leal (,INCY, SIZE), INCY
|
leal (,INCY, SIZE), INCY
|
||||||
leal (,LDA, SIZE), LDA
|
leal (,LDA, SIZE), LDA
|
||||||
|
|
||||||
|
|
||||||
subl $-16 * SIZE, A
|
subl $-16 * SIZE, A
|
||||||
|
|
||||||
cmpl $0, N
|
cmpl $0, N
|
||||||
|
@ -560,10 +602,19 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
movl M,J
|
||||||
|
leal (,J,SIZE),%eax
|
||||||
|
addl %eax,AA
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
popl %ebx
|
popl %ebx
|
||||||
popl %esi
|
popl %esi
|
||||||
popl %edi
|
popl %edi
|
||||||
popl %ebp
|
popl %ebp
|
||||||
|
|
||||||
|
addl $ARGS,%esp
|
||||||
ret
|
ret
|
||||||
|
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
|
@ -269,7 +269,7 @@
|
||||||
sarl $5, I
|
sarl $5, I
|
||||||
jle .L113
|
jle .L113
|
||||||
|
|
||||||
#if defined(BARCELONA)
|
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||||
|
|
||||||
movaps %xmm0, %xmm1
|
movaps %xmm0, %xmm1
|
||||||
mulps -32 * SIZE(X), %xmm1
|
mulps -32 * SIZE(X), %xmm1
|
||||||
|
|
|
@ -253,7 +253,7 @@
|
||||||
sarl $4, I
|
sarl $4, I
|
||||||
jle .L113
|
jle .L113
|
||||||
|
|
||||||
#if defined(BARCELONA)
|
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||||
|
|
||||||
movaps %xmm0, %xmm1
|
movaps %xmm0, %xmm1
|
||||||
mulpd -16 * SIZE(X), %xmm1
|
mulpd -16 * SIZE(X), %xmm1
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -488,7 +488,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1697,7 +1697,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1727,7 +1727,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -437,7 +437,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -833,7 +833,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1848,7 +1848,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2109,7 +2109,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2429,7 +2429,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2459,7 +2459,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2952,7 +2952,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -3148,7 +3148,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3389,7 +3389,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -3404,7 +3404,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -910,7 +910,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -959,7 +959,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1439,7 +1439,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1469,7 +1469,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -872,7 +872,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1316,7 +1316,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1855,7 +1855,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1885,7 +1885,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2249,7 +2249,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2562,7 +2562,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2957,7 +2957,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -2972,7 +2972,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -3280,7 +3280,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3515,7 +3515,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -1036,7 +1036,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1066,7 +1066,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
@ -2224,7 +2224,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -2273,7 +2273,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -454,7 +454,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -758,7 +758,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -993,7 +993,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -1324,7 +1324,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1354,7 +1354,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -1718,7 +1718,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2031,7 +2031,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2859,7 +2859,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -3303,7 +3303,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
|
|
@ -74,7 +74,7 @@
|
||||||
#define BB %ecx
|
#define BB %ecx
|
||||||
#define LDC %ebp
|
#define LDC %ebp
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -625,7 +625,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 4 * SIZE(BB), %xmm2
|
movsd 4 * SIZE(BB), %xmm2
|
||||||
|
@ -870,7 +870,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1173,7 +1173,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1203,7 +1203,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -1359,7 +1359,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1536,7 +1536,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1794,7 +1794,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -1809,7 +1809,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -1936,7 +1936,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -2069,7 +2069,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
|
|
@ -71,7 +71,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef BARCELONA
|
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetchnta
|
#define PREFETCH prefetchnta
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 5)
|
#define PREFETCHSIZE (16 * 5)
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef BARCELONA
|
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetchnta
|
#define PREFETCH prefetchnta
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (8 * 5)
|
#define PREFETCHSIZE (8 * 5)
|
||||||
|
|
|
@ -71,7 +71,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef BARCELONA
|
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetchnta
|
#define PREFETCH prefetchnta
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 5)
|
#define PREFETCHSIZE (16 * 5)
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef BARCELONA
|
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetchnta
|
#define PREFETCH prefetchnta
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (8 * 5)
|
#define PREFETCHSIZE (8 * 5)
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -533,7 +533,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -994,7 +994,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -1820,7 +1820,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
SGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||||
|
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
|
||||||
|
DGEMMINCOPY =
|
||||||
|
DGEMMITCOPY =
|
||||||
|
DGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||||
|
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||||
|
DGEMMINCOPYOBJ =
|
||||||
|
DGEMMITCOPYOBJ =
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMONCOPY = zgemm_ncopy_2.S
|
||||||
|
CGEMMOTCOPY = zgemm_tcopy_2.S
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||||
|
ZGEMMINCOPY =
|
||||||
|
ZGEMMITCOPY =
|
||||||
|
ZGEMMONCOPY = zgemm_ncopy_2.S
|
||||||
|
ZGEMMOTCOPY = zgemm_tcopy_2.S
|
||||||
|
ZGEMMINCOPYOBJ =
|
||||||
|
ZGEMMITCOPYOBJ =
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
|
||||||
|
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
|
||||||
|
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
|
||||||
|
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
|
||||||
|
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
|
||||||
|
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
|
||||||
|
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
|
||||||
|
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
|
||||||
|
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
|
||||||
|
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
|
||||||
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
File diff suppressed because it is too large
Load Diff
|
@ -530,7 +530,7 @@
|
||||||
#endif
|
#endif
|
||||||
movsd -32 * SIZE(Y), %xmm8
|
movsd -32 * SIZE(Y), %xmm8
|
||||||
|
|
||||||
pshufd $0x39, %xmm4, %xmm5
|
pshufd $0x29, %xmm4, %xmm5
|
||||||
|
|
||||||
mulps %xmm8, %xmm5
|
mulps %xmm8, %xmm5
|
||||||
addps %xmm5, %xmm3
|
addps %xmm5, %xmm3
|
||||||
|
@ -750,7 +750,8 @@
|
||||||
xorps %xmm5, %xmm5
|
xorps %xmm5, %xmm5
|
||||||
movhlps %xmm4, %xmm5
|
movhlps %xmm4, %xmm5
|
||||||
|
|
||||||
mulps -32 * SIZE(Y), %xmm5
|
movlps -32 * SIZE(Y), %xmm4
|
||||||
|
mulps %xmm4, %xmm5
|
||||||
addps %xmm5, %xmm0
|
addps %xmm5, %xmm0
|
||||||
|
|
||||||
addq $2 * SIZE, X
|
addq $2 * SIZE, X
|
||||||
|
@ -992,7 +993,7 @@
|
||||||
movsd -32 * SIZE(Y), %xmm8
|
movsd -32 * SIZE(Y), %xmm8
|
||||||
|
|
||||||
movss %xmm5, %xmm4
|
movss %xmm5, %xmm4
|
||||||
shufps $0x93, %xmm5, %xmm4
|
shufps $0x93, %xmm4, %xmm4
|
||||||
|
|
||||||
mulps %xmm8, %xmm4
|
mulps %xmm8, %xmm4
|
||||||
addps %xmm4, %xmm3
|
addps %xmm4, %xmm3
|
||||||
|
|
|
@ -930,7 +930,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
addps %xmm9, %xmm0
|
addps %xmm9, %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BO), %xmm9
|
movaps 4 * SIZE(BO), %xmm9
|
||||||
|
@ -983,7 +983,7 @@
|
||||||
addps %xmm8, %xmm3
|
addps %xmm8, %xmm3
|
||||||
movaps 0 * SIZE(AO), %xmm8
|
movaps 0 * SIZE(AO), %xmm8
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm10, %xmm9
|
mulps %xmm10, %xmm9
|
||||||
|
@ -1178,7 +1178,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
addps %xmm9, %xmm0
|
addps %xmm9, %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movsd 4 * SIZE(BO), %xmm9
|
movsd 4 * SIZE(BO), %xmm9
|
||||||
|
@ -1423,7 +1423,7 @@
|
||||||
.L42:
|
.L42:
|
||||||
mulss %xmm8, %xmm9
|
mulss %xmm8, %xmm9
|
||||||
addss %xmm9, %xmm0
|
addss %xmm9, %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BO), %xmm9
|
movss 4 * SIZE(BO), %xmm9
|
||||||
|
@ -1765,7 +1765,7 @@
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BO), %xmm8
|
mulps 4 * SIZE(BO), %xmm8
|
||||||
|
@ -1793,7 +1793,7 @@
|
||||||
addps %xmm8, %xmm5
|
addps %xmm8, %xmm5
|
||||||
movaps 32 * SIZE(AO), %xmm8
|
movaps 32 * SIZE(AO), %xmm8
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm10, %xmm11
|
mulps %xmm10, %xmm11
|
||||||
|
@ -1822,7 +1822,7 @@
|
||||||
addps %xmm10, %xmm5
|
addps %xmm10, %xmm5
|
||||||
movaps 48 * SIZE(AO), %xmm10
|
movaps 48 * SIZE(AO), %xmm10
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm12, %xmm13
|
mulps %xmm12, %xmm13
|
||||||
|
@ -1851,7 +1851,7 @@
|
||||||
addps %xmm12, %xmm5
|
addps %xmm12, %xmm5
|
||||||
movaps 64 * SIZE(AO), %xmm12
|
movaps 64 * SIZE(AO), %xmm12
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm14, %xmm15
|
mulps %xmm14, %xmm15
|
||||||
|
@ -2024,7 +2024,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2208,7 +2208,7 @@
|
||||||
.L82:
|
.L82:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
addps %xmm9, %xmm0
|
addps %xmm9, %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movsd 4 * SIZE(BO), %xmm9
|
movsd 4 * SIZE(BO), %xmm9
|
||||||
|
@ -2395,7 +2395,7 @@
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
addps %xmm9, %xmm0
|
addps %xmm9, %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BO), %xmm9
|
movss 4 * SIZE(BO), %xmm9
|
||||||
|
@ -2670,7 +2670,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulps %xmm9, %xmm8
|
mulps %xmm9, %xmm8
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2687,7 +2687,7 @@
|
||||||
addps %xmm9, %xmm4
|
addps %xmm9, %xmm4
|
||||||
movaps 8 * SIZE(BO), %xmm9
|
movaps 8 * SIZE(BO), %xmm9
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm9, %xmm10
|
mulps %xmm9, %xmm10
|
||||||
|
@ -2704,7 +2704,7 @@
|
||||||
addps %xmm9, %xmm4
|
addps %xmm9, %xmm4
|
||||||
movaps 32 * SIZE(BO), %xmm9
|
movaps 32 * SIZE(BO), %xmm9
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm11, %xmm12
|
mulps %xmm11, %xmm12
|
||||||
|
@ -2721,7 +2721,7 @@
|
||||||
addps %xmm11, %xmm4
|
addps %xmm11, %xmm4
|
||||||
movaps 24 * SIZE(BO), %xmm11
|
movaps 24 * SIZE(BO), %xmm11
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm11, %xmm14
|
mulps %xmm11, %xmm14
|
||||||
|
@ -2857,7 +2857,7 @@
|
||||||
|
|
||||||
.L122:
|
.L122:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movaps -28 * SIZE(AO), %xmm8
|
movaps -28 * SIZE(AO), %xmm8
|
||||||
|
@ -2873,7 +2873,7 @@
|
||||||
addps %xmm8, %xmm3
|
addps %xmm8, %xmm3
|
||||||
movaps 0 * SIZE(AO), %xmm8
|
movaps 0 * SIZE(AO), %xmm8
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm10, %xmm11
|
mulps %xmm10, %xmm11
|
||||||
|
@ -3003,7 +3003,7 @@
|
||||||
|
|
||||||
.L132:
|
.L132:
|
||||||
mulps %xmm8, %xmm9
|
mulps %xmm8, %xmm9
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movsd -30 * SIZE(AO), %xmm8
|
movsd -30 * SIZE(AO), %xmm8
|
||||||
|
@ -3150,7 +3150,7 @@
|
||||||
|
|
||||||
.L142:
|
.L142:
|
||||||
mulss %xmm8, %xmm9
|
mulss %xmm8, %xmm9
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||||
#endif
|
#endif
|
||||||
movss -31 * SIZE(AO), %xmm8
|
movss -31 * SIZE(AO), %xmm8
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define RPREFETCHSIZE (12 + 4)
|
#define RPREFETCHSIZE (12 + 4)
|
||||||
#define WPREFETCHSIZE (48 + 4)
|
#define WPREFETCHSIZE (48 + 4)
|
||||||
#define MOVNTQ MOVQ
|
#define MOVNTQ MOVQ
|
||||||
|
@ -79,7 +79,7 @@
|
||||||
#define AO3 %r13
|
#define AO3 %r13
|
||||||
#define AO4 %rax
|
#define AO4 %rax
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define RPREFETCH prefetch
|
#define RPREFETCH prefetch
|
||||||
#else
|
#else
|
||||||
#define RPREFETCH prefetch
|
#define RPREFETCH prefetch
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define RPREFETCHSIZE (12 + 4)
|
#define RPREFETCHSIZE (12 + 4)
|
||||||
#define WPREFETCHSIZE (12 + 4)
|
#define WPREFETCHSIZE (12 + 4)
|
||||||
#define MOVNTQ MOVQ
|
#define MOVNTQ MOVQ
|
||||||
|
@ -96,7 +96,7 @@
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define RPREFETCH prefetch
|
#define RPREFETCH prefetch
|
||||||
#else
|
#else
|
||||||
#define RPREFETCH prefetch
|
#define RPREFETCH prefetch
|
||||||
|
|
|
@ -469,7 +469,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L71:
|
.L71:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
prefetch PREFETCHSIZE * SIZE(X)
|
prefetch PREFETCHSIZE * SIZE(X)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -266,7 +266,7 @@
|
||||||
sarq $5, I
|
sarq $5, I
|
||||||
jle .L113
|
jle .L113
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
|
|
||||||
movaps %xmm0, %xmm1
|
movaps %xmm0, %xmm1
|
||||||
mulps -32 * SIZE(X), %xmm1
|
mulps -32 * SIZE(X), %xmm1
|
||||||
|
|
|
@ -251,7 +251,7 @@
|
||||||
sarq $4, I
|
sarq $4, I
|
||||||
jle .L113
|
jle .L113
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
|
|
||||||
movaps %xmm0, %xmm1
|
movaps %xmm0, %xmm1
|
||||||
mulpd -16 * SIZE(X), %xmm1
|
mulpd -16 * SIZE(X), %xmm1
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
/*********************************************************************/
|
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
|
@ -47,7 +46,7 @@
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define STACKSIZE 64
|
#define STACKSIZE 128
|
||||||
|
|
||||||
#define OLD_M %rdi
|
#define OLD_M %rdi
|
||||||
#define OLD_N %rsi
|
#define OLD_N %rsi
|
||||||
|
@ -57,6 +56,10 @@
|
||||||
#define STACK_Y 16 + STACKSIZE(%rsp)
|
#define STACK_Y 16 + STACKSIZE(%rsp)
|
||||||
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
||||||
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
||||||
|
#define MMM 56(%rsp)
|
||||||
|
#define NN 64(%rsp)
|
||||||
|
#define AA 72(%rsp)
|
||||||
|
#define LDAX 80(%rsp)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -71,6 +74,10 @@
|
||||||
#define STACK_Y 72 + STACKSIZE(%rsp)
|
#define STACK_Y 72 + STACKSIZE(%rsp)
|
||||||
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
||||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||||
|
#define MMM 216(%rsp)
|
||||||
|
#define NN 224(%rsp)
|
||||||
|
#define AA 232(%rsp)
|
||||||
|
#define LDAX 240(%rsp)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -127,29 +134,48 @@
|
||||||
movups %xmm14, 192(%rsp)
|
movups %xmm14, 192(%rsp)
|
||||||
movups %xmm15, 208(%rsp)
|
movups %xmm15, 208(%rsp)
|
||||||
|
|
||||||
movq OLD_M, M
|
movq OLD_M, MMM
|
||||||
movq OLD_N, N
|
movq OLD_N, NN
|
||||||
movq OLD_A, A
|
movq OLD_A, X
|
||||||
movq OLD_LDA, LDA
|
movq X, AA
|
||||||
|
movq OLD_LDA, X
|
||||||
|
movq X, LDAX
|
||||||
movq OLD_X, X
|
movq OLD_X, X
|
||||||
#else
|
#else
|
||||||
movq OLD_M, M
|
movq OLD_M, MMM
|
||||||
movq OLD_N, N
|
movq OLD_N, NN
|
||||||
movq OLD_A, A
|
movq OLD_A, AA
|
||||||
movq OLD_LDA, LDA
|
movq OLD_LDA, LDAX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
movq STACK_INCX, INCX
|
|
||||||
movq STACK_Y, Y
|
|
||||||
movq STACK_INCY, INCY
|
|
||||||
movq STACK_BUFFER, BUFFER
|
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
pshufd $0, %xmm0, ALPHA
|
pshufd $0, %xmm0, ALPHA
|
||||||
#else
|
#else
|
||||||
pshufd $0, %xmm3, ALPHA
|
pshufd $0, %xmm3, ALPHA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
.L0t:
|
||||||
|
xorq M,M
|
||||||
|
addq $1,M
|
||||||
|
salq $22,M
|
||||||
|
subq M,MMM
|
||||||
|
jge .L00t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
movq MMM,%rax
|
||||||
|
addq M,%rax
|
||||||
|
jle .L999x
|
||||||
|
movq %rax,M
|
||||||
|
|
||||||
|
.L00t:
|
||||||
|
movq LDAX,LDA
|
||||||
|
movq NN,N
|
||||||
|
movq AA,A
|
||||||
|
movq STACK_INCX, INCX
|
||||||
|
movq STACK_Y, Y
|
||||||
|
movq STACK_INCY, INCY
|
||||||
|
movq STACK_BUFFER, BUFFER
|
||||||
|
|
||||||
leaq (,INCX, SIZE), INCX
|
leaq (,INCX, SIZE), INCX
|
||||||
leaq (,INCY, SIZE), INCY
|
leaq (,INCY, SIZE), INCY
|
||||||
leaq (,LDA, SIZE), LDA
|
leaq (,LDA, SIZE), LDA
|
||||||
|
@ -6341,6 +6367,12 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L999:
|
.L999:
|
||||||
|
leaq (,M,SIZE),%rax
|
||||||
|
addq %rax,AA
|
||||||
|
jmp .L0t
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999x:
|
||||||
movq 0(%rsp), %rbx
|
movq 0(%rsp), %rbx
|
||||||
movq 8(%rsp), %rbp
|
movq 8(%rsp), %rbp
|
||||||
movq 16(%rsp), %r12
|
movq 16(%rsp), %r12
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
|
|
|
@ -699,7 +699,7 @@
|
||||||
movsd -32 * SIZE(X), %xmm4
|
movsd -32 * SIZE(X), %xmm4
|
||||||
|
|
||||||
pshufd $0xb1, %xmm4, %xmm12
|
pshufd $0xb1, %xmm4, %xmm12
|
||||||
shufps $0x39, %xmm8, %xmm8
|
shufps $0x59, %xmm8, %xmm8
|
||||||
mulps %xmm8, %xmm4
|
mulps %xmm8, %xmm4
|
||||||
addps %xmm4, %xmm0
|
addps %xmm4, %xmm0
|
||||||
mulps %xmm8, %xmm12
|
mulps %xmm8, %xmm12
|
||||||
|
@ -1336,7 +1336,7 @@
|
||||||
|
|
||||||
movss %xmm9, %xmm8
|
movss %xmm9, %xmm8
|
||||||
pshufd $0xb1, %xmm4, %xmm12
|
pshufd $0xb1, %xmm4, %xmm12
|
||||||
shufps $0x93, %xmm8, %xmm8
|
shufps $0x03, %xmm8, %xmm8
|
||||||
mulps %xmm8, %xmm4
|
mulps %xmm8, %xmm4
|
||||||
addps %xmm4, %xmm0
|
addps %xmm4, %xmm0
|
||||||
mulps %xmm8, %xmm12
|
mulps %xmm8, %xmm12
|
||||||
|
@ -1697,7 +1697,7 @@
|
||||||
movsd -32 * SIZE(Y), %xmm4
|
movsd -32 * SIZE(Y), %xmm4
|
||||||
|
|
||||||
pshufd $0xb1, %xmm4, %xmm12
|
pshufd $0xb1, %xmm4, %xmm12
|
||||||
shufps $0x39, %xmm8, %xmm8
|
shufps $0xa9, %xmm8, %xmm8
|
||||||
mulps %xmm8, %xmm4
|
mulps %xmm8, %xmm4
|
||||||
addps %xmm4, %xmm0
|
addps %xmm4, %xmm0
|
||||||
mulps %xmm8, %xmm12
|
mulps %xmm8, %xmm12
|
||||||
|
@ -2024,7 +2024,7 @@
|
||||||
|
|
||||||
movss %xmm9, %xmm8
|
movss %xmm9, %xmm8
|
||||||
pshufd $0xb1, %xmm4, %xmm12
|
pshufd $0xb1, %xmm4, %xmm12
|
||||||
shufps $0x93, %xmm8, %xmm8
|
shufps $0x03, %xmm8, %xmm8
|
||||||
mulps %xmm8, %xmm4
|
mulps %xmm8, %xmm4
|
||||||
addps %xmm4, %xmm0
|
addps %xmm4, %xmm0
|
||||||
mulps %xmm8, %xmm12
|
mulps %xmm8, %xmm12
|
||||||
|
|
|
@ -85,7 +85,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define RPREFETCHSIZE 32
|
#define RPREFETCHSIZE 32
|
||||||
#define WPREFETCHSIZE 48
|
#define WPREFETCHSIZE 48
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -160,7 +160,7 @@
|
||||||
#define a3 %xmm14
|
#define a3 %xmm14
|
||||||
#define xt1 %xmm15
|
#define xt1 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
@ -167,7 +167,7 @@
|
||||||
#define a3 %xmm14
|
#define a3 %xmm14
|
||||||
#define xt1 %xmm15
|
#define xt1 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
@ -166,7 +166,7 @@
|
||||||
#define xt1 %xmm14
|
#define xt1 %xmm14
|
||||||
#define xt2 %xmm15
|
#define xt2 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
@ -166,7 +166,7 @@
|
||||||
#define a3 %xmm14
|
#define a3 %xmm14
|
||||||
#define xt1 %xmm15
|
#define xt1 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define BORIG 72(%rsp)
|
#define BORIG 72(%rsp)
|
||||||
#define BUFFER 128(%rsp)
|
#define BUFFER 128(%rsp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -95,7 +95,7 @@
|
||||||
#define PREFETCHSIZE (8 * 6 + 4)
|
#define PREFETCHSIZE (8 * 6 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define BORIG 72(%rsp)
|
#define BORIG 72(%rsp)
|
||||||
#define BUFFER 128(%rsp)
|
#define BUFFER 128(%rsp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -95,7 +95,7 @@
|
||||||
#define PREFETCHSIZE (8 * 6 + 4)
|
#define PREFETCHSIZE (8 * 6 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define BORIG 72(%rsp)
|
#define BORIG 72(%rsp)
|
||||||
#define BUFFER 128(%rsp)
|
#define BUFFER 128(%rsp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -95,7 +95,7 @@
|
||||||
#define PREFETCHSIZE (8 * 6 + 4)
|
#define PREFETCHSIZE (8 * 6 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -74,6 +74,13 @@
|
||||||
#define ALIGNED_ACCESS
|
#define ALIGNED_ACCESS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef BULLDOZER
|
||||||
|
#define PREFETCH prefetch
|
||||||
|
#define PREFETCHW prefetchw
|
||||||
|
#define PREFETCHSIZE (128 * 5)
|
||||||
|
#define ALIGNED_ACCESS
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef NANO
|
#ifdef NANO
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
|
|
|
@ -85,7 +85,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
#define ALIGNED_ACCESS
|
#define ALIGNED_ACCESS
|
||||||
#define MOVUPS_A movaps
|
#define MOVUPS_A movaps
|
||||||
#define MOVUPS_XL movaps
|
#define MOVUPS_XL movaps
|
||||||
|
|
|
@ -66,7 +66,9 @@ static FLOAT dm1 = -1.;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GEMM_PQ MAX(GEMM_P, GEMM_Q)
|
#define GEMM_PQ MAX(GEMM_P, GEMM_Q)
|
||||||
#define REAL_GEMM_R (GEMM_R - GEMM_PQ)
|
|
||||||
|
//leave some space for GEMM_ALIGN in sb2
|
||||||
|
#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ)
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
#define SHARED_ARRAY
|
#define SHARED_ARRAY
|
||||||
|
@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
||||||
sa,
|
sa,
|
||||||
sb2,
|
sb2,
|
||||||
a + (is + js * lda) * COMPSIZE, lda,
|
a + (is + js * lda) * COMPSIZE, lda,
|
||||||
- is + js);
|
is - js);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
2
make.inc
2
make.inc
|
@ -4,7 +4,7 @@ DRVOPTS = $(OPTS)
|
||||||
LOADER = $(FORTRAN)
|
LOADER = $(FORTRAN)
|
||||||
TIMER = NONE
|
TIMER = NONE
|
||||||
ARCHFLAGS= -ru
|
ARCHFLAGS= -ru
|
||||||
RANLIB = ranlib
|
#RANLIB = ranlib
|
||||||
BLASLIB =
|
BLASLIB =
|
||||||
TMGLIB = tmglib.a
|
TMGLIB = tmglib.a
|
||||||
EIGSRCLIB = eigsrc.a
|
EIGSRCLIB = eigsrc.a
|
||||||
|
|
|
@ -48,7 +48,8 @@ typedef int blasint;
|
||||||
/* C99 supports complex floating numbers natively, which GCC also offers as an
|
/* C99 supports complex floating numbers natively, which GCC also offers as an
|
||||||
extension since version 3.0. If neither are available, use a compatible
|
extension since version 3.0. If neither are available, use a compatible
|
||||||
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
|
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
|
||||||
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
|
#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
|
||||||
|
(__GNUC__ >= 3 && !defined(__cplusplus)))
|
||||||
#define OPENBLAS_COMPLEX_C99
|
#define OPENBLAS_COMPLEX_C99
|
||||||
#include <complex.h>
|
#include <complex.h>
|
||||||
typedef float _Complex openblas_complex_float;
|
typedef float _Complex openblas_complex_float;
|
||||||
|
|
2
param.h
2
param.h
|
@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||||
|
|
||||||
#define SNUMOPT 8
|
#define SNUMOPT 8
|
||||||
#define DNUMOPT 4
|
#define DNUMOPT 4
|
||||||
|
|
Loading…
Reference in New Issue