Merge branch 'develop'
This commit is contained in:
commit
e5ac3007e0
|
@ -1,4 +1,22 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.6
|
||||
2-Mar-2013
|
||||
common:
|
||||
* Improved OpenMP performance slightly. (d744c9)
|
||||
* Improved cblas.h compatibility with Intel MKL.(#185)
|
||||
* Fixed the overflowing bug in single thread cholesky factorization.
|
||||
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
|
||||
|
||||
x86/x86-64:
|
||||
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
||||
We will tune the performance in future.
|
||||
* Auto-detect Intel Xeon E7540.
|
||||
* Fixed the overflowing buffer bug of gemv. (#173)
|
||||
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
|
||||
|
||||
MIPS64:
|
||||
|
||||
====================================================================
|
||||
Version 0.2.5
|
||||
26-Nov-2012
|
||||
|
|
2
Makefile
2
Makefile
|
@ -314,7 +314,7 @@ clean ::
|
|||
#endif
|
||||
@$(MAKE) -C reference clean
|
||||
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
|
||||
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
||||
@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
|
||||
@if test -d $(NETLIB_LAPACK_DIR); then \
|
||||
echo deleting $(NETLIB_LAPACK_DIR); \
|
||||
rm -rf $(NETLIB_LAPACK_DIR) ;\
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# This is triggered by Makefile.system and runs before any of the code is built.
|
||||
|
||||
export BINARY
|
||||
export USE_OPENMP
|
||||
|
||||
|
@ -15,7 +17,7 @@ ifdef CPUIDEMU
|
|||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||
endif
|
||||
|
||||
all: getarch_2nd
|
||||
all: getarch_2nd cblas_noconst.h
|
||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
||||
|
@ -36,4 +38,7 @@ else
|
|||
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
|
||||
endif
|
||||
|
||||
cblas_noconst.h : cblas.h
|
||||
perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h
|
||||
|
||||
dummy:
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.5
|
||||
VERSION = 0.2.6
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE
|
|||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
||||
# Generating Makefile.conf and config.h
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
|
||||
|
||||
ifndef TARGET_CORE
|
||||
include $(TOPDIR)/Makefile.conf
|
||||
|
@ -277,14 +277,14 @@ ifeq ($(ARCH), x86)
|
|||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||
ifneq ($(NO_AVX), 1)
|
||||
DYNAMIC_CORE += SANDYBRIDGE
|
||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||
ifneq ($(NO_AVX), 1)
|
||||
DYNAMIC_CORE += SANDYBRIDGE
|
||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt
|
|||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
||||
|
||||
#### MIPS64:
|
||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||
|
|
|
@ -29,6 +29,7 @@ BARCELONA
|
|||
SHANGHAI
|
||||
ISTANBUL
|
||||
BOBCAT
|
||||
BULLDOZER
|
||||
|
||||
c)VIA CPU:
|
||||
SSE_GENERIC
|
||||
|
|
448
cblas.h
448
cblas.h
|
@ -1,291 +1,293 @@
|
|||
#ifndef CBLAS_H
|
||||
#define CBLAS_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
/* Assume C declarations for C++ */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#include <stddef.h>
|
||||
#include "common.h"
|
||||
|
||||
/*Set the number of threads on runtime.*/
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
void goto_set_num_threads(int num_threads);
|
||||
|
||||
/*Get the build configure on runtime.*/
|
||||
char* openblas_get_config(void);
|
||||
|
||||
#define CBLAS_INDEX size_t
|
||||
|
||||
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
|
||||
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114};
|
||||
enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
|
||||
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
|
||||
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
|
||||
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
|
||||
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
|
||||
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
|
||||
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
|
||||
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
|
||||
|
||||
float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy);
|
||||
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
|
||||
float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||
double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||
float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||
double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
|
||||
|
||||
openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
|
||||
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
|
||||
openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||
openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
|
||||
openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
|
||||
openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
|
||||
|
||||
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
|
||||
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
|
||||
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
|
||||
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
|
||||
void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
|
||||
void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
|
||||
void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
|
||||
void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
|
||||
|
||||
float cblas_sasum (blasint n, float *x, blasint incx);
|
||||
double cblas_dasum (blasint n, double *x, blasint incx);
|
||||
float cblas_scasum(blasint n, float *x, blasint incx);
|
||||
double cblas_dzasum(blasint n, double *x, blasint incx);
|
||||
float cblas_sasum (const blasint n, const float *x, const blasint incx);
|
||||
double cblas_dasum (const blasint n, const double *x, const blasint incx);
|
||||
float cblas_scasum(const blasint n, const float *x, const blasint incx);
|
||||
double cblas_dzasum(const blasint n, const double *x, const blasint incx);
|
||||
|
||||
float cblas_snrm2 (blasint N, float *X, blasint incX);
|
||||
double cblas_dnrm2 (blasint N, double *X, blasint incX);
|
||||
float cblas_scnrm2(blasint N, float *X, blasint incX);
|
||||
double cblas_dznrm2(blasint N, double *X, blasint incX);
|
||||
float cblas_snrm2 (const blasint N, const float *X, const blasint incX);
|
||||
double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
|
||||
float cblas_scnrm2(const blasint N, const float *X, const blasint incX);
|
||||
double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
|
||||
|
||||
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
|
||||
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
|
||||
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
|
||||
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
|
||||
CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx);
|
||||
CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
|
||||
CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx);
|
||||
CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
|
||||
|
||||
void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy);
|
||||
void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy);
|
||||
void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy);
|
||||
void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy);
|
||||
void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
|
||||
void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
|
||||
void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
|
||||
void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
|
||||
|
||||
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
|
||||
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
|
||||
void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
|
||||
void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
|
||||
void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
|
||||
void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
|
||||
|
||||
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
|
||||
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
|
||||
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
|
||||
void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
|
||||
void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
|
||||
void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
|
||||
void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
|
||||
|
||||
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
|
||||
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);
|
||||
void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
|
||||
void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s);
|
||||
|
||||
void cblas_srotg(float *a, float *b, float *c, float *s);
|
||||
void cblas_drotg(double *a, double *b, double *c, double *s);
|
||||
|
||||
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
|
||||
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
|
||||
void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
|
||||
void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
|
||||
|
||||
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
|
||||
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
|
||||
void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
|
||||
void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
|
||||
|
||||
void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
|
||||
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
|
||||
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
|
||||
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
|
||||
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
|
||||
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
|
||||
void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
|
||||
void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
|
||||
void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
|
||||
void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
|
||||
void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
|
||||
void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
|
||||
|
||||
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
||||
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
|
||||
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
||||
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
|
||||
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
||||
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
|
||||
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
|
||||
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);
|
||||
void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||
const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy);
|
||||
void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||
const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy);
|
||||
void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||
const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy);
|
||||
void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
|
||||
const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy);
|
||||
|
||||
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
||||
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
||||
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
||||
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
||||
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
||||
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
||||
void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||
void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||
void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||
void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||
void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||
void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||
|
||||
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
|
||||
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
|
||||
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
|
||||
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
|
||||
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
|
||||
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
|
||||
void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
|
||||
void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
|
||||
void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
|
||||
void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
|
||||
|
||||
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
|
||||
blasint incX, float *Y, blasint incY, float *A, blasint lda);
|
||||
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
|
||||
blasint incX, double *Y, blasint incY, double *A, blasint lda);
|
||||
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
|
||||
float *Y, blasint incY, float *A, blasint lda);
|
||||
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
|
||||
double *Y, blasint incY, double *A, blasint lda);
|
||||
void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
|
||||
const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
|
||||
void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
|
||||
const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
|
||||
void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
|
||||
const float *Y, const blasint incY, float *A, const blasint lda);
|
||||
void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
|
||||
const double *Y, const blasint incY, double *A, const blasint lda);
|
||||
|
||||
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
||||
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
|
||||
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
||||
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
|
||||
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
||||
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
||||
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
|
||||
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
||||
void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||
const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||
void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||
const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||
void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||
const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||
void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
|
||||
const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||
|
||||
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
|
||||
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
|
||||
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
|
||||
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
|
||||
void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
|
||||
const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||
void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
|
||||
const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||
|
||||
|
||||
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
|
||||
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
|
||||
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
|
||||
void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
|
||||
void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
|
||||
|
||||
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, float *Ap, float *X, blasint incX);
|
||||
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, double *Ap, double *X, blasint incX);
|
||||
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, float *Ap, float *X, blasint incX);
|
||||
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, double *Ap, double *X, blasint incX);
|
||||
void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||
void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||
void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||
void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||
|
||||
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, float *Ap, float *X, blasint incX);
|
||||
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, double *Ap, double *X, blasint incX);
|
||||
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, float *Ap, float *X, blasint incX);
|
||||
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint N, double *Ap, double *X, blasint incX);
|
||||
void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||
void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||
void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const float *Ap, float *X, const blasint incX);
|
||||
void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
|
||||
const blasint N, const double *Ap, double *X, const blasint incX);
|
||||
|
||||
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
|
||||
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
|
||||
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
|
||||
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
|
||||
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
|
||||
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
||||
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
|
||||
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
||||
void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
|
||||
const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||
void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
|
||||
const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||
void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
|
||||
const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||
void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
|
||||
const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||
|
||||
|
||||
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
|
||||
float *X, blasint incX, float beta, float *Y, blasint incY);
|
||||
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
|
||||
double *X, blasint incX, double beta, double *Y, blasint incY);
|
||||
void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
|
||||
const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
|
||||
void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
|
||||
const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
|
||||
|
||||
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
|
||||
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
|
||||
void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
|
||||
void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
|
||||
|
||||
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
|
||||
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
|
||||
void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
|
||||
void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
|
||||
|
||||
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
|
||||
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
|
||||
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
|
||||
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
|
||||
void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
|
||||
void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
|
||||
void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
|
||||
void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
|
||||
|
||||
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
|
||||
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
||||
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
|
||||
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
||||
void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
|
||||
const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||
void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
|
||||
const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||
|
||||
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
|
||||
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
|
||||
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
|
||||
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
|
||||
void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
|
||||
const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
|
||||
void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
|
||||
const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
|
||||
|
||||
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
||||
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
||||
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
||||
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
||||
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
||||
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
|
||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
||||
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||
const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||
const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||
void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||
void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
|
||||
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||
|
||||
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
||||
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
||||
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
||||
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
||||
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
||||
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
||||
void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||
const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||
void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||
const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||
void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||
void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||
|
||||
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
|
||||
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
|
||||
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
|
||||
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
|
||||
void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
|
||||
void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
|
||||
void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
|
||||
void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
|
||||
|
||||
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
||||
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
||||
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
||||
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
|
||||
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
||||
void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||
void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||
void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||
void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
|
||||
const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||
|
||||
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
|
||||
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
|
||||
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
|
||||
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
|
||||
void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||
void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||
void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||
void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||
|
||||
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
|
||||
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
|
||||
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
|
||||
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
|
||||
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
|
||||
void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||
void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||
void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
|
||||
void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
|
||||
|
||||
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
|
||||
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
|
||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
|
||||
void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
|
||||
void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
|
||||
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
|
||||
|
||||
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
||||
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
|
||||
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
||||
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
|
||||
void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||
const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
|
||||
void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||
const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
|
||||
|
||||
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
||||
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
|
||||
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
|
||||
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
|
||||
void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
|
||||
void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
|
||||
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
|
||||
|
||||
void cblas_xerbla(blasint p, char *rout, char *form, ...);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif
|
||||
|
|
6
common.h
6
common.h
|
@ -390,7 +390,8 @@ typedef int blasint;
|
|||
/* C99 supports complex floating numbers natively, which GCC also offers as an
|
||||
extension since version 3.0. If neither are available, use a compatible
|
||||
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
|
||||
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
|
||||
#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
|
||||
(__GNUC__ >= 3 && !defined(__cplusplus)))
|
||||
#define OPENBLAS_COMPLEX_C99
|
||||
typedef float _Complex openblas_complex_float;
|
||||
typedef double _Complex openblas_complex_double;
|
||||
|
@ -557,7 +558,8 @@ typedef struct {
|
|||
#include "common_level3.h"
|
||||
#include "common_lapack.h"
|
||||
#ifdef CBLAS
|
||||
#include "cblas.h"
|
||||
/* This header file is generated from "cblas.h" (see Makefile.prebuild). */
|
||||
#include "cblas_noconst.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
|
3
cpuid.h
3
cpuid.h
|
@ -125,7 +125,8 @@
|
|||
#define HAVE_MISALIGNSSE (1 << 15)
|
||||
#define HAVE_128BITFPU (1 << 16)
|
||||
#define HAVE_FASTMOVU (1 << 17)
|
||||
#define HAVE_AVX (1 << 18)
|
||||
#define HAVE_AVX (1 << 18)
|
||||
#define HAVE_FMA4 (1 << 19)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
|
|
32
cpuid_x86.c
32
cpuid_x86.c
|
@ -43,6 +43,8 @@
|
|||
#ifdef NO_AVX
|
||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||
#define CORE_BULLDOZER CORE_BARCELONA
|
||||
#endif
|
||||
|
||||
#ifndef CPUIDEMU
|
||||
|
@ -116,8 +118,9 @@ static inline int have_excpuid(void){
|
|||
|
||||
#ifndef NO_AVX
|
||||
static inline void xgetbv(int op, int * eax, int * edx){
|
||||
//Use binary code for xgetbv
|
||||
__asm__ __volatile__
|
||||
("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -228,6 +231,9 @@ int get_cputype(int gettype){
|
|||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
|
||||
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
|
||||
#ifndef NO_AVX
|
||||
if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4;
|
||||
#endif
|
||||
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
|
||||
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
|
||||
}
|
||||
|
@ -1030,6 +1036,8 @@ int get_cpuname(void){
|
|||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14:
|
||||
// Xeon E7540
|
||||
case 15:
|
||||
//Xeon Processor E7 (Westmere-EX)
|
||||
return CPUTYPE_NEHALEM;
|
||||
|
@ -1075,8 +1083,12 @@ int get_cpuname(void){
|
|||
return CPUTYPE_OPTERON;
|
||||
case 1:
|
||||
case 10:
|
||||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
return CPUTYPE_BARCELONA;
|
||||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
if(support_avx())
|
||||
return CPUTYPE_BULLDOZER;
|
||||
else
|
||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||
case 5:
|
||||
return CPUTYPE_BOBCAT;
|
||||
}
|
||||
|
@ -1398,6 +1410,8 @@ int get_coretype(void){
|
|||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM; //OS doesn't support AVX
|
||||
case 14:
|
||||
//Xeon E7540
|
||||
case 15:
|
||||
//Xeon Processor E7 (Westmere-EX)
|
||||
return CORE_NEHALEM;
|
||||
|
@ -1427,8 +1441,13 @@ int get_coretype(void){
|
|||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||
else if (exfamily == 5) return CORE_BOBCAT;
|
||||
else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
else return CORE_BARCELONA;
|
||||
else if (exfamily == 6) {
|
||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
if(support_avx())
|
||||
return CORE_BULLDOZER;
|
||||
else
|
||||
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
|
||||
}else return CORE_BARCELONA;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1494,6 +1513,9 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_SIZE %d\n", info.size * 1024);
|
||||
printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
|
||||
} else {
|
||||
//fall back for some virtual machines.
|
||||
printf("#define DTB_DEFAULT_ENTRIES 32\n");
|
||||
}
|
||||
|
||||
features = get_cputype(GET_FEATURE);
|
||||
|
@ -1511,6 +1533,7 @@ void get_cpuconfig(void){
|
|||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
||||
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
|
||||
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
|
||||
|
@ -1577,5 +1600,6 @@ void get_sse(void){
|
|||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||
|
||||
}
|
||||
|
|
|
@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
|
||||
a = (FLOAT *)args -> a;
|
||||
x = (FLOAT *)args -> b;
|
||||
y = (FLOAT *)args -> c;
|
||||
|
||||
lda = args -> lda;
|
||||
incx = args -> ldb;
|
||||
|
@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
n_from = 0;
|
||||
n_to = n;
|
||||
|
||||
//Use y as each thread's n* COMPSIZE elements in sb buffer
|
||||
y = buffer;
|
||||
buffer += ((COMPSIZE * n + 1023) & ~1023);
|
||||
|
||||
if (range_m) {
|
||||
n_from = *(range_m + 0);
|
||||
n_to = *(range_m + 1);
|
||||
|
@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
a += n_from * lda * COMPSIZE;
|
||||
}
|
||||
|
||||
if (range_n) y += *range_n * COMPSIZE;
|
||||
|
||||
if (incx != 1) {
|
||||
COPY_K(n, x, incx, buffer, 1);
|
||||
|
@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
|||
|
||||
if (num_cpu) {
|
||||
queue[0].sa = NULL;
|
||||
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
|
||||
queue[0].sb = buffer;
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas(num_cpu, queue);
|
||||
|
@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
|||
#else
|
||||
ONE, ZERO,
|
||||
#endif
|
||||
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
|
||||
(FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
|
||||
}
|
||||
|
||||
AXPYU_K(n, 0, 0,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
TOPDIR = ../..
|
||||
include ../../Makefile.system
|
||||
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
|
||||
|
||||
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||
|
||||
|
@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
|
|||
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_get_config.$(SUFFIX) : openblas_get_config.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
|
|
@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
|
|||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
}
|
||||
}
|
||||
queue->sb=sb;
|
||||
}
|
||||
|
||||
#ifdef MONITOR
|
||||
|
|
|
@ -49,8 +49,12 @@
|
|||
|
||||
int blas_server_avail = 0;
|
||||
|
||||
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||
|
||||
void goto_set_num_threads(int num_threads) {
|
||||
|
||||
int i=0;
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||
|
@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) {
|
|||
blas_cpu_number = num_threads;
|
||||
|
||||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
|
||||
//adjust buffer for each thread
|
||||
for(i=0; i<blas_cpu_number; i++){
|
||||
if(blas_thread_buffer[i]==NULL){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
}
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
}
|
||||
}
|
||||
#if defined(ARCH_MIPS64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
|
@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
|
|||
|
||||
int blas_thread_init(void){
|
||||
|
||||
int i=0;
|
||||
|
||||
blas_get_cpu_number();
|
||||
|
||||
blas_server_avail = 1;
|
||||
|
||||
for(i=0; i<blas_num_threads; i++){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
blas_thread_buffer[i]=NULL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
int i=0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
static void exec_threads(blas_queue_t *queue){
|
||||
|
||||
void *buffer, *sa, *sb;
|
||||
|
||||
int pos=0, release_flag=0;
|
||||
|
||||
buffer = NULL;
|
||||
sa = queue -> sa;
|
||||
sb = queue -> sb;
|
||||
|
@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){
|
|||
|
||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||
|
||||
buffer = blas_memory_alloc(2);
|
||||
pos = omp_get_thread_num();
|
||||
buffer = blas_thread_buffer[pos];
|
||||
|
||||
//fallback
|
||||
if(buffer==NULL) {
|
||||
buffer = blas_memory_alloc(2);
|
||||
release_flag=1;
|
||||
}
|
||||
|
||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
|
||||
|
@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){
|
|||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
}
|
||||
}
|
||||
queue->sb=sb;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){
|
|||
|
||||
}
|
||||
|
||||
if (buffer != NULL) blas_memory_free(buffer);
|
||||
if (release_flag) blas_memory_free(buffer);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
}
|
||||
}
|
||||
queue->sb=sb;
|
||||
}
|
||||
|
||||
#ifdef MONITOR
|
||||
|
@ -495,4 +496,4 @@ void goto_set_num_threads(int num_threads)
|
|||
void openblas_set_num_threads(int num)
|
||||
{
|
||||
goto_set_num_threads(num);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA;
|
|||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#ifndef NO_AVX
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -78,8 +80,9 @@ extern gotoblas_t gotoblas_SANDYBRIDGE;
|
|||
|
||||
#ifndef NO_AVX
|
||||
static inline void xgetbv(int op, int * eax, int * edx){
|
||||
//Use binary code for xgetbv
|
||||
__asm__ __volatile__
|
||||
("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -163,7 +166,8 @@ static gotoblas_t *get_coretype(void){
|
|||
|
||||
//Intel Xeon Processor 5600 (Westmere-EP)
|
||||
//Xeon Processor E7 (Westmere-EX)
|
||||
if (model == 12 || model == 15) return &gotoblas_NEHALEM;
|
||||
//Xeon E7540
|
||||
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
|
||||
|
||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
||||
//Intel Core i7-3000 / Xeon E5
|
||||
|
@ -171,7 +175,7 @@ static gotoblas_t *get_coretype(void){
|
|||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
|
||||
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
|
@ -182,7 +186,7 @@ static gotoblas_t *get_coretype(void){
|
|||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
|
||||
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
|
@ -202,6 +206,14 @@ static gotoblas_t *get_coretype(void){
|
|||
else return &gotoblas_OPTERON;
|
||||
} else if (exfamily == 5) {
|
||||
return &gotoblas_BOBCAT;
|
||||
} else if (exfamily == 6) {
|
||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
if(support_avx())
|
||||
return &gotoblas_BULLDOZER;
|
||||
else{
|
||||
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
} else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
|
@ -238,6 +250,7 @@ static char *corename[] = {
|
|||
"Nano",
|
||||
"Sandybridge",
|
||||
"Bobcat",
|
||||
"Bulldozer",
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
|
@ -259,6 +272,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||
|
||||
return corename[0];
|
||||
}
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the ISCAS nor the names of its contributors may
|
||||
be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static char* openblas_config_str=""
|
||||
#ifdef USE64BITINT
|
||||
"USE64BITINT "
|
||||
#endif
|
||||
#ifdef NO_CBLAS
|
||||
"NO_CBLAS "
|
||||
#endif
|
||||
#ifdef NO_LAPACK
|
||||
"NO_LAPACK "
|
||||
#endif
|
||||
#ifdef NO_LAPACKE
|
||||
"NO_LAPACKE "
|
||||
#endif
|
||||
#ifdef DYNAMIC_ARCH
|
||||
"DYNAMIC_ARCH "
|
||||
#endif
|
||||
#ifdef NO_AFFINITY
|
||||
"NO_AFFINITY "
|
||||
#endif
|
||||
;
|
||||
|
||||
char* CNAME() {
|
||||
return openblas_config_str;
|
||||
}
|
||||
|
|
@ -163,7 +163,7 @@ int get_L2_size(void){
|
|||
|
||||
int eax, ebx, ecx, edx;
|
||||
|
||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
|
||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
|
||||
|
||||
|
|
|
@ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT)
|
|||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
EXTRALIB += -lgfortran
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
EXTRALIB += -lgomp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
|
|
|
@ -74,6 +74,7 @@
|
|||
|
||||
@misc_no_underscore_objs = (
|
||||
openblas_set_num_threads, goto_set_num_threads,
|
||||
openblas_get_config,
|
||||
);
|
||||
|
||||
@misc_underscore_objs = (
|
||||
|
|
18
getarch.c
18
getarch.c
|
@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "OPTERON"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER)
|
||||
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
|
@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "BOBCAT"
|
||||
#endif
|
||||
|
||||
#if defined (FORCE_BULLDOZER)
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "BULLDOZER"
|
||||
#define ARCHCONFIG "-DBULLDOZER " \
|
||||
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
|
||||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
|
||||
"-DHAVE_AVX -DHAVE_FMA4"
|
||||
#define LIBNAME "bulldozer"
|
||||
#define CORENAME "BULLDOZER"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_SSE_GENERIC
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
|
|
|
@ -34,7 +34,7 @@ int main(int argc, char **argv) {
|
|||
#ifdef USE64BITINT
|
||||
printf("#define USE64BITINT\n");
|
||||
#endif
|
||||
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
|
||||
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -810,6 +810,22 @@ static void init_parameter(void) {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef BULLDOZER
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "Bulldozer\n");
|
||||
#endif
|
||||
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
#ifdef EXPRECISION
|
||||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef NANO
|
||||
|
||||
#ifdef DEBUG
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||
CGEMMINCOPY =
|
||||
CGEMMITCOPY =
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ =
|
||||
CGEMMITCOPYOBJ =
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
|
||||
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
|
|
@ -596,7 +596,7 @@
|
|||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 4 * SIZE(BB), %xmm2
|
||||
|
@ -842,7 +842,7 @@
|
|||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
|
@ -1168,7 +1168,7 @@
|
|||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
|
@ -1198,7 +1198,7 @@
|
|||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
|
@ -1347,7 +1347,7 @@
|
|||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
|
@ -1531,7 +1531,7 @@
|
|||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
|
@ -1778,7 +1778,7 @@
|
|||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
|
@ -1793,7 +1793,7 @@
|
|||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
|
@ -1924,7 +1924,7 @@
|
|||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
|
@ -2069,7 +2069,7 @@
|
|||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
|
|
|
@ -89,17 +89,22 @@
|
|||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 16
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA 16 + STACKSIZE(%esp)
|
||||
#define A 20 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 24 + STACKSIZE(%esp)
|
||||
#define STACK_X 28 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 32 + STACKSIZE(%esp)
|
||||
#define Y 36 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 40 + STACKSIZE(%esp)
|
||||
#define BUFFER 44 + STACKSIZE(%esp)
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||
#define A 20 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 28 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 36 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||
#define MMM 0+ARGS(%esp)
|
||||
#define YY 4+ARGS(%esp)
|
||||
#define AA 8+ARGS(%esp)
|
||||
#define LDAX 12+ARGS(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
|
@ -114,6 +119,7 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
subl $ARGS,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
|
@ -121,7 +127,34 @@
|
|||
|
||||
PROFCODE
|
||||
|
||||
movl Y,J
|
||||
movl J,YY # backup Y
|
||||
movl A,J
|
||||
movl J,AA # backup A
|
||||
movl M,J
|
||||
movl J,MMM # backup MM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $21,J
|
||||
subl J,MMM
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_4
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl AA,%eax
|
||||
movl %eax,A
|
||||
|
||||
movl YY,J
|
||||
movl J,Y
|
||||
movl STACK_LDA, LDA
|
||||
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
|
||||
|
@ -651,12 +684,22 @@
|
|||
addss 0 * SIZE(X), %xmm0
|
||||
movss %xmm0, (Y1)
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
movl YY,J
|
||||
addl %eax,J
|
||||
movl J,YY
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
|
|
@ -76,17 +76,22 @@
|
|||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 16
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA 16 + STACKSIZE(%esp)
|
||||
#define A 24 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
||||
#define STACK_X 32 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
||||
#define Y 40 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
||||
#define BUFFER 48 + STACKSIZE(%esp)
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+ARGS(%esp)
|
||||
#define YY 4+ARGS(%esp)
|
||||
#define AA 8+ARGS(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
|
@ -101,6 +106,8 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
|
||||
subl $ARGS,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
|
@ -108,6 +115,33 @@
|
|||
|
||||
PROFCODE
|
||||
|
||||
movl Y,J
|
||||
movl J,YY # backup Y
|
||||
movl A,J
|
||||
movl J,AA # backup A
|
||||
movl M,J
|
||||
movl J,MMM # backup MM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $20,J
|
||||
subl J,MMM
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_4
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl AA,%eax
|
||||
movl %eax,A
|
||||
|
||||
movl YY,J
|
||||
movl J,Y
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
|
@ -677,10 +711,22 @@
|
|||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
movl YY,J
|
||||
addl %eax,J
|
||||
movl J,YY
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
|
|
@ -89,17 +89,24 @@
|
|||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 20
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA 16 + STACKSIZE(%esp)
|
||||
#define A 20 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 24 + STACKSIZE(%esp)
|
||||
#define STACK_X 28 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 32 + STACKSIZE(%esp)
|
||||
#define Y 36 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 40 + STACKSIZE(%esp)
|
||||
#define BUFFER 44 + STACKSIZE(%esp)
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||
#define A 20 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 28 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 36 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+STACKSIZE(%esp)
|
||||
#define NN 4+STACKSIZE(%esp)
|
||||
#define AA 8+STACKSIZE(%esp)
|
||||
#define LDAX 12+STACKSIZE(%esp)
|
||||
#define XX 16+STACKSIZE(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
|
@ -114,6 +121,7 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
subl $ARGS,%esp
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
|
@ -122,7 +130,42 @@
|
|||
PROFCODE
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl LDA,LDAX # backup LDA
|
||||
movl STACK_X, X
|
||||
movl X,XX
|
||||
movl N,J
|
||||
movl J,NN # backup N
|
||||
movl A,J
|
||||
movl J,AA # backup A
|
||||
movl M,J
|
||||
movl J,MMM # mov M to MMM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
|
||||
subl $8, J # Don't use last 8 float in the buffer.
|
||||
# Now, split M by block J
|
||||
subl J,MMM # MMM=MMM-J
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_4
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl AA,%eax
|
||||
movl %eax,A # mov AA to A
|
||||
|
||||
movl NN,%eax
|
||||
movl %eax,N # reset N
|
||||
|
||||
|
||||
movl LDAX, LDA # reset LDA
|
||||
movl XX,X
|
||||
|
||||
movl STACK_INCX, INCX
|
||||
movl STACK_INCY, INCY
|
||||
|
||||
|
@ -198,6 +241,20 @@
|
|||
jg .L06
|
||||
ALIGN_4
|
||||
|
||||
//Padding zero to prevent loading the dirty number from buffer.
|
||||
movl M, I
|
||||
movl $8, J
|
||||
andl $7, I
|
||||
xorps %xmm0, %xmm0
|
||||
subl I, J
|
||||
ALIGN_2
|
||||
.L07:
|
||||
movss %xmm0, 0 * SIZE(Y1)
|
||||
addl $SIZE, Y1
|
||||
decl J
|
||||
jg .L07
|
||||
ALIGN_4
|
||||
|
||||
.L10:
|
||||
movl Y, Y1
|
||||
|
||||
|
@ -628,10 +685,22 @@
|
|||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
movl XX,J
|
||||
addl %eax,J
|
||||
movl J,XX
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
|
|
@ -76,18 +76,24 @@
|
|||
#endif
|
||||
|
||||
#define STACKSIZE 16
|
||||
#define ARGS 16
|
||||
|
||||
#define M 4 + STACKSIZE+ARGS(%esp)
|
||||
#define N 8 + STACKSIZE+ARGS(%esp)
|
||||
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
|
||||
#define A 24 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
|
||||
#define Y 40 + STACKSIZE+ARGS(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
|
||||
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
|
||||
|
||||
#define MMM 0+STACKSIZE(%esp)
|
||||
#define AA 4+STACKSIZE(%esp)
|
||||
#define LDAX 8+STACKSIZE(%esp)
|
||||
#define NN 12+STACKSIZE(%esp)
|
||||
|
||||
#define M 4 + STACKSIZE(%esp)
|
||||
#define N 8 + STACKSIZE(%esp)
|
||||
#define ALPHA 16 + STACKSIZE(%esp)
|
||||
#define A 24 + STACKSIZE(%esp)
|
||||
#define STACK_LDA 28 + STACKSIZE(%esp)
|
||||
#define STACK_X 32 + STACKSIZE(%esp)
|
||||
#define STACK_INCX 36 + STACKSIZE(%esp)
|
||||
#define Y 40 + STACKSIZE(%esp)
|
||||
#define STACK_INCY 44 + STACKSIZE(%esp)
|
||||
#define BUFFER 48 + STACKSIZE(%esp)
|
||||
|
||||
#define I %eax
|
||||
#define J %ebx
|
||||
|
||||
|
@ -101,6 +107,8 @@
|
|||
|
||||
PROLOGUE
|
||||
|
||||
subl $ARGS,%esp
|
||||
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
pushl %esi
|
||||
|
@ -108,7 +116,40 @@
|
|||
|
||||
PROFCODE
|
||||
|
||||
|
||||
movl STACK_LDA, LDA
|
||||
movl LDA,LDAX # backup LDA
|
||||
movl N,J
|
||||
movl J,NN # backup N
|
||||
movl A,J
|
||||
movl J,AA # backup A
|
||||
movl M,J
|
||||
movl J,MMM # mov M to MMM
|
||||
.L0t:
|
||||
xorl J,J
|
||||
addl $1,J
|
||||
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
|
||||
subl $4, J # Don't use last 4 double in the buffer.
|
||||
# Now, split M by block J
|
||||
subl J,MMM # MMM=MMM-J
|
||||
movl J,M
|
||||
jge .L00t
|
||||
ALIGN_4
|
||||
|
||||
movl MMM,%eax
|
||||
addl J,%eax
|
||||
jle .L999x
|
||||
movl %eax,M
|
||||
|
||||
.L00t:
|
||||
movl AA,%eax
|
||||
movl %eax,A # mov AA to A
|
||||
|
||||
movl NN,%eax
|
||||
movl %eax,N # reset N
|
||||
|
||||
|
||||
movl LDAX, LDA # reset LDA
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
movl STACK_INCY, INCY
|
||||
|
@ -117,6 +158,7 @@
|
|||
leal (,INCY, SIZE), INCY
|
||||
leal (,LDA, SIZE), LDA
|
||||
|
||||
|
||||
subl $-16 * SIZE, A
|
||||
|
||||
cmpl $0, N
|
||||
|
@ -560,10 +602,19 @@
|
|||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
movl M,J
|
||||
leal (,J,SIZE),%eax
|
||||
addl %eax,AA
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
popl %edi
|
||||
popl %ebp
|
||||
|
||||
addl $ARGS,%esp
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
|
|
@ -269,7 +269,7 @@
|
|||
sarl $5, I
|
||||
jle .L113
|
||||
|
||||
#if defined(BARCELONA)
|
||||
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||
|
||||
movaps %xmm0, %xmm1
|
||||
mulps -32 * SIZE(X), %xmm1
|
||||
|
|
|
@ -253,7 +253,7 @@
|
|||
sarl $4, I
|
||||
jle .L113
|
||||
|
||||
#if defined(BARCELONA)
|
||||
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||
|
||||
movaps %xmm0, %xmm1
|
||||
mulpd -16 * SIZE(X), %xmm1
|
||||
|
|
|
@ -69,7 +69,7 @@
|
|||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHSIZE (8 * 10 + 4)
|
||||
#endif
|
||||
|
@ -439,7 +439,7 @@
|
|||
.L22:
|
||||
mulsd %xmm0, %xmm2
|
||||
addsd %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movlpd 2 * SIZE(BB), %xmm2
|
||||
|
@ -488,7 +488,7 @@
|
|||
movlpd 40 * SIZE(BB), %xmm3
|
||||
addsd %xmm0, %xmm7
|
||||
movlpd 8 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulsd %xmm1, %xmm2
|
||||
|
@ -1697,7 +1697,7 @@
|
|||
|
||||
.L42:
|
||||
mulpd %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulpd 2 * SIZE(BB), %xmm0
|
||||
|
@ -1727,7 +1727,7 @@
|
|||
addpd %xmm0, %xmm7
|
||||
movapd 16 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulpd %xmm1, %xmm2
|
||||
|
|
|
@ -64,7 +64,7 @@
|
|||
#define BORIG 60(%esp)
|
||||
#define BUFFER 128(%esp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
|
@ -437,7 +437,7 @@
|
|||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
|
@ -833,7 +833,7 @@
|
|||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(BB), %xmm2
|
||||
|
@ -1848,7 +1848,7 @@
|
|||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
|
@ -2109,7 +2109,7 @@
|
|||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
|
@ -2429,7 +2429,7 @@
|
|||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
|
@ -2459,7 +2459,7 @@
|
|||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
|
@ -2952,7 +2952,7 @@
|
|||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
|
@ -3148,7 +3148,7 @@
|
|||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
|
@ -3389,7 +3389,7 @@
|
|||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
|
@ -3404,7 +3404,7 @@
|
|||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
|
|
|
@ -69,7 +69,7 @@
|
|||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHSIZE (8 * 10 + 4)
|
||||
#endif
|
||||
|
@ -910,7 +910,7 @@
|
|||
.L22:
|
||||
mulsd %xmm0, %xmm2
|
||||
addsd %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movlpd 2 * SIZE(BB), %xmm2
|
||||
|
@ -959,7 +959,7 @@
|
|||
movlpd 40 * SIZE(BB), %xmm3
|
||||
addsd %xmm0, %xmm7
|
||||
movlpd 8 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulsd %xmm1, %xmm2
|
||||
|
@ -1439,7 +1439,7 @@
|
|||
|
||||
.L42:
|
||||
mulpd %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulpd 2 * SIZE(BB), %xmm0
|
||||
|
@ -1469,7 +1469,7 @@
|
|||
addpd %xmm0, %xmm7
|
||||
movapd 16 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulpd %xmm1, %xmm2
|
||||
|
|
|
@ -64,7 +64,7 @@
|
|||
#define BORIG 60(%esp)
|
||||
#define BUFFER 128(%esp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
|
@ -872,7 +872,7 @@
|
|||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(BB), %xmm2
|
||||
|
@ -1316,7 +1316,7 @@
|
|||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
|
@ -1855,7 +1855,7 @@
|
|||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
|
@ -1885,7 +1885,7 @@
|
|||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
|
@ -2249,7 +2249,7 @@
|
|||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
|
@ -2562,7 +2562,7 @@
|
|||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
|
@ -2957,7 +2957,7 @@
|
|||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
|
@ -2972,7 +2972,7 @@
|
|||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
|
@ -3280,7 +3280,7 @@
|
|||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
|
@ -3515,7 +3515,7 @@
|
|||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
|
|
|
@ -69,7 +69,7 @@
|
|||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHSIZE (8 * 10 + 4)
|
||||
#endif
|
||||
|
@ -1036,7 +1036,7 @@
|
|||
|
||||
.L42:
|
||||
mulpd %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulpd 2 * SIZE(BB), %xmm0
|
||||
|
@ -1066,7 +1066,7 @@
|
|||
addpd %xmm0, %xmm7
|
||||
movapd 16 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulpd %xmm1, %xmm2
|
||||
|
@ -2224,7 +2224,7 @@
|
|||
.L22:
|
||||
mulsd %xmm0, %xmm2
|
||||
addsd %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movlpd 2 * SIZE(BB), %xmm2
|
||||
|
@ -2273,7 +2273,7 @@
|
|||
movlpd 40 * SIZE(BB), %xmm3
|
||||
addsd %xmm0, %xmm7
|
||||
movlpd 8 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||
#endif
|
||||
mulsd %xmm1, %xmm2
|
||||
|
|
|
@ -64,7 +64,7 @@
|
|||
#define BORIG 60(%esp)
|
||||
#define BUFFER 128(%esp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
|
@ -439,7 +439,7 @@
|
|||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
|
@ -454,7 +454,7 @@
|
|||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
|
@ -758,7 +758,7 @@
|
|||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
|
@ -993,7 +993,7 @@
|
|||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
|
@ -1324,7 +1324,7 @@
|
|||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
|
@ -1354,7 +1354,7 @@
|
|||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
|
@ -1718,7 +1718,7 @@
|
|||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
|
@ -2031,7 +2031,7 @@
|
|||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
|
@ -2859,7 +2859,7 @@
|
|||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(BB), %xmm2
|
||||
|
@ -3303,7 +3303,7 @@
|
|||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
|
|
|
@ -74,7 +74,7 @@
|
|||
#define BB %ecx
|
||||
#define LDC %ebp
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
|
@ -625,7 +625,7 @@
|
|||
.L22:
|
||||
mulps %xmm0, %xmm2
|
||||
addps %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 4 * SIZE(BB), %xmm2
|
||||
|
@ -870,7 +870,7 @@
|
|||
.L32:
|
||||
mulss %xmm0, %xmm2
|
||||
addss %xmm2, %xmm4
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 4 * SIZE(BB), %xmm2
|
||||
|
@ -1173,7 +1173,7 @@
|
|||
|
||||
.L52:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulps 4 * SIZE(BB), %xmm0
|
||||
|
@ -1203,7 +1203,7 @@
|
|||
addps %xmm0, %xmm5
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm2
|
||||
|
@ -1359,7 +1359,7 @@
|
|||
ALIGN_4
|
||||
|
||||
.L62:
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
|
||||
|
@ -1536,7 +1536,7 @@
|
|||
|
||||
.L72:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
mulss 4 * SIZE(BB), %xmm0
|
||||
|
@ -1794,7 +1794,7 @@
|
|||
|
||||
.L92:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movaps 4 * SIZE(AA), %xmm0
|
||||
|
@ -1809,7 +1809,7 @@
|
|||
mulps 12 * SIZE(BB), %xmm0
|
||||
addps %xmm0, %xmm7
|
||||
movaps 32 * SIZE(AA), %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
mulps %xmm1, %xmm3
|
||||
|
@ -1936,7 +1936,7 @@
|
|||
|
||||
.L102:
|
||||
mulps %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movsd 2 * SIZE(AA), %xmm0
|
||||
|
@ -2069,7 +2069,7 @@
|
|||
|
||||
.L112:
|
||||
mulss %xmm0, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||
#endif
|
||||
movss 1 * SIZE(AA), %xmm0
|
||||
|
|
|
@ -71,7 +71,7 @@
|
|||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#ifdef BARCELONA
|
||||
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetchnta
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 5)
|
||||
|
|
|
@ -58,7 +58,7 @@
|
|||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#ifdef BARCELONA
|
||||
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetchnta
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (8 * 5)
|
||||
|
|
|
@ -71,7 +71,7 @@
|
|||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#ifdef BARCELONA
|
||||
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetchnta
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 5)
|
||||
|
|
|
@ -58,7 +58,7 @@
|
|||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#ifdef BARCELONA
|
||||
#if defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetchnta
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (8 * 5)
|
||||
|
|
|
@ -75,7 +75,7 @@
|
|||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
#define WPREFETCHSIZE 112
|
||||
#define PREFETCH prefetch
|
||||
|
@ -533,7 +533,7 @@
|
|||
addps %xmm0, %xmm7
|
||||
movsd 16 * SIZE(AA), %xmm0
|
||||
mulps %xmm1, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
addps %xmm2, %xmm4
|
||||
|
|
|
@ -75,7 +75,7 @@
|
|||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
#define WPREFETCHSIZE 112
|
||||
#define PREFETCH prefetch
|
||||
|
@ -994,7 +994,7 @@
|
|||
addps %xmm0, %xmm7
|
||||
movsd 16 * SIZE(AA), %xmm0
|
||||
mulps %xmm1, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
addps %xmm2, %xmm4
|
||||
|
|
|
@ -75,7 +75,7 @@
|
|||
#define STACK_ALIGN 4096
|
||||
#define STACK_OFFSET 1024
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCHSIZE (16 * 10 + 8)
|
||||
#define WPREFETCHSIZE 112
|
||||
#define PREFETCH prefetch
|
||||
|
@ -1820,7 +1820,7 @@
|
|||
addps %xmm0, %xmm7
|
||||
movsd 16 * SIZE(AA), %xmm0
|
||||
mulps %xmm1, %xmm2
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||
#endif
|
||||
addps %xmm2, %xmm4
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
DGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||
DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = zgemm_ncopy_2.S
|
||||
CGEMMOTCOPY = zgemm_tcopy_2.S
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
ZGEMMONCOPY = zgemm_ncopy_2.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_2.S
|
||||
ZGEMMINCOPYOBJ =
|
||||
ZGEMMITCOPYOBJ =
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
|
||||
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
File diff suppressed because it is too large
Load Diff
|
@ -530,7 +530,7 @@
|
|||
#endif
|
||||
movsd -32 * SIZE(Y), %xmm8
|
||||
|
||||
pshufd $0x39, %xmm4, %xmm5
|
||||
pshufd $0x29, %xmm4, %xmm5
|
||||
|
||||
mulps %xmm8, %xmm5
|
||||
addps %xmm5, %xmm3
|
||||
|
@ -750,7 +750,8 @@
|
|||
xorps %xmm5, %xmm5
|
||||
movhlps %xmm4, %xmm5
|
||||
|
||||
mulps -32 * SIZE(Y), %xmm5
|
||||
movlps -32 * SIZE(Y), %xmm4
|
||||
mulps %xmm4, %xmm5
|
||||
addps %xmm5, %xmm0
|
||||
|
||||
addq $2 * SIZE, X
|
||||
|
@ -992,7 +993,7 @@
|
|||
movsd -32 * SIZE(Y), %xmm8
|
||||
|
||||
movss %xmm5, %xmm4
|
||||
shufps $0x93, %xmm5, %xmm4
|
||||
shufps $0x93, %xmm4, %xmm4
|
||||
|
||||
mulps %xmm8, %xmm4
|
||||
addps %xmm4, %xmm3
|
||||
|
|
|
@ -930,7 +930,7 @@
|
|||
.L22:
|
||||
mulps %xmm8, %xmm9
|
||||
addps %xmm9, %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movaps 4 * SIZE(BO), %xmm9
|
||||
|
@ -983,7 +983,7 @@
|
|||
addps %xmm8, %xmm3
|
||||
movaps 0 * SIZE(AO), %xmm8
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm10, %xmm9
|
||||
|
@ -1178,7 +1178,7 @@
|
|||
.L32:
|
||||
mulps %xmm8, %xmm9
|
||||
addps %xmm9, %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movsd 4 * SIZE(BO), %xmm9
|
||||
|
@ -1423,7 +1423,7 @@
|
|||
.L42:
|
||||
mulss %xmm8, %xmm9
|
||||
addss %xmm9, %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movss 4 * SIZE(BO), %xmm9
|
||||
|
@ -1765,7 +1765,7 @@
|
|||
|
||||
.L62:
|
||||
mulps %xmm8, %xmm9
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
mulps 4 * SIZE(BO), %xmm8
|
||||
|
@ -1793,7 +1793,7 @@
|
|||
addps %xmm8, %xmm5
|
||||
movaps 32 * SIZE(AO), %xmm8
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm10, %xmm11
|
||||
|
@ -1822,7 +1822,7 @@
|
|||
addps %xmm10, %xmm5
|
||||
movaps 48 * SIZE(AO), %xmm10
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm12, %xmm13
|
||||
|
@ -1851,7 +1851,7 @@
|
|||
addps %xmm12, %xmm5
|
||||
movaps 64 * SIZE(AO), %xmm12
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm14, %xmm15
|
||||
|
@ -2024,7 +2024,7 @@
|
|||
|
||||
.L72:
|
||||
mulps %xmm8, %xmm9
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
|
||||
|
@ -2208,7 +2208,7 @@
|
|||
.L82:
|
||||
mulps %xmm8, %xmm9
|
||||
addps %xmm9, %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movsd 4 * SIZE(BO), %xmm9
|
||||
|
@ -2395,7 +2395,7 @@
|
|||
.L92:
|
||||
mulps %xmm8, %xmm9
|
||||
addps %xmm9, %xmm0
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movss 4 * SIZE(BO), %xmm9
|
||||
|
@ -2670,7 +2670,7 @@
|
|||
|
||||
.L112:
|
||||
mulps %xmm9, %xmm8
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
|
||||
|
@ -2687,7 +2687,7 @@
|
|||
addps %xmm9, %xmm4
|
||||
movaps 8 * SIZE(BO), %xmm9
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm9, %xmm10
|
||||
|
@ -2704,7 +2704,7 @@
|
|||
addps %xmm9, %xmm4
|
||||
movaps 32 * SIZE(BO), %xmm9
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm11, %xmm12
|
||||
|
@ -2721,7 +2721,7 @@
|
|||
addps %xmm11, %xmm4
|
||||
movaps 24 * SIZE(BO), %xmm11
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm11, %xmm14
|
||||
|
@ -2857,7 +2857,7 @@
|
|||
|
||||
.L122:
|
||||
mulps %xmm8, %xmm9
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movaps -28 * SIZE(AO), %xmm8
|
||||
|
@ -2873,7 +2873,7 @@
|
|||
addps %xmm8, %xmm3
|
||||
movaps 0 * SIZE(AO), %xmm8
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
|
||||
#endif
|
||||
mulps %xmm10, %xmm11
|
||||
|
@ -3003,7 +3003,7 @@
|
|||
|
||||
.L132:
|
||||
mulps %xmm8, %xmm9
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movsd -30 * SIZE(AO), %xmm8
|
||||
|
@ -3150,7 +3150,7 @@
|
|||
|
||||
.L142:
|
||||
mulss %xmm8, %xmm9
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
|
||||
#endif
|
||||
movss -31 * SIZE(AO), %xmm8
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define RPREFETCHSIZE (12 + 4)
|
||||
#define WPREFETCHSIZE (48 + 4)
|
||||
#define MOVNTQ MOVQ
|
||||
|
@ -79,7 +79,7 @@
|
|||
#define AO3 %r13
|
||||
#define AO4 %rax
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define RPREFETCH prefetch
|
||||
#else
|
||||
#define RPREFETCH prefetch
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define RPREFETCHSIZE (12 + 4)
|
||||
#define WPREFETCHSIZE (12 + 4)
|
||||
#define MOVNTQ MOVQ
|
||||
|
@ -96,7 +96,7 @@
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define RPREFETCH prefetch
|
||||
#else
|
||||
#define RPREFETCH prefetch
|
||||
|
|
|
@ -469,7 +469,7 @@
|
|||
ALIGN_4
|
||||
|
||||
.L71:
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
prefetch PREFETCHSIZE * SIZE(X)
|
||||
#endif
|
||||
|
||||
|
|
|
@ -266,7 +266,7 @@
|
|||
sarq $5, I
|
||||
jle .L113
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
|
||||
movaps %xmm0, %xmm1
|
||||
mulps -32 * SIZE(X), %xmm1
|
||||
|
|
|
@ -251,7 +251,7 @@
|
|||
sarq $4, I
|
||||
jle .L113
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
|
||||
movaps %xmm0, %xmm1
|
||||
mulpd -16 * SIZE(X), %xmm1
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
|
@ -47,7 +46,7 @@
|
|||
|
||||
#ifndef WINDOWS_ABI
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define STACKSIZE 128
|
||||
|
||||
#define OLD_M %rdi
|
||||
#define OLD_N %rsi
|
||||
|
@ -57,6 +56,10 @@
|
|||
#define STACK_Y 16 + STACKSIZE(%rsp)
|
||||
#define STACK_INCY 24 + STACKSIZE(%rsp)
|
||||
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
|
||||
#define MMM 56(%rsp)
|
||||
#define NN 64(%rsp)
|
||||
#define AA 72(%rsp)
|
||||
#define LDAX 80(%rsp)
|
||||
|
||||
#else
|
||||
|
||||
|
@ -71,6 +74,10 @@
|
|||
#define STACK_Y 72 + STACKSIZE(%rsp)
|
||||
#define STACK_INCY 80 + STACKSIZE(%rsp)
|
||||
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
|
||||
#define MMM 216(%rsp)
|
||||
#define NN 224(%rsp)
|
||||
#define AA 232(%rsp)
|
||||
#define LDAX 240(%rsp)
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -127,29 +134,48 @@
|
|||
movups %xmm14, 192(%rsp)
|
||||
movups %xmm15, 208(%rsp)
|
||||
|
||||
movq OLD_M, M
|
||||
movq OLD_N, N
|
||||
movq OLD_A, A
|
||||
movq OLD_LDA, LDA
|
||||
movq OLD_M, MMM
|
||||
movq OLD_N, NN
|
||||
movq OLD_A, X
|
||||
movq X, AA
|
||||
movq OLD_LDA, X
|
||||
movq X, LDAX
|
||||
movq OLD_X, X
|
||||
#else
|
||||
movq OLD_M, M
|
||||
movq OLD_N, N
|
||||
movq OLD_A, A
|
||||
movq OLD_LDA, LDA
|
||||
movq OLD_M, MMM
|
||||
movq OLD_N, NN
|
||||
movq OLD_A, AA
|
||||
movq OLD_LDA, LDAX
|
||||
#endif
|
||||
|
||||
movq STACK_INCX, INCX
|
||||
movq STACK_Y, Y
|
||||
movq STACK_INCY, INCY
|
||||
movq STACK_BUFFER, BUFFER
|
||||
|
||||
#ifndef WINDOWS_ABI
|
||||
pshufd $0, %xmm0, ALPHA
|
||||
#else
|
||||
pshufd $0, %xmm3, ALPHA
|
||||
#endif
|
||||
|
||||
|
||||
.L0t:
|
||||
xorq M,M
|
||||
addq $1,M
|
||||
salq $22,M
|
||||
subq M,MMM
|
||||
jge .L00t
|
||||
ALIGN_4
|
||||
|
||||
movq MMM,%rax
|
||||
addq M,%rax
|
||||
jle .L999x
|
||||
movq %rax,M
|
||||
|
||||
.L00t:
|
||||
movq LDAX,LDA
|
||||
movq NN,N
|
||||
movq AA,A
|
||||
movq STACK_INCX, INCX
|
||||
movq STACK_Y, Y
|
||||
movq STACK_INCY, INCY
|
||||
movq STACK_BUFFER, BUFFER
|
||||
|
||||
leaq (,INCX, SIZE), INCX
|
||||
leaq (,INCY, SIZE), INCY
|
||||
leaq (,LDA, SIZE), LDA
|
||||
|
@ -6341,6 +6367,12 @@
|
|||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
leaq (,M,SIZE),%rax
|
||||
addq %rax,AA
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
.L999x:
|
||||
movq 0(%rsp), %rbx
|
||||
movq 8(%rsp), %rbp
|
||||
movq 16(%rsp), %r12
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
#define PREFETCHW prefetcht0
|
||||
#endif
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define movsd movlps
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
#define PREFETCHW prefetcht0
|
||||
#endif
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define movsd movlps
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
#define PREFETCHW prefetcht0
|
||||
#endif
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define movsd movlps
|
||||
|
|
|
@ -699,7 +699,7 @@
|
|||
movsd -32 * SIZE(X), %xmm4
|
||||
|
||||
pshufd $0xb1, %xmm4, %xmm12
|
||||
shufps $0x39, %xmm8, %xmm8
|
||||
shufps $0x59, %xmm8, %xmm8
|
||||
mulps %xmm8, %xmm4
|
||||
addps %xmm4, %xmm0
|
||||
mulps %xmm8, %xmm12
|
||||
|
@ -1336,7 +1336,7 @@
|
|||
|
||||
movss %xmm9, %xmm8
|
||||
pshufd $0xb1, %xmm4, %xmm12
|
||||
shufps $0x93, %xmm8, %xmm8
|
||||
shufps $0x03, %xmm8, %xmm8
|
||||
mulps %xmm8, %xmm4
|
||||
addps %xmm4, %xmm0
|
||||
mulps %xmm8, %xmm12
|
||||
|
@ -1697,7 +1697,7 @@
|
|||
movsd -32 * SIZE(Y), %xmm4
|
||||
|
||||
pshufd $0xb1, %xmm4, %xmm12
|
||||
shufps $0x39, %xmm8, %xmm8
|
||||
shufps $0xa9, %xmm8, %xmm8
|
||||
mulps %xmm8, %xmm4
|
||||
addps %xmm4, %xmm0
|
||||
mulps %xmm8, %xmm12
|
||||
|
@ -2024,7 +2024,7 @@
|
|||
|
||||
movss %xmm9, %xmm8
|
||||
pshufd $0xb1, %xmm4, %xmm12
|
||||
shufps $0x93, %xmm8, %xmm8
|
||||
shufps $0x03, %xmm8, %xmm8
|
||||
mulps %xmm8, %xmm4
|
||||
addps %xmm4, %xmm0
|
||||
mulps %xmm8, %xmm12
|
||||
|
|
|
@ -85,7 +85,7 @@
|
|||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define RPREFETCHSIZE 32
|
||||
#define WPREFETCHSIZE 48
|
||||
#endif
|
||||
|
|
|
@ -160,7 +160,7 @@
|
|||
#define a3 %xmm14
|
||||
#define xt1 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
@ -167,7 +167,7 @@
|
|||
#define a3 %xmm14
|
||||
#define xt1 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
@ -166,7 +166,7 @@
|
|||
#define xt1 %xmm14
|
||||
#define xt2 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
|
|
@ -76,7 +76,7 @@
|
|||
#define movsd movlpd
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (16 * 16)
|
||||
|
@ -166,7 +166,7 @@
|
|||
#define a3 %xmm14
|
||||
#define xt1 %xmm15
|
||||
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
|
||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||
#else
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
#define BORIG 72(%rsp)
|
||||
#define BUFFER 128(%rsp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHNTA prefetchnta
|
||||
|
|
|
@ -95,7 +95,7 @@
|
|||
#define PREFETCHSIZE (8 * 6 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHNTA prefetchnta
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
#define BORIG 72(%rsp)
|
||||
#define BUFFER 128(%rsp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHNTA prefetchnta
|
||||
|
|
|
@ -95,7 +95,7 @@
|
|||
#define PREFETCHSIZE (8 * 6 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHNTA prefetchnta
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
#define BORIG 72(%rsp)
|
||||
#define BUFFER 128(%rsp)
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHNTA prefetchnta
|
||||
|
|
|
@ -95,7 +95,7 @@
|
|||
#define PREFETCHSIZE (8 * 6 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHNTA prefetchnta
|
||||
|
|
|
@ -74,6 +74,13 @@
|
|||
#define ALIGNED_ACCESS
|
||||
#endif
|
||||
|
||||
#ifdef BULLDOZER
|
||||
#define PREFETCH prefetch
|
||||
#define PREFETCHW prefetchw
|
||||
#define PREFETCHSIZE (128 * 5)
|
||||
#define ALIGNED_ACCESS
|
||||
#endif
|
||||
|
||||
#ifdef NANO
|
||||
#define PREFETCH prefetcht0
|
||||
#define PREFETCHW prefetcht0
|
||||
|
|
|
@ -85,7 +85,7 @@
|
|||
#define movsd movlps
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
#define ALIGNED_ACCESS
|
||||
#define MOVUPS_A movaps
|
||||
#define MOVUPS_XL movaps
|
||||
|
|
|
@ -66,7 +66,9 @@ static FLOAT dm1 = -1.;
|
|||
#endif
|
||||
|
||||
#define GEMM_PQ MAX(GEMM_P, GEMM_Q)
|
||||
#define REAL_GEMM_R (GEMM_R - GEMM_PQ)
|
||||
|
||||
//leave some space for GEMM_ALIGN in sb2
|
||||
#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ)
|
||||
|
||||
#if 0
|
||||
#define SHARED_ARRAY
|
||||
|
@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
sa,
|
||||
sb2,
|
||||
a + (is + js * lda) * COMPSIZE, lda,
|
||||
- is + js);
|
||||
is - js);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
2
make.inc
2
make.inc
|
@ -4,7 +4,7 @@ DRVOPTS = $(OPTS)
|
|||
LOADER = $(FORTRAN)
|
||||
TIMER = NONE
|
||||
ARCHFLAGS= -ru
|
||||
RANLIB = ranlib
|
||||
#RANLIB = ranlib
|
||||
BLASLIB =
|
||||
TMGLIB = tmglib.a
|
||||
EIGSRCLIB = eigsrc.a
|
||||
|
|
|
@ -48,7 +48,8 @@ typedef int blasint;
|
|||
/* C99 supports complex floating numbers natively, which GCC also offers as an
|
||||
extension since version 3.0. If neither are available, use a compatible
|
||||
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
|
||||
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
|
||||
#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
|
||||
(__GNUC__ >= 3 && !defined(__cplusplus)))
|
||||
#define OPENBLAS_COMPLEX_C99
|
||||
#include <complex.h>
|
||||
typedef float _Complex openblas_complex_float;
|
||||
|
|
2
param.h
2
param.h
|
@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
||||
|
||||
#define SNUMOPT 8
|
||||
#define DNUMOPT 4
|
||||
|
|
Loading…
Reference in New Issue