Merge branch 'release-0.2.0'

This commit is contained in:
Zhang Xianyi 2012-06-26 07:45:23 +08:00
commit 47860cf002
106 changed files with 17268 additions and 522 deletions

7
.gitignore vendored
View File

@ -1,16 +1,23 @@
*.obj *.obj
*.lib *.lib
*.dll *.dll
*.dylib
*.def *.def
*.o *.o
lapack-3.1.1 lapack-3.1.1
lapack-3.1.1.tgz lapack-3.1.1.tgz
lapack-3.4.1
lapack-3.4.1.tgz
*.so *.so
*.a *.a
.svn .svn
*~ *~
lib.grd
nohup.out
config.h config.h
Makefile.conf Makefile.conf
Makefile.conf_last
config_last.h
getarch getarch
getarch_2nd getarch_2nd
utest/openblas_utest utest/openblas_utest

View File

@ -1,4 +1,17 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.2.0
26-Jun-2012
common:
* Removed the limitation (64) of numbers of CPU cores.
Now, it supports 256 cores at max.
* Supported clang compiler.
* Fixed some build bugs on FreeBSD
x86/x86-64:
* Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions.
Please use gcc >= 4.6 or clang >=3.1.
* Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes.
==================================================================== ====================================================================
Version 0.1.1 Version 0.1.1
29-Apr-2012 29-Apr-2012
@ -7,6 +20,8 @@ common:
* Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia)
* Fixed the build bug (MD5 and download) on Mac OSX. * Fixed the build bug (MD5 and download) on Mac OSX.
* Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1.
* Fxied the compatibility issue for compilers without C99 complex number
(e.g. Visual Studio)
x86/x86_64: x86/x86_64:
* Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX.
* Test alpha=Nan in dscale. * Test alpha=Nan in dscale.

View File

@ -90,6 +90,15 @@
number of threads will consume extra resource. I recommend you to number of threads will consume extra resource. I recommend you to
specify minimum number of threads. specify minimum number of threads.
1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong?
A This may be related to a bug in the Linux kernel 2.6.32. Try applying
the patch segaults.patch using
patch < segfaults.patch
and see if the crashes persist. Note that this patch will lead to many
compiler warnings.
2. Architecture Specific issue or Implementation 2. Architecture Specific issue or Implementation

View File

@ -256,12 +256,17 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz
lapack-3.4.1.tgz : lapack-3.4.1.tgz :
ifndef NOFORTRAN ifndef NOFORTRAN
ifeq ($(OSNAME), Darwin) #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
curl -O $(LAPACK_URL) curl -O $(LAPACK_URL)
else
ifeq ($(OSNAME), FreeBSD)
fetch $(LAPACK_URL)
else else
wget $(LAPACK_URL) wget $(LAPACK_URL)
endif endif
endif endif
endif
large.tgz : large.tgz :
ifndef NOFORTRAN ifndef NOFORTRAN

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.1.1 VERSION = 0.2.0
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -94,8 +94,8 @@ VERSION = 0.1.1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading # with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4. # in small matrix sizes. The default value is 50.
# GEMM_MULTITHREAD_THRESHOLD = 4 # GEMM_MULTITHREAD_THRESHOLD = 50
# If you need santy check by comparing reference BLAS. It'll be very # If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet). # slow (Not implemented yet).

View File

@ -14,7 +14,15 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1
endif endif
# Default C compiler # Default C compiler
# - Only set if not specified on the command line or inherited from the environment.
# - CC is an implicit variable so neither '?=' or 'ifndef' can be used.
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default)
CC = gcc CC = gcc
endif
# Default Fortran compiler (FC) is selected by f_check.
ifndef MAKEFILE_RULE ifndef MAKEFILE_RULE
include $(TOPDIR)/Makefile.rule include $(TOPDIR)/Makefile.rule
@ -45,7 +53,7 @@ GETARCH_FLAGS += -DUSE64BITINT
endif endif
ifndef GEMM_MULTITHREAD_THRESHOLD ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4 GEMM_MULTITHREAD_THRESHOLD=50
endif endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
@ -108,6 +116,14 @@ export MACOSX_DEPLOYMENT_TARGET=10.2
MD5SUM = md5 -r MD5SUM = md5 -r
endif endif
ifeq ($(OSNAME), FreeBSD)
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), NetBSD)
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
EXTRALIB += -lm EXTRALIB += -lm
endif endif
@ -231,11 +247,11 @@ endif
ifdef DYNAMIC_ARCH ifdef DYNAMIC_ARCH
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
endif endif
ifndef DYNAMIC_CORE ifndef DYNAMIC_CORE
@ -754,6 +770,7 @@ export HAVE_SSE4_1
export HAVE_SSE4_2 export HAVE_SSE4_2
export HAVE_SSE4A export HAVE_SSE4A
export HAVE_SSE5 export HAVE_SSE5
export HAVE_AVX
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE

84
README
View File

@ -1,84 +0,0 @@
OpenBLAS Readme
1.Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn)
Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki).
2.Intallation
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or,
check out codes from git://github.com/xianyi/OpenBLAS.git
1)Normal compile
(a) type "make" to detect the CPU automatically.
or
(b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
2)Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
examples:
On X86 box, compile this library for loongson3a CPU.
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
3)Debug version
make DEBUG=1
4)Intall to the directory (Optional)
e.g.
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS
3.Support CPU & OS
Please read GotoBLAS_01Readme.txt
Additional support CPU:
x86_64:
Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes.
MIPS64:
ICT Loongson 3A //Level 3 BLAS subroutines are optimized.
4.Usages
Link with libopenblas.a or -lopenblas for shared library.
4.1 Set the number of threads with environment variables. for example,
export OPENBLAS_NUM_THREADS=4
or
export GOTO_NUM_THREADS=4
or
export OMP_NUM_THREADS=4
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
4.2 Set the number of threads with calling functions. for example,
void goto_set_num_threads(int num_threads);
or
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
5.Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
6.To-Do List:
Optimization on ICT Loongson 3A CPU
7.Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
8.ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages

110
README.md Normal file
View File

@ -0,0 +1,110 @@
# OpenBLAS
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
## Installation
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
### Normal compile
* type "make" to detect the CPU automatically.
or
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
### Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
Examples:
On X86 box, compile this library for loongson3a CPU.
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
### Debug version
make DEBUG=1
### Intall to the directory (Optional)
Example:
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS
## Support CPU & OS
Please read GotoBLAS_01Readme.txt
### Additional support CPU:
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental
### Support OS:
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
## Usages
Link with libopenblas.a or -lopenblas for shared library.
### Set the number of threads with environment variables.
Examples:
export OPENBLAS_NUM_THREADS=4
or
export GOTO_NUM_THREADS=4
or
export OMP_NUM_THREADS=4
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
### Set the number of threads on runtime.
We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy.
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
## Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
## Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
## Troubleshooting
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages

View File

@ -18,6 +18,7 @@ CORE2
PENRYN PENRYN
DUNNINGTON DUNNINGTON
NEHALEM NEHALEM
SANDYBRIDGE
ATOM ATOM
b)AMD CPU: b)AMD CPU:
@ -27,6 +28,7 @@ OPTERON_SSE3
BARCELONA BARCELONA
SHANGHAI SHANGHAI
ISTANBUL ISTANBUL
BOBCAT
c)VIA CPU: c)VIA CPU:
SSE_GENERIC SSE_GENERIC
@ -47,6 +49,7 @@ CELL
3.MIPS64 CPU: 3.MIPS64 CPU:
SICORTEX SICORTEX
LOONGSON3A LOONGSON3A
LOONGSON3B
4.IA64 CPU: 4.IA64 CPU:
ITANIUM2 ITANIUM2

10
c_check
View File

@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/);
$compiler = GCC if ($compiler eq ""); $compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/); $os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FreeBSD/); $os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NetBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/);
$os = Darwin if ($data =~ /OS_Darwin/); $os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SunOS/); $os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/); $os = AIX if ($data =~ /OS_AIX/);
$os = osf if ($data =~ /OS_OSF/); $os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/); $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/); $os = Interix if ($data =~ /OS_INTERIX/);
$architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86 if ($data =~ /ARCH_X86/);

View File

@ -9,6 +9,10 @@ extern "C" {
#include <stddef.h> #include <stddef.h>
#include "common.h" #include "common.h"
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
#define CBLAS_INDEX size_t #define CBLAS_INDEX size_t
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};

View File

@ -68,7 +68,7 @@ extern "C" {
#define SMP #define SMP
#endif #endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define WINDOWS_ABI #define WINDOWS_ABI
#define OS_WINDOWS #define OS_WINDOWS
@ -89,7 +89,7 @@ extern "C" {
#include <sched.h> #include <sched.h>
#endif #endif
#ifdef OS_DARWIN #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
#include <sched.h> #include <sched.h>
#endif #endif

View File

@ -45,6 +45,8 @@ extern "C" {
int BLASFUNC(xerbla)(char *, blasint *info, blasint); int BLASFUNC(xerbla)(char *, blasint *info, blasint);
void BLASFUNC(openblas_set_num_threads)(int *);
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);

View File

@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) {
int openmp_nthreads=0; int openmp_nthreads=0;
#endif #endif
if ((blas_cpu_number == 1) if (blas_cpu_number == 1
#ifdef USE_OPENMP #ifdef USE_OPENMP
|| omp_in_parallel() || omp_in_parallel()

View File

@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define PROFCODE #define PROFCODE
#endif #endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define SAVEREGISTERS \ #define SAVEREGISTERS \
subl $32, %esp;\ subl $32, %esp;\
movups %xmm6, 0(%esp);\ movups %xmm6, 0(%esp);\
@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define RESTOREREGISTERS #define RESTOREREGISTERS
#endif #endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 16; \ .align 16; \
@ -282,7 +282,7 @@ REALNAME:
#define EPILOGUE .end REALNAME #define EPILOGUE .end REALNAME
#endif #endif
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 16; \ .align 16; \
@ -356,4 +356,11 @@ REALNAME:
#ifndef ALIGN_6 #ifndef ALIGN_6
#define ALIGN_6 .align 64 #define ALIGN_6 .align 64
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif #endif

View File

@ -353,7 +353,7 @@ REALNAME:
#define EPILOGUE .end REALNAME #define EPILOGUE .end REALNAME
#endif #endif
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 512; \ .align 512; \
@ -425,6 +425,7 @@ REALNAME:
#define ALIGN_2 .align 2 #define ALIGN_2 .align 2
#define ALIGN_3 .align 3 #define ALIGN_3 .align 3
#define ALIGN_4 .align 4 #define ALIGN_4 .align 4
#define ALIGN_5 .align 5
#define ffreep fstp #define ffreep fstp
#endif #endif
@ -448,4 +449,10 @@ REALNAME:
#define ALIGN_6 .align 64 #define ALIGN_6 .align 64
#endif #endif
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif #endif

View File

@ -103,6 +103,8 @@
#define CORE_NEHALEM 17 #define CORE_NEHALEM 17
#define CORE_ATOM 18 #define CORE_ATOM 18
#define CORE_NANO 19 #define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -122,6 +124,7 @@
#define HAVE_MISALIGNSSE (1 << 15) #define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16) #define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17) #define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18)
#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@ -188,4 +191,6 @@ typedef struct {
#define CPUTYPE_NSGEODE 41 #define CPUTYPE_NSGEODE 41
#define CPUTYPE_VIAC3 42 #define CPUTYPE_VIAC3 42
#define CPUTYPE_NANO 43 #define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#endif #endif

View File

@ -189,6 +189,7 @@ int get_cputype(int gettype){
if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3;
if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1;
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX;
if (have_excpuid() >= 0x01) { if (have_excpuid() >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx); cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
@ -983,13 +984,13 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 10: case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CPUTYPE_NEHALEM; return CPUTYPE_SANDYBRIDGE;
case 12: case 12:
//Xeon Processor 5600 (Westmere-EP) //Xeon Processor 5600 (Westmere-EP)
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 13: case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CPUTYPE_NEHALEM; return CPUTYPE_SANDYBRIDGE;
case 15: case 15:
//Xeon Processor E7 (Westmere-EX) //Xeon Processor E7 (Westmere-EX)
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
@ -1027,6 +1028,8 @@ int get_cpuname(void){
case 1: case 1:
case 10: case 10:
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
case 5:
return CPUTYPE_BOBCAT;
} }
break; break;
} }
@ -1146,6 +1149,8 @@ static char *cpuname[] = {
"NSGEODE", "NSGEODE",
"VIAC3", "VIAC3",
"NANO", "NANO",
"SANDYBRIDGE",
"BOBCAT",
}; };
static char *lowercpuname[] = { static char *lowercpuname[] = {
@ -1192,6 +1197,8 @@ static char *lowercpuname[] = {
"tms3x00", "tms3x00",
"nsgeode", "nsgeode",
"nano", "nano",
"sandybridge",
"bobcat",
}; };
static char *corename[] = { static char *corename[] = {
@ -1215,6 +1222,8 @@ static char *corename[] = {
"NEHALEM", "NEHALEM",
"ATOM", "ATOM",
"NANO", "NANO",
"SANDYBRIDGE",
"BOBCAT",
}; };
static char *corename_lower[] = { static char *corename_lower[] = {
@ -1238,6 +1247,8 @@ static char *corename_lower[] = {
"nehalem", "nehalem",
"atom", "atom",
"nano", "nano",
"sandybridge",
"bobcat",
}; };
@ -1321,13 +1332,13 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
case 10: case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CORE_NEHALEM; return CORE_SANDYBRIDGE;
case 12: case 12:
//Xeon Processor 5600 (Westmere-EP) //Xeon Processor 5600 (Westmere-EP)
return CORE_NEHALEM; return CORE_NEHALEM;
case 13: case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CORE_NEHALEM; return CORE_SANDYBRIDGE;
case 15: case 15:
//Xeon Processor E7 (Westmere-EX) //Xeon Processor E7 (Westmere-EX)
return CORE_NEHALEM; return CORE_NEHALEM;
@ -1346,7 +1357,9 @@ int get_coretype(void){
if (family <= 0x5) return CORE_80486; if (family <= 0x5) return CORE_80486;
if (family <= 0xe) return CORE_ATHLON; if (family <= 0xe) return CORE_ATHLON;
if (family == 0xf){ if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
else return CORE_BARCELONA;
} }
} }
@ -1426,6 +1439,7 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n");
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
@ -1491,6 +1505,7 @@ void get_sse(void){
if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n");
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");

10
ctest.c
View File

@ -35,19 +35,19 @@ OS_LINUX
#endif #endif
#if defined(__FreeBSD__) #if defined(__FreeBSD__)
OS_FreeBSD OS_FREEBSD
#endif #endif
#if defined(__NetBSD__) #if defined(__NetBSD__)
OS_NetBSD OS_NETBSD
#endif #endif
#if defined(__sun) #if defined(__sun)
OS_SunOS OS_SUNOS
#endif #endif
#if defined(__APPLE__) #if defined(__APPLE__)
OS_Darwin OS_DARWIN
#endif #endif
#if defined(_AIX) #if defined(_AIX)
@ -63,7 +63,7 @@ OS_WINNT
#endif #endif
#if defined(__CYGWIN__) #if defined(__CYGWIN__)
OS_CYGWIN OS_CYGWIN_NT
#endif #endif
#if defined(__INTERIX) #if defined(__INTERIX)

View File

@ -1,12 +1,12 @@
TOPDIR = ../.. TOPDIR = ../..
include ../../Makefile.system include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
ifdef SMP ifdef SMP
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
ifndef NO_AFFINITY ifndef NO_AFFINITY
COMMONOBJS += init.$(SUFFIX) COMMONOBJS += init.$(SUFFIX)
endif endif

View File

@ -63,6 +63,14 @@ static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER]; static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER];
void goto_set_num_threads(int num)
{
}
void openblas_set_num_threads(int num)
{
}
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){ if (!(mode & BLAS_COMPLEX)){

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MAX_NODES 16 #define MAX_NODES 16
#define MAX_CPUS 256 #define MAX_CPUS 256
#define NCPUBITS (8*sizeof(unsigned long))
#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS)
#define CPUELT(cpu) ((cpu) / NCPUBITS)
#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS))
#define SH_MAGIC 0x510510 #define SH_MAGIC 0x510510
@ -103,10 +108,10 @@ typedef struct {
int num_nodes; int num_nodes;
int num_procs; int num_procs;
int final_num_procs; int final_num_procs;
unsigned long avail; unsigned long avail [MAX_BITMASK_LEN];
int avail_count;
unsigned long cpu_info [MAX_CPUS]; unsigned long cpu_info [MAX_CPUS];
unsigned long node_info [MAX_NODES]; unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN];
int cpu_use[MAX_CPUS]; int cpu_use[MAX_CPUS];
} shm_t; } shm_t;
@ -126,7 +131,8 @@ static shm_t *common = (void *)-1;
static int shmid, pshmid; static int shmid, pshmid;
static void *paddr; static void *paddr;
static unsigned long lprocmask, lnodemask; static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask;
static int lprocmask_count = 0;
static int numprocs = 1; static int numprocs = 1;
static int numnodes = 1; static int numnodes = 1;
@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) {
than sizeof(unsigned long). On 64 bits, the limit than sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32. is 64. On 32 bits, it is 32.
***/ ***/
static inline unsigned long get_cpumap(int node) { static inline void get_cpumap(int node, unsigned long * node_info) {
int infile; int infile;
unsigned long affinity; unsigned long affinity[32];
char name[160]; char name[160];
char cpumap[160]; char cpumap[160];
char *p, *dummy; char *dummy;
int i=0; int i=0;
int count=0;
int k=0;
sprintf(name, CPUMAP_NAME, node); sprintf(name, CPUMAP_NAME, node);
infile = open(name, O_RDONLY); infile = open(name, O_RDONLY);
for(i=0; i<32; i++){
affinity = 0; affinity[i] = 0;
}
if (infile != -1) { if (infile != -1) {
read(infile, cpumap, sizeof(cpumap)); read(infile, cpumap, sizeof(cpumap));
p = cpumap;
while (*p != '\n' && i<160){ for(i=0; i<160; i++){
if(*p != ',') { if(cpumap[i] == '\n')
name[i++]=*p; break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
//Enough data for Hex
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
} }
p++;
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
node_info[i]=affinity[count-i-1];
} }
p = name;
// while ((*p == '0') || (*p == ',')) p++;
affinity = strtoul(p, &dummy, 16);
close(infile); close(infile);
} }
return affinity; return ;
} }
static inline unsigned long get_share(int cpu, int level) { static inline void get_share(int cpu, int level, unsigned long * share) {
int infile; int infile;
unsigned long affinity; unsigned long affinity[32];
char cpumap[160];
char name[160]; char name[160];
char *p; char *dummy;
int count=0;
int i=0,k=0;
int bitmask_idx = 0;
sprintf(name, SHARE_NAME, cpu, level); sprintf(name, SHARE_NAME, cpu, level);
infile = open(name, O_RDONLY); infile = open(name, O_RDONLY);
affinity = (1UL << cpu); // Init share
for(i=0; i<MAX_BITMASK_LEN; i++){
share[i]=0;
}
bitmask_idx = CPUELT(cpu);
share[bitmask_idx] = CPUMASK(cpu);
if (infile != -1) { if (infile != -1) {
read(infile, name, sizeof(name)); read(infile, cpumap, sizeof(cpumap));
p = name; for(i=0; i<160; i++){
if(cpumap[i] == '\n')
break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
while ((*p == '0') || (*p == ',')) p++; //Enough data
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
}
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
share[i]=affinity[count-i-1];
}
affinity = strtol(p, &p, 16);
close(infile); close(infile);
} }
return affinity; return ;
} }
static int numa_check(void) { static int numa_check(void) {
@ -248,6 +298,7 @@ static int numa_check(void) {
DIR *dp; DIR *dp;
struct dirent *dir; struct dirent *dir;
int node; int node;
int j;
common -> num_nodes = 0; common -> num_nodes = 0;
@ -258,7 +309,9 @@ static int numa_check(void) {
return 0; return 0;
} }
for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; for (node = 0; node < MAX_NODES; node ++) {
for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0;
}
while ((dir = readdir(dp)) != NULL) { while ((dir = readdir(dp)) != NULL) {
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
@ -266,12 +319,12 @@ static int numa_check(void) {
node = atoi(&dir -> d_name[4]); node = atoi(&dir -> d_name[4]);
if (node > MAX_NODES) { if (node > MAX_NODES) {
fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n");
exit(1); exit(1);
} }
common -> num_nodes ++; common -> num_nodes ++;
common -> node_info[node] = get_cpumap(node); get_cpumap(node, common->node_info[node]);
} }
} }
@ -284,7 +337,7 @@ static int numa_check(void) {
fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes);
for (node = 0; node < common -> num_nodes; node ++) for (node = 0; node < common -> num_nodes; node ++)
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]);
#endif #endif
return common -> num_nodes; return common -> num_nodes;
@ -296,11 +349,13 @@ static void numa_mapping(void) {
int i, j, h; int i, j, h;
unsigned long work, bit; unsigned long work, bit;
int count = 0; int count = 0;
int bitmask_idx = 0;
for (node = 0; node < common -> num_nodes; node ++) { for (node = 0; node < common -> num_nodes; node ++) {
core = 0; core = 0;
for (cpu = 0; cpu < common -> num_procs; cpu ++) { for (cpu = 0; cpu < common -> num_procs; cpu ++) {
if (common -> node_info[node] & common -> avail & (1UL << cpu)) { bitmask_idx = CPUELT(cpu);
if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) {
common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu);
count ++; count ++;
core ++; core ++;
@ -357,58 +412,89 @@ static void numa_mapping(void) {
static void disable_hyperthread(void) { static void disable_hyperthread(void) {
unsigned long share; unsigned long share[MAX_BITMASK_LEN];
int cpu; int cpu;
int bitmask_idx = 0;
int i=0, count=0;
bitmask_idx = CPUELT(common -> num_procs);
if(common->num_procs > 64){ for(i=0; i< bitmask_idx; i++){
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL;
exit(1); }
}else if(common->num_procs == 64){ if(CPUMASK(common -> num_procs) != 1){
common -> avail = 0xFFFFFFFFFFFFFFFFUL; common -> avail[count++] = CPUMASK(common -> num_procs) - 1;
}else }
common -> avail = (1UL << common -> num_procs) - 1; common -> avail_count = count;
/* if(common->num_procs > 64){ */
/* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */
/* exit(1); */
/* }else if(common->num_procs == 64){ */
/* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */
/* }else */
/* common -> avail = (1UL << common -> num_procs) - 1; */
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); fprintf(stderr, "\nAvail CPUs : ");
for(i=0; i<count; i++)
fprintf(stderr, "%04lx ", common -> avail[i]);
fprintf(stderr, ".\n");
#endif #endif
for (cpu = 0; cpu < common -> num_procs; cpu ++) { for (cpu = 0; cpu < common -> num_procs; cpu ++) {
share = (get_share(cpu, 1) & common -> avail); get_share(cpu, 1, share);
if (popcount(share) > 1) { //When the shared cpu are in different element of share & avail array, this may be a bug.
for (i = 0; i < count ; i++){
if (popcount(share[i]) > 1) {
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
cpu, share & ~(1UL << cpu)); cpu, share[i] & ~(CPUMASK(cpu)));
#endif #endif
common -> avail &= ~((share & ~(1UL << cpu))); common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu)));
}
} }
} }
} }
static void disable_affinity(void) { static void disable_affinity(void) {
int i=0;
int bitmask_idx=0;
int count=0;
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]);
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
#endif #endif
if(common->final_num_procs > 64){ /* if(common->final_num_procs > 64){ */
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */
exit(1); /* exit(1); */
}else if(common->final_num_procs == 64){ /* }else if(common->final_num_procs == 64){ */
lprocmask = 0xFFFFFFFFFFFFFFFFUL; /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */
}else /* }else */
lprocmask = (1UL << common -> final_num_procs) - 1; /* lprocmask = (1UL << common -> final_num_procs) - 1; */
bitmask_idx = CPUELT(common -> final_num_procs);
for(i=0; i< bitmask_idx; i++){
lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL;
}
if(CPUMASK(common -> final_num_procs) != 1){
lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1;
}
lprocmask_count = count;
#ifndef USE_OPENMP #ifndef USE_OPENMP
lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; for(i=0; i< count; i++){
lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i];
}
#endif #endif
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]);
#endif #endif
} }
@ -498,7 +584,7 @@ static void create_pshmem(void) {
static void local_cpu_map(void) { static void local_cpu_map(void) {
int cpu, id, mapping; int cpu, id, mapping;
int bitmask_idx = 0;
cpu = 0; cpu = 0;
mapping = 0; mapping = 0;
@ -509,7 +595,8 @@ static void local_cpu_map(void) {
if (is_dead(id)) common -> cpu_use[cpu] = 0; if (is_dead(id)) common -> cpu_use[cpu] = 0;
} }
if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { bitmask_idx = CPUELT(cpu);
if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) {
common -> cpu_use[cpu] = pshmid; common -> cpu_use[cpu] = pshmid;
cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]);
@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) {
#ifndef USE_OPENMP #ifndef USE_OPENMP
cpu_set_t cpu_mask; cpu_set_t cpu_mask;
#endif #endif
int i;
if (initialized) return; if (initialized) return;
@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) {
common -> num_procs = get_nprocs(); common -> num_procs = get_nprocs();
if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
exit(1);
}
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
numa_check(); numa_check();
@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping(); if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = popcount(common -> avail); common -> final_num_procs = 0;
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]);
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) {
disable_affinity(); disable_affinity();
num_avail = popcount(lprocmask); num_avail = 0;
for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]);
if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail;

View File

@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#if defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_FREEBSD) || defined(OS_DARWIN)
#include <sys/sysctl.h> #include <sys/sysctl.h>
#endif #endif
@ -185,7 +185,7 @@ int get_num_procs(void) {
#endif #endif
#if defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_FREEBSD) || defined(OS_DARWIN)
int get_num_procs(void) { int get_num_procs(void) {
@ -215,7 +215,7 @@ int goto_get_num_procs (void) {
int blas_get_cpu_number(void){ int blas_get_cpu_number(void){
char *p; char *p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
int max_num; int max_num;
#endif #endif
int blas_goto_num = 0; int blas_goto_num = 0;
@ -223,7 +223,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads; if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
max_num = get_num_procs(); max_num = get_num_procs();
#endif #endif
@ -250,7 +250,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER; else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
if (blas_num_threads > max_num) blas_num_threads = max_num; if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif #endif

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#ifdef SMP_SERVER #ifdef SMP_SERVER
#ifdef OS_LINUX
extern void openblas_set_num_threads(int num_threads) ; extern void openblas_set_num_threads(int num_threads) ;
@ -41,5 +40,13 @@ void NAME(int* num_threads){
openblas_set_num_threads(*num_threads); openblas_set_num_threads(*num_threads);
} }
#endif #else
//Single thread
void openblas_set_num_threads(int num_threads) {
}
void NAME(int* num_threads){
}
#endif #endif

View File

@ -163,9 +163,9 @@ int get_L2_size(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -384,6 +384,17 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif
#if defined(SANDYBRIDGE)
sgemm_p = 1024;
dgemm_p = 512;
cgemm_p = 512;
zgemm_p = 256;
#ifdef EXPRECISION
qgemm_p = 256;
xgemm_p = 128;
#endif
#endif
#if defined(CORE_PRESCOTT) || defined(GENERIC) #if defined(CORE_PRESCOTT) || defined(GENERIC)
size >>= 6; size >>= 6;
@ -435,7 +446,7 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif
#if defined(CORE_BARCELONA) #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
size >>= 8; size >>= 8;
sgemm_p = 232 * size; sgemm_p = 232 * size;

View File

@ -70,11 +70,11 @@ dll2 : libgoto2_shared.dll
$(RANLIB) ../$(LIBNAME) $(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1) ifeq ($(BINARY32), 1)
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
-lib /machine:i386 /def:libopenblas.def -lib /machine:i386 /def:libopenblas.def
else else
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
-lib /machine:X64 /def:libopenblas.def -lib /machine:X64 /def:libopenblas.def
endif endif
@ -121,7 +121,7 @@ so : ../$(LIBSONAME)
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def $(EXTRALIB) -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest rm -f linktest

File diff suppressed because it is too large Load Diff

View File

@ -32,11 +32,12 @@ if ($compiler eq "") {
"pgf95", "pgf90", "pgf77", "pgf95", "pgf90", "pgf77",
"ifort"); "ifort");
OUTER:
foreach $lists (@lists) { foreach $lists (@lists) {
foreach $path (@path) { foreach $path (@path) {
if (-f $path . "/" . $lists) { if (-x $path . "/" . $lists) {
$compiler = $lists; $compiler = $lists;
break; last OUTER;
} }
} }
} }

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_BARCELONA */ /* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */ /* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */ /* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_SSE_GENERIC */ /* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */ /* #define FORCE_VIAC3 */
/* #define FORCE_NANO */ /* #define FORCE_NANO */
@ -278,6 +279,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "NEHALEM" #define CORENAME "NEHALEM"
#endif #endif
#ifdef FORCE_SANDYBRIDGE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -349,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BARCELONA" #define CORENAME "BARCELONA"
#endif #endif
#if defined(FORCE_BOBCAT)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BOBCAT"
#define ARCHCONFIG "-DBOBCAT " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV"
#define LIBNAME "bobcat"
#define CORENAME "BOBCAT"
#endif
#ifdef FORCE_SSE_GENERIC #ifdef FORCE_SSE_GENERIC
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL

View File

@ -0,0 +1,235 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
for (j=0; j<col/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
dest0[16] = src0[4];
dest0[17] = src0[5];
dest0[18] = src1[4];
dest0[19] = src1[5];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src3[4];
dest0[23] = src3[5];
dest0[24] = src0[6];
dest0[25] = src0[7];
dest0[26] = src1[6];
dest0[27] = src1[7];
dest0[28] = src2[6];
dest0[29] = src2[7];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (4<<3);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
ii = (2<<3);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
ii = (1<<3);
dest0 = dest0+ii;
}
}
if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
dest0[8] = src0[4];
dest0[9] = src0[5];
dest0[10] = src1[4];
dest0[11] = src1[5];
dest0[12] = src0[6];
dest0[13] = src0[7];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (4<<2);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
ii = (2<<2);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
ii = (1<<2);
dest0 = dest0+ii;
}
}
if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (4<<1);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
src0 = src0+4;
ii = (2<<1);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
src0 = src0+2;
ii = (1<<1);
dest0 = dest0+ii;
}
}
return 0;
}

View File

@ -0,0 +1,401 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0;
for (j=0; j<col/8; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src4 = src3+2*srcdim;
src5 = src4+2*srcdim;
src6 = src5+2*srcdim;
src7 = src6+2*srcdim;
src = src7+2*srcdim;
dest0 = dest;
ii = (row<<4);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
dest0[16] = src0[2];
dest0[17] = src0[3];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src2[2];
dest0[21] = src2[3];
dest0[22] = src3[2];
dest0[23] = src3[3];
dest0[24] = src4[2];
dest0[25] = src4[3];
dest0[26] = src5[2];
dest0[27] = src5[3];
dest0[28] = src6[2];
dest0[29] = src6[3];
dest0[30] = src7[2];
dest0[31] = src7[3];
dest0[32] = src0[4];
dest0[33] = src0[5];
dest0[34] = src1[4];
dest0[35] = src1[5];
dest0[36] = src2[4];
dest0[37] = src2[5];
dest0[38] = src3[4];
dest0[39] = src3[5];
dest0[40] = src4[4];
dest0[41] = src4[5];
dest0[42] = src5[4];
dest0[43] = src5[5];
dest0[44] = src6[4];
dest0[45] = src6[5];
dest0[46] = src7[4];
dest0[47] = src7[5];
dest0[48] = src0[6];
dest0[49] = src0[7];
dest0[50] = src1[6];
dest0[51] = src1[7];
dest0[52] = src2[6];
dest0[53] = src2[7];
dest0[54] = src3[6];
dest0[55] = src3[7];
dest0[56] = src4[6];
dest0[57] = src4[7];
dest0[58] = src5[6];
dest0[59] = src5[7];
dest0[60] = src6[6];
dest0[61] = src6[7];
dest0[62] = src7[6];
dest0[63] = src7[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
src4 = src4+8;
src5 = src5+8;
src6 = src6+8;
src7 = src7+8;
ii = (4<<4);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
dest0[16] = src0[2];
dest0[17] = src0[3];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src2[2];
dest0[21] = src2[3];
dest0[22] = src3[2];
dest0[23] = src3[3];
dest0[24] = src4[2];
dest0[25] = src4[3];
dest0[26] = src5[2];
dest0[27] = src5[3];
dest0[28] = src6[2];
dest0[29] = src6[3];
dest0[30] = src7[2];
dest0[31] = src7[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
src4 = src4+4;
src5 = src5+4;
src6 = src6+4;
src7 = src7+4;
ii = (2<<4);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
src4 = src4+2;
src5 = src5+2;
src6 = src6+2;
src7 = src7+2;
ii = (1<<4);
dest0 = dest0+ii;
}
}
if (col&4)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
dest0[16] = src0[4];
dest0[17] = src0[5];
dest0[18] = src1[4];
dest0[19] = src1[5];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src3[4];
dest0[23] = src3[5];
dest0[24] = src0[6];
dest0[25] = src0[7];
dest0[26] = src1[6];
dest0[27] = src1[7];
dest0[28] = src2[6];
dest0[29] = src2[7];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (4<<3);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
ii = (2<<3);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
ii = (1<<3);
dest0 = dest0+ii;
}
}
if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
dest0[8] = src0[4];
dest0[9] = src0[5];
dest0[10] = src1[4];
dest0[11] = src1[5];
dest0[12] = src0[6];
dest0[13] = src0[7];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (4<<2);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
ii = (2<<2);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
ii = (1<<2);
dest0 = dest0+ii;
}
}
if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (4<<1);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
src0 = src0+4;
ii = (2<<1);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
src0 = src0+2;
ii = (1<<1);
dest0 = dest0+ii;
}
}
return 0;
}

View File

@ -0,0 +1,237 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
FLOAT *dest1,*dest2;
ii = col&-4;
ii = ii*(2*row);
dest2 = dest+ii;
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (4<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src1[0];
dest0[9] = src1[1];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src1[4];
dest0[13] = src1[5];
dest0[14] = src1[6];
dest0[15] = src1[7];
dest0[16] = src2[0];
dest0[17] = src2[1];
dest0[18] = src2[2];
dest0[19] = src2[3];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src2[6];
dest0[23] = src2[7];
dest0[24] = src3[0];
dest0[25] = src3[1];
dest0[26] = src3[2];
dest0[27] = src3[3];
dest0[28] = src3[4];
dest0[29] = src3[5];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
dest2[8] = src2[0];
dest2[9] = src2[1];
dest2[10] = src2[2];
dest2[11] = src2[3];
dest2[12] = src3[0];
dest2[13] = src3[1];
dest2[14] = src3[2];
dest2[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
dest2 = dest2+16;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
dest1[4] = src2[0];
dest1[5] = src2[1];
dest1[6] = src3[0];
dest1[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
dest1 = dest1+8;
}
}
if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (2<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src1[0];
dest0[9] = src1[1];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src1[4];
dest0[13] = src1[5];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
dest2 = dest2+8;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
dest1 = dest1+4;
}
}
if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
src0 = src0+4;
dest2 = dest2+4;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
src0 = src0+2;
dest1 = dest1+2;
}
}
return 0;
}

View File

@ -0,0 +1,370 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
FLOAT *dest1,*dest2,*dest4;
ii = col&-8;
ii = ii*(2*row);
dest4 = dest+ii;
ii = col&-4;
ii = ii*(2*row);
dest2 = dest+ii;
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (4<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
dest0[16] = src1[0];
dest0[17] = src1[1];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src1[4];
dest0[21] = src1[5];
dest0[22] = src1[6];
dest0[23] = src1[7];
dest0[24] = src1[8];
dest0[25] = src1[9];
dest0[26] = src1[10];
dest0[27] = src1[11];
dest0[28] = src1[12];
dest0[29] = src1[13];
dest0[30] = src1[14];
dest0[31] = src1[15];
dest0[32] = src2[0];
dest0[33] = src2[1];
dest0[34] = src2[2];
dest0[35] = src2[3];
dest0[36] = src2[4];
dest0[37] = src2[5];
dest0[38] = src2[6];
dest0[39] = src2[7];
dest0[40] = src2[8];
dest0[41] = src2[9];
dest0[42] = src2[10];
dest0[43] = src2[11];
dest0[44] = src2[12];
dest0[45] = src2[13];
dest0[46] = src2[14];
dest0[47] = src2[15];
dest0[48] = src3[0];
dest0[49] = src3[1];
dest0[50] = src3[2];
dest0[51] = src3[3];
dest0[52] = src3[4];
dest0[53] = src3[5];
dest0[54] = src3[6];
dest0[55] = src3[7];
dest0[56] = src3[8];
dest0[57] = src3[9];
dest0[58] = src3[10];
dest0[59] = src3[11];
dest0[60] = src3[12];
dest0[61] = src3[13];
dest0[62] = src3[14];
dest0[63] = src3[15];
src0 = src0+16;
src1 = src1+16;
src2 = src2+16;
src3 = src3+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
dest4[8] = src1[0];
dest4[9] = src1[1];
dest4[10] = src1[2];
dest4[11] = src1[3];
dest4[12] = src1[4];
dest4[13] = src1[5];
dest4[14] = src1[6];
dest4[15] = src1[7];
dest4[16] = src2[0];
dest4[17] = src2[1];
dest4[18] = src2[2];
dest4[19] = src2[3];
dest4[20] = src2[4];
dest4[21] = src2[5];
dest4[22] = src2[6];
dest4[23] = src2[7];
dest4[24] = src3[0];
dest4[25] = src3[1];
dest4[26] = src3[2];
dest4[27] = src3[3];
dest4[28] = src3[4];
dest4[29] = src3[5];
dest4[30] = src3[6];
dest4[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
dest4 = dest4+32;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
dest2[8] = src2[0];
dest2[9] = src2[1];
dest2[10] = src2[2];
dest2[11] = src2[3];
dest2[12] = src3[0];
dest2[13] = src3[1];
dest2[14] = src3[2];
dest2[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
dest2 = dest2+16;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
dest1[4] = src2[0];
dest1[5] = src2[1];
dest1[6] = src3[0];
dest1[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
dest1 = dest1+8;
}
}
if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (2<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
dest0[16] = src1[0];
dest0[17] = src1[1];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src1[4];
dest0[21] = src1[5];
dest0[22] = src1[6];
dest0[23] = src1[7];
dest0[24] = src1[8];
dest0[25] = src1[9];
dest0[26] = src1[10];
dest0[27] = src1[11];
dest0[28] = src1[12];
dest0[29] = src1[13];
dest0[30] = src1[14];
dest0[31] = src1[15];
src0 = src0+16;
src1 = src1+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
dest4[8] = src1[0];
dest4[9] = src1[1];
dest4[10] = src1[2];
dest4[11] = src1[3];
dest4[12] = src1[4];
dest4[13] = src1[5];
dest4[14] = src1[6];
dest4[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
dest4 = dest4+16;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
dest2 = dest2+8;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
dest1 = dest1+4;
}
}
if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
src0 = src0+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
src0 = src0+8;
dest4 = dest4+8;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
src0 = src0+4;
dest2 = dest2+4;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
src0 = src0+2;
dest1 = dest1+2;
}
}
return 0;
}

View File

@ -746,6 +746,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef SANDYBRIDGE
#ifdef DEBUG
fprintf(stderr, "Sandybridge\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef OPTERON #ifdef OPTERON
#ifdef DEBUG #ifdef DEBUG
@ -778,6 +794,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef BOBCAT
#ifdef DEBUG
fprintf(stderr, "Bobcate\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO #ifdef NANO
#ifdef DEBUG #ifdef DEBUG

59
kernel/x86/KERNEL.BOBCAT Normal file
View File

@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN

View File

@ -76,6 +76,12 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif
#ifndef PREFETCH #ifndef PREFETCH
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#endif #endif

View File

@ -69,6 +69,12 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE (16 * 1 - 8)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif
#ifndef PREFETCH #ifndef PREFETCH
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#endif #endif
@ -262,7 +268,7 @@
movaps -16 * SIZE(AA), %xmm0 movaps -16 * SIZE(AA), %xmm0
addps %xmm2, %xmm7 addps %xmm2, %xmm7
#ifndef NEHALEM #if !(defined(NEHALEM) || defined(SANDYBRIDGE))
PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
pshufd $0x93, %xmm1, %xmm2 pshufd $0x93, %xmm1, %xmm2

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4) #define PREFETCHSIZE (16 * 4)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4) #define PREFETCHSIZE (16 * 4)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -439,7 +439,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1697,7 +1697,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -437,7 +437,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -833,7 +833,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1848,7 +1848,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2109,7 +2109,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2429,7 +2429,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2952,7 +2952,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -3148,7 +3148,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3389,7 +3389,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -910,7 +910,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1439,7 +1439,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -872,7 +872,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1316,7 +1316,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1855,7 +1855,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2249,7 +2249,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2562,7 +2562,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2957,7 +2957,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -3280,7 +3280,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3515,7 +3515,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -1036,7 +1036,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2
@ -2224,7 +2224,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -439,7 +439,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -758,7 +758,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -993,7 +993,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -1324,7 +1324,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1718,7 +1718,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2031,7 +2031,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2859,7 +2859,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -3303,7 +3303,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2

View File

@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0

View File

@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHSIZE (16 * 1 + 8)
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2) #define PREFETCHSIZE (16 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2) #define PREFETCHSIZE (16 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)

View File

@ -55,7 +55,7 @@
#define XX %edi #define XX %edi
#define FLAG %ebp #define FLAG %ebp
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@ -697,7 +697,7 @@
cmpl $2 * SIZE, INCX cmpl $2 * SIZE, INCX
jne .L120 jne .L120
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm0, %xmm6)
PSHUFD2($0, %xmm1, %xmm1) PSHUFD2($0, %xmm1, %xmm1)

View File

@ -57,7 +57,7 @@
#include "l1param.h" #include "l1param.h"
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@ -860,7 +860,7 @@
cmpl $2 * SIZE, INCX cmpl $2 * SIZE, INCX
jne .L220 jne .L220
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
#ifdef HAVE_SSE3 #ifdef HAVE_SSE3
movddup %xmm0, %xmm6 movddup %xmm0, %xmm6

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -533,7 +533,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -994,7 +994,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

View File

@ -0,0 +1,84 @@
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
#DGEMMONCOPY = gemm_ncopy_4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
#DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -45,6 +45,12 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef MOVAPS #ifndef MOVAPS
#define MOVAPS movaps #define MOVAPS movaps
#endif #endif

View File

@ -45,7 +45,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE 16 #define PREFETCHSIZE 16
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0

View File

@ -45,6 +45,12 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef MOVAPS #ifndef MOVAPS
#define MOVAPS movaps #define MOVAPS movaps
#endif #endif

View File

@ -52,6 +52,13 @@
#define MOVUPS_A movups #define MOVUPS_A movups
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define MOVUPS_A movups
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCHSIZE 16 #define PREFETCHSIZE 16
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0

View File

@ -51,6 +51,12 @@
#define MOVUPS_A movups #define MOVUPS_A movups
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define MOVUPS_A movups
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCHSIZE 16 #define PREFETCHSIZE 16
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0

View File

@ -46,6 +46,13 @@
#define MOVUPS_A movups #define MOVUPS_A movups
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define MOVUPS_A movups
#endif
#ifdef MOVUPS_A #ifdef MOVUPS_A
#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS

View File

@ -46,6 +46,13 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI
#define M ARG1 /* rdi */ #define M ARG1 /* rdi */

View File

@ -46,7 +46,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12 #define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define WPREFETCHSIZE (RPREFETCHSIZE * 4)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0

View File

@ -46,6 +46,13 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI
#define M ARG1 /* rdi */ #define M ARG1 /* rdi */

View File

@ -46,7 +46,7 @@
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12 #define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define WPREFETCHSIZE (RPREFETCHSIZE * 4)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0

File diff suppressed because it is too large Load Diff

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12) #define PREFETCHSIZE (16 * 12)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

File diff suppressed because it is too large Load Diff

View File

@ -685,7 +685,7 @@
cmpq $2 * SIZE, INCX cmpq $2 * SIZE, INCX
jne .L120 jne .L120
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm0, %xmm14
pshufd $0, %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1

View File

@ -55,7 +55,7 @@
#include "l1param.h" #include "l1param.h"
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@ -803,7 +803,7 @@
cmpq $2 * SIZE, INCX cmpq $2 * SIZE, INCX
jne .L220 jne .L220
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
movddup %xmm0, %xmm14 movddup %xmm0, %xmm14
pxor %xmm15, %xmm15 pxor %xmm15, %xmm15

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
@ -160,7 +160,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24) #define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

Some files were not shown because too many files have changed in this diff Show More