Merge branch 'release-0.2.0'

This commit is contained in:
Zhang Xianyi 2012-06-26 07:45:23 +08:00
commit 47860cf002
106 changed files with 17268 additions and 522 deletions

7
.gitignore vendored
View File

@ -1,16 +1,23 @@
*.obj
*.lib
*.dll
*.dylib
*.def
*.o
lapack-3.1.1
lapack-3.1.1.tgz
lapack-3.4.1
lapack-3.4.1.tgz
*.so
*.a
.svn
*~
lib.grd
nohup.out
config.h
Makefile.conf
Makefile.conf_last
config_last.h
getarch
getarch_2nd
utest/openblas_utest

View File

@ -1,4 +1,17 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.0
26-Jun-2012
common:
* Removed the limitation (64) of numbers of CPU cores.
Now, it supports 256 cores at max.
* Supported clang compiler.
* Fixed some build bugs on FreeBSD
x86/x86-64:
* Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions.
Please use gcc >= 4.6 or clang >=3.1.
* Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes.
====================================================================
Version 0.1.1
29-Apr-2012
@ -7,6 +20,8 @@ common:
* Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia)
* Fixed the build bug (MD5 and download) on Mac OSX.
* Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1.
* Fxied the compatibility issue for compilers without C99 complex number
(e.g. Visual Studio)
x86/x86_64:
* Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX.
* Test alpha=Nan in dscale.

View File

@ -90,6 +90,15 @@
number of threads will consume extra resource. I recommend you to
specify minimum number of threads.
1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong?
A This may be related to a bug in the Linux kernel 2.6.32. Try applying
the patch segaults.patch using
patch < segfaults.patch
and see if the crashes persist. Note that this patch will lead to many
compiler warnings.
2. Architecture Specific issue or Implementation

View File

@ -256,12 +256,17 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz
lapack-3.4.1.tgz :
ifndef NOFORTRAN
ifeq ($(OSNAME), Darwin)
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
curl -O $(LAPACK_URL)
else
ifeq ($(OSNAME), FreeBSD)
fetch $(LAPACK_URL)
else
wget $(LAPACK_URL)
endif
endif
endif
large.tgz :
ifndef NOFORTRAN

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.1.1
VERSION = 0.2.0
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -94,8 +94,8 @@ VERSION = 0.1.1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# GEMM_MULTITHREAD_THRESHOLD = 4
# in small matrix sizes. The default value is 50.
# GEMM_MULTITHREAD_THRESHOLD = 50
# If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet).

View File

@ -14,7 +14,15 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1
endif
# Default C compiler
# - Only set if not specified on the command line or inherited from the environment.
# - CC is an implicit variable so neither '?=' or 'ifndef' can be used.
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default)
CC = gcc
endif
# Default Fortran compiler (FC) is selected by f_check.
ifndef MAKEFILE_RULE
include $(TOPDIR)/Makefile.rule
@ -45,7 +53,7 @@ GETARCH_FLAGS += -DUSE64BITINT
endif
ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4
GEMM_MULTITHREAD_THRESHOLD=50
endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
@ -108,6 +116,14 @@ export MACOSX_DEPLOYMENT_TARGET=10.2
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), FreeBSD)
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), NetBSD)
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), Linux)
EXTRALIB += -lm
endif
@ -231,11 +247,11 @@ endif
ifdef DYNAMIC_ARCH
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
endif
ifndef DYNAMIC_CORE
@ -754,6 +770,7 @@ export HAVE_SSE4_1
export HAVE_SSE4_2
export HAVE_SSE4A
export HAVE_SSE5
export HAVE_AVX
export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE

84
README
View File

@ -1,84 +0,0 @@
OpenBLAS Readme
1.Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn)
Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki).
2.Intallation
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or,
check out codes from git://github.com/xianyi/OpenBLAS.git
1)Normal compile
(a) type "make" to detect the CPU automatically.
or
(b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
2)Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
examples:
On X86 box, compile this library for loongson3a CPU.
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
3)Debug version
make DEBUG=1
4)Intall to the directory (Optional)
e.g.
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS
3.Support CPU & OS
Please read GotoBLAS_01Readme.txt
Additional support CPU:
x86_64:
Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes.
MIPS64:
ICT Loongson 3A //Level 3 BLAS subroutines are optimized.
4.Usages
Link with libopenblas.a or -lopenblas for shared library.
4.1 Set the number of threads with environment variables. for example,
export OPENBLAS_NUM_THREADS=4
or
export GOTO_NUM_THREADS=4
or
export OMP_NUM_THREADS=4
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
4.2 Set the number of threads with calling functions. for example,
void goto_set_num_threads(int num_threads);
or
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
5.Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
6.To-Do List:
Optimization on ICT Loongson 3A CPU
7.Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
8.ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages

110
README.md Normal file
View File

@ -0,0 +1,110 @@
# OpenBLAS
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
## Installation
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
### Normal compile
* type "make" to detect the CPU automatically.
or
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
### Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
Examples:
On X86 box, compile this library for loongson3a CPU.
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
### Debug version
make DEBUG=1
### Intall to the directory (Optional)
Example:
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS
## Support CPU & OS
Please read GotoBLAS_01Readme.txt
### Additional support CPU:
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental
### Support OS:
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
## Usages
Link with libopenblas.a or -lopenblas for shared library.
### Set the number of threads with environment variables.
Examples:
export OPENBLAS_NUM_THREADS=4
or
export GOTO_NUM_THREADS=4
or
export OMP_NUM_THREADS=4
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
### Set the number of threads on runtime.
We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy.
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
## Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
## Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
## Troubleshooting
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages

View File

@ -18,6 +18,7 @@ CORE2
PENRYN
DUNNINGTON
NEHALEM
SANDYBRIDGE
ATOM
b)AMD CPU:
@ -27,6 +28,7 @@ OPTERON_SSE3
BARCELONA
SHANGHAI
ISTANBUL
BOBCAT
c)VIA CPU:
SSE_GENERIC
@ -47,6 +49,7 @@ CELL
3.MIPS64 CPU:
SICORTEX
LOONGSON3A
LOONGSON3B
4.IA64 CPU:
ITANIUM2

10
c_check
View File

@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/);
$compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FreeBSD/);
$os = NetBSD if ($data =~ /OS_NetBSD/);
$os = Darwin if ($data =~ /OS_Darwin/);
$os = SunOS if ($data =~ /OS_SunOS/);
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NETBSD/);
$os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/);
$os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$architecture = x86 if ($data =~ /ARCH_X86/);

View File

@ -9,6 +9,10 @@ extern "C" {
#include <stddef.h>
#include "common.h"
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
#define CBLAS_INDEX size_t
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};

View File

@ -68,7 +68,7 @@ extern "C" {
#define SMP
#endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix)
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define WINDOWS_ABI
#define OS_WINDOWS
@ -89,7 +89,7 @@ extern "C" {
#include <sched.h>
#endif
#ifdef OS_DARWIN
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
#include <sched.h>
#endif

View File

@ -45,6 +45,8 @@ extern "C" {
int BLASFUNC(xerbla)(char *, blasint *info, blasint);
void BLASFUNC(openblas_set_num_threads)(int *);
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);

View File

@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) {
int openmp_nthreads=0;
#endif
if ((blas_cpu_number == 1)
if (blas_cpu_number == 1
#ifdef USE_OPENMP
|| omp_in_parallel()

View File

@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define PROFCODE
#endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX)
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define SAVEREGISTERS \
subl $32, %esp;\
movups %xmm6, 0(%esp);\
@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define RESTOREREGISTERS
#endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX)
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define PROLOGUE \
.text; \
.align 16; \
@ -282,7 +282,7 @@ REALNAME:
#define EPILOGUE .end REALNAME
#endif
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__)
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
#define PROLOGUE \
.text; \
.align 16; \
@ -356,4 +356,11 @@ REALNAME:
#ifndef ALIGN_6
#define ALIGN_6 .align 64
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif

View File

@ -353,7 +353,7 @@ REALNAME:
#define EPILOGUE .end REALNAME
#endif
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI)
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
#define PROLOGUE \
.text; \
.align 512; \
@ -425,6 +425,7 @@ REALNAME:
#define ALIGN_2 .align 2
#define ALIGN_3 .align 3
#define ALIGN_4 .align 4
#define ALIGN_5 .align 5
#define ffreep fstp
#endif
@ -448,4 +449,10 @@ REALNAME:
#define ALIGN_6 .align 64
#endif
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif

View File

@ -103,6 +103,8 @@
#define CORE_NEHALEM 17
#define CORE_ATOM 18
#define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@ -122,6 +124,7 @@
#define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
@ -188,4 +191,6 @@ typedef struct {
#define CPUTYPE_NSGEODE 41
#define CPUTYPE_VIAC3 42
#define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#endif

View File

@ -189,6 +189,7 @@ int get_cputype(int gettype){
if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3;
if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1;
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX;
if (have_excpuid() >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
@ -983,13 +984,13 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CPUTYPE_NEHALEM;
return CPUTYPE_SANDYBRIDGE;
case 12:
//Xeon Processor 5600 (Westmere-EP)
return CPUTYPE_NEHALEM;
case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CPUTYPE_NEHALEM;
return CPUTYPE_SANDYBRIDGE;
case 15:
//Xeon Processor E7 (Westmere-EX)
return CPUTYPE_NEHALEM;
@ -1027,6 +1028,8 @@ int get_cpuname(void){
case 1:
case 10:
return CPUTYPE_BARCELONA;
case 5:
return CPUTYPE_BOBCAT;
}
break;
}
@ -1146,6 +1149,8 @@ static char *cpuname[] = {
"NSGEODE",
"VIAC3",
"NANO",
"SANDYBRIDGE",
"BOBCAT",
};
static char *lowercpuname[] = {
@ -1192,6 +1197,8 @@ static char *lowercpuname[] = {
"tms3x00",
"nsgeode",
"nano",
"sandybridge",
"bobcat",
};
static char *corename[] = {
@ -1215,6 +1222,8 @@ static char *corename[] = {
"NEHALEM",
"ATOM",
"NANO",
"SANDYBRIDGE",
"BOBCAT",
};
static char *corename_lower[] = {
@ -1238,6 +1247,8 @@ static char *corename_lower[] = {
"nehalem",
"atom",
"nano",
"sandybridge",
"bobcat",
};
@ -1321,13 +1332,13 @@ int get_coretype(void){
return CORE_NEHALEM;
case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CORE_NEHALEM;
return CORE_SANDYBRIDGE;
case 12:
//Xeon Processor 5600 (Westmere-EP)
return CORE_NEHALEM;
case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CORE_NEHALEM;
return CORE_SANDYBRIDGE;
case 15:
//Xeon Processor E7 (Westmere-EX)
return CORE_NEHALEM;
@ -1346,7 +1357,9 @@ int get_coretype(void){
if (family <= 0x5) return CORE_80486;
if (family <= 0xe) return CORE_ATHLON;
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA;
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
else return CORE_BARCELONA;
}
}
@ -1426,6 +1439,7 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n");
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
@ -1491,6 +1505,7 @@ void get_sse(void){
if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n");
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");

10
ctest.c
View File

@ -35,19 +35,19 @@ OS_LINUX
#endif
#if defined(__FreeBSD__)
OS_FreeBSD
OS_FREEBSD
#endif
#if defined(__NetBSD__)
OS_NetBSD
OS_NETBSD
#endif
#if defined(__sun)
OS_SunOS
OS_SUNOS
#endif
#if defined(__APPLE__)
OS_Darwin
OS_DARWIN
#endif
#if defined(_AIX)
@ -63,7 +63,7 @@ OS_WINNT
#endif
#if defined(__CYGWIN__)
OS_CYGWIN
OS_CYGWIN_NT
#endif
#if defined(__INTERIX)

View File

@ -1,12 +1,12 @@
TOPDIR = ../..
include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
ifdef SMP
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
ifndef NO_AFFINITY
COMMONOBJS += init.$(SUFFIX)
endif

View File

@ -63,6 +63,14 @@ static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER];
void goto_set_num_threads(int num)
{
}
void openblas_set_num_threads(int num)
{
}
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){

View File

@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MAX_NODES 16
#define MAX_CPUS 256
#define NCPUBITS (8*sizeof(unsigned long))
#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS)
#define CPUELT(cpu) ((cpu) / NCPUBITS)
#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS))
#define SH_MAGIC 0x510510
@ -103,10 +108,10 @@ typedef struct {
int num_nodes;
int num_procs;
int final_num_procs;
unsigned long avail;
unsigned long avail [MAX_BITMASK_LEN];
int avail_count;
unsigned long cpu_info [MAX_CPUS];
unsigned long node_info [MAX_NODES];
unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN];
int cpu_use[MAX_CPUS];
} shm_t;
@ -126,7 +131,8 @@ static shm_t *common = (void *)-1;
static int shmid, pshmid;
static void *paddr;
static unsigned long lprocmask, lnodemask;
static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask;
static int lprocmask_count = 0;
static int numprocs = 1;
static int numnodes = 1;
@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) {
than sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
***/
static inline unsigned long get_cpumap(int node) {
static inline void get_cpumap(int node, unsigned long * node_info) {
int infile;
unsigned long affinity;
unsigned long affinity[32];
char name[160];
char cpumap[160];
char *p, *dummy;
char *dummy;
int i=0;
int count=0;
int k=0;
sprintf(name, CPUMAP_NAME, node);
infile = open(name, O_RDONLY);
affinity = 0;
for(i=0; i<32; i++){
affinity[i] = 0;
}
if (infile != -1) {
read(infile, cpumap, sizeof(cpumap));
p = cpumap;
while (*p != '\n' && i<160){
if(*p != ',') {
name[i++]=*p;
for(i=0; i<160; i++){
if(cpumap[i] == '\n')
break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
//Enough data for Hex
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
p++;
}
p = name;
// while ((*p == '0') || (*p == ',')) p++;
affinity = strtoul(p, &dummy, 16);
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
node_info[i]=affinity[count-i-1];
}
close(infile);
}
return affinity;
return ;
}
static inline unsigned long get_share(int cpu, int level) {
static inline void get_share(int cpu, int level, unsigned long * share) {
int infile;
unsigned long affinity;
unsigned long affinity[32];
char cpumap[160];
char name[160];
char *p;
char *dummy;
int count=0;
int i=0,k=0;
int bitmask_idx = 0;
sprintf(name, SHARE_NAME, cpu, level);
infile = open(name, O_RDONLY);
affinity = (1UL << cpu);
// Init share
for(i=0; i<MAX_BITMASK_LEN; i++){
share[i]=0;
}
bitmask_idx = CPUELT(cpu);
share[bitmask_idx] = CPUMASK(cpu);
if (infile != -1) {
read(infile, name, sizeof(name));
read(infile, cpumap, sizeof(cpumap));
p = name;
for(i=0; i<160; i++){
if(cpumap[i] == '\n')
break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
while ((*p == '0') || (*p == ',')) p++;
//Enough data
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
}
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
share[i]=affinity[count-i-1];
}
affinity = strtol(p, &p, 16);
close(infile);
}
return affinity;
return ;
}
static int numa_check(void) {
@ -248,6 +298,7 @@ static int numa_check(void) {
DIR *dp;
struct dirent *dir;
int node;
int j;
common -> num_nodes = 0;
@ -258,7 +309,9 @@ static int numa_check(void) {
return 0;
}
for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0;
for (node = 0; node < MAX_NODES; node ++) {
for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0;
}
while ((dir = readdir(dp)) != NULL) {
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
@ -266,12 +319,12 @@ static int numa_check(void) {
node = atoi(&dir -> d_name[4]);
if (node > MAX_NODES) {
fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n");
fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n");
exit(1);
}
common -> num_nodes ++;
common -> node_info[node] = get_cpumap(node);
get_cpumap(node, common->node_info[node]);
}
}
@ -284,7 +337,7 @@ static int numa_check(void) {
fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes);
for (node = 0; node < common -> num_nodes; node ++)
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]);
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]);
#endif
return common -> num_nodes;
@ -296,11 +349,13 @@ static void numa_mapping(void) {
int i, j, h;
unsigned long work, bit;
int count = 0;
int bitmask_idx = 0;
for (node = 0; node < common -> num_nodes; node ++) {
core = 0;
for (cpu = 0; cpu < common -> num_procs; cpu ++) {
if (common -> node_info[node] & common -> avail & (1UL << cpu)) {
bitmask_idx = CPUELT(cpu);
if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) {
common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu);
count ++;
core ++;
@ -357,58 +412,89 @@ static void numa_mapping(void) {
static void disable_hyperthread(void) {
unsigned long share;
unsigned long share[MAX_BITMASK_LEN];
int cpu;
int bitmask_idx = 0;
int i=0, count=0;
bitmask_idx = CPUELT(common -> num_procs);
if(common->num_procs > 64){
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs);
exit(1);
}else if(common->num_procs == 64){
common -> avail = 0xFFFFFFFFFFFFFFFFUL;
}else
common -> avail = (1UL << common -> num_procs) - 1;
for(i=0; i< bitmask_idx; i++){
common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL;
}
if(CPUMASK(common -> num_procs) != 1){
common -> avail[count++] = CPUMASK(common -> num_procs) - 1;
}
common -> avail_count = count;
/* if(common->num_procs > 64){ */
/* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */
/* exit(1); */
/* }else if(common->num_procs == 64){ */
/* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */
/* }else */
/* common -> avail = (1UL << common -> num_procs) - 1; */
#ifdef DEBUG
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail);
fprintf(stderr, "\nAvail CPUs : ");
for(i=0; i<count; i++)
fprintf(stderr, "%04lx ", common -> avail[i]);
fprintf(stderr, ".\n");
#endif
for (cpu = 0; cpu < common -> num_procs; cpu ++) {
share = (get_share(cpu, 1) & common -> avail);
get_share(cpu, 1, share);
if (popcount(share) > 1) {
//When the shared cpu are in different element of share & avail array, this may be a bug.
for (i = 0; i < count ; i++){
if (popcount(share[i]) > 1) {
#ifdef DEBUG
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
cpu, share & ~(1UL << cpu));
cpu, share[i] & ~(CPUMASK(cpu)));
#endif
common -> avail &= ~((share & ~(1UL << cpu)));
common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu)));
}
}
}
}
static void disable_affinity(void) {
int i=0;
int bitmask_idx=0;
int count=0;
#ifdef DEBUG
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail);
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]);
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
#endif
if(common->final_num_procs > 64){
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs);
exit(1);
}else if(common->final_num_procs == 64){
lprocmask = 0xFFFFFFFFFFFFFFFFUL;
}else
lprocmask = (1UL << common -> final_num_procs) - 1;
/* if(common->final_num_procs > 64){ */
/* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */
/* exit(1); */
/* }else if(common->final_num_procs == 64){ */
/* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */
/* }else */
/* lprocmask = (1UL << common -> final_num_procs) - 1; */
bitmask_idx = CPUELT(common -> final_num_procs);
for(i=0; i< bitmask_idx; i++){
lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL;
}
if(CPUMASK(common -> final_num_procs) != 1){
lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1;
}
lprocmask_count = count;
#ifndef USE_OPENMP
lprocmask &= *(unsigned long *)&cpu_orig_mask[0];
for(i=0; i< count; i++){
lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i];
}
#endif
#ifdef DEBUG
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask);
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]);
#endif
}
@ -498,7 +584,7 @@ static void create_pshmem(void) {
static void local_cpu_map(void) {
int cpu, id, mapping;
int bitmask_idx = 0;
cpu = 0;
mapping = 0;
@ -509,7 +595,8 @@ static void local_cpu_map(void) {
if (is_dead(id)) common -> cpu_use[cpu] = 0;
}
if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) {
bitmask_idx = CPUELT(cpu);
if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) {
common -> cpu_use[cpu] = pshmid;
cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]);
@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) {
#ifndef USE_OPENMP
cpu_set_t cpu_mask;
#endif
int i;
if (initialized) return;
@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) {
common -> num_procs = get_nprocs();
if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
exit(1);
}
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
numa_check();
@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = popcount(common -> avail);
common -> final_num_procs = 0;
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]);
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) {
disable_affinity();
num_avail = popcount(lprocmask);
num_avail = 0;
for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]);
if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail;

View File

@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/syscall.h>
#endif
#if defined(OS_FreeBSD) || defined(OS_Darwin)
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#endif
@ -185,7 +185,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FreeBSD) || defined(OS_Darwin)
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
int get_num_procs(void) {
@ -215,7 +215,7 @@ int goto_get_num_procs (void) {
int blas_get_cpu_number(void){
char *p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
int max_num;
#endif
int blas_goto_num = 0;
@ -223,7 +223,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
max_num = get_num_procs();
#endif
@ -250,7 +250,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif

View File

@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#ifdef SMP_SERVER
#ifdef OS_LINUX
extern void openblas_set_num_threads(int num_threads) ;
@ -41,5 +40,13 @@ void NAME(int* num_threads){
openblas_set_num_threads(*num_threads);
}
#endif
#else
//Single thread
void openblas_set_num_threads(int num_threads) {
}
void NAME(int* num_threads){
}
#endif

View File

@ -163,9 +163,9 @@ int get_L2_size(void){
int eax, ebx, ecx, edx;
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC)
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -384,6 +384,17 @@ void blas_set_parameter(void){
#endif
#endif
#if defined(SANDYBRIDGE)
sgemm_p = 1024;
dgemm_p = 512;
cgemm_p = 512;
zgemm_p = 256;
#ifdef EXPRECISION
qgemm_p = 256;
xgemm_p = 128;
#endif
#endif
#if defined(CORE_PRESCOTT) || defined(GENERIC)
size >>= 6;
@ -435,7 +446,7 @@ void blas_set_parameter(void){
#endif
#endif
#if defined(CORE_BARCELONA)
#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
size >>= 8;
sgemm_p = 232 * size;

View File

@ -70,11 +70,11 @@ dll2 : libgoto2_shared.dll
$(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1)
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
-lib /machine:i386 /def:libopenblas.def
else
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
-lib /machine:X64 /def:libopenblas.def
endif
@ -121,7 +121,7 @@ so : ../$(LIBSONAME)
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def $(EXTRALIB)
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest

File diff suppressed because it is too large Load Diff

View File

@ -32,11 +32,12 @@ if ($compiler eq "") {
"pgf95", "pgf90", "pgf77",
"ifort");
OUTER:
foreach $lists (@lists) {
foreach $path (@path) {
if (-f $path . "/" . $lists) {
if (-x $path . "/" . $lists) {
$compiler = $lists;
break;
last OUTER;
}
}
}

View File

@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */
/* #define FORCE_NANO */
@ -278,6 +279,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "NEHALEM"
#endif
#ifdef FORCE_SANDYBRIDGE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#ifdef FORCE_ATOM
#define FORCE
#define FORCE_INTEL
@ -349,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BARCELONA"
#endif
#if defined(FORCE_BOBCAT)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BOBCAT"
#define ARCHCONFIG "-DBOBCAT " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV"
#define LIBNAME "bobcat"
#define CORENAME "BOBCAT"
#endif
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL

View File

@ -0,0 +1,235 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
for (j=0; j<col/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
dest0[16] = src0[4];
dest0[17] = src0[5];
dest0[18] = src1[4];
dest0[19] = src1[5];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src3[4];
dest0[23] = src3[5];
dest0[24] = src0[6];
dest0[25] = src0[7];
dest0[26] = src1[6];
dest0[27] = src1[7];
dest0[28] = src2[6];
dest0[29] = src2[7];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (4<<3);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
ii = (2<<3);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
ii = (1<<3);
dest0 = dest0+ii;
}
}
if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
dest0[8] = src0[4];
dest0[9] = src0[5];
dest0[10] = src1[4];
dest0[11] = src1[5];
dest0[12] = src0[6];
dest0[13] = src0[7];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (4<<2);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
ii = (2<<2);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
ii = (1<<2);
dest0 = dest0+ii;
}
}
if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (4<<1);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
src0 = src0+4;
ii = (2<<1);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
src0 = src0+2;
ii = (1<<1);
dest0 = dest0+ii;
}
}
return 0;
}

View File

@ -0,0 +1,401 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0;
for (j=0; j<col/8; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src4 = src3+2*srcdim;
src5 = src4+2*srcdim;
src6 = src5+2*srcdim;
src7 = src6+2*srcdim;
src = src7+2*srcdim;
dest0 = dest;
ii = (row<<4);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
dest0[16] = src0[2];
dest0[17] = src0[3];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src2[2];
dest0[21] = src2[3];
dest0[22] = src3[2];
dest0[23] = src3[3];
dest0[24] = src4[2];
dest0[25] = src4[3];
dest0[26] = src5[2];
dest0[27] = src5[3];
dest0[28] = src6[2];
dest0[29] = src6[3];
dest0[30] = src7[2];
dest0[31] = src7[3];
dest0[32] = src0[4];
dest0[33] = src0[5];
dest0[34] = src1[4];
dest0[35] = src1[5];
dest0[36] = src2[4];
dest0[37] = src2[5];
dest0[38] = src3[4];
dest0[39] = src3[5];
dest0[40] = src4[4];
dest0[41] = src4[5];
dest0[42] = src5[4];
dest0[43] = src5[5];
dest0[44] = src6[4];
dest0[45] = src6[5];
dest0[46] = src7[4];
dest0[47] = src7[5];
dest0[48] = src0[6];
dest0[49] = src0[7];
dest0[50] = src1[6];
dest0[51] = src1[7];
dest0[52] = src2[6];
dest0[53] = src2[7];
dest0[54] = src3[6];
dest0[55] = src3[7];
dest0[56] = src4[6];
dest0[57] = src4[7];
dest0[58] = src5[6];
dest0[59] = src5[7];
dest0[60] = src6[6];
dest0[61] = src6[7];
dest0[62] = src7[6];
dest0[63] = src7[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
src4 = src4+8;
src5 = src5+8;
src6 = src6+8;
src7 = src7+8;
ii = (4<<4);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
dest0[16] = src0[2];
dest0[17] = src0[3];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src2[2];
dest0[21] = src2[3];
dest0[22] = src3[2];
dest0[23] = src3[3];
dest0[24] = src4[2];
dest0[25] = src4[3];
dest0[26] = src5[2];
dest0[27] = src5[3];
dest0[28] = src6[2];
dest0[29] = src6[3];
dest0[30] = src7[2];
dest0[31] = src7[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
src4 = src4+4;
src5 = src5+4;
src6 = src6+4;
src7 = src7+4;
ii = (2<<4);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
src4 = src4+2;
src5 = src5+2;
src6 = src6+2;
src7 = src7+2;
ii = (1<<4);
dest0 = dest0+ii;
}
}
if (col&4)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
dest0[16] = src0[4];
dest0[17] = src0[5];
dest0[18] = src1[4];
dest0[19] = src1[5];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src3[4];
dest0[23] = src3[5];
dest0[24] = src0[6];
dest0[25] = src0[7];
dest0[26] = src1[6];
dest0[27] = src1[7];
dest0[28] = src2[6];
dest0[29] = src2[7];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (4<<3);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
ii = (2<<3);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
ii = (1<<3);
dest0 = dest0+ii;
}
}
if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
dest0[8] = src0[4];
dest0[9] = src0[5];
dest0[10] = src1[4];
dest0[11] = src1[5];
dest0[12] = src0[6];
dest0[13] = src0[7];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (4<<2);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
ii = (2<<2);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
ii = (1<<2);
dest0 = dest0+ii;
}
}
if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (4<<1);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
src0 = src0+4;
ii = (2<<1);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
src0 = src0+2;
ii = (1<<1);
dest0 = dest0+ii;
}
}
return 0;
}

View File

@ -0,0 +1,237 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
FLOAT *dest1,*dest2;
ii = col&-4;
ii = ii*(2*row);
dest2 = dest+ii;
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (4<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src1[0];
dest0[9] = src1[1];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src1[4];
dest0[13] = src1[5];
dest0[14] = src1[6];
dest0[15] = src1[7];
dest0[16] = src2[0];
dest0[17] = src2[1];
dest0[18] = src2[2];
dest0[19] = src2[3];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src2[6];
dest0[23] = src2[7];
dest0[24] = src3[0];
dest0[25] = src3[1];
dest0[26] = src3[2];
dest0[27] = src3[3];
dest0[28] = src3[4];
dest0[29] = src3[5];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
dest2[8] = src2[0];
dest2[9] = src2[1];
dest2[10] = src2[2];
dest2[11] = src2[3];
dest2[12] = src3[0];
dest2[13] = src3[1];
dest2[14] = src3[2];
dest2[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
dest2 = dest2+16;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
dest1[4] = src2[0];
dest1[5] = src2[1];
dest1[6] = src3[0];
dest1[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
dest1 = dest1+8;
}
}
if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (2<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src1[0];
dest0[9] = src1[1];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src1[4];
dest0[13] = src1[5];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
dest2 = dest2+8;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
dest1 = dest1+4;
}
}
if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
src0 = src0+4;
dest2 = dest2+4;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
src0 = src0+2;
dest1 = dest1+2;
}
}
return 0;
}

View File

@ -0,0 +1,370 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
FLOAT *dest1,*dest2,*dest4;
ii = col&-8;
ii = ii*(2*row);
dest4 = dest+ii;
ii = col&-4;
ii = ii*(2*row);
dest2 = dest+ii;
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (4<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
dest0[16] = src1[0];
dest0[17] = src1[1];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src1[4];
dest0[21] = src1[5];
dest0[22] = src1[6];
dest0[23] = src1[7];
dest0[24] = src1[8];
dest0[25] = src1[9];
dest0[26] = src1[10];
dest0[27] = src1[11];
dest0[28] = src1[12];
dest0[29] = src1[13];
dest0[30] = src1[14];
dest0[31] = src1[15];
dest0[32] = src2[0];
dest0[33] = src2[1];
dest0[34] = src2[2];
dest0[35] = src2[3];
dest0[36] = src2[4];
dest0[37] = src2[5];
dest0[38] = src2[6];
dest0[39] = src2[7];
dest0[40] = src2[8];
dest0[41] = src2[9];
dest0[42] = src2[10];
dest0[43] = src2[11];
dest0[44] = src2[12];
dest0[45] = src2[13];
dest0[46] = src2[14];
dest0[47] = src2[15];
dest0[48] = src3[0];
dest0[49] = src3[1];
dest0[50] = src3[2];
dest0[51] = src3[3];
dest0[52] = src3[4];
dest0[53] = src3[5];
dest0[54] = src3[6];
dest0[55] = src3[7];
dest0[56] = src3[8];
dest0[57] = src3[9];
dest0[58] = src3[10];
dest0[59] = src3[11];
dest0[60] = src3[12];
dest0[61] = src3[13];
dest0[62] = src3[14];
dest0[63] = src3[15];
src0 = src0+16;
src1 = src1+16;
src2 = src2+16;
src3 = src3+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
dest4[8] = src1[0];
dest4[9] = src1[1];
dest4[10] = src1[2];
dest4[11] = src1[3];
dest4[12] = src1[4];
dest4[13] = src1[5];
dest4[14] = src1[6];
dest4[15] = src1[7];
dest4[16] = src2[0];
dest4[17] = src2[1];
dest4[18] = src2[2];
dest4[19] = src2[3];
dest4[20] = src2[4];
dest4[21] = src2[5];
dest4[22] = src2[6];
dest4[23] = src2[7];
dest4[24] = src3[0];
dest4[25] = src3[1];
dest4[26] = src3[2];
dest4[27] = src3[3];
dest4[28] = src3[4];
dest4[29] = src3[5];
dest4[30] = src3[6];
dest4[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
dest4 = dest4+32;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
dest2[8] = src2[0];
dest2[9] = src2[1];
dest2[10] = src2[2];
dest2[11] = src2[3];
dest2[12] = src3[0];
dest2[13] = src3[1];
dest2[14] = src3[2];
dest2[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
dest2 = dest2+16;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
dest1[4] = src2[0];
dest1[5] = src2[1];
dest1[6] = src3[0];
dest1[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
dest1 = dest1+8;
}
}
if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (2<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
dest0[16] = src1[0];
dest0[17] = src1[1];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src1[4];
dest0[21] = src1[5];
dest0[22] = src1[6];
dest0[23] = src1[7];
dest0[24] = src1[8];
dest0[25] = src1[9];
dest0[26] = src1[10];
dest0[27] = src1[11];
dest0[28] = src1[12];
dest0[29] = src1[13];
dest0[30] = src1[14];
dest0[31] = src1[15];
src0 = src0+16;
src1 = src1+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
dest4[8] = src1[0];
dest4[9] = src1[1];
dest4[10] = src1[2];
dest4[11] = src1[3];
dest4[12] = src1[4];
dest4[13] = src1[5];
dest4[14] = src1[6];
dest4[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
dest4 = dest4+16;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
dest2 = dest2+8;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
dest1 = dest1+4;
}
}
if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
src0 = src0+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
src0 = src0+8;
dest4 = dest4+8;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
src0 = src0+4;
dest2 = dest2+4;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
src0 = src0+2;
dest1 = dest1+2;
}
}
return 0;
}

View File

@ -746,6 +746,22 @@ static void init_parameter(void) {
#endif
#endif
#ifdef SANDYBRIDGE
#ifdef DEBUG
fprintf(stderr, "Sandybridge\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef OPTERON
#ifdef DEBUG
@ -778,6 +794,22 @@ static void init_parameter(void) {
#endif
#endif
#ifdef BOBCAT
#ifdef DEBUG
fprintf(stderr, "Bobcate\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO
#ifdef DEBUG

59
kernel/x86/KERNEL.BOBCAT Normal file
View File

@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN

View File

@ -76,6 +76,12 @@
#define PREFETCHB prefetcht0
#endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif
#ifndef PREFETCH
#define PREFETCH prefetcht0
#endif

View File

@ -69,6 +69,12 @@
#define PREFETCHB prefetcht0
#endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE (16 * 1 - 8)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif
#ifndef PREFETCH
#define PREFETCH prefetcht0
#endif
@ -262,7 +268,7 @@
movaps -16 * SIZE(AA), %xmm0
addps %xmm2, %xmm7
#ifndef NEHALEM
#if !(defined(NEHALEM) || defined(SANDYBRIDGE))
PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
#endif
pshufd $0x93, %xmm1, %xmm2

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7)

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7)

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@ -439,7 +439,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@ -1697,7 +1697,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@ -437,7 +437,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@ -833,7 +833,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@ -1848,7 +1848,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@ -2109,7 +2109,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@ -2429,7 +2429,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@ -2952,7 +2952,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@ -3148,7 +3148,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@ -3389,7 +3389,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@ -910,7 +910,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@ -1439,7 +1439,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@ -872,7 +872,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@ -1316,7 +1316,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@ -1855,7 +1855,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@ -2249,7 +2249,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@ -2562,7 +2562,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@ -2957,7 +2957,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@ -3280,7 +3280,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@ -3515,7 +3515,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@ -1036,7 +1036,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
@ -2224,7 +2224,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@ -439,7 +439,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@ -758,7 +758,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@ -993,7 +993,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@ -1324,7 +1324,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@ -1718,7 +1718,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@ -2031,7 +2031,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@ -2859,7 +2859,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@ -3303,7 +3303,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2

View File

@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0

View File

@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (16 * 1 + 8)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7)

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2)
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7)

View File

@ -55,7 +55,7 @@
#define XX %edi
#define FLAG %ebp
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON)
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD
#else
#define USE_PSHUFD_HALF
@ -697,7 +697,7 @@
cmpl $2 * SIZE, INCX
jne .L120
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
PSHUFD2($0, %xmm0, %xmm6)
PSHUFD2($0, %xmm1, %xmm1)

View File

@ -57,7 +57,7 @@
#include "l1param.h"
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON)
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD
#else
#define USE_PSHUFD_HALF
@ -860,7 +860,7 @@
cmpl $2 * SIZE, INCX
jne .L220
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
#ifdef HAVE_SSE3
movddup %xmm0, %xmm6

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@ -533,7 +533,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@ -994,7 +994,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

View File

@ -0,0 +1,84 @@
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
#DGEMMONCOPY = gemm_ncopy_4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
#DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -45,6 +45,12 @@
#define PREFETCHW prefetcht0
#endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef MOVAPS
#define MOVAPS movaps
#endif

View File

@ -45,7 +45,7 @@
#define PREFETCHW prefetcht0
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0

View File

@ -45,6 +45,12 @@
#define PREFETCHW prefetcht0
#endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef MOVAPS
#define MOVAPS movaps
#endif

View File

@ -52,6 +52,13 @@
#define MOVUPS_A movups
#endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define MOVUPS_A movups
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0

View File

@ -51,6 +51,12 @@
#define MOVUPS_A movups
#endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 12
#define PREFETCH prefetcht0
#define MOVUPS_A movups
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0

View File

@ -46,6 +46,13 @@
#define MOVUPS_A movups
#endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE 16
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define MOVUPS_A movups
#endif
#ifdef MOVUPS_A
#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS

View File

@ -46,6 +46,13 @@
#define PREFETCHW prefetcht0
#endif
#if defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef WINDOWS_ABI
#define M ARG1 /* rdi */

View File

@ -46,7 +46,7 @@
#define PREFETCHW prefetcht0
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 4)
#define PREFETCH prefetcht0

View File

@ -46,6 +46,13 @@
#define PREFETCHW prefetcht0
#endif
#if defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#endif
#ifndef WINDOWS_ABI
#define M ARG1 /* rdi */

View File

@ -46,7 +46,7 @@
#define PREFETCHW prefetcht0
#endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define RPREFETCHSIZE 12
#define WPREFETCHSIZE (RPREFETCHSIZE * 4)
#define PREFETCH prefetcht0

File diff suppressed because it is too large Load Diff

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
@ -76,7 +76,7 @@
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
@ -76,7 +76,7 @@
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 12)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

File diff suppressed because it is too large Load Diff

View File

@ -685,7 +685,7 @@
cmpq $2 * SIZE, INCX
jne .L120
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
pshufd $0, %xmm0, %xmm14
pshufd $0, %xmm1, %xmm1

View File

@ -55,7 +55,7 @@
#include "l1param.h"
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO)
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE)
#define USE_PSHUFD
#else
#define USE_PSHUFD_HALF
@ -803,7 +803,7 @@
cmpq $2 * SIZE, INCX
jne .L220
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
movddup %xmm0, %xmm14
pxor %xmm15, %xmm15

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
@ -160,7 +160,7 @@
#define a3 %xmm14
#define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -57,7 +57,7 @@
#define PREFETCHSIZE (16 * 24)
#endif
#ifdef NEHALEM
#if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp)
#define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4)
#endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp)
#define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4)
#endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta

Some files were not shown because too many files have changed in this diff Show More