Merge branch 'release-0.2.0'
This commit is contained in:
commit
47860cf002
|
@ -1,16 +1,23 @@
|
||||||
*.obj
|
*.obj
|
||||||
*.lib
|
*.lib
|
||||||
*.dll
|
*.dll
|
||||||
|
*.dylib
|
||||||
*.def
|
*.def
|
||||||
*.o
|
*.o
|
||||||
lapack-3.1.1
|
lapack-3.1.1
|
||||||
lapack-3.1.1.tgz
|
lapack-3.1.1.tgz
|
||||||
|
lapack-3.4.1
|
||||||
|
lapack-3.4.1.tgz
|
||||||
*.so
|
*.so
|
||||||
*.a
|
*.a
|
||||||
.svn
|
.svn
|
||||||
*~
|
*~
|
||||||
|
lib.grd
|
||||||
|
nohup.out
|
||||||
config.h
|
config.h
|
||||||
Makefile.conf
|
Makefile.conf
|
||||||
|
Makefile.conf_last
|
||||||
|
config_last.h
|
||||||
getarch
|
getarch
|
||||||
getarch_2nd
|
getarch_2nd
|
||||||
utest/openblas_utest
|
utest/openblas_utest
|
||||||
|
|
|
@ -1,4 +1,17 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.2.0
|
||||||
|
26-Jun-2012
|
||||||
|
common:
|
||||||
|
* Removed the limitation (64) of numbers of CPU cores.
|
||||||
|
Now, it supports 256 cores at max.
|
||||||
|
* Supported clang compiler.
|
||||||
|
* Fixed some build bugs on FreeBSD
|
||||||
|
x86/x86-64:
|
||||||
|
* Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions.
|
||||||
|
Please use gcc >= 4.6 or clang >=3.1.
|
||||||
|
* Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes.
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.1.1
|
Version 0.1.1
|
||||||
29-Apr-2012
|
29-Apr-2012
|
||||||
|
@ -7,6 +20,8 @@ common:
|
||||||
* Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia)
|
* Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia)
|
||||||
* Fixed the build bug (MD5 and download) on Mac OSX.
|
* Fixed the build bug (MD5 and download) on Mac OSX.
|
||||||
* Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1.
|
* Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1.
|
||||||
|
* Fxied the compatibility issue for compilers without C99 complex number
|
||||||
|
(e.g. Visual Studio)
|
||||||
x86/x86_64:
|
x86/x86_64:
|
||||||
* Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX.
|
* Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX.
|
||||||
* Test alpha=Nan in dscale.
|
* Test alpha=Nan in dscale.
|
||||||
|
|
|
@ -90,6 +90,15 @@
|
||||||
number of threads will consume extra resource. I recommend you to
|
number of threads will consume extra resource. I recommend you to
|
||||||
specify minimum number of threads.
|
specify minimum number of threads.
|
||||||
|
|
||||||
|
1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong?
|
||||||
|
|
||||||
|
A This may be related to a bug in the Linux kernel 2.6.32. Try applying
|
||||||
|
the patch segaults.patch using
|
||||||
|
|
||||||
|
patch < segfaults.patch
|
||||||
|
|
||||||
|
and see if the crashes persist. Note that this patch will lead to many
|
||||||
|
compiler warnings.
|
||||||
|
|
||||||
2. Architecture Specific issue or Implementation
|
2. Architecture Specific issue or Implementation
|
||||||
|
|
||||||
|
|
7
Makefile
7
Makefile
|
@ -256,12 +256,17 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz
|
||||||
|
|
||||||
lapack-3.4.1.tgz :
|
lapack-3.4.1.tgz :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
ifeq ($(OSNAME), Darwin)
|
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
||||||
|
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
|
||||||
curl -O $(LAPACK_URL)
|
curl -O $(LAPACK_URL)
|
||||||
|
else
|
||||||
|
ifeq ($(OSNAME), FreeBSD)
|
||||||
|
fetch $(LAPACK_URL)
|
||||||
else
|
else
|
||||||
wget $(LAPACK_URL)
|
wget $(LAPACK_URL)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
large.tgz :
|
large.tgz :
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.1.1
|
VERSION = 0.2.0
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -94,8 +94,8 @@ VERSION = 0.1.1
|
||||||
|
|
||||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
||||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
||||||
# in small matrix sizes. The default value is 4.
|
# in small matrix sizes. The default value is 50.
|
||||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
# GEMM_MULTITHREAD_THRESHOLD = 50
|
||||||
|
|
||||||
# If you need santy check by comparing reference BLAS. It'll be very
|
# If you need santy check by comparing reference BLAS. It'll be very
|
||||||
# slow (Not implemented yet).
|
# slow (Not implemented yet).
|
||||||
|
|
|
@ -14,7 +14,15 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Default C compiler
|
# Default C compiler
|
||||||
|
# - Only set if not specified on the command line or inherited from the environment.
|
||||||
|
# - CC is an implicit variable so neither '?=' or 'ifndef' can be used.
|
||||||
|
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
|
||||||
|
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
||||||
|
ifeq ($(origin CC),default)
|
||||||
CC = gcc
|
CC = gcc
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Default Fortran compiler (FC) is selected by f_check.
|
||||||
|
|
||||||
ifndef MAKEFILE_RULE
|
ifndef MAKEFILE_RULE
|
||||||
include $(TOPDIR)/Makefile.rule
|
include $(TOPDIR)/Makefile.rule
|
||||||
|
@ -45,7 +53,7 @@ GETARCH_FLAGS += -DUSE64BITINT
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef GEMM_MULTITHREAD_THRESHOLD
|
ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||||
GEMM_MULTITHREAD_THRESHOLD=4
|
GEMM_MULTITHREAD_THRESHOLD=50
|
||||||
endif
|
endif
|
||||||
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
|
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
|
||||||
|
|
||||||
|
@ -108,6 +116,14 @@ export MACOSX_DEPLOYMENT_TARGET=10.2
|
||||||
MD5SUM = md5 -r
|
MD5SUM = md5 -r
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(OSNAME), FreeBSD)
|
||||||
|
MD5SUM = md5 -r
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(OSNAME), NetBSD)
|
||||||
|
MD5SUM = md5 -r
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), Linux)
|
ifeq ($(OSNAME), Linux)
|
||||||
EXTRALIB += -lm
|
EXTRALIB += -lm
|
||||||
endif
|
endif
|
||||||
|
@ -231,11 +247,11 @@ endif
|
||||||
ifdef DYNAMIC_ARCH
|
ifdef DYNAMIC_ARCH
|
||||||
ifeq ($(ARCH), x86)
|
ifeq ($(ARCH), x86)
|
||||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
|
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
|
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef DYNAMIC_CORE
|
ifndef DYNAMIC_CORE
|
||||||
|
@ -754,6 +770,7 @@ export HAVE_SSE4_1
|
||||||
export HAVE_SSE4_2
|
export HAVE_SSE4_2
|
||||||
export HAVE_SSE4A
|
export HAVE_SSE4A
|
||||||
export HAVE_SSE5
|
export HAVE_SSE5
|
||||||
|
export HAVE_AVX
|
||||||
export KERNELDIR
|
export KERNELDIR
|
||||||
export FUNCTION_PROFILE
|
export FUNCTION_PROFILE
|
||||||
export TARGET_CORE
|
export TARGET_CORE
|
||||||
|
|
84
README
84
README
|
@ -1,84 +0,0 @@
|
||||||
OpenBLAS Readme
|
|
||||||
|
|
||||||
1.Introduction
|
|
||||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn)
|
|
||||||
Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki).
|
|
||||||
|
|
||||||
2.Intallation
|
|
||||||
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
|
||||||
Or,
|
|
||||||
check out codes from git://github.com/xianyi/OpenBLAS.git
|
|
||||||
1)Normal compile
|
|
||||||
(a) type "make" to detect the CPU automatically.
|
|
||||||
or
|
|
||||||
(b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
|
|
||||||
|
|
||||||
2)Cross compile
|
|
||||||
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
|
|
||||||
|
|
||||||
examples:
|
|
||||||
On X86 box, compile this library for loongson3a CPU.
|
|
||||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
|
||||||
|
|
||||||
3)Debug version
|
|
||||||
make DEBUG=1
|
|
||||||
|
|
||||||
4)Intall to the directory (Optional)
|
|
||||||
e.g.
|
|
||||||
make install PREFIX=your_installation_directory
|
|
||||||
The default directory is /opt/OpenBLAS
|
|
||||||
|
|
||||||
3.Support CPU & OS
|
|
||||||
Please read GotoBLAS_01Readme.txt
|
|
||||||
|
|
||||||
Additional support CPU:
|
|
||||||
x86_64:
|
|
||||||
Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes.
|
|
||||||
MIPS64:
|
|
||||||
ICT Loongson 3A //Level 3 BLAS subroutines are optimized.
|
|
||||||
|
|
||||||
4.Usages
|
|
||||||
Link with libopenblas.a or -lopenblas for shared library.
|
|
||||||
|
|
||||||
4.1 Set the number of threads with environment variables. for example,
|
|
||||||
export OPENBLAS_NUM_THREADS=4
|
|
||||||
or
|
|
||||||
export GOTO_NUM_THREADS=4
|
|
||||||
or
|
|
||||||
export OMP_NUM_THREADS=4
|
|
||||||
|
|
||||||
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
|
|
||||||
|
|
||||||
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
|
|
||||||
|
|
||||||
4.2 Set the number of threads with calling functions. for example,
|
|
||||||
void goto_set_num_threads(int num_threads);
|
|
||||||
or
|
|
||||||
void openblas_set_num_threads(int num_threads);
|
|
||||||
|
|
||||||
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
|
|
||||||
|
|
||||||
5.Report Bugs
|
|
||||||
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
|
||||||
|
|
||||||
6.To-Do List:
|
|
||||||
Optimization on ICT Loongson 3A CPU
|
|
||||||
|
|
||||||
7.Contact
|
|
||||||
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
|
|
||||||
|
|
||||||
8.ChangeLog
|
|
||||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
|
||||||
|
|
||||||
9.Known Issues
|
|
||||||
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
|
|
||||||
is 64. On 32 bits, it is 32.
|
|
||||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
|
|
||||||
|
|
||||||
10. Specification of Git Branches
|
|
||||||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
|
||||||
Now, there are 4 branches in github.com.
|
|
||||||
* The master branch. This a main branch to reflect a production-ready state.
|
|
||||||
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
|
|
||||||
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
|
|
||||||
* The gh-pages branch. This is for web pages
|
|
|
@ -0,0 +1,110 @@
|
||||||
|
# OpenBLAS
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
|
||||||
|
|
||||||
|
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
||||||
|
|
||||||
|
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
|
||||||
|
### Normal compile
|
||||||
|
* type "make" to detect the CPU automatically.
|
||||||
|
or
|
||||||
|
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
|
||||||
|
|
||||||
|
### Cross compile
|
||||||
|
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
On X86 box, compile this library for loongson3a CPU.
|
||||||
|
|
||||||
|
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||||
|
|
||||||
|
### Debug version
|
||||||
|
|
||||||
|
make DEBUG=1
|
||||||
|
|
||||||
|
### Intall to the directory (Optional)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
make install PREFIX=your_installation_directory
|
||||||
|
|
||||||
|
The default directory is /opt/OpenBLAS
|
||||||
|
|
||||||
|
## Support CPU & OS
|
||||||
|
Please read GotoBLAS_01Readme.txt
|
||||||
|
|
||||||
|
### Additional support CPU:
|
||||||
|
|
||||||
|
#### x86/x86-64:
|
||||||
|
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||||
|
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
|
||||||
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
|
|
||||||
|
#### MIPS64:
|
||||||
|
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||||
|
- **ICT Loongson 3B**: Experimental
|
||||||
|
|
||||||
|
### Support OS:
|
||||||
|
- **GNU/Linux**
|
||||||
|
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||||
|
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||||
|
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
|
||||||
|
|
||||||
|
## Usages
|
||||||
|
Link with libopenblas.a or -lopenblas for shared library.
|
||||||
|
|
||||||
|
### Set the number of threads with environment variables.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
export OPENBLAS_NUM_THREADS=4
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
export GOTO_NUM_THREADS=4
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=4
|
||||||
|
|
||||||
|
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
|
||||||
|
|
||||||
|
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
|
||||||
|
|
||||||
|
### Set the number of threads on runtime.
|
||||||
|
|
||||||
|
We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy.
|
||||||
|
|
||||||
|
void goto_set_num_threads(int num_threads);
|
||||||
|
|
||||||
|
void openblas_set_num_threads(int num_threads);
|
||||||
|
|
||||||
|
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
|
||||||
|
|
||||||
|
## Report Bugs
|
||||||
|
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
||||||
|
|
||||||
|
## Contact
|
||||||
|
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
|
||||||
|
|
||||||
|
## ChangeLog
|
||||||
|
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||||
|
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||||
|
* The number of CPUs/Cores should less than or equal to 256.
|
||||||
|
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||||
|
|
||||||
|
## Specification of Git Branches
|
||||||
|
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
||||||
|
Now, there are 4 branches in github.com.
|
||||||
|
* The master branch. This a main branch to reflect a production-ready state.
|
||||||
|
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
|
||||||
|
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
|
||||||
|
* The gh-pages branch. This is for web pages
|
|
@ -18,6 +18,7 @@ CORE2
|
||||||
PENRYN
|
PENRYN
|
||||||
DUNNINGTON
|
DUNNINGTON
|
||||||
NEHALEM
|
NEHALEM
|
||||||
|
SANDYBRIDGE
|
||||||
ATOM
|
ATOM
|
||||||
|
|
||||||
b)AMD CPU:
|
b)AMD CPU:
|
||||||
|
@ -27,6 +28,7 @@ OPTERON_SSE3
|
||||||
BARCELONA
|
BARCELONA
|
||||||
SHANGHAI
|
SHANGHAI
|
||||||
ISTANBUL
|
ISTANBUL
|
||||||
|
BOBCAT
|
||||||
|
|
||||||
c)VIA CPU:
|
c)VIA CPU:
|
||||||
SSE_GENERIC
|
SSE_GENERIC
|
||||||
|
@ -47,6 +49,7 @@ CELL
|
||||||
3.MIPS64 CPU:
|
3.MIPS64 CPU:
|
||||||
SICORTEX
|
SICORTEX
|
||||||
LOONGSON3A
|
LOONGSON3A
|
||||||
|
LOONGSON3B
|
||||||
|
|
||||||
4.IA64 CPU:
|
4.IA64 CPU:
|
||||||
ITANIUM2
|
ITANIUM2
|
||||||
|
|
10
c_check
10
c_check
|
@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/);
|
||||||
$compiler = GCC if ($compiler eq "");
|
$compiler = GCC if ($compiler eq "");
|
||||||
|
|
||||||
$os = Linux if ($data =~ /OS_LINUX/);
|
$os = Linux if ($data =~ /OS_LINUX/);
|
||||||
$os = FreeBSD if ($data =~ /OS_FreeBSD/);
|
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
||||||
$os = NetBSD if ($data =~ /OS_NetBSD/);
|
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
||||||
$os = Darwin if ($data =~ /OS_Darwin/);
|
$os = Darwin if ($data =~ /OS_DARWIN/);
|
||||||
$os = SunOS if ($data =~ /OS_SunOS/);
|
$os = SunOS if ($data =~ /OS_SUNOS/);
|
||||||
$os = AIX if ($data =~ /OS_AIX/);
|
$os = AIX if ($data =~ /OS_AIX/);
|
||||||
$os = osf if ($data =~ /OS_OSF/);
|
$os = osf if ($data =~ /OS_OSF/);
|
||||||
$os = WINNT if ($data =~ /OS_WINNT/);
|
$os = WINNT if ($data =~ /OS_WINNT/);
|
||||||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/);
|
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||||
|
|
||||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||||
|
|
4
cblas.h
4
cblas.h
|
@ -9,6 +9,10 @@ extern "C" {
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
/*Set the number of threads on runtime.*/
|
||||||
|
void openblas_set_num_threads(int num_threads);
|
||||||
|
void goto_set_num_threads(int num_threads);
|
||||||
|
|
||||||
#define CBLAS_INDEX size_t
|
#define CBLAS_INDEX size_t
|
||||||
|
|
||||||
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
|
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
|
||||||
|
|
4
common.h
4
common.h
|
@ -68,7 +68,7 @@ extern "C" {
|
||||||
#define SMP
|
#define SMP
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix)
|
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
|
||||||
#define WINDOWS_ABI
|
#define WINDOWS_ABI
|
||||||
#define OS_WINDOWS
|
#define OS_WINDOWS
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ extern "C" {
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OS_DARWIN
|
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
|
||||||
#include <sched.h>
|
#include <sched.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,8 @@ extern "C" {
|
||||||
|
|
||||||
int BLASFUNC(xerbla)(char *, blasint *info, blasint);
|
int BLASFUNC(xerbla)(char *, blasint *info, blasint);
|
||||||
|
|
||||||
|
void BLASFUNC(openblas_set_num_threads)(int *);
|
||||||
|
|
||||||
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
|
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
|
||||||
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);
|
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);
|
||||||
|
|
||||||
|
|
|
@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) {
|
||||||
int openmp_nthreads=0;
|
int openmp_nthreads=0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if ((blas_cpu_number == 1)
|
if (blas_cpu_number == 1
|
||||||
|
|
||||||
#ifdef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
|| omp_in_parallel()
|
|| omp_in_parallel()
|
||||||
|
|
13
common_x86.h
13
common_x86.h
|
@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#define PROFCODE
|
#define PROFCODE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX)
|
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
|
||||||
#define SAVEREGISTERS \
|
#define SAVEREGISTERS \
|
||||||
subl $32, %esp;\
|
subl $32, %esp;\
|
||||||
movups %xmm6, 0(%esp);\
|
movups %xmm6, 0(%esp);\
|
||||||
|
@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#define RESTOREREGISTERS
|
#define RESTOREREGISTERS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX)
|
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.text; \
|
.text; \
|
||||||
.align 16; \
|
.align 16; \
|
||||||
|
@ -282,7 +282,7 @@ REALNAME:
|
||||||
#define EPILOGUE .end REALNAME
|
#define EPILOGUE .end REALNAME
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__)
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.text; \
|
.text; \
|
||||||
.align 16; \
|
.align 16; \
|
||||||
|
@ -356,4 +356,11 @@ REALNAME:
|
||||||
|
|
||||||
#ifndef ALIGN_6
|
#ifndef ALIGN_6
|
||||||
#define ALIGN_6 .align 64
|
#define ALIGN_6 .align 64
|
||||||
|
|
||||||
|
// ffreep %st(0).
|
||||||
|
// Because Clang didn't support ffreep, we directly use the opcode.
|
||||||
|
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
|
||||||
|
#ifndef ffreep
|
||||||
|
#define ffreep .byte 0xdf, 0xc0 #
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -353,7 +353,7 @@ REALNAME:
|
||||||
#define EPILOGUE .end REALNAME
|
#define EPILOGUE .end REALNAME
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI)
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.text; \
|
.text; \
|
||||||
.align 512; \
|
.align 512; \
|
||||||
|
@ -425,6 +425,7 @@ REALNAME:
|
||||||
#define ALIGN_2 .align 2
|
#define ALIGN_2 .align 2
|
||||||
#define ALIGN_3 .align 3
|
#define ALIGN_3 .align 3
|
||||||
#define ALIGN_4 .align 4
|
#define ALIGN_4 .align 4
|
||||||
|
#define ALIGN_5 .align 5
|
||||||
#define ffreep fstp
|
#define ffreep fstp
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -448,4 +449,10 @@ REALNAME:
|
||||||
#define ALIGN_6 .align 64
|
#define ALIGN_6 .align 64
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// ffreep %st(0).
|
||||||
|
// Because Clang didn't support ffreep, we directly use the opcode.
|
||||||
|
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
|
||||||
|
#ifndef ffreep
|
||||||
|
#define ffreep .byte 0xdf, 0xc0 #
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
5
cpuid.h
5
cpuid.h
|
@ -103,6 +103,8 @@
|
||||||
#define CORE_NEHALEM 17
|
#define CORE_NEHALEM 17
|
||||||
#define CORE_ATOM 18
|
#define CORE_ATOM 18
|
||||||
#define CORE_NANO 19
|
#define CORE_NANO 19
|
||||||
|
#define CORE_SANDYBRIDGE 20
|
||||||
|
#define CORE_BOBCAT 21
|
||||||
|
|
||||||
#define HAVE_SSE (1 << 0)
|
#define HAVE_SSE (1 << 0)
|
||||||
#define HAVE_SSE2 (1 << 1)
|
#define HAVE_SSE2 (1 << 1)
|
||||||
|
@ -122,6 +124,7 @@
|
||||||
#define HAVE_MISALIGNSSE (1 << 15)
|
#define HAVE_MISALIGNSSE (1 << 15)
|
||||||
#define HAVE_128BITFPU (1 << 16)
|
#define HAVE_128BITFPU (1 << 16)
|
||||||
#define HAVE_FASTMOVU (1 << 17)
|
#define HAVE_FASTMOVU (1 << 17)
|
||||||
|
#define HAVE_AVX (1 << 18)
|
||||||
|
|
||||||
#define CACHE_INFO_L1_I 1
|
#define CACHE_INFO_L1_I 1
|
||||||
#define CACHE_INFO_L1_D 2
|
#define CACHE_INFO_L1_D 2
|
||||||
|
@ -188,4 +191,6 @@ typedef struct {
|
||||||
#define CPUTYPE_NSGEODE 41
|
#define CPUTYPE_NSGEODE 41
|
||||||
#define CPUTYPE_VIAC3 42
|
#define CPUTYPE_VIAC3 42
|
||||||
#define CPUTYPE_NANO 43
|
#define CPUTYPE_NANO 43
|
||||||
|
#define CPUTYPE_SANDYBRIDGE 44
|
||||||
|
#define CPUTYPE_BOBCAT 45
|
||||||
#endif
|
#endif
|
||||||
|
|
25
cpuid_x86.c
25
cpuid_x86.c
|
@ -189,6 +189,7 @@ int get_cputype(int gettype){
|
||||||
if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3;
|
if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3;
|
||||||
if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1;
|
if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1;
|
||||||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
||||||
|
if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX;
|
||||||
|
|
||||||
if (have_excpuid() >= 0x01) {
|
if (have_excpuid() >= 0x01) {
|
||||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||||
|
@ -983,13 +984,13 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 10:
|
case 10:
|
||||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
case 12:
|
case 12:
|
||||||
//Xeon Processor 5600 (Westmere-EP)
|
//Xeon Processor 5600 (Westmere-EP)
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 13:
|
case 13:
|
||||||
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
|
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
case 15:
|
case 15:
|
||||||
//Xeon Processor E7 (Westmere-EX)
|
//Xeon Processor E7 (Westmere-EX)
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
|
@ -1027,6 +1028,8 @@ int get_cpuname(void){
|
||||||
case 1:
|
case 1:
|
||||||
case 10:
|
case 10:
|
||||||
return CPUTYPE_BARCELONA;
|
return CPUTYPE_BARCELONA;
|
||||||
|
case 5:
|
||||||
|
return CPUTYPE_BOBCAT;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1146,6 +1149,8 @@ static char *cpuname[] = {
|
||||||
"NSGEODE",
|
"NSGEODE",
|
||||||
"VIAC3",
|
"VIAC3",
|
||||||
"NANO",
|
"NANO",
|
||||||
|
"SANDYBRIDGE",
|
||||||
|
"BOBCAT",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1192,6 +1197,8 @@ static char *lowercpuname[] = {
|
||||||
"tms3x00",
|
"tms3x00",
|
||||||
"nsgeode",
|
"nsgeode",
|
||||||
"nano",
|
"nano",
|
||||||
|
"sandybridge",
|
||||||
|
"bobcat",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
@ -1215,6 +1222,8 @@ static char *corename[] = {
|
||||||
"NEHALEM",
|
"NEHALEM",
|
||||||
"ATOM",
|
"ATOM",
|
||||||
"NANO",
|
"NANO",
|
||||||
|
"SANDYBRIDGE",
|
||||||
|
"BOBCAT",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1238,6 +1247,8 @@ static char *corename_lower[] = {
|
||||||
"nehalem",
|
"nehalem",
|
||||||
"atom",
|
"atom",
|
||||||
"nano",
|
"nano",
|
||||||
|
"sandybridge",
|
||||||
|
"bobcat",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -1321,13 +1332,13 @@ int get_coretype(void){
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
case 10:
|
case 10:
|
||||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
||||||
return CORE_NEHALEM;
|
return CORE_SANDYBRIDGE;
|
||||||
case 12:
|
case 12:
|
||||||
//Xeon Processor 5600 (Westmere-EP)
|
//Xeon Processor 5600 (Westmere-EP)
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
case 13:
|
case 13:
|
||||||
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
|
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
|
||||||
return CORE_NEHALEM;
|
return CORE_SANDYBRIDGE;
|
||||||
case 15:
|
case 15:
|
||||||
//Xeon Processor E7 (Westmere-EX)
|
//Xeon Processor E7 (Westmere-EX)
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
|
@ -1346,7 +1357,9 @@ int get_coretype(void){
|
||||||
if (family <= 0x5) return CORE_80486;
|
if (family <= 0x5) return CORE_80486;
|
||||||
if (family <= 0xe) return CORE_ATHLON;
|
if (family <= 0xe) return CORE_ATHLON;
|
||||||
if (family == 0xf){
|
if (family == 0xf){
|
||||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA;
|
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||||
|
else if (exfamily == 5) return CORE_BOBCAT;
|
||||||
|
else return CORE_BARCELONA;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1426,6 +1439,7 @@ void get_cpuconfig(void){
|
||||||
if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n");
|
if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n");
|
||||||
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
|
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
|
||||||
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
|
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
|
||||||
|
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
||||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||||
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
||||||
|
@ -1491,6 +1505,7 @@ void get_sse(void){
|
||||||
if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n");
|
if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n");
|
||||||
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
|
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
|
||||||
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
|
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
|
||||||
|
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
||||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||||
|
|
||||||
|
|
10
ctest.c
10
ctest.c
|
@ -35,19 +35,19 @@ OS_LINUX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__FreeBSD__)
|
#if defined(__FreeBSD__)
|
||||||
OS_FreeBSD
|
OS_FREEBSD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__NetBSD__)
|
#if defined(__NetBSD__)
|
||||||
OS_NetBSD
|
OS_NETBSD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__sun)
|
#if defined(__sun)
|
||||||
OS_SunOS
|
OS_SUNOS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
OS_Darwin
|
OS_DARWIN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(_AIX)
|
#if defined(_AIX)
|
||||||
|
@ -63,7 +63,7 @@ OS_WINNT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__CYGWIN__)
|
#if defined(__CYGWIN__)
|
||||||
OS_CYGWIN
|
OS_CYGWIN_NT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__INTERIX)
|
#if defined(__INTERIX)
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
TOPDIR = ../..
|
TOPDIR = ../..
|
||||||
include ../../Makefile.system
|
include ../../Makefile.system
|
||||||
|
|
||||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
|
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
|
||||||
|
|
||||||
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||||
|
|
||||||
ifdef SMP
|
ifdef SMP
|
||||||
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
|
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
|
||||||
ifndef NO_AFFINITY
|
ifndef NO_AFFINITY
|
||||||
COMMONOBJS += init.$(SUFFIX)
|
COMMONOBJS += init.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -63,6 +63,14 @@ static blas_pool_t pool;
|
||||||
static HANDLE blas_threads [MAX_CPU_NUMBER];
|
static HANDLE blas_threads [MAX_CPU_NUMBER];
|
||||||
static DWORD blas_threads_id[MAX_CPU_NUMBER];
|
static DWORD blas_threads_id[MAX_CPU_NUMBER];
|
||||||
|
|
||||||
|
void goto_set_num_threads(int num)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void openblas_set_num_threads(int num)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||||
|
|
||||||
if (!(mode & BLAS_COMPLEX)){
|
if (!(mode & BLAS_COMPLEX)){
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*****************************************************************************
|
/*****************************************************************************
|
||||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define MAX_NODES 16
|
#define MAX_NODES 16
|
||||||
#define MAX_CPUS 256
|
#define MAX_CPUS 256
|
||||||
|
#define NCPUBITS (8*sizeof(unsigned long))
|
||||||
|
#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS)
|
||||||
|
#define CPUELT(cpu) ((cpu) / NCPUBITS)
|
||||||
|
#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS))
|
||||||
|
|
||||||
|
|
||||||
#define SH_MAGIC 0x510510
|
#define SH_MAGIC 0x510510
|
||||||
|
|
||||||
|
@ -103,10 +108,10 @@ typedef struct {
|
||||||
int num_nodes;
|
int num_nodes;
|
||||||
int num_procs;
|
int num_procs;
|
||||||
int final_num_procs;
|
int final_num_procs;
|
||||||
unsigned long avail;
|
unsigned long avail [MAX_BITMASK_LEN];
|
||||||
|
int avail_count;
|
||||||
unsigned long cpu_info [MAX_CPUS];
|
unsigned long cpu_info [MAX_CPUS];
|
||||||
unsigned long node_info [MAX_NODES];
|
unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN];
|
||||||
int cpu_use[MAX_CPUS];
|
int cpu_use[MAX_CPUS];
|
||||||
|
|
||||||
} shm_t;
|
} shm_t;
|
||||||
|
@ -126,7 +131,8 @@ static shm_t *common = (void *)-1;
|
||||||
static int shmid, pshmid;
|
static int shmid, pshmid;
|
||||||
static void *paddr;
|
static void *paddr;
|
||||||
|
|
||||||
static unsigned long lprocmask, lnodemask;
|
static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask;
|
||||||
|
static int lprocmask_count = 0;
|
||||||
static int numprocs = 1;
|
static int numprocs = 1;
|
||||||
static int numnodes = 1;
|
static int numnodes = 1;
|
||||||
|
|
||||||
|
@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) {
|
||||||
than sizeof(unsigned long). On 64 bits, the limit
|
than sizeof(unsigned long). On 64 bits, the limit
|
||||||
is 64. On 32 bits, it is 32.
|
is 64. On 32 bits, it is 32.
|
||||||
***/
|
***/
|
||||||
static inline unsigned long get_cpumap(int node) {
|
static inline void get_cpumap(int node, unsigned long * node_info) {
|
||||||
|
|
||||||
int infile;
|
int infile;
|
||||||
unsigned long affinity;
|
unsigned long affinity[32];
|
||||||
char name[160];
|
char name[160];
|
||||||
char cpumap[160];
|
char cpumap[160];
|
||||||
char *p, *dummy;
|
char *dummy;
|
||||||
int i=0;
|
int i=0;
|
||||||
|
int count=0;
|
||||||
|
int k=0;
|
||||||
|
|
||||||
sprintf(name, CPUMAP_NAME, node);
|
sprintf(name, CPUMAP_NAME, node);
|
||||||
|
|
||||||
infile = open(name, O_RDONLY);
|
infile = open(name, O_RDONLY);
|
||||||
|
for(i=0; i<32; i++){
|
||||||
affinity = 0;
|
affinity[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (infile != -1) {
|
if (infile != -1) {
|
||||||
|
|
||||||
read(infile, cpumap, sizeof(cpumap));
|
read(infile, cpumap, sizeof(cpumap));
|
||||||
p = cpumap;
|
|
||||||
while (*p != '\n' && i<160){
|
for(i=0; i<160; i++){
|
||||||
if(*p != ',') {
|
if(cpumap[i] == '\n')
|
||||||
name[i++]=*p;
|
break;
|
||||||
|
if(cpumap[i] != ','){
|
||||||
|
name[k++]=cpumap[i];
|
||||||
|
|
||||||
|
//Enough data for Hex
|
||||||
|
if(k >= NCPUBITS/4){
|
||||||
|
affinity[count++] = strtoul(name, &dummy, 16);
|
||||||
|
k=0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
p++;
|
|
||||||
|
}
|
||||||
|
if(k!=0){
|
||||||
|
name[k]='\0';
|
||||||
|
affinity[count++] = strtoul(name, &dummy, 16);
|
||||||
|
k=0;
|
||||||
|
}
|
||||||
|
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
|
||||||
|
// revert the sequence
|
||||||
|
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
|
||||||
|
node_info[i]=affinity[count-i-1];
|
||||||
}
|
}
|
||||||
p = name;
|
|
||||||
|
|
||||||
// while ((*p == '0') || (*p == ',')) p++;
|
|
||||||
|
|
||||||
affinity = strtoul(p, &dummy, 16);
|
|
||||||
|
|
||||||
close(infile);
|
close(infile);
|
||||||
}
|
}
|
||||||
|
|
||||||
return affinity;
|
return ;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned long get_share(int cpu, int level) {
|
static inline void get_share(int cpu, int level, unsigned long * share) {
|
||||||
|
|
||||||
int infile;
|
int infile;
|
||||||
unsigned long affinity;
|
unsigned long affinity[32];
|
||||||
|
char cpumap[160];
|
||||||
char name[160];
|
char name[160];
|
||||||
char *p;
|
char *dummy;
|
||||||
|
int count=0;
|
||||||
|
int i=0,k=0;
|
||||||
|
int bitmask_idx = 0;
|
||||||
|
|
||||||
sprintf(name, SHARE_NAME, cpu, level);
|
sprintf(name, SHARE_NAME, cpu, level);
|
||||||
|
|
||||||
infile = open(name, O_RDONLY);
|
infile = open(name, O_RDONLY);
|
||||||
|
|
||||||
affinity = (1UL << cpu);
|
// Init share
|
||||||
|
for(i=0; i<MAX_BITMASK_LEN; i++){
|
||||||
|
share[i]=0;
|
||||||
|
}
|
||||||
|
bitmask_idx = CPUELT(cpu);
|
||||||
|
share[bitmask_idx] = CPUMASK(cpu);
|
||||||
|
|
||||||
if (infile != -1) {
|
if (infile != -1) {
|
||||||
|
|
||||||
read(infile, name, sizeof(name));
|
read(infile, cpumap, sizeof(cpumap));
|
||||||
|
|
||||||
p = name;
|
for(i=0; i<160; i++){
|
||||||
|
if(cpumap[i] == '\n')
|
||||||
|
break;
|
||||||
|
if(cpumap[i] != ','){
|
||||||
|
name[k++]=cpumap[i];
|
||||||
|
|
||||||
while ((*p == '0') || (*p == ',')) p++;
|
//Enough data
|
||||||
|
if(k >= NCPUBITS/4){
|
||||||
|
affinity[count++] = strtoul(name, &dummy, 16);
|
||||||
|
k=0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if(k!=0){
|
||||||
|
name[k]='\0';
|
||||||
|
affinity[count++] = strtoul(name, &dummy, 16);
|
||||||
|
k=0;
|
||||||
|
}
|
||||||
|
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
|
||||||
|
// revert the sequence
|
||||||
|
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
|
||||||
|
share[i]=affinity[count-i-1];
|
||||||
|
}
|
||||||
|
|
||||||
affinity = strtol(p, &p, 16);
|
|
||||||
|
|
||||||
close(infile);
|
close(infile);
|
||||||
}
|
}
|
||||||
|
|
||||||
return affinity;
|
return ;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int numa_check(void) {
|
static int numa_check(void) {
|
||||||
|
@ -248,6 +298,7 @@ static int numa_check(void) {
|
||||||
DIR *dp;
|
DIR *dp;
|
||||||
struct dirent *dir;
|
struct dirent *dir;
|
||||||
int node;
|
int node;
|
||||||
|
int j;
|
||||||
|
|
||||||
common -> num_nodes = 0;
|
common -> num_nodes = 0;
|
||||||
|
|
||||||
|
@ -258,7 +309,9 @@ static int numa_check(void) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0;
|
for (node = 0; node < MAX_NODES; node ++) {
|
||||||
|
for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
while ((dir = readdir(dp)) != NULL) {
|
while ((dir = readdir(dp)) != NULL) {
|
||||||
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
|
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
|
||||||
|
@ -266,12 +319,12 @@ static int numa_check(void) {
|
||||||
node = atoi(&dir -> d_name[4]);
|
node = atoi(&dir -> d_name[4]);
|
||||||
|
|
||||||
if (node > MAX_NODES) {
|
if (node > MAX_NODES) {
|
||||||
fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n");
|
fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
common -> num_nodes ++;
|
common -> num_nodes ++;
|
||||||
common -> node_info[node] = get_cpumap(node);
|
get_cpumap(node, common->node_info[node]);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -284,7 +337,7 @@ static int numa_check(void) {
|
||||||
fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes);
|
fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes);
|
||||||
|
|
||||||
for (node = 0; node < common -> num_nodes; node ++)
|
for (node = 0; node < common -> num_nodes; node ++)
|
||||||
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]);
|
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return common -> num_nodes;
|
return common -> num_nodes;
|
||||||
|
@ -296,11 +349,13 @@ static void numa_mapping(void) {
|
||||||
int i, j, h;
|
int i, j, h;
|
||||||
unsigned long work, bit;
|
unsigned long work, bit;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
int bitmask_idx = 0;
|
||||||
|
|
||||||
for (node = 0; node < common -> num_nodes; node ++) {
|
for (node = 0; node < common -> num_nodes; node ++) {
|
||||||
core = 0;
|
core = 0;
|
||||||
for (cpu = 0; cpu < common -> num_procs; cpu ++) {
|
for (cpu = 0; cpu < common -> num_procs; cpu ++) {
|
||||||
if (common -> node_info[node] & common -> avail & (1UL << cpu)) {
|
bitmask_idx = CPUELT(cpu);
|
||||||
|
if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) {
|
||||||
common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu);
|
common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu);
|
||||||
count ++;
|
count ++;
|
||||||
core ++;
|
core ++;
|
||||||
|
@ -357,58 +412,89 @@ static void numa_mapping(void) {
|
||||||
|
|
||||||
static void disable_hyperthread(void) {
|
static void disable_hyperthread(void) {
|
||||||
|
|
||||||
unsigned long share;
|
unsigned long share[MAX_BITMASK_LEN];
|
||||||
int cpu;
|
int cpu;
|
||||||
|
int bitmask_idx = 0;
|
||||||
|
int i=0, count=0;
|
||||||
|
bitmask_idx = CPUELT(common -> num_procs);
|
||||||
|
|
||||||
if(common->num_procs > 64){
|
for(i=0; i< bitmask_idx; i++){
|
||||||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs);
|
common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL;
|
||||||
exit(1);
|
}
|
||||||
}else if(common->num_procs == 64){
|
if(CPUMASK(common -> num_procs) != 1){
|
||||||
common -> avail = 0xFFFFFFFFFFFFFFFFUL;
|
common -> avail[count++] = CPUMASK(common -> num_procs) - 1;
|
||||||
}else
|
}
|
||||||
common -> avail = (1UL << common -> num_procs) - 1;
|
common -> avail_count = count;
|
||||||
|
|
||||||
|
/* if(common->num_procs > 64){ */
|
||||||
|
/* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */
|
||||||
|
/* exit(1); */
|
||||||
|
/* }else if(common->num_procs == 64){ */
|
||||||
|
/* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */
|
||||||
|
/* }else */
|
||||||
|
/* common -> avail = (1UL << common -> num_procs) - 1; */
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail);
|
fprintf(stderr, "\nAvail CPUs : ");
|
||||||
|
for(i=0; i<count; i++)
|
||||||
|
fprintf(stderr, "%04lx ", common -> avail[i]);
|
||||||
|
fprintf(stderr, ".\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (cpu = 0; cpu < common -> num_procs; cpu ++) {
|
for (cpu = 0; cpu < common -> num_procs; cpu ++) {
|
||||||
|
|
||||||
share = (get_share(cpu, 1) & common -> avail);
|
get_share(cpu, 1, share);
|
||||||
|
|
||||||
if (popcount(share) > 1) {
|
//When the shared cpu are in different element of share & avail array, this may be a bug.
|
||||||
|
for (i = 0; i < count ; i++){
|
||||||
|
if (popcount(share[i]) > 1) {
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
|
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
|
||||||
cpu, share & ~(1UL << cpu));
|
cpu, share[i] & ~(CPUMASK(cpu)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
common -> avail &= ~((share & ~(1UL << cpu)));
|
common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void disable_affinity(void) {
|
static void disable_affinity(void) {
|
||||||
|
int i=0;
|
||||||
|
int bitmask_idx=0;
|
||||||
|
int count=0;
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail);
|
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]);
|
||||||
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
|
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if(common->final_num_procs > 64){
|
/* if(common->final_num_procs > 64){ */
|
||||||
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs);
|
/* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */
|
||||||
exit(1);
|
/* exit(1); */
|
||||||
}else if(common->final_num_procs == 64){
|
/* }else if(common->final_num_procs == 64){ */
|
||||||
lprocmask = 0xFFFFFFFFFFFFFFFFUL;
|
/* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */
|
||||||
}else
|
/* }else */
|
||||||
lprocmask = (1UL << common -> final_num_procs) - 1;
|
/* lprocmask = (1UL << common -> final_num_procs) - 1; */
|
||||||
|
|
||||||
|
bitmask_idx = CPUELT(common -> final_num_procs);
|
||||||
|
|
||||||
|
for(i=0; i< bitmask_idx; i++){
|
||||||
|
lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL;
|
||||||
|
}
|
||||||
|
if(CPUMASK(common -> final_num_procs) != 1){
|
||||||
|
lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1;
|
||||||
|
}
|
||||||
|
lprocmask_count = count;
|
||||||
|
|
||||||
#ifndef USE_OPENMP
|
#ifndef USE_OPENMP
|
||||||
lprocmask &= *(unsigned long *)&cpu_orig_mask[0];
|
for(i=0; i< count; i++){
|
||||||
|
lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i];
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask);
|
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -498,7 +584,7 @@ static void create_pshmem(void) {
|
||||||
static void local_cpu_map(void) {
|
static void local_cpu_map(void) {
|
||||||
|
|
||||||
int cpu, id, mapping;
|
int cpu, id, mapping;
|
||||||
|
int bitmask_idx = 0;
|
||||||
cpu = 0;
|
cpu = 0;
|
||||||
mapping = 0;
|
mapping = 0;
|
||||||
|
|
||||||
|
@ -509,7 +595,8 @@ static void local_cpu_map(void) {
|
||||||
if (is_dead(id)) common -> cpu_use[cpu] = 0;
|
if (is_dead(id)) common -> cpu_use[cpu] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) {
|
bitmask_idx = CPUELT(cpu);
|
||||||
|
if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) {
|
||||||
|
|
||||||
common -> cpu_use[cpu] = pshmid;
|
common -> cpu_use[cpu] = pshmid;
|
||||||
cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]);
|
cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]);
|
||||||
|
@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) {
|
||||||
#ifndef USE_OPENMP
|
#ifndef USE_OPENMP
|
||||||
cpu_set_t cpu_mask;
|
cpu_set_t cpu_mask;
|
||||||
#endif
|
#endif
|
||||||
|
int i;
|
||||||
|
|
||||||
if (initialized) return;
|
if (initialized) return;
|
||||||
|
|
||||||
|
@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) {
|
||||||
|
|
||||||
common -> num_procs = get_nprocs();
|
common -> num_procs = get_nprocs();
|
||||||
|
|
||||||
|
if(common -> num_procs > MAX_CPUS) {
|
||||||
|
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
|
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
|
||||||
|
|
||||||
numa_check();
|
numa_check();
|
||||||
|
@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) {
|
||||||
|
|
||||||
if (common -> num_nodes > 1) numa_mapping();
|
if (common -> num_nodes > 1) numa_mapping();
|
||||||
|
|
||||||
common -> final_num_procs = popcount(common -> avail);
|
common -> final_num_procs = 0;
|
||||||
|
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]);
|
||||||
|
|
||||||
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
|
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
|
||||||
|
|
||||||
|
@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) {
|
||||||
|
|
||||||
disable_affinity();
|
disable_affinity();
|
||||||
|
|
||||||
num_avail = popcount(lprocmask);
|
num_avail = 0;
|
||||||
|
for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]);
|
||||||
|
|
||||||
if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail;
|
if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail;
|
||||||
|
|
||||||
|
|
|
@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_FreeBSD) || defined(OS_Darwin)
|
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -185,7 +185,7 @@ int get_num_procs(void) {
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_FreeBSD) || defined(OS_Darwin)
|
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||||
|
|
||||||
int get_num_procs(void) {
|
int get_num_procs(void) {
|
||||||
|
|
||||||
|
@ -215,7 +215,7 @@ int goto_get_num_procs (void) {
|
||||||
|
|
||||||
int blas_get_cpu_number(void){
|
int blas_get_cpu_number(void){
|
||||||
char *p;
|
char *p;
|
||||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
|
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||||
int max_num;
|
int max_num;
|
||||||
#endif
|
#endif
|
||||||
int blas_goto_num = 0;
|
int blas_goto_num = 0;
|
||||||
|
@ -223,7 +223,7 @@ int blas_get_cpu_number(void){
|
||||||
|
|
||||||
if (blas_num_threads) return blas_num_threads;
|
if (blas_num_threads) return blas_num_threads;
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
|
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||||
max_num = get_num_procs();
|
max_num = get_num_procs();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -250,7 +250,7 @@ int blas_get_cpu_number(void){
|
||||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||||
else blas_num_threads = MAX_CPU_NUMBER;
|
else blas_num_threads = MAX_CPU_NUMBER;
|
||||||
|
|
||||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin)
|
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*****************************************************************************
|
/*****************************************************************************
|
||||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef SMP_SERVER
|
#ifdef SMP_SERVER
|
||||||
#ifdef OS_LINUX
|
|
||||||
|
|
||||||
extern void openblas_set_num_threads(int num_threads) ;
|
extern void openblas_set_num_threads(int num_threads) ;
|
||||||
|
|
||||||
|
@ -41,5 +40,13 @@ void NAME(int* num_threads){
|
||||||
openblas_set_num_threads(*num_threads);
|
openblas_set_num_threads(*num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#else
|
||||||
|
//Single thread
|
||||||
|
|
||||||
|
void openblas_set_num_threads(int num_threads) {
|
||||||
|
}
|
||||||
|
|
||||||
|
void NAME(int* num_threads){
|
||||||
|
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -163,9 +163,9 @@ int get_L2_size(void){
|
||||||
|
|
||||||
int eax, ebx, ecx, edx;
|
int eax, ebx, ecx, edx;
|
||||||
|
|
||||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \
|
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
|
||||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||||
defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC)
|
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
|
||||||
|
|
||||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||||
|
|
||||||
|
@ -384,6 +384,17 @@ void blas_set_parameter(void){
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(SANDYBRIDGE)
|
||||||
|
sgemm_p = 1024;
|
||||||
|
dgemm_p = 512;
|
||||||
|
cgemm_p = 512;
|
||||||
|
zgemm_p = 256;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
qgemm_p = 256;
|
||||||
|
xgemm_p = 128;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(CORE_PRESCOTT) || defined(GENERIC)
|
#if defined(CORE_PRESCOTT) || defined(GENERIC)
|
||||||
size >>= 6;
|
size >>= 6;
|
||||||
|
|
||||||
|
@ -435,7 +446,7 @@ void blas_set_parameter(void){
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE_BARCELONA)
|
#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
|
||||||
size >>= 8;
|
size >>= 8;
|
||||||
|
|
||||||
sgemm_p = 232 * size;
|
sgemm_p = 232 * size;
|
||||||
|
|
|
@ -70,11 +70,11 @@ dll2 : libgoto2_shared.dll
|
||||||
$(RANLIB) ../$(LIBNAME)
|
$(RANLIB) ../$(LIBNAME)
|
||||||
ifeq ($(BINARY32), 1)
|
ifeq ($(BINARY32), 1)
|
||||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
||||||
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
|
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
|
||||||
-lib /machine:i386 /def:libopenblas.def
|
-lib /machine:i386 /def:libopenblas.def
|
||||||
else
|
else
|
||||||
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
|
||||||
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
|
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
|
||||||
-lib /machine:X64 /def:libopenblas.def
|
-lib /machine:X64 /def:libopenblas.def
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ so : ../$(LIBSONAME)
|
||||||
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
|
||||||
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
|
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
|
||||||
-Wl,--retain-symbols-file=linux.def $(EXTRALIB)
|
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
|
||||||
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
|
||||||
rm -f linktest
|
rm -f linktest
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
5
f_check
5
f_check
|
@ -32,11 +32,12 @@ if ($compiler eq "") {
|
||||||
"pgf95", "pgf90", "pgf77",
|
"pgf95", "pgf90", "pgf77",
|
||||||
"ifort");
|
"ifort");
|
||||||
|
|
||||||
|
OUTER:
|
||||||
foreach $lists (@lists) {
|
foreach $lists (@lists) {
|
||||||
foreach $path (@path) {
|
foreach $path (@path) {
|
||||||
if (-f $path . "/" . $lists) {
|
if (-x $path . "/" . $lists) {
|
||||||
$compiler = $lists;
|
$compiler = $lists;
|
||||||
break;
|
last OUTER;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
32
getarch.c
32
getarch.c
|
@ -1,5 +1,5 @@
|
||||||
/*****************************************************************************
|
/*****************************************************************************
|
||||||
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/* #define FORCE_BARCELONA */
|
/* #define FORCE_BARCELONA */
|
||||||
/* #define FORCE_SHANGHAI */
|
/* #define FORCE_SHANGHAI */
|
||||||
/* #define FORCE_ISTANBUL */
|
/* #define FORCE_ISTANBUL */
|
||||||
|
/* #define FORCE_BOBCAT */
|
||||||
/* #define FORCE_SSE_GENERIC */
|
/* #define FORCE_SSE_GENERIC */
|
||||||
/* #define FORCE_VIAC3 */
|
/* #define FORCE_VIAC3 */
|
||||||
/* #define FORCE_NANO */
|
/* #define FORCE_NANO */
|
||||||
|
@ -278,6 +279,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "NEHALEM"
|
#define CORENAME "NEHALEM"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_SANDYBRIDGE
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||||
|
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||||
|
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||||
|
#define LIBNAME "sandybridge"
|
||||||
|
#define CORENAME "SANDYBRIDGE"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_ATOM
|
#ifdef FORCE_ATOM
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
@ -349,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "BARCELONA"
|
#define CORENAME "BARCELONA"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(FORCE_BOBCAT)
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "BOBCAT"
|
||||||
|
#define ARCHCONFIG "-DBOBCAT " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||||
|
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \
|
||||||
|
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV"
|
||||||
|
#define LIBNAME "bobcat"
|
||||||
|
#define CORENAME "BOBCAT"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_SSE_GENERIC
|
#ifdef FORCE_SSE_GENERIC
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
|
|
@ -0,0 +1,235 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the ISCAS nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
|
||||||
|
{
|
||||||
|
BLASLONG i,j;
|
||||||
|
BLASLONG idx=0;
|
||||||
|
BLASLONG ii;
|
||||||
|
FLOAT *src0,*src1,*src2,*src3,*dest0;
|
||||||
|
for (j=0; j<col/4; j+=1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src2 = src1+2*srcdim;
|
||||||
|
src3 = src2+2*srcdim;
|
||||||
|
src = src3+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (row<<3);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<row/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
dest0[8] = src0[2];
|
||||||
|
dest0[9] = src0[3];
|
||||||
|
dest0[10] = src1[2];
|
||||||
|
dest0[11] = src1[3];
|
||||||
|
dest0[12] = src2[2];
|
||||||
|
dest0[13] = src2[3];
|
||||||
|
dest0[14] = src3[2];
|
||||||
|
dest0[15] = src3[3];
|
||||||
|
dest0[16] = src0[4];
|
||||||
|
dest0[17] = src0[5];
|
||||||
|
dest0[18] = src1[4];
|
||||||
|
dest0[19] = src1[5];
|
||||||
|
dest0[20] = src2[4];
|
||||||
|
dest0[21] = src2[5];
|
||||||
|
dest0[22] = src3[4];
|
||||||
|
dest0[23] = src3[5];
|
||||||
|
dest0[24] = src0[6];
|
||||||
|
dest0[25] = src0[7];
|
||||||
|
dest0[26] = src1[6];
|
||||||
|
dest0[27] = src1[7];
|
||||||
|
dest0[28] = src2[6];
|
||||||
|
dest0[29] = src2[7];
|
||||||
|
dest0[30] = src3[6];
|
||||||
|
dest0[31] = src3[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
src2 = src2+8;
|
||||||
|
src3 = src3+8;
|
||||||
|
ii = (4<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
dest0[8] = src0[2];
|
||||||
|
dest0[9] = src0[3];
|
||||||
|
dest0[10] = src1[2];
|
||||||
|
dest0[11] = src1[3];
|
||||||
|
dest0[12] = src2[2];
|
||||||
|
dest0[13] = src2[3];
|
||||||
|
dest0[14] = src3[2];
|
||||||
|
dest0[15] = src3[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
src2 = src2+4;
|
||||||
|
src3 = src3+4;
|
||||||
|
ii = (2<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
src2 = src2+2;
|
||||||
|
src3 = src3+2;
|
||||||
|
ii = (1<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src = src1+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (row<<2);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<row/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src0[2];
|
||||||
|
dest0[5] = src0[3];
|
||||||
|
dest0[6] = src1[2];
|
||||||
|
dest0[7] = src1[3];
|
||||||
|
dest0[8] = src0[4];
|
||||||
|
dest0[9] = src0[5];
|
||||||
|
dest0[10] = src1[4];
|
||||||
|
dest0[11] = src1[5];
|
||||||
|
dest0[12] = src0[6];
|
||||||
|
dest0[13] = src0[7];
|
||||||
|
dest0[14] = src1[6];
|
||||||
|
dest0[15] = src1[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
ii = (4<<2);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src0[2];
|
||||||
|
dest0[5] = src0[3];
|
||||||
|
dest0[6] = src1[2];
|
||||||
|
dest0[7] = src1[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
ii = (2<<2);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
ii = (1<<2);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src = src0+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (row<<1);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<row/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
ii = (4<<1);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
ii = (2<<1);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
ii = (1<<1);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,401 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the ISCAS nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
|
||||||
|
{
|
||||||
|
BLASLONG i,j;
|
||||||
|
BLASLONG idx=0;
|
||||||
|
BLASLONG ii;
|
||||||
|
FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0;
|
||||||
|
for (j=0; j<col/8; j+=1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src2 = src1+2*srcdim;
|
||||||
|
src3 = src2+2*srcdim;
|
||||||
|
src4 = src3+2*srcdim;
|
||||||
|
src5 = src4+2*srcdim;
|
||||||
|
src6 = src5+2*srcdim;
|
||||||
|
src7 = src6+2*srcdim;
|
||||||
|
src = src7+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (row<<4);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<row/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
dest0[8] = src4[0];
|
||||||
|
dest0[9] = src4[1];
|
||||||
|
dest0[10] = src5[0];
|
||||||
|
dest0[11] = src5[1];
|
||||||
|
dest0[12] = src6[0];
|
||||||
|
dest0[13] = src6[1];
|
||||||
|
dest0[14] = src7[0];
|
||||||
|
dest0[15] = src7[1];
|
||||||
|
dest0[16] = src0[2];
|
||||||
|
dest0[17] = src0[3];
|
||||||
|
dest0[18] = src1[2];
|
||||||
|
dest0[19] = src1[3];
|
||||||
|
dest0[20] = src2[2];
|
||||||
|
dest0[21] = src2[3];
|
||||||
|
dest0[22] = src3[2];
|
||||||
|
dest0[23] = src3[3];
|
||||||
|
dest0[24] = src4[2];
|
||||||
|
dest0[25] = src4[3];
|
||||||
|
dest0[26] = src5[2];
|
||||||
|
dest0[27] = src5[3];
|
||||||
|
dest0[28] = src6[2];
|
||||||
|
dest0[29] = src6[3];
|
||||||
|
dest0[30] = src7[2];
|
||||||
|
dest0[31] = src7[3];
|
||||||
|
dest0[32] = src0[4];
|
||||||
|
dest0[33] = src0[5];
|
||||||
|
dest0[34] = src1[4];
|
||||||
|
dest0[35] = src1[5];
|
||||||
|
dest0[36] = src2[4];
|
||||||
|
dest0[37] = src2[5];
|
||||||
|
dest0[38] = src3[4];
|
||||||
|
dest0[39] = src3[5];
|
||||||
|
dest0[40] = src4[4];
|
||||||
|
dest0[41] = src4[5];
|
||||||
|
dest0[42] = src5[4];
|
||||||
|
dest0[43] = src5[5];
|
||||||
|
dest0[44] = src6[4];
|
||||||
|
dest0[45] = src6[5];
|
||||||
|
dest0[46] = src7[4];
|
||||||
|
dest0[47] = src7[5];
|
||||||
|
dest0[48] = src0[6];
|
||||||
|
dest0[49] = src0[7];
|
||||||
|
dest0[50] = src1[6];
|
||||||
|
dest0[51] = src1[7];
|
||||||
|
dest0[52] = src2[6];
|
||||||
|
dest0[53] = src2[7];
|
||||||
|
dest0[54] = src3[6];
|
||||||
|
dest0[55] = src3[7];
|
||||||
|
dest0[56] = src4[6];
|
||||||
|
dest0[57] = src4[7];
|
||||||
|
dest0[58] = src5[6];
|
||||||
|
dest0[59] = src5[7];
|
||||||
|
dest0[60] = src6[6];
|
||||||
|
dest0[61] = src6[7];
|
||||||
|
dest0[62] = src7[6];
|
||||||
|
dest0[63] = src7[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
src2 = src2+8;
|
||||||
|
src3 = src3+8;
|
||||||
|
src4 = src4+8;
|
||||||
|
src5 = src5+8;
|
||||||
|
src6 = src6+8;
|
||||||
|
src7 = src7+8;
|
||||||
|
ii = (4<<4);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
dest0[8] = src4[0];
|
||||||
|
dest0[9] = src4[1];
|
||||||
|
dest0[10] = src5[0];
|
||||||
|
dest0[11] = src5[1];
|
||||||
|
dest0[12] = src6[0];
|
||||||
|
dest0[13] = src6[1];
|
||||||
|
dest0[14] = src7[0];
|
||||||
|
dest0[15] = src7[1];
|
||||||
|
dest0[16] = src0[2];
|
||||||
|
dest0[17] = src0[3];
|
||||||
|
dest0[18] = src1[2];
|
||||||
|
dest0[19] = src1[3];
|
||||||
|
dest0[20] = src2[2];
|
||||||
|
dest0[21] = src2[3];
|
||||||
|
dest0[22] = src3[2];
|
||||||
|
dest0[23] = src3[3];
|
||||||
|
dest0[24] = src4[2];
|
||||||
|
dest0[25] = src4[3];
|
||||||
|
dest0[26] = src5[2];
|
||||||
|
dest0[27] = src5[3];
|
||||||
|
dest0[28] = src6[2];
|
||||||
|
dest0[29] = src6[3];
|
||||||
|
dest0[30] = src7[2];
|
||||||
|
dest0[31] = src7[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
src2 = src2+4;
|
||||||
|
src3 = src3+4;
|
||||||
|
src4 = src4+4;
|
||||||
|
src5 = src5+4;
|
||||||
|
src6 = src6+4;
|
||||||
|
src7 = src7+4;
|
||||||
|
ii = (2<<4);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
dest0[8] = src4[0];
|
||||||
|
dest0[9] = src4[1];
|
||||||
|
dest0[10] = src5[0];
|
||||||
|
dest0[11] = src5[1];
|
||||||
|
dest0[12] = src6[0];
|
||||||
|
dest0[13] = src6[1];
|
||||||
|
dest0[14] = src7[0];
|
||||||
|
dest0[15] = src7[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
src2 = src2+2;
|
||||||
|
src3 = src3+2;
|
||||||
|
src4 = src4+2;
|
||||||
|
src5 = src5+2;
|
||||||
|
src6 = src6+2;
|
||||||
|
src7 = src7+2;
|
||||||
|
ii = (1<<4);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (col&4)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src2 = src1+2*srcdim;
|
||||||
|
src3 = src2+2*srcdim;
|
||||||
|
src = src3+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (row<<3);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<row/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
dest0[8] = src0[2];
|
||||||
|
dest0[9] = src0[3];
|
||||||
|
dest0[10] = src1[2];
|
||||||
|
dest0[11] = src1[3];
|
||||||
|
dest0[12] = src2[2];
|
||||||
|
dest0[13] = src2[3];
|
||||||
|
dest0[14] = src3[2];
|
||||||
|
dest0[15] = src3[3];
|
||||||
|
dest0[16] = src0[4];
|
||||||
|
dest0[17] = src0[5];
|
||||||
|
dest0[18] = src1[4];
|
||||||
|
dest0[19] = src1[5];
|
||||||
|
dest0[20] = src2[4];
|
||||||
|
dest0[21] = src2[5];
|
||||||
|
dest0[22] = src3[4];
|
||||||
|
dest0[23] = src3[5];
|
||||||
|
dest0[24] = src0[6];
|
||||||
|
dest0[25] = src0[7];
|
||||||
|
dest0[26] = src1[6];
|
||||||
|
dest0[27] = src1[7];
|
||||||
|
dest0[28] = src2[6];
|
||||||
|
dest0[29] = src2[7];
|
||||||
|
dest0[30] = src3[6];
|
||||||
|
dest0[31] = src3[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
src2 = src2+8;
|
||||||
|
src3 = src3+8;
|
||||||
|
ii = (4<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
dest0[8] = src0[2];
|
||||||
|
dest0[9] = src0[3];
|
||||||
|
dest0[10] = src1[2];
|
||||||
|
dest0[11] = src1[3];
|
||||||
|
dest0[12] = src2[2];
|
||||||
|
dest0[13] = src2[3];
|
||||||
|
dest0[14] = src3[2];
|
||||||
|
dest0[15] = src3[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
src2 = src2+4;
|
||||||
|
src3 = src3+4;
|
||||||
|
ii = (2<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src2[0];
|
||||||
|
dest0[5] = src2[1];
|
||||||
|
dest0[6] = src3[0];
|
||||||
|
dest0[7] = src3[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
src2 = src2+2;
|
||||||
|
src3 = src3+2;
|
||||||
|
ii = (1<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src = src1+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (row<<2);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<row/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src0[2];
|
||||||
|
dest0[5] = src0[3];
|
||||||
|
dest0[6] = src1[2];
|
||||||
|
dest0[7] = src1[3];
|
||||||
|
dest0[8] = src0[4];
|
||||||
|
dest0[9] = src0[5];
|
||||||
|
dest0[10] = src1[4];
|
||||||
|
dest0[11] = src1[5];
|
||||||
|
dest0[12] = src0[6];
|
||||||
|
dest0[13] = src0[7];
|
||||||
|
dest0[14] = src1[6];
|
||||||
|
dest0[15] = src1[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
ii = (4<<2);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
dest0[4] = src0[2];
|
||||||
|
dest0[5] = src0[3];
|
||||||
|
dest0[6] = src1[2];
|
||||||
|
dest0[7] = src1[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
ii = (2<<2);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src1[0];
|
||||||
|
dest0[3] = src1[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
ii = (1<<2);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src = src0+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (row<<1);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<row/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
ii = (4<<1);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
ii = (2<<1);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
ii = (1<<1);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,237 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the ISCAS nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
|
||||||
|
{
|
||||||
|
BLASLONG i,j;
|
||||||
|
BLASLONG idx=0;
|
||||||
|
BLASLONG ii;
|
||||||
|
FLOAT *src0,*src1,*src2,*src3,*dest0;
|
||||||
|
FLOAT *dest1,*dest2;
|
||||||
|
ii = col&-4;
|
||||||
|
ii = ii*(2*row);
|
||||||
|
dest2 = dest+ii;
|
||||||
|
ii = col&-2;
|
||||||
|
ii = ii*(2*row);
|
||||||
|
dest1 = dest+ii;
|
||||||
|
for (j=0; j<row/4; j+=1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src2 = src1+2*srcdim;
|
||||||
|
src3 = src2+2*srcdim;
|
||||||
|
src = src3+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (4<<3);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<col/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
dest0[8] = src1[0];
|
||||||
|
dest0[9] = src1[1];
|
||||||
|
dest0[10] = src1[2];
|
||||||
|
dest0[11] = src1[3];
|
||||||
|
dest0[12] = src1[4];
|
||||||
|
dest0[13] = src1[5];
|
||||||
|
dest0[14] = src1[6];
|
||||||
|
dest0[15] = src1[7];
|
||||||
|
dest0[16] = src2[0];
|
||||||
|
dest0[17] = src2[1];
|
||||||
|
dest0[18] = src2[2];
|
||||||
|
dest0[19] = src2[3];
|
||||||
|
dest0[20] = src2[4];
|
||||||
|
dest0[21] = src2[5];
|
||||||
|
dest0[22] = src2[6];
|
||||||
|
dest0[23] = src2[7];
|
||||||
|
dest0[24] = src3[0];
|
||||||
|
dest0[25] = src3[1];
|
||||||
|
dest0[26] = src3[2];
|
||||||
|
dest0[27] = src3[3];
|
||||||
|
dest0[28] = src3[4];
|
||||||
|
dest0[29] = src3[5];
|
||||||
|
dest0[30] = src3[6];
|
||||||
|
dest0[31] = src3[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
src2 = src2+8;
|
||||||
|
src3 = src3+8;
|
||||||
|
ii = (row<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
dest2[0] = src0[0];
|
||||||
|
dest2[1] = src0[1];
|
||||||
|
dest2[2] = src0[2];
|
||||||
|
dest2[3] = src0[3];
|
||||||
|
dest2[4] = src1[0];
|
||||||
|
dest2[5] = src1[1];
|
||||||
|
dest2[6] = src1[2];
|
||||||
|
dest2[7] = src1[3];
|
||||||
|
dest2[8] = src2[0];
|
||||||
|
dest2[9] = src2[1];
|
||||||
|
dest2[10] = src2[2];
|
||||||
|
dest2[11] = src2[3];
|
||||||
|
dest2[12] = src3[0];
|
||||||
|
dest2[13] = src3[1];
|
||||||
|
dest2[14] = src3[2];
|
||||||
|
dest2[15] = src3[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
src2 = src2+4;
|
||||||
|
src3 = src3+4;
|
||||||
|
dest2 = dest2+16;
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
dest1[0] = src0[0];
|
||||||
|
dest1[1] = src0[1];
|
||||||
|
dest1[2] = src1[0];
|
||||||
|
dest1[3] = src1[1];
|
||||||
|
dest1[4] = src2[0];
|
||||||
|
dest1[5] = src2[1];
|
||||||
|
dest1[6] = src3[0];
|
||||||
|
dest1[7] = src3[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
src2 = src2+2;
|
||||||
|
src3 = src3+2;
|
||||||
|
dest1 = dest1+8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src = src1+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (2<<3);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<col/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
dest0[8] = src1[0];
|
||||||
|
dest0[9] = src1[1];
|
||||||
|
dest0[10] = src1[2];
|
||||||
|
dest0[11] = src1[3];
|
||||||
|
dest0[12] = src1[4];
|
||||||
|
dest0[13] = src1[5];
|
||||||
|
dest0[14] = src1[6];
|
||||||
|
dest0[15] = src1[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
ii = (row<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
dest2[0] = src0[0];
|
||||||
|
dest2[1] = src0[1];
|
||||||
|
dest2[2] = src0[2];
|
||||||
|
dest2[3] = src0[3];
|
||||||
|
dest2[4] = src1[0];
|
||||||
|
dest2[5] = src1[1];
|
||||||
|
dest2[6] = src1[2];
|
||||||
|
dest2[7] = src1[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
dest2 = dest2+8;
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
dest1[0] = src0[0];
|
||||||
|
dest1[1] = src0[1];
|
||||||
|
dest1[2] = src1[0];
|
||||||
|
dest1[3] = src1[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
dest1 = dest1+4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src = src0+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (1<<3);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<col/4; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
ii = (row<<3);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
dest2[0] = src0[0];
|
||||||
|
dest2[1] = src0[1];
|
||||||
|
dest2[2] = src0[2];
|
||||||
|
dest2[3] = src0[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
dest2 = dest2+4;
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
dest1[0] = src0[0];
|
||||||
|
dest1[1] = src0[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
dest1 = dest1+2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,370 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the ISCAS nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software
|
||||||
|
without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
|
||||||
|
{
|
||||||
|
BLASLONG i,j;
|
||||||
|
BLASLONG idx=0;
|
||||||
|
BLASLONG ii;
|
||||||
|
FLOAT *src0,*src1,*src2,*src3,*dest0;
|
||||||
|
FLOAT *dest1,*dest2,*dest4;
|
||||||
|
ii = col&-8;
|
||||||
|
ii = ii*(2*row);
|
||||||
|
dest4 = dest+ii;
|
||||||
|
ii = col&-4;
|
||||||
|
ii = ii*(2*row);
|
||||||
|
dest2 = dest+ii;
|
||||||
|
ii = col&-2;
|
||||||
|
ii = ii*(2*row);
|
||||||
|
dest1 = dest+ii;
|
||||||
|
for (j=0; j<row/4; j+=1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src2 = src1+2*srcdim;
|
||||||
|
src3 = src2+2*srcdim;
|
||||||
|
src = src3+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (4<<4);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<col/8; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
dest0[8] = src0[8];
|
||||||
|
dest0[9] = src0[9];
|
||||||
|
dest0[10] = src0[10];
|
||||||
|
dest0[11] = src0[11];
|
||||||
|
dest0[12] = src0[12];
|
||||||
|
dest0[13] = src0[13];
|
||||||
|
dest0[14] = src0[14];
|
||||||
|
dest0[15] = src0[15];
|
||||||
|
dest0[16] = src1[0];
|
||||||
|
dest0[17] = src1[1];
|
||||||
|
dest0[18] = src1[2];
|
||||||
|
dest0[19] = src1[3];
|
||||||
|
dest0[20] = src1[4];
|
||||||
|
dest0[21] = src1[5];
|
||||||
|
dest0[22] = src1[6];
|
||||||
|
dest0[23] = src1[7];
|
||||||
|
dest0[24] = src1[8];
|
||||||
|
dest0[25] = src1[9];
|
||||||
|
dest0[26] = src1[10];
|
||||||
|
dest0[27] = src1[11];
|
||||||
|
dest0[28] = src1[12];
|
||||||
|
dest0[29] = src1[13];
|
||||||
|
dest0[30] = src1[14];
|
||||||
|
dest0[31] = src1[15];
|
||||||
|
dest0[32] = src2[0];
|
||||||
|
dest0[33] = src2[1];
|
||||||
|
dest0[34] = src2[2];
|
||||||
|
dest0[35] = src2[3];
|
||||||
|
dest0[36] = src2[4];
|
||||||
|
dest0[37] = src2[5];
|
||||||
|
dest0[38] = src2[6];
|
||||||
|
dest0[39] = src2[7];
|
||||||
|
dest0[40] = src2[8];
|
||||||
|
dest0[41] = src2[9];
|
||||||
|
dest0[42] = src2[10];
|
||||||
|
dest0[43] = src2[11];
|
||||||
|
dest0[44] = src2[12];
|
||||||
|
dest0[45] = src2[13];
|
||||||
|
dest0[46] = src2[14];
|
||||||
|
dest0[47] = src2[15];
|
||||||
|
dest0[48] = src3[0];
|
||||||
|
dest0[49] = src3[1];
|
||||||
|
dest0[50] = src3[2];
|
||||||
|
dest0[51] = src3[3];
|
||||||
|
dest0[52] = src3[4];
|
||||||
|
dest0[53] = src3[5];
|
||||||
|
dest0[54] = src3[6];
|
||||||
|
dest0[55] = src3[7];
|
||||||
|
dest0[56] = src3[8];
|
||||||
|
dest0[57] = src3[9];
|
||||||
|
dest0[58] = src3[10];
|
||||||
|
dest0[59] = src3[11];
|
||||||
|
dest0[60] = src3[12];
|
||||||
|
dest0[61] = src3[13];
|
||||||
|
dest0[62] = src3[14];
|
||||||
|
dest0[63] = src3[15];
|
||||||
|
src0 = src0+16;
|
||||||
|
src1 = src1+16;
|
||||||
|
src2 = src2+16;
|
||||||
|
src3 = src3+16;
|
||||||
|
ii = (row<<4);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (col&4)
|
||||||
|
{
|
||||||
|
dest4[0] = src0[0];
|
||||||
|
dest4[1] = src0[1];
|
||||||
|
dest4[2] = src0[2];
|
||||||
|
dest4[3] = src0[3];
|
||||||
|
dest4[4] = src0[4];
|
||||||
|
dest4[5] = src0[5];
|
||||||
|
dest4[6] = src0[6];
|
||||||
|
dest4[7] = src0[7];
|
||||||
|
dest4[8] = src1[0];
|
||||||
|
dest4[9] = src1[1];
|
||||||
|
dest4[10] = src1[2];
|
||||||
|
dest4[11] = src1[3];
|
||||||
|
dest4[12] = src1[4];
|
||||||
|
dest4[13] = src1[5];
|
||||||
|
dest4[14] = src1[6];
|
||||||
|
dest4[15] = src1[7];
|
||||||
|
dest4[16] = src2[0];
|
||||||
|
dest4[17] = src2[1];
|
||||||
|
dest4[18] = src2[2];
|
||||||
|
dest4[19] = src2[3];
|
||||||
|
dest4[20] = src2[4];
|
||||||
|
dest4[21] = src2[5];
|
||||||
|
dest4[22] = src2[6];
|
||||||
|
dest4[23] = src2[7];
|
||||||
|
dest4[24] = src3[0];
|
||||||
|
dest4[25] = src3[1];
|
||||||
|
dest4[26] = src3[2];
|
||||||
|
dest4[27] = src3[3];
|
||||||
|
dest4[28] = src3[4];
|
||||||
|
dest4[29] = src3[5];
|
||||||
|
dest4[30] = src3[6];
|
||||||
|
dest4[31] = src3[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
src2 = src2+8;
|
||||||
|
src3 = src3+8;
|
||||||
|
dest4 = dest4+32;
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
dest2[0] = src0[0];
|
||||||
|
dest2[1] = src0[1];
|
||||||
|
dest2[2] = src0[2];
|
||||||
|
dest2[3] = src0[3];
|
||||||
|
dest2[4] = src1[0];
|
||||||
|
dest2[5] = src1[1];
|
||||||
|
dest2[6] = src1[2];
|
||||||
|
dest2[7] = src1[3];
|
||||||
|
dest2[8] = src2[0];
|
||||||
|
dest2[9] = src2[1];
|
||||||
|
dest2[10] = src2[2];
|
||||||
|
dest2[11] = src2[3];
|
||||||
|
dest2[12] = src3[0];
|
||||||
|
dest2[13] = src3[1];
|
||||||
|
dest2[14] = src3[2];
|
||||||
|
dest2[15] = src3[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
src2 = src2+4;
|
||||||
|
src3 = src3+4;
|
||||||
|
dest2 = dest2+16;
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
dest1[0] = src0[0];
|
||||||
|
dest1[1] = src0[1];
|
||||||
|
dest1[2] = src1[0];
|
||||||
|
dest1[3] = src1[1];
|
||||||
|
dest1[4] = src2[0];
|
||||||
|
dest1[5] = src2[1];
|
||||||
|
dest1[6] = src3[0];
|
||||||
|
dest1[7] = src3[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
src2 = src2+2;
|
||||||
|
src3 = src3+2;
|
||||||
|
dest1 = dest1+8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (row&2)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src1 = src0+2*srcdim;
|
||||||
|
src = src1+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (2<<4);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<col/8; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
dest0[8] = src0[8];
|
||||||
|
dest0[9] = src0[9];
|
||||||
|
dest0[10] = src0[10];
|
||||||
|
dest0[11] = src0[11];
|
||||||
|
dest0[12] = src0[12];
|
||||||
|
dest0[13] = src0[13];
|
||||||
|
dest0[14] = src0[14];
|
||||||
|
dest0[15] = src0[15];
|
||||||
|
dest0[16] = src1[0];
|
||||||
|
dest0[17] = src1[1];
|
||||||
|
dest0[18] = src1[2];
|
||||||
|
dest0[19] = src1[3];
|
||||||
|
dest0[20] = src1[4];
|
||||||
|
dest0[21] = src1[5];
|
||||||
|
dest0[22] = src1[6];
|
||||||
|
dest0[23] = src1[7];
|
||||||
|
dest0[24] = src1[8];
|
||||||
|
dest0[25] = src1[9];
|
||||||
|
dest0[26] = src1[10];
|
||||||
|
dest0[27] = src1[11];
|
||||||
|
dest0[28] = src1[12];
|
||||||
|
dest0[29] = src1[13];
|
||||||
|
dest0[30] = src1[14];
|
||||||
|
dest0[31] = src1[15];
|
||||||
|
src0 = src0+16;
|
||||||
|
src1 = src1+16;
|
||||||
|
ii = (row<<4);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (col&4)
|
||||||
|
{
|
||||||
|
dest4[0] = src0[0];
|
||||||
|
dest4[1] = src0[1];
|
||||||
|
dest4[2] = src0[2];
|
||||||
|
dest4[3] = src0[3];
|
||||||
|
dest4[4] = src0[4];
|
||||||
|
dest4[5] = src0[5];
|
||||||
|
dest4[6] = src0[6];
|
||||||
|
dest4[7] = src0[7];
|
||||||
|
dest4[8] = src1[0];
|
||||||
|
dest4[9] = src1[1];
|
||||||
|
dest4[10] = src1[2];
|
||||||
|
dest4[11] = src1[3];
|
||||||
|
dest4[12] = src1[4];
|
||||||
|
dest4[13] = src1[5];
|
||||||
|
dest4[14] = src1[6];
|
||||||
|
dest4[15] = src1[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
src1 = src1+8;
|
||||||
|
dest4 = dest4+16;
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
dest2[0] = src0[0];
|
||||||
|
dest2[1] = src0[1];
|
||||||
|
dest2[2] = src0[2];
|
||||||
|
dest2[3] = src0[3];
|
||||||
|
dest2[4] = src1[0];
|
||||||
|
dest2[5] = src1[1];
|
||||||
|
dest2[6] = src1[2];
|
||||||
|
dest2[7] = src1[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
src1 = src1+4;
|
||||||
|
dest2 = dest2+8;
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
dest1[0] = src0[0];
|
||||||
|
dest1[1] = src0[1];
|
||||||
|
dest1[2] = src1[0];
|
||||||
|
dest1[3] = src1[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
src1 = src1+2;
|
||||||
|
dest1 = dest1+4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (row&1)
|
||||||
|
{
|
||||||
|
src0 = src;
|
||||||
|
src = src0+2*srcdim;
|
||||||
|
dest0 = dest;
|
||||||
|
ii = (1<<4);
|
||||||
|
dest = dest+ii;
|
||||||
|
for (i=0; i<col/8; i+=1)
|
||||||
|
{
|
||||||
|
dest0[0] = src0[0];
|
||||||
|
dest0[1] = src0[1];
|
||||||
|
dest0[2] = src0[2];
|
||||||
|
dest0[3] = src0[3];
|
||||||
|
dest0[4] = src0[4];
|
||||||
|
dest0[5] = src0[5];
|
||||||
|
dest0[6] = src0[6];
|
||||||
|
dest0[7] = src0[7];
|
||||||
|
dest0[8] = src0[8];
|
||||||
|
dest0[9] = src0[9];
|
||||||
|
dest0[10] = src0[10];
|
||||||
|
dest0[11] = src0[11];
|
||||||
|
dest0[12] = src0[12];
|
||||||
|
dest0[13] = src0[13];
|
||||||
|
dest0[14] = src0[14];
|
||||||
|
dest0[15] = src0[15];
|
||||||
|
src0 = src0+16;
|
||||||
|
ii = (row<<4);
|
||||||
|
dest0 = dest0+ii;
|
||||||
|
}
|
||||||
|
if (col&4)
|
||||||
|
{
|
||||||
|
dest4[0] = src0[0];
|
||||||
|
dest4[1] = src0[1];
|
||||||
|
dest4[2] = src0[2];
|
||||||
|
dest4[3] = src0[3];
|
||||||
|
dest4[4] = src0[4];
|
||||||
|
dest4[5] = src0[5];
|
||||||
|
dest4[6] = src0[6];
|
||||||
|
dest4[7] = src0[7];
|
||||||
|
src0 = src0+8;
|
||||||
|
dest4 = dest4+8;
|
||||||
|
}
|
||||||
|
if (col&2)
|
||||||
|
{
|
||||||
|
dest2[0] = src0[0];
|
||||||
|
dest2[1] = src0[1];
|
||||||
|
dest2[2] = src0[2];
|
||||||
|
dest2[3] = src0[3];
|
||||||
|
src0 = src0+4;
|
||||||
|
dest2 = dest2+4;
|
||||||
|
}
|
||||||
|
if (col&1)
|
||||||
|
{
|
||||||
|
dest1[0] = src0[0];
|
||||||
|
dest1[1] = src0[1];
|
||||||
|
src0 = src0+2;
|
||||||
|
dest1 = dest1+2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -746,6 +746,22 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "Sandybridge\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef OPTERON
|
#ifdef OPTERON
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
@ -778,6 +794,22 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef BOBCAT
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "Bobcate\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef NANO
|
#ifdef NANO
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
|
||||||
|
SGEMMINCOPY =
|
||||||
|
SGEMMITCOPY =
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
SGEMMINCOPYOBJ =
|
||||||
|
SGEMMITCOPYOBJ =
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||||
|
CGEMMINCOPY =
|
||||||
|
CGEMMITCOPY =
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ =
|
||||||
|
CGEMMITCOPYOBJ =
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
|
||||||
|
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
|
||||||
|
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
|
|
@ -0,0 +1 @@
|
||||||
|
include $(KERNELDIR)/KERNEL.PENRYN
|
|
@ -76,6 +76,12 @@
|
||||||
#define PREFETCHB prefetcht0
|
#define PREFETCHB prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
#define PREFETCHSIZE (8 * 1 - 4)
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#define PREFETCHB prefetcht0
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef PREFETCH
|
#ifndef PREFETCH
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -69,6 +69,12 @@
|
||||||
#define PREFETCHB prefetcht0
|
#define PREFETCHB prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
#define PREFETCHSIZE (16 * 1 - 8)
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#define PREFETCHB prefetcht0
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef PREFETCH
|
#ifndef PREFETCH
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
@ -262,7 +268,7 @@
|
||||||
movaps -16 * SIZE(AA), %xmm0
|
movaps -16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
addps %xmm2, %xmm7
|
addps %xmm2, %xmm7
|
||||||
#ifndef NEHALEM
|
#if !(defined(NEHALEM) || defined(SANDYBRIDGE))
|
||||||
PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
pshufd $0x93, %xmm1, %xmm2
|
pshufd $0x93, %xmm1, %xmm2
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define PREFETCHSIZE (16 * 4)
|
#define PREFETCHSIZE (16 * 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 7)
|
#define PREFETCHSIZE (16 * 7)
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
#define PREFETCHSIZE (8 * 2)
|
#define PREFETCHSIZE (8 * 2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 7)
|
#define PREFETCHSIZE (8 * 7)
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define PREFETCHSIZE (16 * 4)
|
#define PREFETCHSIZE (16 * 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 7)
|
#define PREFETCHSIZE (16 * 7)
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
#define PREFETCHSIZE (8 * 2)
|
#define PREFETCHSIZE (8 * 2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 7)
|
#define PREFETCHSIZE (8 * 7)
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -488,7 +488,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1697,7 +1697,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1727,7 +1727,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -437,7 +437,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -833,7 +833,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1848,7 +1848,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2109,7 +2109,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2429,7 +2429,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2459,7 +2459,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2952,7 +2952,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -3148,7 +3148,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3389,7 +3389,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -3404,7 +3404,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -910,7 +910,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -959,7 +959,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1439,7 +1439,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1469,7 +1469,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -872,7 +872,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1316,7 +1316,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1855,7 +1855,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1885,7 +1885,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2249,7 +2249,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2562,7 +2562,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2957,7 +2957,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -2972,7 +2972,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -3280,7 +3280,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3515,7 +3515,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -1036,7 +1036,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1066,7 +1066,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
@ -2224,7 +2224,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -2273,7 +2273,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 21 + 4)
|
#define PREFETCHSIZE (8 * 21 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -454,7 +454,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -758,7 +758,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -993,7 +993,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -1324,7 +1324,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1354,7 +1354,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -1718,7 +1718,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2031,7 +2031,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2859,7 +2859,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -3303,7 +3303,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define PREFETCHB prefetcht0
|
#define PREFETCHB prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCHSIZE (8 * 1 - 4)
|
#define PREFETCHSIZE (8 * 1 - 4)
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHB prefetcht0
|
#define PREFETCHB prefetcht0
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define PREFETCHB prefetcht0
|
#define PREFETCHB prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCHSIZE (16 * 1 + 8)
|
#define PREFETCHSIZE (16 * 1 + 8)
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHB prefetcht0
|
#define PREFETCHB prefetcht0
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define PREFETCHSIZE (16 * 2)
|
#define PREFETCHSIZE (16 * 2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 7)
|
#define PREFETCHSIZE (16 * 7)
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
#define PREFETCHSIZE (8 * 2)
|
#define PREFETCHSIZE (8 * 2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 7)
|
#define PREFETCHSIZE (8 * 7)
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define PREFETCHSIZE (16 * 2)
|
#define PREFETCHSIZE (16 * 2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 7)
|
#define PREFETCHSIZE (16 * 7)
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
#define PREFETCHSIZE (8 * 2)
|
#define PREFETCHSIZE (8 * 2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (8 * 7)
|
#define PREFETCHSIZE (8 * 7)
|
||||||
|
|
|
@ -55,7 +55,7 @@
|
||||||
#define XX %edi
|
#define XX %edi
|
||||||
#define FLAG %ebp
|
#define FLAG %ebp
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON)
|
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
|
||||||
#define USE_PSHUFD
|
#define USE_PSHUFD
|
||||||
#else
|
#else
|
||||||
#define USE_PSHUFD_HALF
|
#define USE_PSHUFD_HALF
|
||||||
|
@ -697,7 +697,7 @@
|
||||||
cmpl $2 * SIZE, INCX
|
cmpl $2 * SIZE, INCX
|
||||||
jne .L120
|
jne .L120
|
||||||
|
|
||||||
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
|
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
|
||||||
|
|
||||||
PSHUFD2($0, %xmm0, %xmm6)
|
PSHUFD2($0, %xmm0, %xmm6)
|
||||||
PSHUFD2($0, %xmm1, %xmm1)
|
PSHUFD2($0, %xmm1, %xmm1)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
|
|
||||||
#include "l1param.h"
|
#include "l1param.h"
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON)
|
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
|
||||||
#define USE_PSHUFD
|
#define USE_PSHUFD
|
||||||
#else
|
#else
|
||||||
#define USE_PSHUFD_HALF
|
#define USE_PSHUFD_HALF
|
||||||
|
@ -860,7 +860,7 @@
|
||||||
cmpl $2 * SIZE, INCX
|
cmpl $2 * SIZE, INCX
|
||||||
jne .L220
|
jne .L220
|
||||||
|
|
||||||
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
|
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
|
||||||
|
|
||||||
#ifdef HAVE_SSE3
|
#ifdef HAVE_SSE3
|
||||||
movddup %xmm0, %xmm6
|
movddup %xmm0, %xmm6
|
||||||
|
|
|
@ -61,7 +61,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -533,7 +533,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -63,7 +63,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -61,7 +61,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -994,7 +994,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -63,7 +63,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -61,7 +61,7 @@
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht1
|
#define PREFETCH prefetcht1
|
||||||
#define PREFETCHSIZE 84
|
#define PREFETCHSIZE 84
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -1820,7 +1820,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
SGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||||
|
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
|
||||||
|
DGEMMINCOPY =
|
||||||
|
DGEMMITCOPY =
|
||||||
|
DGEMMONCOPY = gemm_ncopy_4_opteron.S
|
||||||
|
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
|
||||||
|
DGEMMINCOPYOBJ =
|
||||||
|
DGEMMITCOPYOBJ =
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMONCOPY = zgemm_ncopy_2.S
|
||||||
|
CGEMMOTCOPY = zgemm_tcopy_2.S
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||||
|
ZGEMMINCOPY =
|
||||||
|
ZGEMMITCOPY =
|
||||||
|
ZGEMMONCOPY = zgemm_ncopy_2.S
|
||||||
|
ZGEMMOTCOPY = zgemm_tcopy_2.S
|
||||||
|
ZGEMMINCOPYOBJ =
|
||||||
|
ZGEMMITCOPYOBJ =
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
|
||||||
|
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
|
||||||
|
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
|
||||||
|
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
|
||||||
|
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
|
||||||
|
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
|
||||||
|
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
|
||||||
|
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
|
||||||
|
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
|
||||||
|
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
|
||||||
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
|
@ -0,0 +1,84 @@
|
||||||
|
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
|
||||||
|
SGEMMINCOPY =
|
||||||
|
SGEMMITCOPY =
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
SGEMMINCOPYOBJ =
|
||||||
|
SGEMMITCOPYOBJ =
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
#DGEMMONCOPY = gemm_ncopy_4.S
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
#DGEMMOTCOPY = gemm_tcopy_4.S
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
|
||||||
|
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
|
||||||
|
ZGEMMINCOPY =
|
||||||
|
ZGEMMITCOPY =
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
ZGEMMINCOPYOBJ =
|
||||||
|
ZGEMMITCOPYOBJ =
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
|
||||||
|
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
|
||||||
|
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
|
||||||
|
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
|
||||||
|
|
||||||
|
#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
|
||||||
|
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
|
||||||
|
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
|
||||||
|
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
|
||||||
|
|
||||||
|
#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
|
||||||
|
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
|
||||||
|
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
|
||||||
|
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
|
||||||
|
|
||||||
|
#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
|
||||||
|
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
|
||||||
|
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
|
||||||
|
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -45,6 +45,12 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
#define PREFETCHSIZE 16
|
||||||
|
#define PREFETCH prefetcht0
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef MOVAPS
|
#ifndef MOVAPS
|
||||||
#define MOVAPS movaps
|
#define MOVAPS movaps
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCHSIZE 16
|
#define PREFETCHSIZE 16
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
|
|
|
@ -45,6 +45,12 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
#define PREFETCHSIZE 12
|
||||||
|
#define PREFETCH prefetcht0
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef MOVAPS
|
#ifndef MOVAPS
|
||||||
#define MOVAPS movaps
|
#define MOVAPS movaps
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -52,6 +52,13 @@
|
||||||
#define MOVUPS_A movups
|
#define MOVUPS_A movups
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
#define PREFETCHSIZE 12
|
||||||
|
#define PREFETCH prefetcht0
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#define MOVUPS_A movups
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
|
||||||
#define PREFETCHSIZE 16
|
#define PREFETCHSIZE 16
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
|
|
|
@ -51,6 +51,12 @@
|
||||||
#define MOVUPS_A movups
|
#define MOVUPS_A movups
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
#define PREFETCHSIZE 12
|
||||||
|
#define PREFETCH prefetcht0
|
||||||
|
#define MOVUPS_A movups
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
|
||||||
#define PREFETCHSIZE 16
|
#define PREFETCHSIZE 16
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
|
|
|
@ -46,6 +46,13 @@
|
||||||
#define MOVUPS_A movups
|
#define MOVUPS_A movups
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SANDYBRIDGE
|
||||||
|
#define PREFETCHSIZE 16
|
||||||
|
#define PREFETCH prefetcht0
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#define MOVUPS_A movups
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef MOVUPS_A
|
#ifdef MOVUPS_A
|
||||||
#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
|
#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
|
||||||
#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS
|
#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS
|
||||||
|
|
|
@ -46,6 +46,13 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(SANDYBRIDGE)
|
||||||
|
#define RPREFETCHSIZE 12
|
||||||
|
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
|
||||||
|
#define PREFETCH prefetcht0
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define M ARG1 /* rdi */
|
#define M ARG1 /* rdi */
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define RPREFETCHSIZE 12
|
#define RPREFETCHSIZE 12
|
||||||
#define WPREFETCHSIZE (RPREFETCHSIZE * 4)
|
#define WPREFETCHSIZE (RPREFETCHSIZE * 4)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
|
|
|
@ -46,6 +46,13 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(SANDYBRIDGE)
|
||||||
|
#define RPREFETCHSIZE 12
|
||||||
|
#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
|
||||||
|
#define PREFETCH prefetcht0
|
||||||
|
#define PREFETCHW prefetcht0
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef WINDOWS_ABI
|
#ifndef WINDOWS_ABI
|
||||||
|
|
||||||
#define M ARG1 /* rdi */
|
#define M ARG1 /* rdi */
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
|
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define RPREFETCHSIZE 12
|
#define RPREFETCHSIZE 12
|
||||||
#define WPREFETCHSIZE (RPREFETCHSIZE * 4)
|
#define WPREFETCHSIZE (RPREFETCHSIZE * 4)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 12)
|
#define PREFETCHSIZE (16 * 12)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -685,7 +685,7 @@
|
||||||
cmpq $2 * SIZE, INCX
|
cmpq $2 * SIZE, INCX
|
||||||
jne .L120
|
jne .L120
|
||||||
|
|
||||||
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
|
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
|
||||||
|
|
||||||
pshufd $0, %xmm0, %xmm14
|
pshufd $0, %xmm0, %xmm14
|
||||||
pshufd $0, %xmm1, %xmm1
|
pshufd $0, %xmm1, %xmm1
|
||||||
|
|
|
@ -55,7 +55,7 @@
|
||||||
|
|
||||||
#include "l1param.h"
|
#include "l1param.h"
|
||||||
|
|
||||||
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO)
|
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE)
|
||||||
#define USE_PSHUFD
|
#define USE_PSHUFD
|
||||||
#else
|
#else
|
||||||
#define USE_PSHUFD_HALF
|
#define USE_PSHUFD_HALF
|
||||||
|
@ -803,7 +803,7 @@
|
||||||
cmpq $2 * SIZE, INCX
|
cmpq $2 * SIZE, INCX
|
||||||
jne .L220
|
jne .L220
|
||||||
|
|
||||||
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM)
|
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
|
||||||
|
|
||||||
movddup %xmm0, %xmm14
|
movddup %xmm0, %xmm14
|
||||||
pxor %xmm15, %xmm15
|
pxor %xmm15, %xmm15
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
|
@ -160,7 +160,7 @@
|
||||||
#define a3 %xmm14
|
#define a3 %xmm14
|
||||||
#define xt1 %xmm15
|
#define xt1 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef NEHALEM
|
#if defined(NEHALEM) || defined(SANDYBRIDGE)
|
||||||
#define PREFETCH prefetcht0
|
#define PREFETCH prefetcht0
|
||||||
#define PREFETCHW prefetcht0
|
#define PREFETCHW prefetcht0
|
||||||
#define PREFETCHSIZE (16 * 24)
|
#define PREFETCHSIZE (16 * 24)
|
||||||
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define BORIG 72(%rsp)
|
#define BORIG 72(%rsp)
|
||||||
#define BUFFER 128(%rsp)
|
#define BUFFER 128(%rsp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -95,7 +95,7 @@
|
||||||
#define PREFETCHSIZE (8 * 6 + 4)
|
#define PREFETCHSIZE (8 * 6 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -86,7 +86,7 @@
|
||||||
#define BORIG 72(%rsp)
|
#define BORIG 72(%rsp)
|
||||||
#define BUFFER 128(%rsp)
|
#define BUFFER 128(%rsp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
|
@ -95,7 +95,7 @@
|
||||||
#define PREFETCHSIZE (8 * 6 + 4)
|
#define PREFETCHSIZE (8 * 6 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHNTA prefetchnta
|
#define PREFETCHNTA prefetchnta
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue