Merge branch 'loongson3a' into loongson3b

This commit is contained in:
Xianyi Zhang 2013-04-08 14:56:39 +08:00
commit d692ee07f7
173 changed files with 29723 additions and 1684 deletions

9
.gitignore vendored
View File

@ -1,16 +1,25 @@
*.obj *.obj
*.lib *.lib
*.dll *.dll
*.dylib
*.def *.def
*.o *.o
lapack-3.1.1 lapack-3.1.1
lapack-3.1.1.tgz lapack-3.1.1.tgz
lapack-3.4.1
lapack-3.4.1.tgz
lapack-3.4.2
lapack-3.4.2.tgz
*.so *.so
*.a *.a
.svn .svn
*~ *~
lib.grd
nohup.out
config.h config.h
Makefile.conf Makefile.conf
Makefile.conf_last
config_last.h
getarch getarch
getarch_2nd getarch_2nd
utest/openblas_utest utest/openblas_utest

View File

@ -1,4 +1,115 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.2.6
2-Mar-2013
common:
* Improved OpenMP performance slightly. (d744c9)
* Improved cblas.h compatibility with Intel MKL.(#185)
* Fixed the overflowing bug in single thread cholesky factorization.
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
x86/x86-64:
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
We will tune the performance in future.
* Auto-detect Intel Xeon E7540.
* Fixed the overflowing buffer bug of gemv. (#173)
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
MIPS64:
====================================================================
Version 0.2.5
26-Nov-2012
common:
* Added NO_SHARED flag to disable generating the shared library.
* Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158)
* Export LAPACK 3.4.2 symbols in shared library. (#147)
* Only detect the number of physical CPU cores on Mac OSX. (#157)
* Fixed NetBSD build. (#155)
* Fixed compilation with TARGET=GENERIC. (#160)
x86/x86-64:
* Restore the original CPU affinity when calling
openblas_set_num_threads(1) (#153)
* Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154)
MIPS64:
====================================================================
Version 0.2.4
8-Oct-2012
common:
* Upgraded LAPACK to 3.4.2 version. (#145)
* Provided support for passing CFLAGS, FFLAGS, PFLAGS,
FPFLAGS to make. (#137)
* f77blas.h:compatibility for compilers without C99 complex
number support. (#141)
x86/x86-64:
* Added NO_AVX flag. Check OS supporting AVX on runtime. (#139)
* Fixed zdot incompatibility ABI issue with GCC 4.7 on
Windows 32-bit. (#140)
MIPS64:
* Fixed the generation of shared library bug.
* Fixed the detection bug on the Loongson 3A server.
====================================================================
Version 0.2.3
20-Aug-2012
common:
* Fixed LAPACK unstable bug about ?laswp. (#130)
* Fixed the shared library bug about unloading the library on
Linux (#132).
* Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2)
Please use gcc and IBM xlf. (#134)
x86/x86-64:
* Supported goto_set_num_threads and openblas_set_num_threads
APIs in Windows. They can set the number of threads on runtime.
====================================================================
Version 0.2.2
6-July-2012
common:
* Fixed exporting DLL functions bug on Windows/MingW
* Support GNU Hurd (Thank Sylvestre Ledru)
* Support kfreebsd kernel (Thank Sylvestre Ledru)
x86/x86-64:
* Support Intel Sandy Bridge 22nm desktop/mobile CPU
SPARC:
* Improve the detection of SPARC (Thank Sylvestre Ledru)
====================================================================
Version 0.2.1
30-Jun-2012
common:
x86/x86-64:
* Fixed the SEGFAULT bug about hyper-theading
* Support AMD Bulldozer by using GotoBLAS2 AMD Barcelona codes
====================================================================
Version 0.2.0
26-Jun-2012
common:
* Removed the limitation (64) of numbers of CPU cores.
Now, it supports 256 cores at max.
* Supported clang compiler.
* Fixed some build bugs on FreeBSD
x86/x86-64:
* Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions.
Please use gcc >= 4.6 or clang >=3.1.
* Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes.
====================================================================
Version 0.1.1
29-Apr-2012
common:
* Upgraded LAPACK to 3.4.1 version. (Thank Zaheer Chothia)
* Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia)
* Fixed the build bug (MD5 and download) on Mac OSX.
* Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1.
* Fxied the compatibility issue for compilers without C99 complex number
(e.g. Visual Studio)
x86/x86_64:
* Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX.
* Test alpha=Nan in dscale.
* Fixed a SEGFAULT bug in samax on x86 windows.
==================================================================== ====================================================================
Version 0.1.0 Version 0.1.0
23-Mar-2012 23-Mar-2012

View File

@ -90,6 +90,15 @@
number of threads will consume extra resource. I recommend you to number of threads will consume extra resource. I recommend you to
specify minimum number of threads. specify minimum number of threads.
1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong?
A This may be related to a bug in the Linux kernel 2.6.32. Try applying
the patch segaults.patch using
patch < segfaults.patch
and see if the crashes persist. Note that this patch will lead to many
compiler warnings.
2. Architecture Specific issue or Implementation 2. Architecture Specific issue or Implementation

View File

@ -1,4 +1,4 @@
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without

118
Makefile
View File

@ -3,7 +3,7 @@ include ./Makefile.system
BLASDIRS = interface driver/level2 driver/level3 driver/others BLASDIRS = interface driver/level2 driver/level3 driver/others
ifndef DYNAMIC_ARCH ifneq ($(DYNAMIC_ARCH), 1)
BLASDIRS += kernel BLASDIRS += kernel
endif endif
@ -80,6 +80,7 @@ endif
@echo @echo
shared : shared :
ifndef NO_SHARED
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
$(MAKE) -C exports so $(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so -ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ -99,11 +100,10 @@ ifeq ($(OSNAME), Darwin)
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll $(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll $(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif
endif endif
tests : tests :
@ -147,7 +147,7 @@ ifeq ($(EXPRECISION), 1)
echo "#define EXPRECISION">> config_last.h echo "#define EXPRECISION">> config_last.h
endif endif
## ##
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1 $(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \ for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
@ -165,7 +165,7 @@ prof_blas :
$(MAKE) -C $$d prof || exit 1 ; \ $(MAKE) -C $$d prof || exit 1 ; \
fi; \ fi; \
done done
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonprof || exit 1 $(MAKE) -C kernel commonprof || exit 1
endif endif
@ -184,7 +184,7 @@ hpl :
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \ fi; \
done done
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1 $(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \ for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
@ -203,47 +203,73 @@ ifeq ($(NO_LAPACK), 1)
netlib : netlib :
else else
netlib : lapack-3.4.0 patch.for_lapack-3.4.0 lapack-3.4.0/make.inc netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
ifndef NOFORTRAN ifndef NOFORTRAN
-@$(MAKE) -C lapack-3.4.0 lapacklib -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
endif
ifndef NO_LAPACKE
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
endif endif
endif endif
prof_lapack : lapack-3.4.0 lapack-3.4.0/make.inc prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc
-@$(MAKE) -C lapack-3.4.0 lapack_prof -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
lapack-3.4.0/make.inc : $(NETLIB_LAPACK_DIR)/make.inc :
ifndef NOFORTRAN ifndef NOFORTRAN
-@echo "FORTRAN = $(FC)" > lapack-3.4.0/make.inc -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(FFLAGS)" >> lapack-3.4.0/make.inc -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(FPFLAGS)" >> lapack-3.4.0/make.inc -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = $(FFLAGS) -O0" >> lapack-3.4.0/make.inc -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(FPFLAGS) -O0" >> lapack-3.4.0/make.inc -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> lapack-3.4.0/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCH = $(AR)" >> lapack-3.4.0/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> lapack-3.4.0/make.inc ifdef INTERFACE64
-@echo "LAPACKLIB = ../$(LIBNAME)" >> lapack-3.4.0/make.inc -@echo "override CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> lapack-3.4.0/make.inc else
-@echo "SUFFIX = $(SUFFIX)" >> lapack-3.4.0/make.inc -@echo "override CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> lapack-3.4.0/make.inc endif
# -@echo "CEXTRALIB = $(CEXTRALIB)" >> lapack-3.4.0/make.inc -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@cat make.inc >> lapack-3.4.0/make.inc -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif endif
lapack-3.4.0 : lapack-3.4.0.tgz lapack-3.4.2 : lapack-3.4.2.tgz
ifndef NOFORTRAN ifndef NOFORTRAN
@if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ ifndef NO_LAPACK
@if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
echo $(TAR) zxf $< ;\ echo $(TAR) zxf $< ;\
$(TAR) zxf $< && (cd lapack-3.4.0; $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\
rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\
else \ else \
echo " lapack-3.4.0.tgz check sum is wrong (Please use orignal)." ;\ rm -rf $(NETLIB_LAPACK_DIR) ;\
rm -rf lapack-3.4.0 ;\ echo " Cannot download lapack-3.4.2.tgz or the MD5 check sum is wrong (Please use orignal)."; \
exit 1; \
fi fi
endif endif
endif
lapack-3.4.0.tgz : LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.2.tgz
lapack-3.4.2.tgz :
ifndef NOFORTRAN ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/lapack-3.4.0.tgz #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
curl -O $(LAPACK_URL)
else
ifeq ($(OSNAME), FreeBSD)
fetch $(LAPACK_URL)
else
wget -O $@ $(LAPACK_URL)
endif
endif
endif endif
large.tgz : large.tgz :
@ -256,21 +282,21 @@ ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/timing/timing.tgz -wget http://www.netlib.org/lapack/timing/timing.tgz
endif endif
lapack-timing : lapack-3.4.0 large.tgz timing.tgz lapack-timing : lapack-3.4.2 large.tgz timing.tgz
ifndef NOFORTRAN ifndef NOFORTRAN
(cd lapack-3.4.0; $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd lapack-3.4.0/TIMING; $(TAR) zxf ../../large.tgz ) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
make -C lapack-3.4.0 tmglib make -C $(NETLIB_LAPACK_DIR) tmglib
make -C lapack-3.4.0/TIMING make -C $(NETLIB_LAPACK_DIR)/TIMING
endif endif
lapack-test : lapack-test :
$(MAKE) -C lapack-3.4.0 tmglib $(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
$(MAKE) -C lapack-3.4.0/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc $(MAKE) -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc
@rm -f lapack-3.4.0/TESTING/*.out @rm -f $(NETLIB_LAPACK_DIR)/TESTING/*.out
$(MAKE) -j 1 -C lapack-3.4.0/TESTING $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING
$(GREP) failed lapack-3.4.0/TESTING/*.out $(GREP) failed $(NETLIB_LAPACK_DIR)/TESTING/*.out
dummy : dummy :
@ -288,10 +314,10 @@ clean ::
#endif #endif
@$(MAKE) -C reference clean @$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d lapack-3.4.0; then \ @if test -d $(NETLIB_LAPACK_DIR); then \
echo deleting lapack-3.4.0; \ echo deleting $(NETLIB_LAPACK_DIR); \
rm -rf lapack-3.4.0 ;\ rm -rf $(NETLIB_LAPACK_DIR) ;\
fi fi
@rm -f *.grd Makefile.conf_last config_last.h @rm -f *.grd Makefile.conf_last config_last.h
@echo Done. @echo Done.

View File

@ -23,7 +23,7 @@ install : lib.grd
@cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR) @echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR)
@echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h
@ -32,8 +32,18 @@ install : lib.grd
@cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h @cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
ifndef NO_CBLAS
@echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR) @echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR)
@sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h @sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR)
@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
endif
#for install static library #for install static library
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
@ -61,11 +71,9 @@ ifeq ($(OSNAME), Darwin)
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
endif endif
@echo Install OK! @echo Install OK!

View File

@ -1,3 +1,5 @@
# This is triggered by Makefile.system and runs before any of the code is built.
export BINARY export BINARY
export USE_OPENMP export USE_OPENMP
@ -15,7 +17,7 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99 EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif endif
all: getarch_2nd all: getarch_2nd cblas_noconst.h
./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF) ./getarch_2nd 1 >> $(TARGET_CONF)
@ -36,4 +38,7 @@ else
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
endif endif
cblas_noconst.h : cblas.h
perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h
dummy: dummy:

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.1.0 VERSION = 0.2.6
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -24,10 +24,13 @@ VERSION = 0.1.0
# Fortran compiler. Default is g77. # Fortran compiler. Default is g77.
# FC = gfortran # FC = gfortran
# Even you can specify cross compiler # Even you can specify cross compiler. Meanwhile, please set HOSTCC.
# CC = x86_64-w64-mingw32-gcc # CC = x86_64-w64-mingw32-gcc
# FC = x86_64-w64-mingw32-gfortran # FC = x86_64-w64-mingw32-gfortran
# If you use the cross compiler, please set this host compiler.
# HOSTCC = gcc
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# BINARY=64 # BINARY=64
@ -45,12 +48,19 @@ VERSION = 0.1.0
# automatically detected by the the script. # automatically detected by the the script.
# NUM_THREADS = 24 # NUM_THREADS = 24
# if you don't need generate the shared library, please comment it in.
# NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in. # If you don't need CBLAS interface, please comment it in.
# NO_CBLAS = 1 # NO_CBLAS = 1
# If you don't need LAPACK, please comment it in. # If you don't need LAPACK, please comment it in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1 # NO_LAPACK = 1
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# NO_LAPACKE = 1
# If you want to use legacy threaded Level 3 implementation. # If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1 # USE_SIMPLE_THREADED_LEVEL3 = 1
@ -67,6 +77,10 @@ VERSION = 0.1.0
# If you want to disable CPU/Memory affinity on Linux. # If you want to disable CPU/Memory affinity on Linux.
# NO_AFFINITY = 1 # NO_AFFINITY = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
# and OS. However, the performance is low.
# NO_AVX = 1
# If you would like to know minute performance report of GotoBLAS. # If you would like to know minute performance report of GotoBLAS.
# FUNCTION_PROFILE = 1 # FUNCTION_PROFILE = 1
@ -90,8 +104,8 @@ VERSION = 0.1.0
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading # with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4. # in small matrix sizes. The default value is 50.
# GEMM_MULTITHREAD_THRESHOLD = 4 # GEMM_MULTITHREAD_THRESHOLD = 50
# If you need santy check by comparing reference BLAS. It'll be very # If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet). # slow (Not implemented yet).
@ -104,19 +118,16 @@ VERSION = 0.1.0
# The installation directory. # The installation directory.
# PREFIX = /opt/OpenBLAS # PREFIX = /opt/OpenBLAS
# Common Optimization Flag; -O2 is enough. # Common Optimization Flag;
# DEBUG = 1 # The default -O2 is enough.
# COMMON_OPT = -O2
ifeq ($(DEBUG), 1)
COMMON_OPT += -g
# -DDEBUG
else
COMMON_OPT += -O2
endif
# Profiling flags # Profiling flags
COMMON_PROF = -pg COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
# #
# End of user configuration # End of user configuration
# #

View File

@ -9,8 +9,20 @@ ifndef TOPDIR
TOPDIR = . TOPDIR = .
endif endif
ifndef NETLIB_LAPACK_DIR
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2
endif
# Default C compiler # Default C compiler
# - Only set if not specified on the command line or inherited from the environment.
# - CC is an implicit variable so neither '?=' or 'ifndef' can be used.
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default)
CC = gcc CC = gcc
endif
# Default Fortran compiler (FC) is selected by f_check.
ifndef MAKEFILE_RULE ifndef MAKEFILE_RULE
include $(TOPDIR)/Makefile.rule include $(TOPDIR)/Makefile.rule
@ -41,16 +53,24 @@ GETARCH_FLAGS += -DUSE64BITINT
endif endif
ifndef GEMM_MULTITHREAD_THRESHOLD ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4 GEMM_MULTITHREAD_THRESHOLD=50
endif endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
# This operation is expensive, so execution should be once. # This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h # Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
ifndef TARGET_CORE ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf include $(TOPDIR)/Makefile.conf
@ -101,6 +121,15 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
export MACOSX_DEPLOYMENT_TARGET=10.2 export MACOSX_DEPLOYMENT_TARGET=10.2
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), FreeBSD)
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), NetBSD)
MD5SUM = md5 -n
endif endif
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
@ -120,6 +149,26 @@ EXTRALIB += -defaultlib:advapi32
SUFFIX = obj SUFFIX = obj
PSUFFIX = pobj PSUFFIX = pobj
LIBSUFFIX = lib LIBSUFFIX = lib
ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(GCCVERSIONGTEQ4), 1)
ifeq ($(GCCMINORVERSIONGTEQ7), 1)
# GCC Version >=4.7
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
endif
endif
endif endif
ifeq ($(OSNAME), Interix) ifeq ($(OSNAME), Interix)
@ -223,14 +272,20 @@ endif
endif endif
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif
endif endif
ifndef DYNAMIC_CORE ifndef DYNAMIC_CORE
@ -459,11 +514,28 @@ ifdef INTERFACE64
FCOMMON_OPT += -i8 FCOMMON_OPT += -i8
endif endif
endif endif
ifeq ($(ARCH), mips64)
ifndef BINARY64
FCOMMON_OPT += -n32
else
FCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
FCOMMON_OPT += -loongson3
endif
ifeq ($(CORE), LOONGSON3B)
FCOMMON_OPT += -loongson3
endif
else
ifndef BINARY64 ifndef BINARY64
FCOMMON_OPT += -m32 FCOMMON_OPT += -m32
else else
FCOMMON_OPT += -m64 FCOMMON_OPT += -m64
endif endif
endif
ifdef USE_OPENMP ifdef USE_OPENMP
FEXTRALIB += -lstdc++ FEXTRALIB += -lstdc++
@ -472,12 +544,30 @@ endif
endif endif
ifeq ($(C_COMPILER), OPEN64) ifeq ($(C_COMPILER), OPEN64)
ifeq ($(ARCH), mips64)
ifndef BINARY64
CCOMMON_OPT += -n32
else
CCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -loongson3
endif
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -loongson3
endif
else
ifndef BINARY64 ifndef BINARY64
CCOMMON_OPT += -m32 CCOMMON_OPT += -m32
else else
CCOMMON_OPT += -m64 CCOMMON_OPT += -m64
endif endif
endif endif
endif
ifeq ($(C_COMPILER), SUN) ifeq ($(C_COMPILER), SUN)
CCOMMON_OPT += -w CCOMMON_OPT += -w
@ -533,6 +623,16 @@ endif
ifeq ($(NO_LAPACK), 1) ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK CCOMMON_OPT += -DNO_LAPACK
#Disable LAPACK C interface
NO_LAPACKE = 1
endif
ifeq ($(NO_LAPACKE), 1)
CCOMMON_OPT += -DNO_LAPACKE
endif
ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX
endif endif
ifdef SMP ifdef SMP
@ -651,17 +751,30 @@ PATCH = patch
GREP = grep GREP = grep
endif endif
ifndef MD5SUM
MD5SUM = md5sum MD5SUM = md5sum
endif
AWK = awk AWK = awk
REVISION = -r$(VERSION) REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) ifeq ($(DEBUG), 1)
PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) COMMON_OPT += -g
endif
FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) ifndef COMMON_OPT
FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) COMMON_OPT = -O2
endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
ifndef SUFFIX ifndef SUFFIX
SUFFIX = o SUFFIX = o
@ -675,7 +788,7 @@ ifndef LIBSUFFIX
LIBSUFFIX = a LIBSUFFIX = a
endif endif
ifndef DYNAMIC_ARCH ifneq ($(DYNAMIC_ARCH), 1)
ifndef SMP ifndef SMP
LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
@ -694,8 +807,8 @@ endif
endif endif
LIBDLLNAME = $(LIBPREFIX).dll
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll)
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
@ -740,6 +853,7 @@ export HAVE_SSE4_1
export HAVE_SSE4_2 export HAVE_SSE4_2
export HAVE_SSE4A export HAVE_SSE4A
export HAVE_SSE5 export HAVE_SSE5
export HAVE_AVX
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE

View File

@ -22,19 +22,19 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
endif endif
$(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
$(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
$(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
$(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
$(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
$(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
libs :: $(BLASOBJS) $(COMMONOBJS) libs :: $(BLASOBJS) $(COMMONOBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

83
README
View File

@ -1,83 +0,0 @@
OpenBLAS Readme
1.Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn)
2.Intallation
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or,
check out codes from git://github.com/xianyi/OpenBLAS.git
1)Normal compile
(a) type "make" to detect the CPU automatically.
or
(b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
2)Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
examples:
On X86 box, compile this library for loongson3a CPU.
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
3)Debug version
make DEBUG=1
4)Intall to the directory (Optional)
e.g.
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS
3.Support CPU & OS
Please read GotoBLAS_01Readme.txt
Additional support CPU:
x86_64:
Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes.
MIPS64:
ICT Loongson 3A //The initial version used GotoBLAS2 MIPS64 kernels. Thus, the performance is not good.
4.Usages
Link with libopenblas.a or -lopenblas for shared library.
4.1 Set the number of threads with environment variables. for example,
export OPENBLAS_NUM_THREADS=4
or
export GOTO_NUM_THREADS=4
or
export OMP_NUM_THREADS=4
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
4.2 Set the number of threads with calling functions. for example,
void goto_set_num_threads(int num_threads);
or
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
5.Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
6.To-Do List:
Optimization on ICT Loongson 3A CPU
7.Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
8.ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
9.Known Issues
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
10. Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages

117
README.md Normal file
View File

@ -0,0 +1,117 @@
# OpenBLAS
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
## Installation
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
### Normal compile
* type "make" to detect the CPU automatically.
or
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
### Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
Examples:
On X86 box, compile this library for loongson3a CPU.
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
### Debug version
make DEBUG=1
### Install to the directory (Optional)
Example:
make install PREFIX=your_installation_directory
The default directory is /opt/OpenBLAS
## Support CPU & OS
Please read GotoBLAS_01Readme.txt
### Additional support CPU:
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental
### Support OS:
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
## Usages
Link with libopenblas.a or -lopenblas for shared library.
### Set the number of threads with environment variables.
Examples:
export OPENBLAS_NUM_THREADS=4
or
export GOTO_NUM_THREADS=4
or
export OMP_NUM_THREADS=4
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
### Set the number of threads on runtime.
We provided the below functions to controll the number of threads on runtime.
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
## Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
## Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256.
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages

View File

@ -8,8 +8,8 @@ Supported List:
1.X86/X86_64 1.X86/X86_64
a)Intel CPU: a)Intel CPU:
P2 P2
COPPERMINE
KATMAI KATMAI
COPPERMINE
NORTHWOOD NORTHWOOD
PRESCOTT PRESCOTT
BANIAS BANIAS
@ -18,6 +18,7 @@ CORE2
PENRYN PENRYN
DUNNINGTON DUNNINGTON
NEHALEM NEHALEM
SANDYBRIDGE
ATOM ATOM
b)AMD CPU: b)AMD CPU:
@ -27,6 +28,8 @@ OPTERON_SSE3
BARCELONA BARCELONA
SHANGHAI SHANGHAI
ISTANBUL ISTANBUL
BOBCAT
BULLDOZER
c)VIA CPU: c)VIA CPU:
SSE_GENERIC SSE_GENERIC
@ -47,6 +50,7 @@ CELL
3.MIPS64 CPU: 3.MIPS64 CPU:
SICORTEX SICORTEX
LOONGSON3A LOONGSON3A
LOONGSON3B
4.IA64 CPU: 4.IA64 CPU:
ITANIUM2 ITANIUM2

12
c_check
View File

@ -43,14 +43,14 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/);
$compiler = GCC if ($compiler eq ""); $compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/); $os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FreeBSD/); $os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NetBSD/); $os = NetBSD if ($data =~ /OS_NETBSD/);
$os = Darwin if ($data =~ /OS_Darwin/); $os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SunOS/); $os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/); $os = AIX if ($data =~ /OS_AIX/);
$os = osf if ($data =~ /OS_OSF/); $os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/); $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/); $os = Interix if ($data =~ /OS_INTERIX/);
$architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86 if ($data =~ /ARCH_X86/);
@ -174,6 +174,8 @@ $linker_a = "";
$link =~ s/\-Y\sP\,/\-Y/g; $link =~ s/\-Y\sP\,/\-Y/g;
@flags = split(/[\s\,\n]/, $link); @flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
foreach $flags (@flags) { foreach $flags (@flags) {
if ( if (

450
cblas.h
View File

@ -1,287 +1,293 @@
#ifndef CBLAS_H #ifndef CBLAS_H
#define CBLAS_H #define CBLAS_H
#include <stddef.h>
#include "common.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
/* Assume C declarations for C++ */ /* Assume C declarations for C++ */
#endif /* __cplusplus */ #endif /* __cplusplus */
#include <stddef.h> /*Set the number of threads on runtime.*/
#include "common.h" void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
#define CBLAS_INDEX size_t #define CBLAS_INDEX size_t
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
float _Complex cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
float _Complex cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
double _Complex cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
double _Complex cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
float cblas_sasum (blasint n, float *x, blasint incx); float cblas_sasum (const blasint n, const float *x, const blasint incx);
double cblas_dasum (blasint n, double *x, blasint incx); double cblas_dasum (const blasint n, const double *x, const blasint incx);
float cblas_scasum(blasint n, float *x, blasint incx); float cblas_scasum(const blasint n, const float *x, const blasint incx);
double cblas_dzasum(blasint n, double *x, blasint incx); double cblas_dzasum(const blasint n, const double *x, const blasint incx);
float cblas_snrm2 (blasint N, float *X, blasint incX); float cblas_snrm2 (const blasint N, const float *X, const blasint incX);
double cblas_dnrm2 (blasint N, double *X, blasint incX); double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
float cblas_scnrm2(blasint N, float *X, blasint incX); float cblas_scnrm2(const blasint N, const float *X, const blasint incX);
double cblas_dznrm2(blasint N, double *X, blasint incX); double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx);
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx);
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s);
void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s); void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
void cblas_sscal(blasint N, float alpha, float *X, blasint incX); void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
void cblas_dscal(blasint N, double alpha, double *X, blasint incX); void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
void cblas_csscal(blasint N, float alpha, float *X, blasint incX); void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy);
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy);
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy);
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy);
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
blasint incX, float *Y, blasint incY, float *A, blasint lda); const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
blasint incX, double *Y, blasint incY, double *A, blasint lda); const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
float *Y, blasint incY, float *A, blasint lda); const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
double *Y, blasint incY, double *A, blasint lda); const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX); const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX); const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
float *X, blasint incX, float beta, float *Y, blasint incY); const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
double *X, blasint incX, double beta, double *Y, blasint incY); const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...); void cblas_xerbla(blasint p, char *rout, char *form, ...);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif #endif

View File

@ -68,7 +68,7 @@ extern "C" {
#define SMP #define SMP
#endif #endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define WINDOWS_ABI #define WINDOWS_ABI
#define OS_WINDOWS #define OS_WINDOWS
@ -89,7 +89,7 @@ extern "C" {
#include <sched.h> #include <sched.h>
#endif #endif
#ifdef OS_DARWIN #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
#include <sched.h> #include <sched.h>
#endif #endif
@ -351,7 +351,12 @@ typedef int blasint;
#endif #endif
#define MMAP_ACCESS (PROT_READ | PROT_WRITE) #define MMAP_ACCESS (PROT_READ | PROT_WRITE)
#ifdef __NetBSD__
#define MMAP_POLICY (MAP_PRIVATE | MAP_ANON)
#else
#define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS)
#endif
#include "param.h" #include "param.h"
#include "common_param.h" #include "common_param.h"
@ -374,6 +379,31 @@ typedef int blasint;
#endif #endif
#endif #endif
#ifndef ASSEMBLER
#ifndef NOINCLUDE
/* Inclusion of a standard header file is needed for definition of __STDC_*
predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs
as a side effect of including either <features.h> or <stdc-predef.h>. */
#include <stdio.h>
#endif // NOINCLUDE
/* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus)))
#define OPENBLAS_COMPLEX_C99
typedef float _Complex openblas_complex_float;
typedef double _Complex openblas_complex_double;
typedef xdouble _Complex openblas_complex_xdouble;
#else
#define OPENBLAS_COMPLEX_STRUCT
typedef struct { float real, imag; } openblas_complex_float;
typedef struct { double real, imag; } openblas_complex_double;
typedef struct { xdouble real, imag; } openblas_complex_xdouble;
#endif
#endif // ASSEMBLER
#ifndef IFLUSH #ifndef IFLUSH
#define IFLUSH #define IFLUSH
#endif #endif
@ -528,7 +558,8 @@ typedef struct {
#include "common_level3.h" #include "common_level3.h"
#include "common_lapack.h" #include "common_lapack.h"
#ifdef CBLAS #ifdef CBLAS
#include "cblas.h" /* This header file is generated from "cblas.h" (see Makefile.prebuild). */
#include "cblas_noconst.h"
#endif #endif
#ifndef ASSEMBLER #ifndef ASSEMBLER

View File

@ -45,6 +45,8 @@ extern "C" {
int BLASFUNC(xerbla)(char *, blasint *info, blasint); int BLASFUNC(xerbla)(char *, blasint *info, blasint);
void openblas_set_num_threads_(int *);
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);
@ -74,19 +76,19 @@ myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *,
myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
#elif defined RETURN_BY_STACK #elif defined RETURN_BY_STACK
void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *); void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *);
void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *); void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
#else #else
float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *);
float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *);
double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *);
double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *);
xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
#endif #endif
void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *);
@ -640,6 +642,8 @@ int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
/* Lapack routines */
int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *);
int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
@ -675,6 +679,13 @@ int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float
int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);
int BLASFUNC(sgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);
@ -689,6 +700,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);

View File

@ -86,7 +86,13 @@ static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned
return syscall(SYS_set_mempolicy, mode, addr, flag); return syscall(SYS_set_mempolicy, mode, addr, flag);
} }
static inline int my_gettid(void) { return syscall(SYS_gettid); } static inline int my_gettid(void) {
#ifdef SYS_gettid
return syscall(SYS_gettid);
#else
return getpid();
#endif
}
#endif #endif
#endif #endif

View File

@ -64,4 +64,6 @@ void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *);
double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*);
FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *);
#endif #endif

View File

@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) {
int openmp_nthreads=0; int openmp_nthreads=0;
#endif #endif
if ((blas_cpu_number == 1) if (blas_cpu_number == 1
#ifdef USE_OPENMP #ifdef USE_OPENMP
|| omp_in_parallel() || omp_in_parallel()

View File

@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define PROFCODE #define PROFCODE
#endif #endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define SAVEREGISTERS \ #define SAVEREGISTERS \
subl $32, %esp;\ subl $32, %esp;\
movups %xmm6, 0(%esp);\ movups %xmm6, 0(%esp);\
@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define RESTOREREGISTERS #define RESTOREREGISTERS
#endif #endif
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 16; \ .align 16; \
@ -282,7 +282,7 @@ REALNAME:
#define EPILOGUE .end REALNAME #define EPILOGUE .end REALNAME
#endif #endif
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 16; \ .align 16; \
@ -356,4 +356,11 @@ REALNAME:
#ifndef ALIGN_6 #ifndef ALIGN_6
#define ALIGN_6 .align 64 #define ALIGN_6 .align 64
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif #endif

View File

@ -353,7 +353,7 @@ REALNAME:
#define EPILOGUE .end REALNAME #define EPILOGUE .end REALNAME
#endif #endif
#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
#define PROLOGUE \ #define PROLOGUE \
.text; \ .text; \
.align 512; \ .align 512; \
@ -425,6 +425,7 @@ REALNAME:
#define ALIGN_2 .align 2 #define ALIGN_2 .align 2
#define ALIGN_3 .align 3 #define ALIGN_3 .align 3
#define ALIGN_4 .align 4 #define ALIGN_4 .align 4
#define ALIGN_5 .align 5
#define ffreep fstp #define ffreep fstp
#endif #endif
@ -448,4 +449,10 @@ REALNAME:
#define ALIGN_6 .align 64 #define ALIGN_6 .align 64
#endif #endif
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif #endif

View File

@ -103,6 +103,9 @@
#define CORE_NEHALEM 17 #define CORE_NEHALEM 17
#define CORE_ATOM 18 #define CORE_ATOM 18
#define CORE_NANO 19 #define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -122,6 +125,8 @@
#define HAVE_MISALIGNSSE (1 << 15) #define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16) #define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17) #define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19)
#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@ -188,4 +193,7 @@ typedef struct {
#define CPUTYPE_NSGEODE 41 #define CPUTYPE_NSGEODE 41
#define CPUTYPE_VIAC3 42 #define CPUTYPE_VIAC3 42
#define CPUTYPE_NANO 43 #define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#endif #endif

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -101,12 +101,14 @@ int detect(void){
fclose(infile); fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){ if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A; return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){ }else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B; return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){ }else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r"); infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){ while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){ if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2; p = strchr(buffer, ':') + 2;
@ -119,6 +121,24 @@ int detect(void){
}else{ }else{
return CPU_SICORTEX; return CPU_SICORTEX;
} }
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif #endif
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }

View File

@ -40,6 +40,13 @@
#include <string.h> #include <string.h>
#include "cpuid.h" #include "cpuid.h"
#ifdef NO_AVX
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CORE_BULLDOZER CORE_BARCELONA
#endif
#ifndef CPUIDEMU #ifndef CPUIDEMU
#if defined(__APPLE__) && defined(__i386__) #if defined(__APPLE__) && defined(__i386__)
@ -109,6 +116,33 @@ static inline int have_excpuid(void){
return eax & 0xffff; return eax & 0xffff;
} }
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
}
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){ int get_vendor(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
char vendor[13]; char vendor[13];
@ -189,11 +223,17 @@ int get_cputype(int gettype){
if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3;
if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1;
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
#ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX;
#endif
if (have_excpuid() >= 0x01) { if (have_excpuid() >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx); cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
#ifndef NO_AVX
if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4;
#endif
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
} }
@ -983,10 +1023,33 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 10: case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CPUTYPE_NEHALEM; if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM; //OS doesn't support AVX
case 12: case 12:
//Xeon Processor 5600 (Westmere-EP) //Xeon Processor 5600 (Westmere-EP)
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14:
// Xeon E7540
case 15:
//Xeon Processor E7 (Westmere-EX)
return CPUTYPE_NEHALEM;
}
break;
case 3:
switch (model) {
case 10:
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
} }
break; break;
} }
@ -1021,6 +1084,13 @@ int get_cpuname(void){
case 1: case 1:
case 10: case 10:
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 5:
return CPUTYPE_BOBCAT;
} }
break; break;
} }
@ -1140,6 +1210,9 @@ static char *cpuname[] = {
"NSGEODE", "NSGEODE",
"VIAC3", "VIAC3",
"NANO", "NANO",
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
}; };
static char *lowercpuname[] = { static char *lowercpuname[] = {
@ -1186,6 +1259,9 @@ static char *lowercpuname[] = {
"tms3x00", "tms3x00",
"nsgeode", "nsgeode",
"nano", "nano",
"sandybridge",
"bobcat",
"bulldozer",
}; };
static char *corename[] = { static char *corename[] = {
@ -1209,6 +1285,9 @@ static char *corename[] = {
"NEHALEM", "NEHALEM",
"ATOM", "ATOM",
"NANO", "NANO",
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
}; };
static char *corename_lower[] = { static char *corename_lower[] = {
@ -1232,6 +1311,9 @@ static char *corename_lower[] = {
"nehalem", "nehalem",
"atom", "atom",
"nano", "nano",
"sandybridge",
"bobcat",
"bulldozer",
}; };
@ -1315,10 +1397,33 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
case 10: case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CORE_NEHALEM; if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
case 12: case 12:
//Xeon Processor 5600 (Westmere-EP) //Xeon Processor 5600 (Westmere-EP)
return CORE_NEHALEM; return CORE_NEHALEM;
case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
case 14:
//Xeon E7540
case 15:
//Xeon Processor E7 (Westmere-EX)
return CORE_NEHALEM;
}
break;
case 3:
switch (model) {
case 10:
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
} }
break; break;
} }
@ -1334,7 +1439,15 @@ int get_coretype(void){
if (family <= 0x5) return CORE_80486; if (family <= 0x5) return CORE_80486;
if (family <= 0xe) return CORE_ATHLON; if (family <= 0xe) return CORE_ATHLON;
if (family == 0xf){ if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
else if (exfamily == 6) {
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
}else return CORE_BARCELONA;
} }
} }
@ -1400,6 +1513,9 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE %d\n", info.size * 1024); printf("#define DTB_SIZE %d\n", info.size * 1024);
printf("#define DTB_ASSOCIATIVE %d\n", info.associative); printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
} else {
//fall back for some virtual machines.
printf("#define DTB_DEFAULT_ENTRIES 32\n");
} }
features = get_cputype(GET_FEATURE); features = get_cputype(GET_FEATURE);
@ -1414,8 +1530,10 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n");
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
@ -1479,7 +1597,9 @@ void get_sse(void){
if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n");
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
} }

17
ctest.c
View File

@ -34,20 +34,20 @@ COMPILER_GNU
OS_LINUX OS_LINUX
#endif #endif
#if defined(__FreeBSD__) #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
OS_FreeBSD OS_FREEBSD
#endif #endif
#if defined(__NetBSD__) #if defined(__NetBSD__)
OS_NetBSD OS_NETBSD
#endif #endif
#if defined(__sun) #if defined(__sun)
OS_SunOS OS_SUNOS
#endif #endif
#if defined(__APPLE__) #if defined(__APPLE__)
OS_Darwin OS_DARWIN
#endif #endif
#if defined(_AIX) #if defined(_AIX)
@ -63,13 +63,18 @@ OS_WINNT
#endif #endif
#if defined(__CYGWIN__) #if defined(__CYGWIN__)
OS_CYGWIN OS_CYGWIN_NT
#endif #endif
#if defined(__INTERIX) #if defined(__INTERIX)
OS_INTERIX OS_INTERIX
#endif #endif
#if defined(__gnu_hurd__)
/* Hurd is very similar to GNU/Linux, it should work out of the box */
OS_LINUX
#endif
#if defined(__i386) || defined(_X86) #if defined(__i386) || defined(_X86)
ARCH_X86 ARCH_X86
#endif #endif

View File

@ -5,7 +5,7 @@
TOPDIR = .. TOPDIR = ..
include $(TOPDIR)/Makefile.system include $(TOPDIR)/Makefile.system
CFLAGS += -DADD$(BU) -DCBLAS override CFLAGS += -DADD$(BU) -DCBLAS
LIB = $(TOPDIR)/$(LIBNAME) LIB = $(TOPDIR)/$(LIBNAME)

View File

@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a = (FLOAT *)args -> a; a = (FLOAT *)args -> a;
x = (FLOAT *)args -> b; x = (FLOAT *)args -> b;
y = (FLOAT *)args -> c;
lda = args -> lda; lda = args -> lda;
incx = args -> ldb; incx = args -> ldb;
@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
n_from = 0; n_from = 0;
n_to = n; n_to = n;
//Use y as each thread's n* COMPSIZE elements in sb buffer
y = buffer;
buffer += ((COMPSIZE * n + 1023) & ~1023);
if (range_m) { if (range_m) {
n_from = *(range_m + 0); n_from = *(range_m + 0);
n_to = *(range_m + 1); n_to = *(range_m + 1);
@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a += n_from * lda * COMPSIZE; a += n_from * lda * COMPSIZE;
} }
if (range_n) y += *range_n * COMPSIZE;
if (incx != 1) { if (incx != 1) {
COPY_K(n, x, incx, buffer, 1); COPY_K(n, x, incx, buffer, 1);
@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
if (num_cpu) { if (num_cpu) {
queue[0].sa = NULL; queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL; queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue); exec_blas(num_cpu, queue);
@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#else #else
ONE, ZERO, ONE, ZERO,
#endif #endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
} }
AXPYU_K(n, 0, 0, AXPYU_K(n, 0, 0,

View File

@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].args = arg; queue[num_cpu].args = arg;
queue[num_cpu].range_m = range_m; queue[num_cpu].range_m = range_m;
queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].range_n = &range[num_cpu];
#if defined(LOONGSON3A) #if 0 //defined(LOONGSON3A)
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else #else
@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
} }
if (num_cpu) { if (num_cpu) {
#if defined(LOONGSON3A) #if 0 //defined(LOONGSON3A)
queue[0].sa = sa; queue[0].sa = sa;
queue[0].sb = sa + GEMM_OFFSET_A1 * 5; queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else #else

View File

@ -1,12 +1,12 @@
TOPDIR = ../.. TOPDIR = ../..
include ../../Makefile.system include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
ifdef SMP ifdef SMP
COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
ifndef NO_AFFINITY ifndef NO_AFFINITY
COMMONOBJS += init.$(SUFFIX) COMMONOBJS += init.$(SUFFIX)
endif endif
@ -14,7 +14,7 @@ endif
# COMMONOBJS += info.$(SUFFIX) # COMMONOBJS += info.$(SUFFIX)
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
COMMONOBJS += dynamic.$(SUFFIX) COMMONOBJS += dynamic.$(SUFFIX)
else else
COMMONOBJS += parameter.$(SUFFIX) COMMONOBJS += parameter.$(SUFFIX)
@ -70,7 +70,7 @@ ifndef BLAS_SERVER
BLAS_SERVER = blas_server.c BLAS_SERVER = blas_server.c
endif endif
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
else else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
openblas_get_config.$(SUFFIX) : openblas_get_config.c
$(CC) $(CFLAGS) -c $< -o $(@F)
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
@ -215,7 +218,7 @@ info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
hpl : CFLAGS += -DHPL hpl : override CFLAGS += -DHPL
hpl_p : CFLAGS += -DHPL hpl_p : override CFLAGS += -DHPL
include $(TOPDIR)/Makefile.tail include $(TOPDIR)/Makefile.tail

View File

@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
#ifdef MONITOR #ifdef MONITOR
@ -435,7 +436,7 @@ static int blas_thread_server(void *arg){
blas_memory_free(buffer); blas_memory_free(buffer);
pthread_exit(NULL); //pthread_exit(NULL);
return 0; return 0;
} }
@ -770,6 +771,19 @@ void goto_set_num_threads(int num_threads) {
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
#ifndef NO_AFFINITY
if (num_threads == 1) {
if (blas_cpu_number == 1){
//OpenBLAS is already single thread.
return;
}else{
//From multi-threads to single thread
//Restore the original affinity mask
gotoblas_set_affinity(-1);
}
}
#endif
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) { if (num_threads > blas_num_threads) {
@ -800,6 +814,13 @@ void goto_set_num_threads(int num_threads) {
UNLOCK_COMMAND(&server_lock); UNLOCK_COMMAND(&server_lock);
} }
#ifndef NO_AFFINITY
if(blas_cpu_number == 1 && num_threads > 1){
//Restore the thread 0 affinity.
gotoblas_set_affinity(0);
}
#endif
blas_cpu_number = num_threads; blas_cpu_number = num_threads;
#if defined(ARCH_MIPS64) #if defined(ARCH_MIPS64)

View File

@ -49,8 +49,12 @@
int blas_server_avail = 0; int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER];
void goto_set_num_threads(int num_threads) { void goto_set_num_threads(int num_threads) {
int i=0;
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@ -63,6 +67,18 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number); omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){
if(blas_thread_buffer[i]==NULL){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
}
for(; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
#if defined(ARCH_MIPS64) #if defined(ARCH_MIPS64)
//set parameters for different number of threads. //set parameters for different number of threads.
blas_set_parameter(); blas_set_parameter();
@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){ int blas_thread_init(void){
int i=0;
blas_get_cpu_number(); blas_get_cpu_number();
blas_server_avail = 1; blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
for(; i<MAX_CPU_NUMBER; i++){
blas_thread_buffer[i]=NULL;
}
return 0; return 0;
} }
int BLASFUNC(blas_thread_shutdown)(void){ int BLASFUNC(blas_thread_shutdown)(void){
int i=0;
blas_server_avail = 0; blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
return 0; return 0;
} }
@ -177,6 +209,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
static void exec_threads(blas_queue_t *queue){ static void exec_threads(blas_queue_t *queue){
void *buffer, *sa, *sb; void *buffer, *sa, *sb;
int pos=0, release_flag=0;
buffer = NULL; buffer = NULL;
sa = queue -> sa; sa = queue -> sa;
@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos];
//fallback
if(buffer==NULL) {
buffer = blas_memory_alloc(2); buffer = blas_memory_alloc(2);
release_flag=1;
}
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
} }
@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){
} }
if (buffer != NULL) blas_memory_free(buffer); if (release_flag) blas_memory_free(buffer);
} }

View File

@ -63,6 +63,8 @@ static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER]; static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER];
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){ if (!(mode & BLAS_COMPLEX)){
@ -179,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
do { do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
if (action == WAIT_OBJECT_0 + 1) break; if (action == WAIT_OBJECT_0 + 1) break;
@ -251,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} }
} }
queue->sb=sb;
} }
#ifdef MONITOR #ifdef MONITOR
@ -263,6 +266,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
} else { } else {
legacy_exec(routine, queue -> mode, queue -> args, sb); legacy_exec(routine, queue -> mode, queue -> args, sb);
} }
}else{
continue; //if queue == NULL
} }
#ifdef SMP_DEBUG #ifdef SMP_DEBUG
@ -425,7 +430,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
/* Shutdown procedure, but user don't have to call this routine. The */ /* Shutdown procedure, but user don't have to call this routine. The */
/* kernel automatically kill threads. */ /* kernel automatically kill threads. */
int blas_thread_shutdown_(void){ int BLASFUNC(blas_thread_shutdown)(void){
int i; int i;
@ -437,7 +442,7 @@ int blas_thread_shutdown_(void){
SetEvent(pool.killed); SetEvent(pool.killed);
for(i = 0; i < blas_cpu_number - 1; i++){ for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], INFINITE); WaitForSingleObject(blas_threads[i], INFINITE);
} }
@ -448,3 +453,47 @@ int blas_thread_shutdown_(void){
return 0; return 0;
} }
void goto_set_num_threads(int num_threads)
{
long i;
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
//increased_threads = 1;
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
blas_server_avail = 1;
}
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
}
blas_cpu_number = num_threads;
}
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}

View File

@ -60,6 +60,16 @@ extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#endif
#define VENDOR_INTEL 1 #define VENDOR_INTEL 1
#define VENDOR_AMD 2 #define VENDOR_AMD 2
@ -68,6 +78,32 @@ extern gotoblas_t gotoblas_BARCELONA;
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
}
}
return ret;
#else
return 0;
#endif
}
static int get_vendor(void){ static int get_vendor(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
char vendor[13]; char vendor[13];
@ -129,7 +165,31 @@ static gotoblas_t *get_coretype(void){
if (model == 5) return &gotoblas_NEHALEM; if (model == 5) return &gotoblas_NEHALEM;
//Intel Xeon Processor 5600 (Westmere-EP) //Intel Xeon Processor 5600 (Westmere-EP)
if (model == 12) return &gotoblas_NEHALEM; //Xeon Processor E7 (Westmere-EX)
//Xeon E7540
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if (model == 10 || model == 13) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 3:
//Intel Sandy Bridge 22nm (Ivy Bridge?)
if (model == 10) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL; return NULL;
} }
case 0xf: case 0xf:
@ -144,6 +204,16 @@ static gotoblas_t *get_coretype(void){
if ((exfamily == 0) || (exfamily == 2)) { if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON; else return &gotoblas_OPTERON;
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
} else { } else {
return &gotoblas_BARCELONA; return &gotoblas_BARCELONA;
} }
@ -178,6 +248,9 @@ static char *corename[] = {
"Opteron(SSE3)", "Opteron(SSE3)",
"Barcelona", "Barcelona",
"Nano", "Nano",
"Sandybridge",
"Bobcat",
"Bulldozer",
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@ -197,6 +270,9 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_OPTERON) return corename[13]; if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14]; if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15]; if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
return corename[0]; return corename[0];
} }
@ -211,12 +287,21 @@ void gotoblas_dynamic_init(void) {
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else #else
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
if (sizeof(void*) == 8) {
if (gotoblas == &gotoblas_KATMAI ||
gotoblas == &gotoblas_COPPERMINE ||
gotoblas == &gotoblas_NORTHWOOD ||
gotoblas == &gotoblas_BANIAS ||
gotoblas == &gotoblas_ATHLON)
gotoblas = &gotoblas_PRESCOTT;
}
#endif #endif
if (gotoblas && gotoblas -> init) { if (gotoblas && gotoblas -> init) {
gotoblas -> init(); gotoblas -> init();
} else { } else {
fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n"); fprintf(stderr, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1); exit(1);
} }

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MAX_NODES 16 #define MAX_NODES 16
#define MAX_CPUS 256 #define MAX_CPUS 256
#define NCPUBITS (8*sizeof(unsigned long))
#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS)
#define CPUELT(cpu) ((cpu) / NCPUBITS)
#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS))
#define SH_MAGIC 0x510510 #define SH_MAGIC 0x510510
@ -103,10 +108,10 @@ typedef struct {
int num_nodes; int num_nodes;
int num_procs; int num_procs;
int final_num_procs; int final_num_procs;
unsigned long avail; unsigned long avail [MAX_BITMASK_LEN];
int avail_count;
unsigned long cpu_info [MAX_CPUS]; unsigned long cpu_info [MAX_CPUS];
unsigned long node_info [MAX_NODES]; unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN];
int cpu_use[MAX_CPUS]; int cpu_use[MAX_CPUS];
} shm_t; } shm_t;
@ -126,7 +131,8 @@ static shm_t *common = (void *)-1;
static int shmid, pshmid; static int shmid, pshmid;
static void *paddr; static void *paddr;
static unsigned long lprocmask, lnodemask; static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask;
static int lprocmask_count = 0;
static int numprocs = 1; static int numprocs = 1;
static int numnodes = 1; static int numnodes = 1;
@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) {
than sizeof(unsigned long). On 64 bits, the limit than sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32. is 64. On 32 bits, it is 32.
***/ ***/
static inline unsigned long get_cpumap(int node) { static inline void get_cpumap(int node, unsigned long * node_info) {
int infile; int infile;
unsigned long affinity; unsigned long affinity[32];
char name[160]; char name[160];
char cpumap[160]; char cpumap[160];
char *p, *dummy; char *dummy;
int i=0; int i=0;
int count=0;
int k=0;
sprintf(name, CPUMAP_NAME, node); sprintf(name, CPUMAP_NAME, node);
infile = open(name, O_RDONLY); infile = open(name, O_RDONLY);
for(i=0; i<32; i++){
affinity = 0; affinity[i] = 0;
}
if (infile != -1) { if (infile != -1) {
read(infile, cpumap, sizeof(cpumap)); read(infile, cpumap, sizeof(cpumap));
p = cpumap;
while (*p != '\n' && i<160){ for(i=0; i<160; i++){
if(*p != ',') { if(cpumap[i] == '\n')
name[i++]=*p; break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
//Enough data for Hex
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
} }
p++;
} }
p = name;
// while ((*p == '0') || (*p == ',')) p++;
affinity = strtoul(p, &dummy, 16);
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
node_info[i]=affinity[count-i-1];
}
close(infile); close(infile);
} }
return affinity; return ;
} }
static inline unsigned long get_share(int cpu, int level) { static inline void get_share(int cpu, int level, unsigned long * share) {
int infile; int infile;
unsigned long affinity; unsigned long affinity[32];
char cpumap[160];
char name[160]; char name[160];
char *p; char *dummy;
int count=0;
int i=0,k=0;
int bitmask_idx = 0;
sprintf(name, SHARE_NAME, cpu, level); sprintf(name, SHARE_NAME, cpu, level);
infile = open(name, O_RDONLY); infile = open(name, O_RDONLY);
affinity = (1UL << cpu); // Init share
for(i=0; i<MAX_BITMASK_LEN; i++){
share[i]=0;
}
bitmask_idx = CPUELT(cpu);
share[bitmask_idx] = CPUMASK(cpu);
if (infile != -1) { if (infile != -1) {
read(infile, name, sizeof(name)); read(infile, cpumap, sizeof(cpumap));
p = name; for(i=0; i<160; i++){
if(cpumap[i] == '\n')
break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
while ((*p == '0') || (*p == ',')) p++; //Enough data
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
}
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
share[i]=affinity[count-i-1];
}
affinity = strtol(p, &p, 16);
close(infile); close(infile);
} }
return affinity; return ;
} }
static int numa_check(void) { static int numa_check(void) {
@ -248,6 +298,7 @@ static int numa_check(void) {
DIR *dp; DIR *dp;
struct dirent *dir; struct dirent *dir;
int node; int node;
int j;
common -> num_nodes = 0; common -> num_nodes = 0;
@ -258,7 +309,9 @@ static int numa_check(void) {
return 0; return 0;
} }
for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; for (node = 0; node < MAX_NODES; node ++) {
for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0;
}
while ((dir = readdir(dp)) != NULL) { while ((dir = readdir(dp)) != NULL) {
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
@ -266,12 +319,12 @@ static int numa_check(void) {
node = atoi(&dir -> d_name[4]); node = atoi(&dir -> d_name[4]);
if (node > MAX_NODES) { if (node > MAX_NODES) {
fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n");
exit(1); exit(1);
} }
common -> num_nodes ++; common -> num_nodes ++;
common -> node_info[node] = get_cpumap(node); get_cpumap(node, common->node_info[node]);
} }
} }
@ -284,7 +337,7 @@ static int numa_check(void) {
fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes);
for (node = 0; node < common -> num_nodes; node ++) for (node = 0; node < common -> num_nodes; node ++)
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]);
#endif #endif
return common -> num_nodes; return common -> num_nodes;
@ -296,11 +349,13 @@ static void numa_mapping(void) {
int i, j, h; int i, j, h;
unsigned long work, bit; unsigned long work, bit;
int count = 0; int count = 0;
int bitmask_idx = 0;
for (node = 0; node < common -> num_nodes; node ++) { for (node = 0; node < common -> num_nodes; node ++) {
core = 0; core = 0;
for (cpu = 0; cpu < common -> num_procs; cpu ++) { for (cpu = 0; cpu < common -> num_procs; cpu ++) {
if (common -> node_info[node] & common -> avail & (1UL << cpu)) { bitmask_idx = CPUELT(cpu);
if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) {
common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu);
count ++; count ++;
core ++; core ++;
@ -357,58 +412,92 @@ static void numa_mapping(void) {
static void disable_hyperthread(void) { static void disable_hyperthread(void) {
unsigned long share; unsigned long share[MAX_BITMASK_LEN];
int cpu; int cpu;
int bitmask_idx = 0;
int i=0, count=0;
bitmask_idx = CPUELT(common -> num_procs);
if(common->num_procs > 64){ for(i=0; i< bitmask_idx; i++){
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL;
exit(1); }
}else if(common->num_procs == 64){ if(CPUMASK(common -> num_procs) != 1){
common -> avail = 0xFFFFFFFFFFFFFFFFUL; common -> avail[count++] = CPUMASK(common -> num_procs) - 1;
}else }
common -> avail = (1UL << common -> num_procs) - 1; common -> avail_count = count;
/* if(common->num_procs > 64){ */
/* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */
/* exit(1); */
/* }else if(common->num_procs == 64){ */
/* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */
/* }else */
/* common -> avail = (1UL << common -> num_procs) - 1; */
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); fprintf(stderr, "\nAvail CPUs : ");
for(i=0; i<count; i++)
fprintf(stderr, "%04lx ", common -> avail[i]);
fprintf(stderr, ".\n");
#endif #endif
for (cpu = 0; cpu < common -> num_procs; cpu ++) { for (cpu = 0; cpu < common -> num_procs; cpu ++) {
share = (get_share(cpu, 1) & common -> avail); get_share(cpu, 1, share);
if (popcount(share) > 1) { //When the shared cpu are in different element of share & avail array, this may be a bug.
for (i = 0; i < count ; i++){
share[i] &= common->avail[i];
if (popcount(share[i]) > 1) {
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
cpu, share & ~(1UL << cpu)); cpu, share[i] & ~(CPUMASK(cpu)));
#endif #endif
common -> avail &= ~((share & ~(1UL << cpu))); common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu)));
}
} }
} }
} }
static void disable_affinity(void) { static void disable_affinity(void) {
int i=0;
int bitmask_idx=0;
int count=0;
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]);
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
#endif #endif
if(common->final_num_procs > 64){ /* if(common->final_num_procs > 64){ */
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */
exit(1); /* exit(1); */
}else if(common->final_num_procs == 64){ /* }else if(common->final_num_procs == 64){ */
lprocmask = 0xFFFFFFFFFFFFFFFFUL; /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */
}else /* }else */
lprocmask = (1UL << common -> final_num_procs) - 1; /* lprocmask = (1UL << common -> final_num_procs) - 1; */
bitmask_idx = CPUELT(common -> final_num_procs);
for(i=0; i< bitmask_idx; i++){
lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL;
}
if(CPUMASK(common -> final_num_procs) != 1){
lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1;
}
lprocmask_count = count;
#ifndef USE_OPENMP #ifndef USE_OPENMP
lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; for(i=0; i< count; i++){
lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i];
}
#endif #endif
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]);
#endif #endif
} }
@ -498,7 +587,7 @@ static void create_pshmem(void) {
static void local_cpu_map(void) { static void local_cpu_map(void) {
int cpu, id, mapping; int cpu, id, mapping;
int bitmask_idx = 0;
cpu = 0; cpu = 0;
mapping = 0; mapping = 0;
@ -509,7 +598,8 @@ static void local_cpu_map(void) {
if (is_dead(id)) common -> cpu_use[cpu] = 0; if (is_dead(id)) common -> cpu_use[cpu] = 0;
} }
if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { bitmask_idx = CPUELT(cpu);
if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) {
common -> cpu_use[cpu] = pshmid; common -> cpu_use[cpu] = pshmid;
cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]);
@ -595,6 +685,7 @@ void gotoblas_affinity_init(void) {
#ifndef USE_OPENMP #ifndef USE_OPENMP
cpu_set_t cpu_mask; cpu_set_t cpu_mask;
#endif #endif
int i;
if (initialized) return; if (initialized) return;
@ -646,6 +737,11 @@ void gotoblas_affinity_init(void) {
common -> num_procs = get_nprocs(); common -> num_procs = get_nprocs();
if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
exit(1);
}
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
numa_check(); numa_check();
@ -654,7 +750,8 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping(); if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = popcount(common -> avail); common -> final_num_procs = 0;
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]);
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
@ -664,7 +761,8 @@ void gotoblas_affinity_init(void) {
disable_affinity(); disable_affinity();
num_avail = popcount(lprocmask); num_avail = 0;
for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]);
if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail;

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#if defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_FREEBSD) || defined(OS_DARWIN)
#include <sys/sysctl.h> #include <sys/sysctl.h>
#endif #endif
@ -185,7 +185,7 @@ int get_num_procs(void) {
#endif #endif
#if defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_FREEBSD)
int get_num_procs(void) { int get_num_procs(void) {
@ -206,7 +206,27 @@ int get_num_procs(void) {
#endif #endif
#if defined(OS_DARWIN)
int get_num_procs(void) {
static int nums = 0;
size_t len;
if (nums == 0){
len = sizeof(int);
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
}
return nums;
}
#endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0; int blas_cpu_number = 0;
/*
The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0; int blas_num_threads = 0;
int goto_get_num_procs (void) { int goto_get_num_procs (void) {
@ -215,7 +235,7 @@ int goto_get_num_procs (void) {
int blas_get_cpu_number(void){ int blas_get_cpu_number(void){
char *p; char *p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
int max_num; int max_num;
#endif #endif
int blas_goto_num = 0; int blas_goto_num = 0;
@ -223,7 +243,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads; if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
max_num = get_num_procs(); max_num = get_num_procs();
#endif #endif
@ -250,7 +270,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER; else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
if (blas_num_threads > max_num) blas_num_threads = max_num; if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif #endif
@ -1128,7 +1148,7 @@ static BLASULONG init_lock = 0UL;
static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
void *sa, void *sb, BLASLONG pos) { void *sa, void *sb, BLASLONG pos) {
#ifndef ARCH_POWER #if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
long size; long size;
BLASULONG buffer; BLASULONG buffer;
@ -1289,6 +1309,7 @@ void DESTRUCTOR gotoblas_quit(void) {
moncontrol (1); moncontrol (1);
#endif #endif
blas_shutdown();
} }
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@ -0,0 +1,59 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
static char* openblas_config_str=""
#ifdef USE64BITINT
"USE64BITINT "
#endif
#ifdef NO_CBLAS
"NO_CBLAS "
#endif
#ifdef NO_LAPACK
"NO_LAPACK "
#endif
#ifdef NO_LAPACKE
"NO_LAPACKE "
#endif
#ifdef DYNAMIC_ARCH
"DYNAMIC_ARCH "
#endif
#ifdef NO_AFFINITY
"NO_AFFINITY "
#endif
;
char* CNAME() {
return openblas_config_str;
}

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -33,13 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#ifdef SMP_SERVER #ifdef SMP_SERVER
#ifdef OS_LINUX
extern void openblas_set_num_threads(int num_threads) ; extern void openblas_set_num_threads(int num_threads) ;
void NAME(int* num_threads){ void openblas_set_num_threads_(int* num_threads){
openblas_set_num_threads(*num_threads); openblas_set_num_threads(*num_threads);
} }
#endif #else
//Single thread
void openblas_set_num_threads(int num_threads) {
}
void openblas_set_num_threads_(int* num_threads){
}
#endif #endif

View File

@ -163,9 +163,9 @@ int get_L2_size(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx); cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@ -384,6 +384,17 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif
#if defined(SANDYBRIDGE)
sgemm_p = 1024;
dgemm_p = 512;
cgemm_p = 512;
zgemm_p = 256;
#ifdef EXPRECISION
qgemm_p = 256;
xgemm_p = 128;
#endif
#endif
#if defined(CORE_PRESCOTT) || defined(GENERIC) #if defined(CORE_PRESCOTT) || defined(GENERIC)
size >>= 6; size >>= 6;
@ -435,7 +446,7 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif
#if defined(CORE_BARCELONA) #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
size >>= 8; size >>= 8;
sgemm_p = 232 * size; sgemm_p = 232 * size;

View File

@ -10,10 +10,23 @@ ifndef NO_CBLAS
NO_CBLAS = 0 NO_CBLAS = 0
endif endif
ifndef NO_LAPACK
NO_LAPACK = 0
endif
ifndef NO_LAPACKE
NO_LAPACKE = 0
endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
EXTRALIB += -lgfortran EXTRALIB += -lgfortran
endif endif
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
EXTRALIB += -lgomp
endif
endif
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
@ -58,15 +71,20 @@ dll : ../$(LIBDLLNAME)
dll2 : libgoto2_shared.dll dll2 : libgoto2_shared.dll
# On Windows, we only generate a DLL without a version suffix. This is because
# applications which link against the dynamic library reference a fixed DLL name
# in their import table. By instead using a stable name it is possible to
# upgrade between library versions, without needing to re-link an application.
# For more details see: https://github.com/xianyi/OpenBLAS/issues/127.
../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME) $(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1) ifeq ($(BINARY32), 1)
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
-lib /machine:i386 /def:libopenblas.def -lib /machine:i386 /def:libopenblas.def
else else
$(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \
--entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB)
-lib /machine:X64 /def:libopenblas.def -lib /machine:X64 /def:libopenblas.def
endif endif
@ -76,13 +94,13 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
libopenblas.def : gensymbol libopenblas.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
libgoto2_shared.def : gensymbol libgoto2_shared.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
libgoto_hpl.def : gensymbol libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
$(LIBDYNNAME) : ../$(LIBNAME) osx.def $(LIBDYNNAME) : ../$(LIBNAME) osx.def
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
@ -106,14 +124,15 @@ so : ../$(LIBSONAME)
endif endif
ifeq ($(OSNAME), FreeBSD) #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
so : ../$(LIBSONAME) so : ../$(LIBSONAME)
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def $(EXTRALIB) -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest rm -f linktest
@ -163,23 +182,23 @@ static : ../$(LIBNAME)
rm -f goto.$(SUFFIX) rm -f goto.$(SUFFIX)
linux.def : gensymbol ../Makefile.system ../getarch.c linux.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
osx.def : gensymbol ../Makefile.system ../getarch.c osx.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
aix.def : gensymbol ../Makefile.system ../getarch.c aix.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
symbol.S : gensymbol symbol.S : gensymbol
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > symbol.S perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S
test : linktest.c test : linktest.c
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest rm -f linktest
linktest.c : gensymbol ../Makefile.system ../getarch.c linktest.c : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > linktest.c perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > linktest.c
clean :: clean ::
@rm -f *.def *.dylib __.SYMDEF* @rm -f *.def *.dylib __.SYMDEF*

File diff suppressed because it is too large Load Diff

15
f_check
View File

@ -32,11 +32,12 @@ if ($compiler eq "") {
"pgf95", "pgf90", "pgf77", "pgf95", "pgf90", "pgf77",
"ifort"); "ifort");
OUTER:
foreach $lists (@lists) { foreach $lists (@lists) {
foreach $path (@path) { foreach $path (@path) {
if (-f $path . "/" . $lists) { if (-x $path . "/" . $lists) {
$compiler = $lists; $compiler = $lists;
break; last OUTER;
} }
} }
} }
@ -210,6 +211,10 @@ if (!$?) {
if ($?) { if ($?) {
$link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
} }
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$binary = "" if ($?); $binary = "" if ($?);
} }
@ -218,6 +223,10 @@ if (!$?) {
if ($?) { if ($?) {
$link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
} }
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$binary = "" if ($?); $binary = "" if ($?);
} }
@ -237,6 +246,8 @@ if ($link ne "") {
$link =~ s/\-rpath\s+/\-rpath\@/g; $link =~ s/\-rpath\s+/\-rpath\@/g;
@flags = split(/[\s\,\n]/, $link); @flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
foreach $flags (@flags) { foreach $flags (@flags) {
if ( if (

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -96,12 +96,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PENRYN */ /* #define FORCE_PENRYN */
/* #define FORCE_DUNNINGTON */ /* #define FORCE_DUNNINGTON */
/* #define FORCE_NEHALEM */ /* #define FORCE_NEHALEM */
/* #define FORCE_SANDYBRIDGE */
/* #define FORCE_ATOM */
/* #define FORCE_ATHLON */ /* #define FORCE_ATHLON */
/* #define FORCE_OPTERON */ /* #define FORCE_OPTERON */
/* #define FORCE_OPTERON_SSE3 */ /* #define FORCE_OPTERON_SSE3 */
/* #define FORCE_BARCELONA */ /* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */ /* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */ /* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_BULLDOZER */
/* #define FORCE_SSE_GENERIC */ /* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */ /* #define FORCE_VIAC3 */
/* #define FORCE_NANO */ /* #define FORCE_NANO */
@ -119,9 +123,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_LOONGSON3A */ /* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */ /* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */ /* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */
/* #define FORCE_SPARC */ /* #define FORCE_SPARC */
/* #define FORCE_SPARCV7 */ /* #define FORCE_SPARCV7 */
/* #define FORCE_GENERIC */
#ifdef FORCE_P2 #ifdef FORCE_P2
#define FORCE #define FORCE
@ -137,20 +141,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "P5" #define CORENAME "P5"
#endif #endif
#ifdef FORCE_COPPERMINE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#endif
#ifdef FORCE_KATMAI #ifdef FORCE_KATMAI
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -165,6 +155,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "KATMAI" #define CORENAME "KATMAI"
#endif #endif
#ifdef FORCE_COPPERMINE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#endif
#ifdef FORCE_NORTHWOOD #ifdef FORCE_NORTHWOOD
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -278,6 +282,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "NEHALEM" #define CORENAME "NEHALEM"
#endif #endif
#ifdef FORCE_SANDYBRIDGE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -342,13 +360,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DBARCELONA " \ #define ARCHCONFIG "-DBARCELONA " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \
"-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 " \
"-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU"
#define LIBNAME "barcelona" #define LIBNAME "barcelona"
#define CORENAME "BARCELONA" #define CORENAME "BARCELONA"
#endif #endif
#if defined(FORCE_BOBCAT)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BOBCAT"
#define ARCHCONFIG "-DBOBCAT " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV"
#define LIBNAME "bobcat"
#define CORENAME "BOBCAT"
#endif
#if defined (FORCE_BULLDOZER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BULLDOZER"
#define ARCHCONFIG "-DBULLDOZER " \
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
"-DHAVE_AVX -DHAVE_FMA4"
#define LIBNAME "bulldozer"
#define CORENAME "BULLDOZER"
#endif
#ifdef FORCE_SSE_GENERIC #ifdef FORCE_SSE_GENERIC
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL

View File

@ -34,7 +34,7 @@ int main(int argc, char **argv) {
#ifdef USE64BITINT #ifdef USE64BITINT
printf("#define USE64BITINT\n"); printf("#define USE64BITINT\n");
#endif #endif
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD);
} }
return 0; return 0;

View File

@ -318,7 +318,7 @@ CZBLAS3OBJS = \
ifndef NO_CBLAS ifndef NO_CBLAS
CFLAGS += -I. override CFLAGS += -I.
SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS1OBJS += $(CSBLAS1OBJS)
SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS2OBJS += $(CSBLAS2OBJS)
@ -400,7 +400,7 @@ all :: libs
ifdef FUNCTION_PROFILE ifdef FUNCTION_PROFILE
$(BLASOBJS) $(BLASOBJS_P) : functable.h $(BLASOBJS) $(BLASOBJS_P) : functable.h
$(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) $(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F)
functable.h : Makefile functable.h : Makefile
./create $(FUNCALLFILES) > functable.h ./create $(FUNCALLFILES) > functable.h
@ -420,7 +420,7 @@ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \
$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS
srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)

View File

@ -6,7 +6,7 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system include $(TOPDIR)/Makefile.system
ifdef TARGET_CORE ifdef TARGET_CORE
CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
BUILD_KERNEL = 1 BUILD_KERNEL = 1
KDIR = KDIR =
TSUFFIX = _$(TARGET_CORE) TSUFFIX = _$(TARGET_CORE)
@ -48,7 +48,7 @@ HPLOBJS = \
COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX)
ifdef DYNAMIC_ARCH ifeq ($(DYNAMIC_ARCH), 1)
SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX)
endif endif

View File

@ -0,0 +1,235 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
for (j=0; j<col/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
dest0[16] = src0[4];
dest0[17] = src0[5];
dest0[18] = src1[4];
dest0[19] = src1[5];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src3[4];
dest0[23] = src3[5];
dest0[24] = src0[6];
dest0[25] = src0[7];
dest0[26] = src1[6];
dest0[27] = src1[7];
dest0[28] = src2[6];
dest0[29] = src2[7];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (4<<3);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
ii = (2<<3);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
ii = (1<<3);
dest0 = dest0+ii;
}
}
if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
dest0[8] = src0[4];
dest0[9] = src0[5];
dest0[10] = src1[4];
dest0[11] = src1[5];
dest0[12] = src0[6];
dest0[13] = src0[7];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (4<<2);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
ii = (2<<2);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
ii = (1<<2);
dest0 = dest0+ii;
}
}
if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (4<<1);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
src0 = src0+4;
ii = (2<<1);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
src0 = src0+2;
ii = (1<<1);
dest0 = dest0+ii;
}
}
return 0;
}

View File

@ -0,0 +1,401 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0;
for (j=0; j<col/8; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src4 = src3+2*srcdim;
src5 = src4+2*srcdim;
src6 = src5+2*srcdim;
src7 = src6+2*srcdim;
src = src7+2*srcdim;
dest0 = dest;
ii = (row<<4);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
dest0[16] = src0[2];
dest0[17] = src0[3];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src2[2];
dest0[21] = src2[3];
dest0[22] = src3[2];
dest0[23] = src3[3];
dest0[24] = src4[2];
dest0[25] = src4[3];
dest0[26] = src5[2];
dest0[27] = src5[3];
dest0[28] = src6[2];
dest0[29] = src6[3];
dest0[30] = src7[2];
dest0[31] = src7[3];
dest0[32] = src0[4];
dest0[33] = src0[5];
dest0[34] = src1[4];
dest0[35] = src1[5];
dest0[36] = src2[4];
dest0[37] = src2[5];
dest0[38] = src3[4];
dest0[39] = src3[5];
dest0[40] = src4[4];
dest0[41] = src4[5];
dest0[42] = src5[4];
dest0[43] = src5[5];
dest0[44] = src6[4];
dest0[45] = src6[5];
dest0[46] = src7[4];
dest0[47] = src7[5];
dest0[48] = src0[6];
dest0[49] = src0[7];
dest0[50] = src1[6];
dest0[51] = src1[7];
dest0[52] = src2[6];
dest0[53] = src2[7];
dest0[54] = src3[6];
dest0[55] = src3[7];
dest0[56] = src4[6];
dest0[57] = src4[7];
dest0[58] = src5[6];
dest0[59] = src5[7];
dest0[60] = src6[6];
dest0[61] = src6[7];
dest0[62] = src7[6];
dest0[63] = src7[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
src4 = src4+8;
src5 = src5+8;
src6 = src6+8;
src7 = src7+8;
ii = (4<<4);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
dest0[16] = src0[2];
dest0[17] = src0[3];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src2[2];
dest0[21] = src2[3];
dest0[22] = src3[2];
dest0[23] = src3[3];
dest0[24] = src4[2];
dest0[25] = src4[3];
dest0[26] = src5[2];
dest0[27] = src5[3];
dest0[28] = src6[2];
dest0[29] = src6[3];
dest0[30] = src7[2];
dest0[31] = src7[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
src4 = src4+4;
src5 = src5+4;
src6 = src6+4;
src7 = src7+4;
ii = (2<<4);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src4[0];
dest0[9] = src4[1];
dest0[10] = src5[0];
dest0[11] = src5[1];
dest0[12] = src6[0];
dest0[13] = src6[1];
dest0[14] = src7[0];
dest0[15] = src7[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
src4 = src4+2;
src5 = src5+2;
src6 = src6+2;
src7 = src7+2;
ii = (1<<4);
dest0 = dest0+ii;
}
}
if (col&4)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (row<<3);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
dest0[16] = src0[4];
dest0[17] = src0[5];
dest0[18] = src1[4];
dest0[19] = src1[5];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src3[4];
dest0[23] = src3[5];
dest0[24] = src0[6];
dest0[25] = src0[7];
dest0[26] = src1[6];
dest0[27] = src1[7];
dest0[28] = src2[6];
dest0[29] = src2[7];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (4<<3);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
dest0[8] = src0[2];
dest0[9] = src0[3];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src2[2];
dest0[13] = src2[3];
dest0[14] = src3[2];
dest0[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
ii = (2<<3);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src2[0];
dest0[5] = src2[1];
dest0[6] = src3[0];
dest0[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
ii = (1<<3);
dest0 = dest0+ii;
}
}
if (col&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (row<<2);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
dest0[8] = src0[4];
dest0[9] = src0[5];
dest0[10] = src1[4];
dest0[11] = src1[5];
dest0[12] = src0[6];
dest0[13] = src0[7];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (4<<2);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
dest0[4] = src0[2];
dest0[5] = src0[3];
dest0[6] = src1[2];
dest0[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
ii = (2<<2);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src1[0];
dest0[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
ii = (1<<2);
dest0 = dest0+ii;
}
}
if (col&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (row<<1);
dest = dest+ii;
for (i=0; i<row/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (4<<1);
dest0 = dest0+ii;
}
if (row&2)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
src0 = src0+4;
ii = (2<<1);
dest0 = dest0+ii;
}
if (row&1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
src0 = src0+2;
ii = (1<<1);
dest0 = dest0+ii;
}
}
return 0;
}

View File

@ -0,0 +1,237 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
FLOAT *dest1,*dest2;
ii = col&-4;
ii = ii*(2*row);
dest2 = dest+ii;
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (4<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src1[0];
dest0[9] = src1[1];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src1[4];
dest0[13] = src1[5];
dest0[14] = src1[6];
dest0[15] = src1[7];
dest0[16] = src2[0];
dest0[17] = src2[1];
dest0[18] = src2[2];
dest0[19] = src2[3];
dest0[20] = src2[4];
dest0[21] = src2[5];
dest0[22] = src2[6];
dest0[23] = src2[7];
dest0[24] = src3[0];
dest0[25] = src3[1];
dest0[26] = src3[2];
dest0[27] = src3[3];
dest0[28] = src3[4];
dest0[29] = src3[5];
dest0[30] = src3[6];
dest0[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
dest2[8] = src2[0];
dest2[9] = src2[1];
dest2[10] = src2[2];
dest2[11] = src2[3];
dest2[12] = src3[0];
dest2[13] = src3[1];
dest2[14] = src3[2];
dest2[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
dest2 = dest2+16;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
dest1[4] = src2[0];
dest1[5] = src2[1];
dest1[6] = src3[0];
dest1[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
dest1 = dest1+8;
}
}
if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (2<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src1[0];
dest0[9] = src1[1];
dest0[10] = src1[2];
dest0[11] = src1[3];
dest0[12] = src1[4];
dest0[13] = src1[5];
dest0[14] = src1[6];
dest0[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
dest2 = dest2+8;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
dest1 = dest1+4;
}
}
if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<3);
dest = dest+ii;
for (i=0; i<col/4; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
src0 = src0+8;
ii = (row<<3);
dest0 = dest0+ii;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
src0 = src0+4;
dest2 = dest2+4;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
src0 = src0+2;
dest1 = dest1+2;
}
}
return 0;
}

View File

@ -0,0 +1,370 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
{
BLASLONG i,j;
BLASLONG idx=0;
BLASLONG ii;
FLOAT *src0,*src1,*src2,*src3,*dest0;
FLOAT *dest1,*dest2,*dest4;
ii = col&-8;
ii = ii*(2*row);
dest4 = dest+ii;
ii = col&-4;
ii = ii*(2*row);
dest2 = dest+ii;
ii = col&-2;
ii = ii*(2*row);
dest1 = dest+ii;
for (j=0; j<row/4; j+=1)
{
src0 = src;
src1 = src0+2*srcdim;
src2 = src1+2*srcdim;
src3 = src2+2*srcdim;
src = src3+2*srcdim;
dest0 = dest;
ii = (4<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
dest0[16] = src1[0];
dest0[17] = src1[1];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src1[4];
dest0[21] = src1[5];
dest0[22] = src1[6];
dest0[23] = src1[7];
dest0[24] = src1[8];
dest0[25] = src1[9];
dest0[26] = src1[10];
dest0[27] = src1[11];
dest0[28] = src1[12];
dest0[29] = src1[13];
dest0[30] = src1[14];
dest0[31] = src1[15];
dest0[32] = src2[0];
dest0[33] = src2[1];
dest0[34] = src2[2];
dest0[35] = src2[3];
dest0[36] = src2[4];
dest0[37] = src2[5];
dest0[38] = src2[6];
dest0[39] = src2[7];
dest0[40] = src2[8];
dest0[41] = src2[9];
dest0[42] = src2[10];
dest0[43] = src2[11];
dest0[44] = src2[12];
dest0[45] = src2[13];
dest0[46] = src2[14];
dest0[47] = src2[15];
dest0[48] = src3[0];
dest0[49] = src3[1];
dest0[50] = src3[2];
dest0[51] = src3[3];
dest0[52] = src3[4];
dest0[53] = src3[5];
dest0[54] = src3[6];
dest0[55] = src3[7];
dest0[56] = src3[8];
dest0[57] = src3[9];
dest0[58] = src3[10];
dest0[59] = src3[11];
dest0[60] = src3[12];
dest0[61] = src3[13];
dest0[62] = src3[14];
dest0[63] = src3[15];
src0 = src0+16;
src1 = src1+16;
src2 = src2+16;
src3 = src3+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
dest4[8] = src1[0];
dest4[9] = src1[1];
dest4[10] = src1[2];
dest4[11] = src1[3];
dest4[12] = src1[4];
dest4[13] = src1[5];
dest4[14] = src1[6];
dest4[15] = src1[7];
dest4[16] = src2[0];
dest4[17] = src2[1];
dest4[18] = src2[2];
dest4[19] = src2[3];
dest4[20] = src2[4];
dest4[21] = src2[5];
dest4[22] = src2[6];
dest4[23] = src2[7];
dest4[24] = src3[0];
dest4[25] = src3[1];
dest4[26] = src3[2];
dest4[27] = src3[3];
dest4[28] = src3[4];
dest4[29] = src3[5];
dest4[30] = src3[6];
dest4[31] = src3[7];
src0 = src0+8;
src1 = src1+8;
src2 = src2+8;
src3 = src3+8;
dest4 = dest4+32;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
dest2[8] = src2[0];
dest2[9] = src2[1];
dest2[10] = src2[2];
dest2[11] = src2[3];
dest2[12] = src3[0];
dest2[13] = src3[1];
dest2[14] = src3[2];
dest2[15] = src3[3];
src0 = src0+4;
src1 = src1+4;
src2 = src2+4;
src3 = src3+4;
dest2 = dest2+16;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
dest1[4] = src2[0];
dest1[5] = src2[1];
dest1[6] = src3[0];
dest1[7] = src3[1];
src0 = src0+2;
src1 = src1+2;
src2 = src2+2;
src3 = src3+2;
dest1 = dest1+8;
}
}
if (row&2)
{
src0 = src;
src1 = src0+2*srcdim;
src = src1+2*srcdim;
dest0 = dest;
ii = (2<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
dest0[16] = src1[0];
dest0[17] = src1[1];
dest0[18] = src1[2];
dest0[19] = src1[3];
dest0[20] = src1[4];
dest0[21] = src1[5];
dest0[22] = src1[6];
dest0[23] = src1[7];
dest0[24] = src1[8];
dest0[25] = src1[9];
dest0[26] = src1[10];
dest0[27] = src1[11];
dest0[28] = src1[12];
dest0[29] = src1[13];
dest0[30] = src1[14];
dest0[31] = src1[15];
src0 = src0+16;
src1 = src1+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
dest4[8] = src1[0];
dest4[9] = src1[1];
dest4[10] = src1[2];
dest4[11] = src1[3];
dest4[12] = src1[4];
dest4[13] = src1[5];
dest4[14] = src1[6];
dest4[15] = src1[7];
src0 = src0+8;
src1 = src1+8;
dest4 = dest4+16;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
dest2[4] = src1[0];
dest2[5] = src1[1];
dest2[6] = src1[2];
dest2[7] = src1[3];
src0 = src0+4;
src1 = src1+4;
dest2 = dest2+8;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
dest1[2] = src1[0];
dest1[3] = src1[1];
src0 = src0+2;
src1 = src1+2;
dest1 = dest1+4;
}
}
if (row&1)
{
src0 = src;
src = src0+2*srcdim;
dest0 = dest;
ii = (1<<4);
dest = dest+ii;
for (i=0; i<col/8; i+=1)
{
dest0[0] = src0[0];
dest0[1] = src0[1];
dest0[2] = src0[2];
dest0[3] = src0[3];
dest0[4] = src0[4];
dest0[5] = src0[5];
dest0[6] = src0[6];
dest0[7] = src0[7];
dest0[8] = src0[8];
dest0[9] = src0[9];
dest0[10] = src0[10];
dest0[11] = src0[11];
dest0[12] = src0[12];
dest0[13] = src0[13];
dest0[14] = src0[14];
dest0[15] = src0[15];
src0 = src0+16;
ii = (row<<4);
dest0 = dest0+ii;
}
if (col&4)
{
dest4[0] = src0[0];
dest4[1] = src0[1];
dest4[2] = src0[2];
dest4[3] = src0[3];
dest4[4] = src0[4];
dest4[5] = src0[5];
dest4[6] = src0[6];
dest4[7] = src0[7];
src0 = src0+8;
dest4 = dest4+8;
}
if (col&2)
{
dest2[0] = src0[0];
dest2[1] = src0[1];
dest2[2] = src0[2];
dest2[3] = src0[3];
src0 = src0+4;
dest2 = dest2+4;
}
if (col&1)
{
dest1[0] = src0[0];
dest1[1] = src0[1];
src0 = src0+2;
dest1 = dest1+2;
}
}
return 0;
}

View File

@ -634,10 +634,10 @@ static void init_parameter(void) {
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
#endif #endif
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Katmai, Coppermine, Banias\n"); fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
#endif #endif
TABLE_NAME.sgemm_p = 64 * (l2 >> 7); TABLE_NAME.sgemm_p = 64 * (l2 >> 7);
@ -746,6 +746,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef SANDYBRIDGE
#ifdef DEBUG
fprintf(stderr, "Sandybridge\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef OPTERON #ifdef OPTERON
#ifdef DEBUG #ifdef DEBUG
@ -778,6 +794,38 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef BOBCAT
#ifdef DEBUG
fprintf(stderr, "Bobcate\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef BULLDOZER
#ifdef DEBUG
fprintf(stderr, "Bulldozer\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO #ifdef NANO
#ifdef DEBUG #ifdef DEBUG

59
kernel/x86/KERNEL.BOBCAT Normal file
View File

@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN

View File

@ -495,7 +495,6 @@
ALIGN_4 ALIGN_4
.L999: .L999:
RESTOREREGISTERS
subl $8, %esp subl $8, %esp
movss %xmm0, (%esp) movss %xmm0, (%esp)

View File

@ -76,6 +76,12 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif
#ifndef PREFETCH #ifndef PREFETCH
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#endif #endif

View File

@ -596,7 +596,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 4 * SIZE(BB), %xmm2 movsd 4 * SIZE(BB), %xmm2
@ -842,7 +842,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1168,7 +1168,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1198,7 +1198,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1347,7 +1347,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -1531,7 +1531,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -1778,7 +1778,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -1793,7 +1793,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -1924,7 +1924,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -2069,7 +2069,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -69,6 +69,12 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef SANDYBRIDGE
#define PREFETCHSIZE (16 * 1 - 8)
#define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0
#endif
#ifndef PREFETCH #ifndef PREFETCH
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#endif #endif
@ -262,7 +268,7 @@
movaps -16 * SIZE(AA), %xmm0 movaps -16 * SIZE(AA), %xmm0
addps %xmm2, %xmm7 addps %xmm2, %xmm7
#ifndef NEHALEM #if !(defined(NEHALEM) || defined(SANDYBRIDGE))
PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
pshufd $0x93, %xmm1, %xmm2 pshufd $0x93, %xmm1, %xmm2

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4) #define PREFETCHSIZE (16 * 4)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)
@ -89,17 +89,22 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 20 + STACKSIZE(%esp) #define A 20 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 24 + STACKSIZE(%esp) #define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
#define STACK_X 28 + STACKSIZE(%esp) #define STACK_X 28 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 32 + STACKSIZE(%esp) #define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
#define Y 36 + STACKSIZE(%esp) #define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define LDAX 12+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -114,6 +119,7 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -121,7 +127,34 @@
PROFCODE PROFCODE
movl Y,J
movl J,YY # backup Y
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # backup MM
.L0t:
xorl J,J
addl $1,J
sall $21,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@ -651,12 +684,22 @@
addss 0 * SIZE(X), %xmm0 addss 0 * SIZE(X), %xmm0
movss %xmm0, (Y1) movss %xmm0, (Y1)
ALIGN_3 ALIGN_3
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)
@ -76,17 +76,22 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE(%esp) #define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE(%esp) #define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -101,6 +106,8 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -108,6 +115,33 @@
PROFCODE PROFCODE
movl Y,J
movl J,YY # backup Y
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # backup MM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
@ -677,10 +711,22 @@
ALIGN_3 ALIGN_3
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl YY,J
addl %eax,J
movl J,YY
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 4) #define PREFETCHSIZE (16 * 4)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)
@ -89,17 +89,24 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 20 + STACKSIZE(%esp) #define A 20 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 24 + STACKSIZE(%esp) #define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
#define STACK_X 28 + STACKSIZE(%esp) #define STACK_X 28 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 32 + STACKSIZE(%esp) #define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
#define Y 36 + STACKSIZE(%esp) #define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE(%esp) #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE(%esp) #define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+STACKSIZE(%esp)
#define NN 4+STACKSIZE(%esp)
#define AA 8+STACKSIZE(%esp)
#define LDAX 12+STACKSIZE(%esp)
#define XX 16+STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -114,6 +121,7 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -122,7 +130,42 @@
PROFCODE PROFCODE
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl STACK_X, X movl STACK_X, X
movl X,XX
movl N,J
movl J,NN # backup N
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # mov M to MMM
.L0t:
xorl J,J
addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A # mov AA to A
movl NN,%eax
movl %eax,N # reset N
movl LDAX, LDA # reset LDA
movl XX,X
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY
@ -198,6 +241,20 @@
jg .L06 jg .L06
ALIGN_4 ALIGN_4
//Padding zero to prevent loading the dirty number from buffer.
movl M, I
movl $8, J
andl $7, I
xorps %xmm0, %xmm0
subl I, J
ALIGN_2
.L07:
movss %xmm0, 0 * SIZE(Y1)
addl $SIZE, Y1
decl J
jg .L07
ALIGN_4
.L10: .L10:
movl Y, Y1 movl Y, Y1
@ -628,10 +685,22 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl XX,J
addl %eax,J
movl J,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)
@ -76,17 +76,23 @@
#endif #endif
#define STACKSIZE 16 #define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp) #define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE(%esp) #define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE(%esp) #define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE(%esp) #define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp) #define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE(%esp) #define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp) #define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE(%esp) #define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+STACKSIZE(%esp)
#define AA 4+STACKSIZE(%esp)
#define LDAX 8+STACKSIZE(%esp)
#define NN 12+STACKSIZE(%esp)
#define I %eax #define I %eax
#define J %ebx #define J %ebx
@ -101,6 +107,8 @@
PROLOGUE PROLOGUE
subl $ARGS,%esp
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
@ -108,7 +116,40 @@
PROFCODE PROFCODE
movl STACK_LDA, LDA movl STACK_LDA, LDA
movl LDA,LDAX # backup LDA
movl N,J
movl J,NN # backup N
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # mov M to MMM
.L0t:
xorl J,J
addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer.
# Now, split M by block J
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A # mov AA to A
movl NN,%eax
movl %eax,N # reset N
movl LDAX, LDA # reset LDA
movl STACK_X, X movl STACK_X, X
movl STACK_INCX, INCX movl STACK_INCX, INCX
movl STACK_INCY, INCY movl STACK_INCY, INCY
@ -117,6 +158,7 @@
leal (,INCY, SIZE), INCY leal (,INCY, SIZE), INCY
leal (,LDA, SIZE), LDA leal (,LDA, SIZE), LDA
subl $-16 * SIZE, A subl $-16 * SIZE, A
cmpl $0, N cmpl $0, N
@ -560,10 +602,19 @@
ALIGN_4 ALIGN_4
.L999: .L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
jmp .L0t
ALIGN_4
.L999x:
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
popl %ebp popl %ebp
addl $ARGS,%esp
ret ret
EPILOGUE EPILOGUE

View File

@ -269,7 +269,7 @@
sarl $5, I sarl $5, I
jle .L113 jle .L113
#if defined(BARCELONA) #if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1 mulps -32 * SIZE(X), %xmm1

View File

@ -76,6 +76,7 @@
xorps %xmm1, %xmm1 xorps %xmm1, %xmm1
comisd %xmm0, %xmm1 comisd %xmm0, %xmm1
jne .L100 # Alpha != ZERO jne .L100 # Alpha != ZERO
jp .L100 # For Alpha = NaN
/* Alpha == ZERO */ /* Alpha == ZERO */
cmpl $SIZE, INCX cmpl $SIZE, INCX
@ -252,7 +253,7 @@
sarl $4, I sarl $4, I
jle .L113 jle .L113
#if defined(BARCELONA) #if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1 movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1 mulpd -16 * SIZE(X), %xmm1

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -439,7 +439,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1697,7 +1697,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -437,7 +437,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -833,7 +833,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1848,7 +1848,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2109,7 +2109,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2429,7 +2429,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2952,7 +2952,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -3148,7 +3148,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3389,7 +3389,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -910,7 +910,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1439,7 +1439,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -872,7 +872,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1316,7 +1316,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1855,7 +1855,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2249,7 +2249,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2562,7 +2562,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2957,7 +2957,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -3280,7 +3280,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3515,7 +3515,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -1036,7 +1036,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2
@ -2224,7 +2224,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2

View File

@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4) #define PREFETCHSIZE (8 * 21 + 4)
#endif #endif

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -439,7 +439,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -758,7 +758,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -993,7 +993,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -1324,7 +1324,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1718,7 +1718,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2031,7 +2031,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2859,7 +2859,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -3303,7 +3303,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2

View File

@ -1541,6 +1541,16 @@
popl %ebx popl %ebx
popl %esi popl %esi
popl %edi popl %edi
/*remove the hidden return value address from the stack.*/ #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#ifdef MS_ABI
/* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */
ret
#else
/* remove the hidden return value address from the stack. For MingW GCC < 4.7 */
ret $0x4 ret $0x4
#endif
#else
/*remove the hidden return value address from the stack on Linux.*/
ret $0x4
#endif
EPILOGUE EPILOGUE

View File

@ -74,7 +74,7 @@
#define BB %ecx #define BB %ecx
#define LDC %ebp #define LDC %ebp
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define movsd movlps #define movsd movlps
#endif #endif
@ -625,7 +625,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 4 * SIZE(BB), %xmm2 movsd 4 * SIZE(BB), %xmm2
@ -870,7 +870,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1173,7 +1173,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1203,7 +1203,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1359,7 +1359,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -1536,7 +1536,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -1794,7 +1794,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -1809,7 +1809,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -1936,7 +1936,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -2069,7 +2069,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHSIZE (8 * 1 - 4)
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0

View File

@ -64,7 +64,7 @@
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHSIZE (16 * 1 + 8)
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHB prefetcht0 #define PREFETCHB prefetcht0

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2) #define PREFETCHSIZE (16 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)
@ -71,7 +71,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5) #define PREFETCHSIZE (16 * 5)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)
@ -58,7 +58,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5) #define PREFETCHSIZE (8 * 5)

View File

@ -58,7 +58,7 @@
#define PREFETCHSIZE (16 * 2) #define PREFETCHSIZE (16 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 7) #define PREFETCHSIZE (16 * 7)
@ -71,7 +71,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5) #define PREFETCHSIZE (16 * 5)

View File

@ -45,7 +45,7 @@
#define PREFETCHSIZE (8 * 2) #define PREFETCHSIZE (8 * 2)
#endif #endif
#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0
#define PREFETCHSIZE (8 * 7) #define PREFETCHSIZE (8 * 7)
@ -58,7 +58,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#ifdef BARCELONA #if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta #define PREFETCH prefetchnta
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5) #define PREFETCHSIZE (8 * 5)

View File

@ -55,7 +55,7 @@
#define XX %edi #define XX %edi
#define FLAG %ebp #define FLAG %ebp
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@ -697,7 +697,7 @@
cmpl $2 * SIZE, INCX cmpl $2 * SIZE, INCX
jne .L120 jne .L120
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm0, %xmm6)
PSHUFD2($0, %xmm1, %xmm1) PSHUFD2($0, %xmm1, %xmm1)

View File

@ -57,7 +57,7 @@
#include "l1param.h" #include "l1param.h"
#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
#define USE_PSHUFD #define USE_PSHUFD
#else #else
#define USE_PSHUFD_HALF #define USE_PSHUFD_HALF
@ -860,7 +860,7 @@
cmpl $2 * SIZE, INCX cmpl $2 * SIZE, INCX
jne .L220 jne .L220
#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
#ifdef HAVE_SSE3 #ifdef HAVE_SSE3
movddup %xmm0, %xmm6 movddup %xmm0, %xmm6

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -533,7 +533,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -994,7 +994,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -63,7 +63,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -61,7 +61,7 @@
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif
#ifdef NEHALEM #if defined(NEHALEM) || defined(SANDYBRIDGE)
#define PREFETCH prefetcht1 #define PREFETCH prefetcht1
#define PREFETCHSIZE 84 #define PREFETCHSIZE 84
#endif #endif

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

View File

@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

View File

@ -0,0 +1,84 @@
SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
#DGEMMONCOPY = gemm_ncopy_4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
#DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

View File

@ -69,7 +69,7 @@
#endif #endif
movaps %xmm0, ALPHA movaps %xmm0, ALPHA
#else #else
movaps %xmm3, ALPHA
movq 40(%rsp), X movq 40(%rsp), X
movq 48(%rsp), INCX movq 48(%rsp), INCX
@ -79,6 +79,10 @@
SAVEREGISTERS SAVEREGISTERS
#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
shufps $0, ALPHA, ALPHA shufps $0, ALPHA, ALPHA
leaq (, INCX, SIZE), INCX leaq (, INCX, SIZE), INCX

View File

@ -69,7 +69,6 @@
#endif #endif
movaps %xmm0, ALPHA movaps %xmm0, ALPHA
#else #else
movaps %xmm3, ALPHA
movq 40(%rsp), X movq 40(%rsp), X
movq 48(%rsp), INCX movq 48(%rsp), INCX
@ -79,6 +78,10 @@
SAVEREGISTERS SAVEREGISTERS
#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
unpcklpd ALPHA, ALPHA unpcklpd ALPHA, ALPHA
leaq (, INCX, SIZE), INCX leaq (, INCX, SIZE), INCX

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More