Compare commits

..

131 Commits

Author SHA1 Message Date
Zhang Xianyi
034ffa93fa Provide iaxpy and cblas_iaxpy for integer vectors. make INTEGER_PRECISION=1 2015-07-01 03:11:27 +08:00
Zhang Xianyi
3f1b57668e Fix blas lock bug on AArch64. 2015-06-26 11:54:41 +08:00
Zhang Xianyi
d8f18d32c3 Merge pull request #595 from tanderson92/fixTests
Fix test execution when USE_OPENMP=0
2015-06-22 21:54:51 -05:00
wernsaar
bdb5c842fc Merge pull request #596 from wernsaar/develop
optimizations for haswell
2015-06-13 16:44:48 +02:00
Werner Saar
e7c969e164 added optimized dtrmm_kernel for haswell 2015-06-13 16:16:29 +02:00
Werner Saar
9bd962f655 modified haswell parameter dgemm_unroll_n 2015-06-13 10:28:27 +02:00
Thomas Anderson
4f5691e5c0 Fix test execution when USE_OPENMP=0
The standard way to disable OpenMP support is to set USE_OPENMP=0,
as indicated by other checks to see if USE_OPENMP equals 1. The
problem is obviously then that `ifdef USE_OPENMP` is very much not
what we want to test for. This causes tests to fail when no OpenMP
library is installed.
2015-06-12 23:52:07 -07:00
Zhang Xianyi
29293160a4 Fix #593. Change MACOSX_DEPLOYMENT_TARGET to 10.6. 2015-06-08 10:53:50 -05:00
wernsaar
3e33afef2e Merge pull request #592 from wernsaar/develop
added benchmark scripts
2015-06-08 14:22:02 +02:00
Werner Saar
8614057ea9 added benchmark scripts for numpy, octave and R 2015-06-08 14:06:38 +02:00
Werner Saar
7f375f9e8f updated geev benchmark 2015-06-08 12:58:38 +02:00
wernsaar
69c5169e7d Merge pull request #589 from wernsaar/develop
small modification of gemm.c
2015-06-03 12:14:09 +02:00
Werner Saar
e19948baa1 small modification of gemm.c 2015-06-03 09:11:51 +02:00
wernsaar
a2eaf234fc Merge pull request #587 from wernsaar/develop
added gesv benchmark
2015-06-02 15:29:49 +02:00
Werner Saar
6a13a94e71 added gesv benchmark 2015-06-02 13:35:49 +02:00
wernsaar
eff43d3289 Merge pull request #585 from wernsaar/develop
bugfix for benchmark Makefile on MAC
2015-05-31 15:01:54 +02:00
Werner Saar
9c4817d07b bugfix for Makefile on mac 2015-05-31 14:16:51 +02:00
wernsaar
319f3a0451 Merge pull request #584 from wernsaar/develop
bugfixes, to build benchmarks with mingw on Windows OS
2015-05-29 13:27:20 +02:00
Werner Saar
02c7766f68 bugfixes, to build benchmarks with mingw on Windows OS 2015-05-29 12:56:22 +02:00
wernsaar
f38cb67ca8 Merge pull request #581 from wernsaar/develop
bugfix for arm locking
2015-05-23 12:58:15 +02:00
Werner Saar
eea2e30b74 bugfix for arm locking 2015-05-23 11:40:40 +02:00
Werner Saar
19b8fd2aed smp lock bugfix 2015-05-23 10:58:38 +02:00
wernsaar
0cc5212741 Merge pull request #580 from wernsaar/develop
added blas level1 swap  benchmark
2015-05-23 09:46:39 +02:00
Werner Saar
c47c8e8cf5 added blas level1 swap benchmark 2015-05-21 08:51:42 +02:00
Zhang Xianyi
a11555c715 Support Android NDK armeabi-v7a-hard ABI. (-mfloat-abi=hard)
e.g.
make HOSTCC=gcc CC=arm-linux-androideabi-gcc NO_LAPACK=1 TARGET=ARMV7

In Android NDK, it uses armeabi-v7a-hard ABI.
TARGET_CFLAGS += -mhard-float -D_NDK_MATH_NO_SOFTFP=1
TARGET_LDFLAGS += -Wl,--no-warn-mismatch -lm_hard
For more information, please check hard-float example at
android_ndk/tests/device/hard-float/jni/.
2015-05-20 21:57:27 -05:00
wernsaar
897d03518e Merge pull request #578 from wernsaar/develop
added blas level1 copy benchmark
2015-05-20 11:56:02 +02:00
Werner Saar
23fbc5728e added blas level1 copy benchmark 2015-05-20 11:05:00 +02:00
Zhang Xianyi
6d40fa587f Fix f_check bug. 2015-05-19 12:04:45 -05:00
wernsaar
22dcd79959 Merge pull request #577 from wernsaar/develop
Bugfix for armv6 memory barrier
2015-05-19 10:59:24 +02:00
Werner Saar
ea4df0aad3 Ref #574: Bugfix for armv6 memory barrier 2015-05-19 10:43:12 +02:00
Zhang Xianyi
e127fb8fd8 1) Refs #575. Remove g77 from compiler list.
2) If OpenBLAS cannot find Fortran compiler, it will only build BLAS
(without LAPACK).
2015-05-19 00:01:04 -05:00
wernsaar
7fb718a7d8 Merge pull request #572 from wernsaar/develop
added optimized cscal and zscal functions for steamroller
2015-05-18 13:47:38 +02:00
Werner Saar
24f58c8bb1 added optimized cscal and zscal kernels for steamroller 2015-05-18 12:40:07 +02:00
Werner Saar
95b1faf667 added optimized cscal and zscal kernels for steamroller and piledriver 2015-05-18 10:50:57 +02:00
Werner Saar
2d9e406050 added optimized cscal kernel for sandybridge 2015-05-18 08:46:06 +02:00
Werner Saar
59083e3ce1 added optimized cscal kernel for bulldozer 2015-05-18 07:33:52 +02:00
wernsaar
685be40339 Merge pull request #571 from wernsaar/develop
added optimized cscal and zscal functions
2015-05-17 14:09:14 +02:00
Werner Saar
31c9e399e9 added optimized cscal kernel for haswell 2015-05-17 13:44:09 +02:00
Werner Saar
7de6bb9889 added optimized zscal kernel for bulldozer 2015-05-17 11:45:19 +02:00
Werner Saar
d63034303b added optimized zscal kernel for haswell 2015-05-16 16:41:45 +02:00
Zhang Xianyi
51ff17d46e Add AMD Excavator target. 2015-05-13 16:16:30 -05:00
wernsaar
905534942a Merge pull request #568 from wernsaar/develop
added optimized dscal kernel
2015-05-13 13:48:08 +02:00
Werner Saar
18e90ee2e3 bugfix: added static to functions 2015-05-13 13:31:26 +02:00
Werner Saar
e00cccc41e added optimized dscal kernel for piledriver 2015-05-13 13:05:35 +02:00
Werner Saar
73f09bf64f optimized dscal kernel for increment != 1 2015-05-13 12:14:39 +02:00
Werner Saar
02e772c7e4 added optimized dscal kernel for haswell 2015-05-12 17:19:58 +02:00
Werner Saar
7aee913991 added optimized dscal kernel for sandybridge 2015-05-12 16:27:43 +02:00
Werner Saar
e50a933037 added optimized dscal kernel for bulldozer 2015-05-12 12:28:44 +02:00
Zhang Xianyi
5f9011d6ef Merge pull request #566 from powderluv/develop
Fix build with ALLOC_SHM=0 (Android NDK)
2015-05-11 20:59:12 -05:00
powderluv
ebb9eba987 Fix build with ALLOC_SHM=0 (Android NDK)
Refactor such that you can build with ALLOC_SHM=0. HughTLB
implicity depends on ALLOC_SHM=1. This patch allows
building for Android NDK r10d.
2015-05-10 00:10:26 -07:00
Zhang Xianyi
8e5a1083bb Refs #532. Improve gemv paralel with small m and large n case.
Splite the matrix and reduction.
2015-05-08 05:33:17 +08:00
Zhang Xianyi
6743beb748 Refs #565. Fix the bug of generate FEXTRALIB. 2015-05-07 13:06:53 +08:00
Zhang Xianyi
bcabf72c08 Refs #565. Merge branch 'andreasnoack-anj/bench' into develop 2015-05-07 12:52:14 +08:00
Andreas Noack
cda29f183b Add vecLib benchmarks 2015-05-06 21:52:34 -04:00
wernsaar
e52d36450a Merge pull request #564 from wernsaar/develop
Use only 1 thread in trsm if m or n < 2*GEMM_MULTITHREAD_THRESHOLD
2015-05-06 11:10:31 +02:00
Werner Saar
f8f2e261fe use only 1 thread if m or n < 2*GEMM_MULTITHREAD_THRESHOLD 2015-05-06 10:41:53 +02:00
Werner Saar
be3c843700 added loops to trsm.c 2015-05-06 09:21:19 +02:00
wernsaar
e6f57db846 Merge pull request #563 from wernsaar/develop
Bugfix for gemm3m tests
2015-05-05 12:13:35 +02:00
Werner Saar
9bfd267d51 bugfix for gemm3m tests 2015-05-05 11:58:59 +02:00
Werner Saar
924bc5372e removed gemm3m functions from normal checks 2015-05-05 11:39:43 +02:00
wernsaar
2b83a69650 Merge pull request #561 from wernsaar/develop
updated dgemv_n sgemv_n kernels
2015-05-04 11:11:13 +02:00
Werner Saar
133c11a156 updated dgemv_n kernel for nehalem 2015-04-30 14:38:06 +02:00
Werner Saar
30f52d53df optimized dgemv_n kernel for haswell 2015-04-30 12:11:39 +02:00
Zhang Xianyi
a124637329 Merge pull request #560 from sebastien-villemot/develop
Fix detection of ARM architectures in c_check.
2015-04-29 11:36:47 -05:00
Sébastien Villemot
642aaba2e0 Fix detection of ARM architectures in c_check.
This is necessary to avoid the false detection of a cross-compiling environment.
2015-04-29 18:14:21 +02:00
wernsaar
4c616173e4 Merge pull request #558 from wernsaar/develop
optimizations for sandybridge
2015-04-28 17:30:16 +02:00
Werner Saar
5e83d80725 optimized dger kernel for sandybridge 2015-04-28 16:58:11 +02:00
Werner Saar
b2e1797dc6 added optimized sger kernel for sandybridge 2015-04-28 15:33:38 +02:00
Werner Saar
e216f686cb optimized saxpy and daxpy for sandybridge 2015-04-28 10:18:32 +02:00
Zhang Xianyi
e42652f772 Merge pull request #554 from wernsaar/develop
added benchmarks for zgeru and cgeru
2015-04-25 08:11:36 -05:00
Werner Saar
e77db2af31 add benchmarks for zgeru and cgeru 2015-04-25 14:53:07 +02:00
Zhang Xianyi
37b00841ac Merge pull request #552 from jeromerobert/develop
gemv: Ensure stack buffer is large enough to handle memory alignment
2015-04-24 14:12:12 -05:00
Werner Saar
fc0e0391f3 bugfixes: replaced int with BLASLONG 2015-04-24 14:30:44 +02:00
wernsaar
da0f27b9ac Merge pull request #553 from wernsaar/develop
optimized some blas level1 kernels for increments != 1
2015-04-24 13:57:48 +02:00
Werner Saar
c22068c406 optimized sdot.c for increments != 1 2015-04-24 13:13:20 +02:00
Werner Saar
dee100d0e4 optimized saxpy.c for increments != 1 2015-04-24 11:52:59 +02:00
Werner Saar
0273966abb optimized daxpy kernel for increments != 1 2015-04-24 11:39:17 +02:00
Werner Saar
3a67daa954 optimized ddot.c for increments != 1 2015-04-24 10:56:55 +02:00
Jerome Robert
ab567d8443 gemv: Ensure stack buffer is large enough to handle memory alignment
Ref #478
2015-04-24 10:12:49 +02:00
wernsaar
3c09cea4b2 Merge pull request #550 from wernsaar/develop
added optimized ssymv kernels for haswell and sandybridge
2015-04-23 13:27:38 +02:00
Werner Saar
b4f2153dcd added optimized ssymv kernels for sandybridge 2015-04-23 12:19:24 +02:00
Werner Saar
1c4b0eeae3 added optimized ssymv kernels for haswell 2015-04-23 10:23:13 +02:00
wernsaar
406d9d64e9 Merge pull request #549 from wernsaar/develop
added optimized dsymv kernels for haswell and sandybridge
2015-04-22 12:36:13 +02:00
Werner Saar
1bec9abb9a added optimized dsymv kernels for sandybridge 2015-04-22 12:09:43 +02:00
Werner Saar
3814bf60d3 added optimized dsymv kernels for haswell 2015-04-22 10:42:50 +02:00
Zhang Xianyi
847e19c04e Refs #478,#482, Enable stack alloc for s/dgemv_t.(revert 9798491) 2015-04-20 23:22:40 -05:00
Werner Saar
46c7b4d5c8 added asum benchmark 2015-04-19 11:24:07 +02:00
Werner Saar
8e05d291b5 added scal benchmark 2015-04-18 08:41:41 +02:00
wernsaar
9da555e5f7 Merge pull request #546 from wernsaar/develop
added optimized zaxpy-kernels
2015-04-16 11:36:51 +02:00
Werner Saar
6d0db0151f added optimized zaxpy-kernels 2015-04-16 11:19:37 +02:00
Zhang Xianyi
37b9033c90 Merge pull request #543 from jeromerobert/develop
Fix a buffer overflow with MAX_STACK_ALLOC size in dgemv_t
2015-04-15 11:18:14 -05:00
wernsaar
59e7a518c6 Merge pull request #544 from wernsaar/develop
Optimized  caxpy-kernels
2015-04-15 17:04:02 +02:00
Werner Saar
13889515b3 added optimized caxpy-kernel for sandybridge 2015-04-15 16:29:25 +02:00
Werner Saar
248c9340c3 added optimized caxpy-kernel for haswell 2015-04-15 15:16:31 +02:00
Werner Saar
e9f33b4ca7 added optimized caxpy-kernel for steamroller 2015-04-15 13:49:23 +02:00
Werner Saar
f5d847122a updated caxpy_microk_bulldozer-2.c and caxpy.c 2015-04-15 11:59:38 +02:00
Jerome Robert
a4c96eca67 Fix a buffer overflow with MAX_STACK_ALLOC size in dgemv_t
Refs #478, #482, 9798481, fd9fd42
2015-04-15 11:46:48 +02:00
wernsaar
fb02cb0a41 Merge pull request #540 from wernsaar/develop
Optimized dot- and axpy-kernels
2015-04-14 15:53:09 +02:00
Werner Saar
baa0363ea2 add optimized ddot-kernel for piledriver 2015-04-14 15:09:13 +02:00
Werner Saar
34ba66606a add optimized daxpy-kernel for piledriver 2015-04-14 14:23:29 +02:00
Werner Saar
f615dc7603 added optimized saxpy kernel for steamroller 2015-04-14 09:09:39 +02:00
Werner Saar
331c417637 optimized saxpy for piledriver 2015-04-14 08:34:11 +02:00
Zhang Xianyi
6c3a0b5d46 Enable MAX_STACK_ALLOC by default. 2015-04-13 23:23:40 -05:00
Zhang Xianyi
fd9fd42936 Refs #478, #482. Fixed bug on previous commit. 2015-04-13 23:22:27 -05:00
Zhang Xianyi
9798481979 Refs #478, #482. Fix segfault bug for gemv_t with MAX_ALLOC_STACK flag.
For gemv_t, directly use malloc to create the buffer.
2015-04-13 19:45:27 -05:00
Werner Saar
d7a17ad85d optimized sdot-kernel for pilediver 2015-04-13 13:19:21 +02:00
Werner Saar
d35f6c63c2 add optimized daxpy-kernel for steamroller 2015-04-13 12:22:43 +02:00
Werner Saar
166d76e864 added optimized sdot-kernel for steamroller 2015-04-11 08:48:18 +02:00
Werner Saar
f9f127d838 added optimized ddot kernel for steamroller 2015-04-10 16:18:03 +02:00
wernsaar
62231ab337 Merge pull request #538 from wernsaar/develop
Added optimized cdot- and zdot-kernels
2015-04-10 16:03:37 +02:00
Werner Saar
3119def9a7 updated cdot and zdot 2015-04-10 11:10:31 +02:00
Werner Saar
33b332372a add optimized cdot- and zdot-kernel for sandybridge 2015-04-10 09:37:26 +02:00
Werner Saar
fd838c75bc add optimized cdot- and zdot-kernel for haswell 2015-04-09 15:13:52 +02:00
Werner Saar
b57a60dac8 updated cdot and zdot for piledriver 2015-04-09 10:33:46 +02:00
Werner Saar
5c51163972 added optimized cdot- and zdot-kernel for steamroller 2015-04-09 09:45:23 +02:00
Werner Saar
9299d8cfd6 added optimized cdot- and zdot-kernels for bulldozer 2015-04-08 16:29:55 +02:00
Zhang Xianyi
0a3d3b945d Refs #535. Fix the wrong vector instruction in sgemm sandy bridge kernel. 2015-04-08 03:55:49 +08:00
Zhang Xianyi
4f680a7d61 Merge pull request #534 from wernsaar/develop
Refs #533. added optimized saxpy- and daxpy-kernel for haswell and sandybridge
2015-04-07 12:48:11 -05:00
Werner Saar
ba926e807c added cdot- and zdot benchmark 2015-04-07 11:56:06 +02:00
Werner Saar
60c6dec6e6 updated some lines for bulldozer 2015-04-06 18:47:16 +02:00
Werner Saar
47898cca35 added optimized saxpy- and daxpy-kernel for sandybridge 2015-04-06 16:05:16 +02:00
Werner Saar
53bb924287 added optimized saxpy- and daxpy-kernel for haswell 2015-04-06 12:33:16 +02:00
Zhang Xianyi
1e80b8b0d3 Merge pull request #531 from wernsaar/develop
added optimized sdot- and ddot-kernels for Haswell and Sandybridge
2015-04-05 16:42:39 -05:00
Werner Saar
a901b065d3 added optimized ddot-kernel for sandybridge 2015-04-05 20:19:38 +02:00
Werner Saar
3937e2a0a0 add optimized sdot-kernel for sandybridge 2015-04-05 19:47:05 +02:00
Werner Saar
9707d608d5 removed double definition line 2015-04-05 18:35:34 +02:00
Werner Saar
701b9d7556 added optimized sdot- and ddot-kernel for HASWELL 2015-04-05 17:57:53 +02:00
Zhang Xianyi
8977b3f235 Refs #529. Support Intel Broadwell by Haswell kernels. 2015-04-02 11:08:03 -05:00
Zhang Xianyi
f6426395ea Merge pull request #527 from xantares/patch-1
fix mingw install
2015-03-30 10:16:11 -05:00
xantares
0ac787eefe fix mingw install 2015-03-30 09:30:55 +02:00
Zhang Xianyi
e5b96e55a7 Fix build bug for ARM64. 2015-03-24 15:27:17 -05:00
175 changed files with 28130 additions and 1474 deletions

View File

@@ -20,6 +20,8 @@ ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install
@@ -131,7 +133,7 @@ ifeq ($(CORE), UNKOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif
ifeq ($(NOFORTRAN), 1)
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
$(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.)
endif
ifeq ($(NO_STATIC), 1)
ifeq ($(NO_SHARED), 1)
@@ -231,7 +233,7 @@ ifndef NOFORTRAN
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc

View File

@@ -1,13 +1,23 @@
# ifeq logical or
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6

View File

@@ -86,8 +86,8 @@ ifeq ($(OSNAME), Darwin)
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)

View File

@@ -114,6 +114,9 @@ NO_AFFINITY = 1
# Support for IEEE quad precision(it's *real* REAL*16)( under testing)
# QUAD_PRECISION = 1
# Support for integer matrix and vector (e.g. iaxpy)
# INTEGER_PRECISION = 1
# Theads are still working for a while after finishing BLAS operation
# to reduce thread activate/deactivate overhead. You can determine
# time out to improve performance. This number should be from 4 to 30
@@ -162,7 +165,7 @@ COMMON_PROF = -pg
# Improve GEMV and GER for small matrices by stack allocation.
# For details, https://github.com/xianyi/OpenBLAS/pull/482
#
# MAX_STACK_ALLOC=2048
MAX_STACK_ALLOC=2048
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoid conflicts with other BLAS libraries, especially when using

View File

@@ -23,6 +23,7 @@ CC = gcc
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
endif
endif
@@ -64,6 +65,9 @@ endif
ifeq ($(TARGET), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@@ -91,6 +95,9 @@ endif
ifeq ($(TARGET_CORE), STEAMROLLER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@@ -195,12 +202,18 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap
OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
endif
#
# OS dependent settings
#
ifeq ($(OSNAME), Darwin)
export MACOSX_DEPLOYMENT_TARGET=10.2
export MACOSX_DEPLOYMENT_TARGET=10.6
MD5SUM = md5 -r
endif
@@ -296,6 +309,10 @@ CCOMMON_OPT += -DQUAD_PRECISION
NO_EXPRECISION = 1
endif
ifdef INTEGER_PRECISION
CCOMMON_OPT += -DINTEGER_PRECISION
endif
ifneq ($(ARCH), x86)
ifneq ($(ARCH), x86_64)
NO_EXPRECISION = 1
@@ -408,7 +425,7 @@ endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL
@@ -578,7 +595,7 @@ else
FCOMMON_OPT += -m32
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
endif
@@ -590,14 +607,14 @@ ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
ifeq ($(F_COMPILER), FUJITSU)
CCOMMON_OPT += -DF_INTERFACE_FUJITSU
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
@@ -615,7 +632,7 @@ endif
else
FCOMMON_OPT += -q32
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
@@ -633,7 +650,7 @@ FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp p7
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
endif
@@ -662,7 +679,7 @@ FCOMMON_OPT += -mabi=n32
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
endif
@@ -699,7 +716,7 @@ FCOMMON_OPT += -m64
endif
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FEXTRALIB += -lstdc++
FCOMMON_OPT += -mp
endif
@@ -747,14 +764,14 @@ FCOMMON_OPT += -m32
else
FCOMMON_OPT += -m64
endif
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -xopenmp=parallel
endif
endif
ifeq ($(F_COMPILER), COMPAQ)
CCOMMON_OPT += -DF_INTERFACE_COMPAQ
ifdef USE_OPENMP
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif

View File

@@ -4,6 +4,7 @@ QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
IBLASOBJS_P = $(IBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX))
@@ -22,12 +23,18 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
endif
ifdef INTEGER_PRECISION
BLASOBJS += $(IBLASOBJS)
BLASOBJS_P += $(IBLASOBJS_P)
endif
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
$(IBLASOBJS) $(IBLASOBJS_P) : override CFLAGS += -DINTEGER -UCOMPLEX
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
@@ -35,6 +42,7 @@ $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(IBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
libs :: $(BLASOBJS) $(COMMONOBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

View File

@@ -33,6 +33,7 @@ BOBCAT
BULLDOZER
PILEDRIVER
STEAMROLLER
EXCAVATOR
c)VIA CPU:
SSE_GENERIC

9
benchmark/Make_exe.sh Executable file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
for f in *.goto *.acml *.mkl *.atlas
do
if [ -f "$f" ]; then
mv $f `echo $f|tr '.' '_'`.exe
fi
done

File diff suppressed because it is too large Load Diff

196
benchmark/asum.c Normal file
View File

@@ -0,0 +1,196 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef ASUM
#ifdef COMPLEX
#ifdef DOUBLE
#define ASUM BLASFUNC(dzasum)
#else
#define ASUM BLASFUNC(scasum)
#endif
#else
#ifdef DOUBLE
#define ASUM BLASFUNC(dasum)
#else
#define ASUM BLASFUNC(sasum)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
FLOAT result;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = ASUM (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
#else
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -71,8 +71,14 @@ double fabs(double);
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
@@ -99,6 +105,7 @@ int gettimeofday(struct timeval *tv, void *tz){
#endif
static __inline double getmflops(int ratio, int m, double secs){
double mm = (double)m;

201
benchmark/copy.c Normal file
View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef COPY
#ifdef COMPLEX
#ifdef DOUBLE
#define COPY BLASFUNC(zcopy)
#else
#define COPY BLASFUNC(ccopy)
#endif
#else
#ifdef DOUBLE
#define COPY BLASFUNC(dcopy)
#else
#define COPY BLASFUNC(scopy)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
COPY (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -144,6 +144,7 @@ int main(int argc, char *argv[]){
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
FLOAT wkopt[4];
char job='V';
char jobr='N';
char *p;
blasint m, i, j, info,lwork;
@@ -202,9 +203,9 @@ int main(int argc, char *argv[]){
lwork = -1;
m=to;
#ifndef COMPLEX
GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
#else
GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
#endif
lwork = (blasint)wkopt[0];
@@ -226,16 +227,16 @@ int main(int argc, char *argv[]){
lwork = -1;
#ifndef COMPLEX
GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, wkopt, &lwork, &info);
#else
GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, wkopt, &lwork,rwork, &info);
#endif
lwork = (blasint)wkopt[0];
#ifndef COMPLEX
GEEV (&job, &job, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, wi, vl, &m, vr, &m, work, &lwork, &info);
#else
GEEV (&job, &job, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
#endif
gettimeofday( &stop, (struct timezone *)0);

View File

@@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
FLOAT beta [] = {0.0, 0.0};
char trans='N';
blasint m, n, i, j;
int loops = 1;
@@ -168,12 +168,21 @@ int main(int argc, char *argv[]){
has_param_n=1;
}
#ifdef linux
srandom(getpid());
#endif
for(j = 0; j < m; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
fprintf(stderr, " SIZE Flops\n");
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step)
{
@@ -188,34 +197,23 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
gettimeofday( &start, (struct timezone *)0);
for (l=0; l<loops; l++)
{
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg /= loops;
timeg = time1/loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
}

View File

@@ -35,12 +35,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#undef GER
#ifdef COMPLEX
#ifdef DOUBLE
#define GER BLASFUNC(zgeru)
#else
#define GER BLASFUNC(cgeru)
#endif
#else
#ifdef DOUBLE
#define GER BLASFUNC(dger)
#else
#define GER BLASFUNC(sger)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)

218
benchmark/gesv.c Normal file
View File

@@ -0,0 +1,218 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double);
#undef GESV
#undef GETRS
#ifndef COMPLEX
#ifdef XDOUBLE
#define GESV BLASFUNC(qgesv)
#elif defined(DOUBLE)
#define GESV BLASFUNC(dgesv)
#else
#define GESV BLASFUNC(sgesv)
#endif
#else
#ifdef XDOUBLE
#define GESV BLASFUNC(xgesv)
#elif defined(DOUBLE)
#define GESV BLASFUNC(zgesv)
#else
#define GESV BLASFUNC(cgesv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
blasint m, i, j, info;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step){
fprintf(stderr, " %dx%d : ", (int)m, (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
b[i + j * m * COMPSIZE] = 0.0;
}
}
for (j = 0; j < m; ++j) {
for (i = 0; i < m * COMPSIZE; ++i) {
b[i] += a[i + j * m * COMPSIZE];
}
}
gettimeofday( &start, (struct timezone *)0);
GESV (&m, &m, a, &m, ipiv, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
fprintf(stderr,
"%10.2f MFlops %10.6f s\n",
COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -52,6 +52,11 @@ C)
awk '/MFlops/ { print $3,int($9) }'|tail --lines=+2
;;
B)
# Copy Benchmark
awk '/MBytes/ { print $1,int($3) }'|tail --lines=+2
;;
*)
awk '/MFlops/ { print $1,int($3) }'|tail --lines=+2

View File

@@ -88,6 +88,10 @@ double fabs(double);
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;

202
benchmark/scal.c Normal file
View File

@@ -0,0 +1,202 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SCAL
#ifdef COMPLEX
#ifdef DOUBLE
#define SCAL BLASFUNC(zscal)
#else
#define SCAL BLASFUNC(cscal)
#endif
#else
#ifdef DOUBLE
#define SCAL BLASFUNC(dscal)
#else
#define SCAL BLASFUNC(sscal)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SCAL (&m, alpha, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
#else
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
#endif
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_cgemm(N,l):
A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
B = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_cgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_cgemv(N,l):
A = randn(N,N).astype('float32') + randn(N,N).astype('float32') * 1j;
B = randn(N).astype('float32') + randn(N).astype('float32') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_cgemv(i,LOOPS)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
from scipy.linalg.blas import daxpy
def run_daxpy(N,l):
x = randn(N).astype('float64')
y = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
y = daxpy(x,y, a=2.0 )
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_daxpy(i,LOOPS)

56
benchmark/scripts/NUMPY/ddot.py Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_ddot(N,l):
A = randn(N).astype('float64')
B = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_ddot(i,LOOPS)

55
benchmark/scripts/NUMPY/deig.py Executable file
View File

@@ -0,0 +1,55 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_deig(N,l):
A = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
la,v = numpy.linalg.eig(A)
end = time.time()
timediff = (end -start)
mflops = ( 26.33 *N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_deig(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dgemm(N,l):
A = randn(N,N).astype('float64')
B = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dgemv(N,l):
A = randn(N,N).astype('float64')
B = randn(N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgemv(i,LOOPS)

View File

@@ -0,0 +1,58 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
from scipy.linalg.lapack import dgesv
def run_dgesv(N,l):
a = randn(N,N).astype('float64')
b = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
dgesv(a,b,1,1)
end = time.time()
timediff = (end -start)
mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dgesv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_dsolve(N,l):
A = randn(N,N).astype('float64')
B = randn(N,N).astype('float64')
start = time.time();
for i in range(0,l):
ref = numpy.linalg.solve(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2.0/3.0 *N*N*N + 2.0*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_dsolve(i,LOOPS)

56
benchmark/scripts/NUMPY/sdot.py Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sdot(N,l):
A = randn(N).astype('float32')
B = randn(N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N ) *l / timediff
mflops *= 1e-6
size = "%d" % (N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sdot(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sgemm(N,l):
A = randn(N,N).astype('float32')
B = randn(N,N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_sgemv(N,l):
A = randn(N,N).astype('float32')
B = randn(N).astype('float32')
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 2*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_sgemv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_zgemm(N,l):
A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
B = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_zgemm(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/python
import os
import sys
import time
import numpy
from numpy.random import randn
def run_zgemv(N,l):
A = randn(N,N).astype('float64') + randn(N,N).astype('float64') * 1j;
B = randn(N).astype('float64') + randn(N).astype('float64') * 1j;
start = time.time();
for i in range(0,l):
ref = numpy.dot(A,B)
end = time.time()
timediff = (end -start)
mflops = ( 8*N*N) *l / timediff
mflops *= 1e-6
size = "%dx%d" % (N,N)
print("%14s :\t%20f MFlops\t%20f sec" % (size,mflops,timediff))
if __name__ == "__main__":
N=128
NMAX=2048
NINC=128
LOOPS=1
z=0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p);
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range (N,NMAX+NINC,NINC):
run_zgemv(i,LOOPS)

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n)) + single(rand(n,n)) * 1i;
B = single(rand(n,n)) + single(rand(n,n)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n)) + single(rand(n,n)) * 1i;
B = single(rand(n,1)) + single(rand(n,1)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

56
benchmark/scripts/OCTAVE/deig.m Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
start = clock();
l=0;
while l < loops
[V,lambda] = eig(A);
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 26.33 *n*n*n ) *loops / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg );
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,n));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,1));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,59 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n));
B = double(rand(n,n));
start = clock();
l=0;
while l < loops
x = linsolve(A,B);
#x = A / B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
#r = norm(A*x - B)/norm(B)
mflops = ( 2.0/3.0 *n*n*n + 2.0*n*n*n ) *loops / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg );
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n));
B = single(rand(n,n));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = single(rand(n,n));
B = single(rand(n,1));
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n)) + double(rand(n,n)) * 1i;
B = double(rand(n,n)) + double(rand(n,n)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

View File

@@ -0,0 +1,56 @@
#!/usr/bin/octave --silent
nfrom = 128 ;
nto = 2048;
nstep = 128;
loops = 1;
arg_list = argv();
for i = 1:nargin
switch(i)
case 1
nfrom = str2num(arg_list{i});
case 2
nto = str2num(arg_list{i});
case 3
nstep = str2num(arg_list{i});
case 4
loops = str2num(arg_list{i});
endswitch
endfor
p = getenv("OPENBLAS_LOOPS");
if p
loops = str2num(p);
endif
printf("From %d To %d Step=%d Loops=%d\n",nfrom, nto, nstep, loops);
printf(" SIZE FLOPS TIME\n");
n = nfrom;
while n <= nto
A = double(rand(n,n)) + double(rand(n,n)) * 1i;
B = double(rand(n,1)) + double(rand(n,1)) * 1i;
start = clock();
l=0;
while l < loops
C = A * B;
l = l + 1;
endwhile
timeg = etime(clock(), start);
mflops = ( 4.0 * 2.0*n*n *loops ) / ( timeg * 1.0e6 );
st1 = sprintf("%dx%d : ", n,n);
printf("%20s %10.2f MFlops %10.6f sec\n", st1, mflops, timeg);
n = n + nstep;
endwhile

62
benchmark/scripts/R/deig.R Executable file
View File

@@ -0,0 +1,62 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
ev <- eigen(A)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

63
benchmark/scripts/R/dgemm.R Executable file
View File

@@ -0,0 +1,63 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
C <- A %*% B
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

63
benchmark/scripts/R/dsolve.R Executable file
View File

@@ -0,0 +1,63 @@
#!/usr/bin/Rscript
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
start <- proc.time()[3]
while ( l <= loops ) {
solve(A,B)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
}

201
benchmark/swap.c Normal file
View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above swapright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above swapright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SWAP
#ifdef COMPLEX
#ifdef DOUBLE
#define SWAP BLASFUNC(zswap)
#else
#define SWAP BLASFUNC(cswap)
#endif
#else
#ifdef DOUBLE
#define SWAP BLASFUNC(dswap)
#else
#define SWAP BLASFUNC(sswap)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SWAP (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -130,11 +130,21 @@ int main(int argc, char *argv[]){
char trans='N';
char diag ='U';
int l;
int loops = 1;
double timeg;
if ((p = getenv("OPENBLAS_SIDE"))) side=*p;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
p = getenv("OPENBLAS_LOOPS");
if ( p != NULL )
loops = atoi(p);
blasint m, i, j;
int from = 1;
@@ -150,7 +160,7 @@ int main(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c\n", from, to, step,side,uplo,trans,diag);
fprintf(stderr, "From : %3d To : %3d Step = %3d Side = %c Uplo = %c Trans = %c Diag = %c Loops = %d\n", from, to, step,side,uplo,trans,diag,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
@@ -171,28 +181,35 @@ int main(int argc, char *argv[]){
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
timeg=0.0;
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
fprintf(stderr, " %6d : ", (int)m);
gettimeofday( &start, (struct timezone *)0);
for (l=0; l<loops; l++)
{
TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
gettimeofday( &stop, (struct timezone *)0);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
gettimeofday( &start, (struct timezone *)0);
TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
time1 = timeg/loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
}

196
benchmark/zdot-intel.c Normal file
View File

@@ -0,0 +1,196 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#define RETURN_BY_STACK 1
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
DOT (&result, &m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

195
benchmark/zdot.c Normal file
View File

@@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT _Complex result;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -4,6 +4,8 @@
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
$hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$binary = $ENV{"BINARY"};
@@ -55,6 +57,7 @@ $os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);

View File

@@ -93,6 +93,10 @@ extern "C" {
#include <sched.h>
#endif
#ifdef OS_ANDROID
#define NO_SYSV_IPC
#endif
#ifdef OS_WINDOWS
#ifdef ATOM
#define GOTO_ATOM ATOM
@@ -106,7 +110,9 @@ extern "C" {
#endif
#else
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
#endif
#include <sys/time.h>
#include <unistd.h>
#include <math.h>
@@ -270,6 +276,11 @@ typedef int blasint;
#define SIZE 8
#define BASE_SHIFT 3
#define ZBASE_SHIFT 4
#elif defined(INTEGER) //extend for integer matrix
#define FLOAT int
#define SIZE 4
#define BASE_SHIFT 2
#define ZBASE_SHIFT 3
#else
#define FLOAT float
#define SIZE 4
@@ -499,6 +510,8 @@ void blas_set_parameter(void);
int blas_get_cpu_number(void);
void *blas_memory_alloc (int);
void blas_memory_free (void *);
void *blas_memory_alloc_nolock (int); //use malloc without blas_lock
void blas_memory_free_nolock (void *);
int get_num_procs (void);

View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
Copyright (c) 2011-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -30,50 +30,21 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_ARM
#define COMMON_ARM
#if defined(ARMV5) || defined(ARMV6)
#define MB
#define WMB
#else
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#endif
#define INLINE inline
#define RETURN_BY_COMPLEX
@@ -88,9 +59,12 @@ static void __inline blas_lock(volatile BLASULONG *address){
while (*address) {YIELDING;};
__asm__ __volatile__(
"1: \n\t"
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"cmp r3, #0 \n\t"
"bne 1b \n\t"
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)

View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
Copyright (c) 2011-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -30,49 +30,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_ARM64
#define COMMON_ARM64
#define MB
#define WMB
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#define INLINE inline
@@ -81,26 +44,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER
static void __inline blas_lock(volatile BLASULONG *address){
/*
int register ret;
int register tmp;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
"1: \n\t"
"ldaxr %2, [%1] \n\t"
"mov %2, #0 \n\t"
"stlxr %w0, %2, [%1] \n\t"
"cbnz %w0, 1b \n\t"
"mov %0 , #0 \n\t"
: "=r"(ret), "=r"(address), "=r"(tmp)
: "1"(address)
: "memory", "r2" , "r3"
: "memory", "%w0"
//, "%r2" , "%r3"
);
} while (ret);
*/
}
@@ -166,3 +133,4 @@ REALNAME:
#endif
#endif

9
common_i.h Normal file
View File

@@ -0,0 +1,9 @@
#ifndef COMMON_I_H
#define COMMON_I_H
#ifndef DYNAMIC_ARCH
#define IAXPYU_K iaxpy_k
#else
#error
#endif
#endif

View File

@@ -93,6 +93,7 @@ openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdo
void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *);
void BLASFUNC(iaxpy) (blasint *, int *, int *, blasint *, int *, blasint *);
void BLASFUNC(qaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *);

View File

@@ -60,6 +60,8 @@ int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int iaxpy_k (BLASLONG, BLASLONG, BLASLONG, int,
int *, BLASLONG, int *, BLASLONG, int *, BLASLONG);
int caxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int zaxpy_k (BLASLONG, BLASLONG, BLASLONG, double, double,

View File

@@ -47,6 +47,10 @@
#include "common_z.h"
#include "common_x.h"
#ifdef INTEGER_PRECISION
#include "common_i.h"
#endif
#ifndef COMPLEX
#ifdef XDOUBLE
@@ -635,6 +639,9 @@
#define OMATCOPY_K_CT DOMATCOPY_K_CT
#define OMATCOPY_K_RT DOMATCOPY_K_RT
#define GEADD_K DGEADD_K
#elif defined(INTEGER)
#define AXPYU_K IAXPYU_K
#else
#define AMAX_K SAMAX_K

View File

@@ -65,6 +65,7 @@ extern int blas_omp_linked;
#define BLAS_XDOUBLE 0x0002U
#define BLAS_REAL 0x0000U
#define BLAS_COMPLEX 0x0004U
#define BLAS_INTEGER 0x0008U
#define BLAS_TRANSA 0x0030U /* 2bit */
#define BLAS_TRANSA_N 0x0000U

View File

@@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define MMXSTORE movd
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@@ -109,6 +109,7 @@
#define CORE_PILEDRIVER 23
#define CORE_HASWELL 24
#define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -203,5 +204,6 @@ typedef struct {
#define CPUTYPE_PILEDRIVER 47
#define CPUTYPE_HASWELL 48
#define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50
#endif

View File

@@ -1098,6 +1098,16 @@ int get_cpuname(void){
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 13:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
@@ -1112,11 +1122,36 @@ int get_cpuname(void){
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 7:
case 15:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
}
break;
case 5:
switch (model) {
case 6:
//Broadwell
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
}
break;
}
break;
case 0x7:
@@ -1163,11 +1198,20 @@ int get_cpuname(void){
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 0:
if(support_avx())
return CPUTYPE_STEAMROLLER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
switch(exmodel){
case 3:
if(support_avx())
return CPUTYPE_STEAMROLLER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 6:
if(support_avx())
return CPUTYPE_EXCAVATOR;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
}
break;
}
break;
case 5:
@@ -1297,6 +1341,7 @@ static char *cpuname[] = {
"PILEDRIVER",
"HASWELL",
"STEAMROLLER",
"EXCAVATOR",
};
static char *lowercpuname[] = {
@@ -1349,6 +1394,7 @@ static char *lowercpuname[] = {
"piledriver",
"haswell",
"steamroller",
"excavator",
};
static char *corename[] = {
@@ -1378,6 +1424,7 @@ static char *corename[] = {
"PILEDRIVER",
"HASWELL",
"STEAMROLLER",
"EXCAVATOR",
};
static char *corename_lower[] = {
@@ -1407,6 +1454,7 @@ static char *corename_lower[] = {
"piledriver",
"haswell",
"steamroller",
"excavator",
};
@@ -1525,6 +1573,16 @@ int get_coretype(void){
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 13:
//broadwell
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
@@ -1539,11 +1597,36 @@ int get_coretype(void){
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 7:
case 15:
//broadwell
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
}
break;
case 5:
switch (model) {
case 6:
//broadwell
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
}
break;
}
break;
@@ -1574,10 +1657,20 @@ int get_coretype(void){
return CORE_BARCELONA; //OS don't support AVX.
case 0:
if(support_avx())
return CORE_STEAMROLLER;
else
return CORE_BARCELONA; //OS don't support AVX.
switch(exmodel){
case 3:
if(support_avx())
return CORE_STEAMROLLER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 6:
if(support_avx())
return CORE_EXCAVATOR;
else
return CORE_BARCELONA; //OS don't support AVX.
}
break;
}

View File

@@ -44,6 +44,10 @@ COMPILER_DEC
COMPILER_GNU
#endif
#if defined(__ANDROID__)
OS_ANDROID
#endif
#if defined(__linux__)
OS_LINUX
#endif

View File

@@ -27,12 +27,18 @@ ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o
ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o
ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o
ztestl1o = c_zblas1.o
ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o
ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
all :: all1 all2 all3
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
@@ -115,8 +121,8 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3_3m: $(ctestl3o) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
# Double complex
xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
@@ -127,8 +133,8 @@ xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3_3m: $(ztestl3o) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
include $(TOPDIR)/Makefile.tail

View File

@@ -46,235 +46,7 @@ void F77_c3chke(char * rout) {
}
if (strncmp( sf,"cblas_cgemm3m" ,13)==0) {
cblas_rout = "cblas_cgemm3" ;
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
} else if (strncmp( sf,"cblas_cgemm" ,11)==0) {
if (strncmp( sf,"cblas_cgemm" ,11)==0) {
cblas_rout = "cblas_cgemm" ;

1936
ctest/c_c3chke_3m.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -567,81 +567,3 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
CBLAS_TEST_COMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}

647
ctest/c_cblas3_3m.c Normal file
View File

@@ -0,0 +1,647 @@
/*
* Written by D.P. Manley, Digital Equipment Corporation.
* Prefixed "C_" to BLAS routines and their declarations.
*
* Modified by T. H. Do, 4/15/98, SGI/CRAY Research.
*/
#include <stdlib.h>
#include "common.h"
#include "cblas_test.h"
#define TEST_COL_MJR 0
#define TEST_ROW_MJR 1
#define UNDEFINED -1
void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
CBLAS_TEST_COMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}
void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n,
CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
CBLAS_TEST_COMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_UPLO uplo;
enum CBLAS_SIDE side;
get_uplo_type(uplow,&uplo);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n,
CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
CBLAS_TEST_COMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_UPLO uplo;
enum CBLAS_SIDE side;
get_uplo_type(uplow,&uplo);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
c[j*(*ldc)+i]=C[i*LDC+j];
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k,
float *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
CBLAS_TEST_COMPLEX *A, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
else
cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
}
void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k,
CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
CBLAS_TEST_COMPLEX *A, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
else
cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
}
void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k,
CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, float *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
int i,j,LDA,LDB,LDC;
CBLAS_TEST_COMPLEX *A, *B, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX ));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDA = *n+1;
LDB = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) );
B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, *beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
else
cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
}
void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k,
CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
int i,j,LDA,LDB,LDC;
CBLAS_TEST_COMPLEX *A, *B, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDA = *n+1;
LDB = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX));
B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}
void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a,
int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_COMPLEX *A, *B;
enum CBLAS_SIDE side;
enum CBLAS_DIAG diag;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
get_diag_type(diagn,&diag);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
b[j*(*ldb)+i].real=B[i*LDB+j].real;
b[j*(*ldb)+i].imag=B[i*LDB+j].imag;
}
free(A);
free(B);
}
else if (*order == TEST_COL_MJR)
cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a,
int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_COMPLEX *A, *B;
enum CBLAS_SIDE side;
enum CBLAS_DIAG diag;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
get_diag_type(diagn,&diag);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
b[j*(*ldb)+i].real=B[i*LDB+j].real;
b[j*(*ldb)+i].imag=B[i*LDB+j].imag;
}
free(A);
free(B);
}
else if (*order == TEST_COL_MJR)
cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
CBLAS_TEST_COMPLEX *c, int *ldc ) {
CBLAS_TEST_COMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}

View File

@@ -49,237 +49,7 @@ void F77_z3chke(char * rout) {
if (strncmp( sf,"cblas_zgemm3m" ,13)==0) {
cblas_rout = "cblas_zgemm3" ;
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 1;
cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 2; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 3; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = FALSE;
cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 4; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 5; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 6; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 2, BETA, C, 2 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 9; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 2, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 11; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
ALPHA, A, 1, B, 2, BETA, C, 1 );
chkxer();
cblas_info = 14; RowMajorStrg = TRUE;
cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
ALPHA, A, 1, B, 1, BETA, C, 1 );
chkxer();
} else if (strncmp( sf,"cblas_zgemm" ,11)==0) {
if (strncmp( sf,"cblas_zgemm" ,11)==0) {
cblas_rout = "cblas_zgemm" ;
cblas_info = 1;

1940
ctest/c_z3chke_3m.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -564,80 +564,3 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
}
void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
CBLAS_TEST_ZOMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}

643
ctest/c_zblas3_3m.c Normal file
View File

@@ -0,0 +1,643 @@
/*
* Written by D.P. Manley, Digital Equipment Corporation.
* Prefixed "C_" to BLAS routines and their declarations.
*
* Modified by T. H. Do, 4/15/98, SGI/CRAY Research.
*/
#include <stdlib.h>
#include "common.h"
#include "cblas_test.h"
#define TEST_COL_MJR 0
#define TEST_ROW_MJR 1
#define UNDEFINED -1
void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
CBLAS_TEST_ZOMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}
void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
CBLAS_TEST_ZOMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_UPLO uplo;
enum CBLAS_SIDE side;
get_uplo_type(uplow,&uplo);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
CBLAS_TEST_ZOMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_UPLO uplo;
enum CBLAS_SIDE side;
get_uplo_type(uplow,&uplo);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB,
beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
c[j*(*ldc)+i]=C[i*LDC+j];
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
else
cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb,
beta, c, *ldc );
}
void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
CBLAS_TEST_ZOMPLEX *A, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
else
cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta,
c, *ldc );
}
void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
int i,j,LDA,LDC;
CBLAS_TEST_ZOMPLEX *A, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta,
C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
else
cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta,
c, *ldc );
}
void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
int i,j,LDA,LDB,LDC;
CBLAS_TEST_ZOMPLEX *A, *B, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDA = *n+1;
LDB = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, *beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
else
cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, *beta, c, *ldc );
}
void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
int i,j,LDA,LDB,LDC;
CBLAS_TEST_ZOMPLEX *A, *B, *C;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDA = *n+1;
LDB = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*n; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}
void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a,
int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_ZOMPLEX *A, *B;
enum CBLAS_SIDE side;
enum CBLAS_DIAG diag;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
get_diag_type(diagn,&diag);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
b[j*(*ldb)+i].real=B[i*LDB+j].real;
b[j*(*ldb)+i].imag=B[i*LDB+j].imag;
}
free(A);
free(B);
}
else if (*order == TEST_COL_MJR)
cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a,
int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) {
int i,j,LDA,LDB;
CBLAS_TEST_ZOMPLEX *A, *B;
enum CBLAS_SIDE side;
enum CBLAS_DIAG diag;
enum CBLAS_UPLO uplo;
enum CBLAS_TRANSPOSE trans;
get_uplo_type(uplow,&uplo);
get_transpose_type(transp,&trans);
get_diag_type(diagn,&diag);
get_side_type(rtlf,&side);
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha,
A, LDA, B, LDB );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
b[j*(*ldb)+i].real=B[i*LDB+j].real;
b[j*(*ldb)+i].imag=B[i*LDB+j].imag;
}
free(A);
free(B);
}
else if (*order == TEST_COL_MJR)
cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
else
cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
CBLAS_TEST_ZOMPLEX *A, *B, *C;
int i,j,LDA, LDB, LDC;
enum CBLAS_TRANSPOSE transa, transb;
get_transpose_type(transpa, &transa);
get_transpose_type(transpb, &transb);
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
else {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
A[i*LDA+j].imag=a[j*(*lda)+i].imag;
}
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
else {
LDB = *k+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
}
cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
B, LDB, beta, C, LDC );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
c[j*(*ldc)+i].real=C[i*LDC+j].real;
c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
}
free(A);
free(B);
free(C);
}
else if (*order == TEST_COL_MJR)
cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
else
cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}

View File

@@ -62,6 +62,11 @@
#endif
#endif
#ifndef TRANSA
#define Y_DUMMY_NUM 1024
static FLOAT y_dummy[Y_DUMMY_NUM];
#endif
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
FLOAT *a, *x, *y;
@@ -99,10 +104,15 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a += n_from * lda * COMPSIZE;
#ifdef TRANSA
y += n_from * incy * COMPSIZE;
#else
//for split matrix row (n) direction and vector x of gemv_n
x += n_from * incx * COMPSIZE;
//store partial result for every thread
y += (m_to - m_from) * 1 * COMPSIZE * pos;
#endif
}
// fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d\n", m_from, m_to, n_from, n_to);
//fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d POS=%d\n", m_from, m_to, n_from, n_to, pos);
GEMV(m_to - m_from, n_to - n_from, 0,
*((FLOAT *)args -> alpha + 0),
@@ -126,6 +136,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
BLASLONG width, i, num_cpu;
#ifndef TRANSA
int split_x=0;
#endif
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
@@ -198,6 +212,58 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
i -= width;
}
#ifndef TRANSA
//try to split matrix on row direction and x.
//Then, reduction.
if (num_cpu < nthreads) {
//too small to split or bigger than the y_dummy buffer.
double MN = (double) m * (double) n;
if ( MN <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD))
|| m*COMPSIZE*nthreads > Y_DUMMY_NUM)
goto Outer;
num_cpu = 0;
range[0] = 0;
memset(y_dummy, 0, sizeof(FLOAT) * m * COMPSIZE * nthreads);
args.ldc = 1;
args.c = (void *)y_dummy;
//split on row (n) and x
i=n;
split_x=1;
while (i > 0){
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
if (width < 4) width = 4;
if (i < width) width = i;
range[num_cpu + 1] = range[num_cpu] + width;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = gemv_kernel;
queue[num_cpu].args = &args;
queue[num_cpu].position = num_cpu;
queue[num_cpu].range_m = NULL;
queue[num_cpu].range_n = &range[num_cpu];
queue[num_cpu].sa = NULL;
queue[num_cpu].sb = NULL;
queue[num_cpu].next = &queue[num_cpu + 1];
num_cpu ++;
i -= width;
}
}
Outer:
#endif
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer;
@@ -206,5 +272,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
exec_blas(num_cpu, queue);
}
#ifndef TRANSA
if(split_x==1){
//reduction
for(i=0; i<num_cpu; i++){
int j;
for(j=0; j<m; j++){
y[j*incy*COMPSIZE] +=y_dummy[i*m*COMPSIZE + j*COMPSIZE];
#ifdef COMPLEX
y[j*incy*COMPSIZE+1] +=y_dummy[i*m*COMPSIZE + j*COMPSIZE+1];
#endif
}
}
}
#endif
return 0;
}

View File

@@ -189,6 +189,20 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else
#endif
#ifdef INTEGER_PRECISION
if (mode & BLAS_INTEGER){
/* REAL / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, int,
int *, BLASLONG, int *, BLASLONG,
int *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((int *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else
#endif
if (mode & BLAS_DOUBLE){
/* REAL / Double */

View File

@@ -67,6 +67,7 @@ extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#else
@@ -79,6 +80,7 @@ extern gotoblas_t gotoblas_HASWELL;
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
#endif
@@ -221,6 +223,15 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 13) {
if(support_avx())
return &gotoblas_HASWELL;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 4:
//Intel Haswell
@@ -232,6 +243,26 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 7 || model == 15) {
if(support_avx())
return &gotoblas_HASWELL;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 5:
//Intel Broadwell
if (model == 6) {
if(support_avx())
return &gotoblas_HASWELL;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:
@@ -278,12 +309,22 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0){
//AMD STEAMROLLER
if(support_avx())
return &gotoblas_STEAMROLLER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
if (exmodel == 3) {
//AMD STEAMROLLER
if(support_avx())
return &gotoblas_STEAMROLLER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 6) {
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}
@@ -328,6 +369,7 @@ static char *corename[] = {
"Piledriver",
"Haswell",
"Steamroller",
"Excavator",
};
char *gotoblas_corename(void) {
@@ -353,6 +395,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
return corename[0];
}
@@ -383,7 +426,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found)
{
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);
case 19: return (&gotoblas_PILEDRIVER);

View File

@@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef OS_WINDOWS
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
#endif
#include <sys/ipc.h>
#endif
@@ -169,6 +171,14 @@ int get_num_procs(void) {
#endif
#endif
#ifdef OS_ANDROID
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
return nums;
}
#endif
#ifdef OS_WINDOWS
int get_num_procs(void) {
@@ -266,7 +276,7 @@ void openblas_fork_handler()
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !defined(OS_WINDOWS) && defined(SMP_SERVER)
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
@@ -276,7 +286,7 @@ void openblas_fork_handler()
int blas_get_cpu_number(void){
env_var_t p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@@ -284,7 +294,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
@@ -308,7 +318,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@@ -709,8 +719,6 @@ static void *alloc_shm(void *address){
return map_address;
}
#endif
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
static void alloc_hugetlb_free(struct release_t *release){
@@ -817,6 +825,8 @@ static void *alloc_hugetlb(void *address){
}
#endif
#endif
#ifdef ALLOC_HUGETLBFILE
static int hugetlb_pid = 0;
@@ -917,12 +927,13 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
alloc_devicedirver,
#endif
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
alloc_hugetlb,
#endif
/* Hugetlb implicitly assumes ALLOC_SHM */
#ifdef ALLOC_SHM
alloc_shm,
#endif
#if ((defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))
alloc_hugetlb,
#endif
#ifdef ALLOC_MMAP
alloc_mmap,
#endif
@@ -1062,7 +1073,7 @@ void *blas_memory_alloc(int procpos){
}
#endif
#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS
#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
#endif
@@ -1161,6 +1172,16 @@ void blas_memory_free(void *free_area){
return;
}
void *blas_memory_alloc_nolock(int unused) {
void *map_address;
map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
return map_address;
}
void blas_memory_free_nolock(void * map_address) {
free(map_address);
}
void blas_shutdown(void){
int pos;

View File

@@ -100,8 +100,8 @@ else
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
endif
ifeq ($(NOFORTRAN), 2)
#only build cblas without Fortran
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)

15
f_check
View File

@@ -3,11 +3,11 @@
#
# 1. Not specified
# 1.1 Automatically detect, then check compiler
# 1.2 If no fortran compiler is detected, g77 is default with NOFORTRAN definition
# 1.2 If no fortran compiler is detected, gfortran is default with NOFORTRAN definition
# 2. Specified
# 2.1 If path is correct, check compiler
# 2.2 If path is not correct, but still valid compiler name, force setting
# 2.2.2 Path is not correct, invalid compiler name, then g77 is default with NOFORTRAN definition
# 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition
#
$makefile = shift(@ARGV);
@@ -25,7 +25,7 @@ $compiler = "" if $compiler eq "f77";
if ($compiler eq "") {
@lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
@lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95",
"sunf77", "sunf90", "sunf95",
"xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
@@ -38,6 +38,7 @@ OUTER:
foreach $path (@path) {
if (-x $path . "/" . $lists) {
$compiler = $lists;
$compiler_bin = $lists;
last OUTER;
}
}
@@ -48,8 +49,8 @@ OUTER:
if ($compiler eq "") {
$nofortran = 1;
$compiler = "g77";
$vendor = G77;
$compiler = "gfortran";
$vendor = GFORTRAN;
$bu = "_";
} else {
@@ -196,8 +197,8 @@ if ($compiler eq "") {
if ($vendor eq "") {
$nofortran = 1;
$compiler = "g77";
$vendor = G77;
$compiler = "gfortran";
$vendor = GFORTRAN;
$bu = "_";
$openmp = "";
}

View File

@@ -448,6 +448,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "STEAMROLLER"
#endif
#if defined (FORCE_EXCAVATOR)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "EXCAVATOR"
#define ARCHCONFIG "-DEXCAVATOR " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
#define LIBNAME "excavator"
#define CORENAME "EXCAVATOR"
#endif
#ifdef FORCE_SSE_GENERIC
#define FORCE

View File

@@ -253,6 +253,15 @@ XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX)
endif
endif
ifdef INTEGER_PRECISION
IBLAS1OBJS = \
iaxpy.$(SUFFIX)
IBLAS2OBJS =
IBLAS3OBJS =
endif
endif
HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \
@@ -343,6 +352,9 @@ CZBLAS3OBJS = \
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
cblas_zgeadd.$(SUFFIX)
CIBLAS1OBJS = \
cblas_iaxpy.$(SUFFIX)
ifeq ($(SUPPORT_GEMM3M), 1)
@@ -372,6 +384,10 @@ ZBLAS1OBJS += $(CZBLAS1OBJS)
ZBLAS2OBJS += $(CZBLAS2OBJS)
ZBLAS3OBJS += $(CZBLAS3OBJS)
IBLAS1OBJS += $(CIBLAS1OBJS)
IBLAS2OBJS += $(CIBLAS2OBJS)
IBLAS3OBJS += $(CIBLAS3OBJS)
endif
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
@@ -380,6 +396,7 @@ QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS)
XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
IBLASOBJS = $(IBLAS1OBJS) $(IBLAS2OBJS) $(IBLAS3OBJS)
#SLAPACKOBJS = \
# sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
@@ -458,6 +475,10 @@ ifdef QUAD_PRECISION
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
endif
ifdef INTEGER_PRECISION
FUNCOBJS += $(IBLASOBJS)
endif
FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=)
include $(TOPDIR)/Makefile.tail
@@ -476,17 +497,18 @@ endif
clean ::
@rm -f functable.h
level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS)
level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(IBLAS1OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(IBLAS2OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS)
level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(IBLAS3OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \
$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS
$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) \
$(CIBLASOBJS) $(CIBLASOBJS_P) : override CFLAGS += -DCBLAS
srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c
$(CC) $(CFLAGS) -c $< -o $(@F)
@@ -725,6 +747,9 @@ saxpy.$(SUFFIX) saxpy.$(PSUFFIX) : axpy.c
daxpy.$(SUFFIX) daxpy.$(PSUFFIX) : axpy.c
$(CC) $(CFLAGS) -c $< -o $(@F)
iaxpy.$(SUFFIX) iaxpy.$(PSUFFIX) : axpy.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qaxpy.$(SUFFIX) qaxpy.$(PSUFFIX) : axpy.c
$(CC) $(CFLAGS) -c $< -o $(@F)
@@ -1437,6 +1462,9 @@ cblas_saxpy.$(SUFFIX) cblas_saxpy.$(PSUFFIX) : axpy.c
cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_iaxpy.$(SUFFIX) cblas_iaxpy.$(PSUFFIX) : axpy.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

View File

@@ -103,6 +103,8 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
#elif defined(INTEGER)
mode = BLAS_INTEGER | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
#endif

View File

@@ -38,6 +38,7 @@
#include <stdio.h>
#include "common.h"
#include "l1param.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
@@ -189,7 +190,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
//printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta);
if ((m==0) || (n==0)) return;
lenx = n;
@@ -211,15 +212,24 @@ void CNAME(enum CBLAS_ORDER order,
#ifdef MAX_STACK_ALLOC
// make it volatile because some gemv implementation (ex: dgemv_n.S)
// do not restore all register
volatile int stack_alloc_size = m + n;
volatile int stack_alloc_size = 0;
//for gemv_n and gemv_t, try to allocate on stack
stack_alloc_size = m + n;
#ifdef ALIGNED_ACCESS
stack_alloc_size += 3;
#endif
if(stack_alloc_size < 128)
//dgemv_n.S require a 128 bytes buffer
stack_alloc_size = 128;
//dgemv_n.S require a 128 bytes buffer
stack_alloc_size = 128;
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
stack_alloc_size = 0;
stack_alloc_size = 0;
FLOAT stack_buffer[stack_alloc_size];
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
// printf("stack_alloc_size=%d\n", stack_alloc_size);
#else
//Original OpenBLAS/GotoBLAS codes.
buffer = (FLOAT *)blas_memory_alloc(1);
#endif
@@ -251,10 +261,13 @@ void CNAME(enum CBLAS_ORDER order,
#endif
#ifdef MAX_STACK_ALLOC
if(!stack_alloc_size)
#endif
if(!stack_alloc_size){
blas_memory_free(buffer);
}
#else
blas_memory_free(buffer);
#endif
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
IDEBUG_END;

View File

@@ -362,6 +362,12 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (side << BLAS_RSIDE_SHIFT);
args.nthreads = num_cpu_avail(3);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
if (args.nthreads == 1) {
#endif

View File

@@ -210,6 +210,10 @@ ifndef XAXPYKERNEL
XAXPYKERNEL = zaxpy.S
endif
ifndef IAXPYKERNEL
IAXPYKERNEL = ../generic/iaxpy.c
endif
### COPY ###
ifndef SCOPYKERNEL
@@ -471,6 +475,9 @@ QBLASOBJS += \
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
IBLASOBJS += \
iaxpy_k$(TSUFFIX).$(SUFFIX)
CBLASOBJS += \
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
@@ -645,6 +652,9 @@ $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)iaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)iaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DINTEGER $< -o $@
$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@

View File

@@ -32,6 +32,10 @@ ifeq ($(TARGET), GENERIC)
USE_TRMM = 1
endif
ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif
SKERNELOBJS += \

View File

@@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.ARMV8

52
kernel/generic/iaxpy.c Normal file
View File

@@ -0,0 +1,52 @@
/***************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, int da, int *x, BLASLONG inc_x, int *y, BLASLONG inc_y, int *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;
if ( n < 0 ) return(0);
if ( da == 0 ) return(0);
ix = 0;
iy = 0;
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,9 +1,14 @@
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
@@ -26,11 +31,11 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
@@ -40,6 +45,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
@@ -49,6 +55,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =

View File

@@ -0,0 +1,92 @@
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
SSYMV_L_KERNEL = ssymv_L.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
DGEMVNKERNEL = dgemv_n_4.c
DGEMVTKERNEL = dgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_4.c
DCOPYKERNEL = dcopy_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@@ -1,3 +1,7 @@
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
@@ -10,6 +14,22 @@ ZGEMVTKERNEL = zgemv_t_4.c
CGEMVNKERNEL = cgemv_n_4.c
CGEMVTKERNEL = cgemv_t_4.c
SSYMV_L_KERNEL = ssymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
DSYMV_U_KERNEL = dsymv_U.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
@@ -20,16 +40,18 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_haswell.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c
DGEMMKERNEL = dgemm_kernel_4x8_haswell.S
DGEMMINCOPY = ../generic/gemm_ncopy_4.c
DGEMMITCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
@@ -40,6 +62,7 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c

View File

@@ -1,3 +1,13 @@
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
@@ -7,7 +17,11 @@ ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DCOPYKERNEL = dcopy_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S

View File

@@ -1,8 +1,28 @@
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
SGERKERNEL = sger.c
DGERKERNEL = dger.c
SGEMVNKERNEL = sgemv_n_4.c
SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_4.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
SSYMV_L_KERNEL = ssymv_L.c
SSYMV_U_KERNEL = ssymv_U.c
DSYMV_L_KERNEL = dsymv_L.c
DSYMV_U_KERNEL = dsymv_U.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c

View File

@@ -1,9 +1,17 @@
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSYMV_U_KERNEL = dsymv_U.c
DSYMV_L_KERNEL = dsymv_L.c

View File

@@ -29,8 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#if defined(PILEDRIVER) || defined(STEAMROLLER)
#include "caxpy_microk_steamroller-2.c"
#elif defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c"
#elif defined(HASWELL)
#include "caxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "caxpy_microk_sandy-2.c"
#endif
@@ -78,15 +84,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -8;
BLASLONG n1 = n & -32;
if ( n1 )
{
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_8(n1, x, y , &da );
caxpy_kernel_8(n1, x, y , da );
ix = 2 * n1;
}
i = n1;
while(i < n)
{

View File

@@ -31,89 +31,87 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __att
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
#if !defined(CONJ)
FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 };
#else
FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 };
#endif
BLASLONG register i = 0;
if ( n < 640 )
{
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
#if !defined(CONJ)
"vmulps (%5), %%xmm1 , %%xmm1 \n\t"
#else
"vmulps (%5), %%xmm0 , %%xmm0 \n\t"
#endif
".align 16 \n\t"
"1: \n\t"
"prefetcht0 768(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
".align 2 \n\t"
"vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
"vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
"prefetcht0 768(%3,%0,4) \n\t"
#if !defined(CONJ)
"vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t"
"vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x
"vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x
"vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x
"vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x
"vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t"
"vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t"
"vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t"
"vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t"
"vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t"
"vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t"
"vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
"vmulps %%xmm1, %%xmm10, %%xmm10 \n\t"
"vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t"
"vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t"
"vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t"
"vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t"
"vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t"
".align 2 \n\t"
"vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t"
"vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t"
"vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t"
#else
"vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t"
"vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t"
"vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t"
"vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t"
"vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
"vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
"vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
"vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
"vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
"vfmaddps 64(%3,%0,4), %%xmm0 , %%xmm12, %%xmm12 \n\t"
"vfmaddps 80(%3,%0,4), %%xmm0 , %%xmm13, %%xmm13 \n\t"
"vfmaddps 96(%3,%0,4), %%xmm0 , %%xmm14, %%xmm14 \n\t"
"vfmaddps 112(%3,%0,4), %%xmm0 , %%xmm15, %%xmm15 \n\t"
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
"vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
"vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
"vfmaddps %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t"
"vfmaddps %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t"
"vfmaddps %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t"
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
"vmovups %%xmm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
"vmovups %%xmm7 , 16(%3,%0,4) \n\t"
"vmovups %%xmm9 , 32(%3,%0,4) \n\t"
"vmovups %%xmm11, 48(%3,%0,4) \n\t"
"vmovups %%xmm12, 64(%3,%0,4) \n\t"
"vmovups %%xmm13, 80(%3,%0,4) \n\t"
"vmovups %%xmm14, 96(%3,%0,4) \n\t"
"vmovups %%xmm15,112(%3,%0,4) \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t"
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
"vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t"
"vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t"
"vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t"
"vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t"
#endif
"vmovups %%xmm12, (%3,%0,4) \n\t"
"vmovups %%xmm13, 16(%3,%0,4) \n\t"
"vmovups %%xmm14, 32(%3,%0,4) \n\t"
"vmovups %%xmm15, 48(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8 , %1 \n\t"
"addq $32, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
@@ -121,7 +119,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
@@ -129,7 +128,73 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
return;
}
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
#if !defined(CONJ)
"vmulps (%5), %%xmm1 , %%xmm1 \n\t"
#else
"vmulps (%5), %%xmm0 , %%xmm0 \n\t"
#endif
".align 16 \n\t"
"1: \n\t"
"prefetcht0 512(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
".align 2 \n\t"
"vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
"vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
"vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
"prefetcht0 512(%3,%0,4) \n\t"
"vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm5 \n\t"
".align 2 \n\t"
"vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm7 \n\t"
"vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm9 \n\t"
"vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm11 \n\t"
"vfmaddps %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t"
"vfmaddps %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t"
"vfmaddps %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t"
"vfmaddps %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t"
"vmovups %%xmm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
"vmovups %%xmm7 , 16(%3,%0,4) \n\t"
"vmovups %%xmm9 , 32(%3,%0,4) \n\t"
"vmovups %%xmm11, 48(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}

View File

@@ -0,0 +1,132 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
#if !defined(CONJ)
FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 };
#else
FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 };
#endif
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha
#if !defined(CONJ)
"vmulps (%5), %%ymm1 , %%ymm1 \n\t"
#else
"vmulps (%5), %%ymm0 , %%ymm0 \n\t"
#endif
".align 16 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x
".align 2 \n\t"
"vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x
"vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x
"vmovups 128(%2,%0,4), %%ymm12 \n\t" // 4 complex values from x
"vmovups 160(%2,%0,4), %%ymm13 \n\t" // 4 complex values from x
"vmovups 192(%2,%0,4), %%ymm14 \n\t" // 4 complex values from x
"vmovups 224(%2,%0,4), %%ymm15 \n\t" // 4 complex values from x
"vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part
"vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t"
".align 2 \n\t"
"vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t"
"vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t"
"vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t"
"vfmadd231ps %%ymm1 , %%ymm4 , %%ymm5 \n\t"
"vfmadd231ps %%ymm1 , %%ymm6 , %%ymm7 \n\t"
"vfmadd231ps %%ymm1 , %%ymm8 , %%ymm9 \n\t"
"vfmadd231ps %%ymm1 , %%ymm10, %%ymm11 \n\t"
"vpermilps $0xb1 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part
"vfmadd213ps 128(%3,%0,4), %%ymm0 , %%ymm12 \n\t"
"vfmadd213ps 160(%3,%0,4), %%ymm0 , %%ymm13 \n\t"
"vfmadd213ps 192(%3,%0,4), %%ymm0 , %%ymm14 \n\t"
"vfmadd213ps 224(%3,%0,4), %%ymm0 , %%ymm15 \n\t"
"vfmadd231ps %%ymm1 , %%ymm4 , %%ymm12 \n\t"
"vfmadd231ps %%ymm1 , %%ymm6 , %%ymm13 \n\t"
"vfmadd231ps %%ymm1 , %%ymm8 , %%ymm14 \n\t"
"vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t"
"vmovups %%ymm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
"vmovups %%ymm7 , 32(%3,%0,4) \n\t"
"vmovups %%ymm9 , 64(%3,%0,4) \n\t"
"vmovups %%ymm11, 96(%3,%0,4) \n\t"
"vmovups %%ymm12,128(%3,%0,4) \n\t"
"vmovups %%ymm13,160(%3,%0,4) \n\t"
"vmovups %%ymm14,192(%3,%0,4) \n\t"
"vmovups %%ymm15,224(%3,%0,4) \n\t"
"addq $64, %0 \n\t"
"subq $32, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@@ -0,0 +1,116 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
#if !defined(CONJ)
FLOAT mvec[8] = { -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0 };
#else
FLOAT mvec[8] = { 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0 };
#endif
BLASLONG register i = 0;
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%4), %%ymm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%ymm1 \n\t" // imag part of alpha
#if !defined(CONJ)
"vmulps (%5), %%ymm1 , %%ymm1 \n\t"
#else
"vmulps (%5), %%ymm0 , %%ymm0 \n\t"
#endif
".align 16 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x
".align 2 \n\t"
"vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x
"vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x
"vpermilps $0xb1 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part
"vmulps %%ymm5 , %%ymm0 , %%ymm5 \n\t"
"vmulps %%ymm7 , %%ymm0 , %%ymm7 \n\t"
"vmulps %%ymm9 , %%ymm0 , %%ymm9 \n\t"
"vmulps %%ymm11, %%ymm0 , %%ymm11 \n\t"
"vaddps (%3,%0,4), %%ymm5 , %%ymm5 \n\t"
"vaddps 32(%3,%0,4), %%ymm7 , %%ymm7 \n\t"
"vaddps 64(%3,%0,4), %%ymm9 , %%ymm9 \n\t"
"vaddps 96(%3,%0,4), %%ymm11, %%ymm11 \n\t"
"vmulps %%ymm4 , %%ymm1 , %%ymm4 \n\t"
"vmulps %%ymm6 , %%ymm1 , %%ymm6 \n\t"
"vmulps %%ymm8 , %%ymm1 , %%ymm8 \n\t"
"vmulps %%ymm10, %%ymm1 , %%ymm10 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm6 , %%ymm7 , %%ymm7 \n\t"
"vaddps %%ymm8 , %%ymm9 , %%ymm9 \n\t"
"vaddps %%ymm10, %%ymm11, %%ymm11 \n\t"
"vmovups %%ymm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
"vmovups %%ymm7 , 32(%3,%0,4) \n\t"
"vmovups %%ymm9 , 64(%3,%0,4) \n\t"
"vmovups %%ymm11, 96(%3,%0,4) \n\t"
"addq $32, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}

View File

@@ -0,0 +1,200 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
#if !defined(CONJ)
FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 };
#else
FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 };
#endif
BLASLONG register i = 0;
if ( n <= 2048 )
{
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
#if !defined(CONJ)
"vmulps (%5), %%xmm1 , %%xmm1 \n\t"
#else
"vmulps (%5), %%xmm0 , %%xmm0 \n\t"
#endif
".align 16 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
".align 2 \n\t"
"vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
"vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
"vmovups 64(%2,%0,4), %%xmm12 \n\t" // 2 complex values from x
"vmovups 80(%2,%0,4), %%xmm13 \n\t" // 2 complex values from x
"vmovups 96(%2,%0,4), %%xmm14 \n\t" // 2 complex values from x
"vmovups 112(%2,%0,4), %%xmm15 \n\t" // 2 complex values from x
"vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
"vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t"
".align 2 \n\t"
"vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t"
"vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t"
"vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t"
"vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t"
"vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t"
"vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t"
"vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t"
"vpermilps $0xb1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part
"vfmadd213ps 64(%3,%0,4), %%xmm0 , %%xmm12 \n\t"
"vfmadd213ps 80(%3,%0,4), %%xmm0 , %%xmm13 \n\t"
"vfmadd213ps 96(%3,%0,4), %%xmm0 , %%xmm14 \n\t"
"vfmadd213ps 112(%3,%0,4), %%xmm0 , %%xmm15 \n\t"
"vfmadd231ps %%xmm1 , %%xmm4 , %%xmm12 \n\t"
"vfmadd231ps %%xmm1 , %%xmm6 , %%xmm13 \n\t"
"vfmadd231ps %%xmm1 , %%xmm8 , %%xmm14 \n\t"
"vfmadd231ps %%xmm1 , %%xmm10, %%xmm15 \n\t"
"vmovups %%xmm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
"vmovups %%xmm7 , 16(%3,%0,4) \n\t"
"vmovups %%xmm9 , 32(%3,%0,4) \n\t"
"vmovups %%xmm11, 48(%3,%0,4) \n\t"
"vmovups %%xmm12, 64(%3,%0,4) \n\t"
"vmovups %%xmm13, 80(%3,%0,4) \n\t"
"vmovups %%xmm14, 96(%3,%0,4) \n\t"
"vmovups %%xmm15,112(%3,%0,4) \n\t"
"addq $32, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
return;
}
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
#if !defined(CONJ)
"vmulps (%5), %%xmm1 , %%xmm1 \n\t"
#else
"vmulps (%5), %%xmm0 , %%xmm0 \n\t"
#endif
".align 16 \n\t"
"1: \n\t"
"prefetcht0 512(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
".align 2 \n\t"
"vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
"vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
"vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
"vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
"vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
"prefetcht0 512(%3,%0,4) \n\t"
"vfmadd213ps (%3,%0,4), %%xmm0 , %%xmm5 \n\t"
".align 2 \n\t"
"vfmadd213ps 16(%3,%0,4), %%xmm0 , %%xmm7 \n\t"
"vfmadd213ps 32(%3,%0,4), %%xmm0 , %%xmm9 \n\t"
"vfmadd213ps 48(%3,%0,4), %%xmm0 , %%xmm11 \n\t"
"vfmadd231ps %%xmm1 , %%xmm4 , %%xmm5 \n\t"
"vfmadd231ps %%xmm1 , %%xmm6 , %%xmm7 \n\t"
"vfmadd231ps %%xmm1 , %%xmm8 , %%xmm9 \n\t"
"vfmadd231ps %%xmm1 , %%xmm10, %%xmm11 \n\t"
"vmovups %%xmm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
"vmovups %%xmm7 , 16(%3,%0,4) \n\t"
"vmovups %%xmm9 , 32(%3,%0,4) \n\t"
"vmovups %%xmm11, 48(%3,%0,4) \n\t"
"addq $16, %0 \n\t"
"subq $8, %1 \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t"
:
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"memory"
);
}

Some files were not shown because too many files have changed in this diff Show More