Compare commits

..

1 Commits

Author SHA1 Message Date
Zhang Xianyi
92058a75e2 For gemm multi-threading, simply split M.
e.g.
layer 1: A (1600k, 576), B(576, 64)

B is very small. We split M.
2015-11-25 05:14:56 +08:00
219 changed files with 1194 additions and 48574 deletions

View File

@@ -24,12 +24,7 @@ before_install:
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
script:
- set -e
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
# whitelist
branches:

View File

@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 17)
set(OpenBLAS_PATCH_VERSION 16.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
@@ -54,6 +54,10 @@ if (NOT DYNAMIC_ARCH)
list(APPEND BLASDIRS kernel)
endif ()
if (DEFINED UTEST_CHECK)
set(SANITY_CHECK 1)
endif ()
if (DEFINED SANITY_CHECK)
list(APPEND BLASDIRS reference)
endif ()
@@ -106,10 +110,6 @@ if (${NO_STATIC} AND ${NO_SHARED})
message(FATAL_ERROR "Neither static nor shared are enabled.")
endif ()
#Set default output directory
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS})
@@ -139,17 +139,6 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
endforeach()
enable_testing()
add_subdirectory(utest)
if(NOT MSVC)
#only build shared library for MSVC
@@ -163,6 +152,7 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
endif()
#build test and ctest
enable_testing()
add_subdirectory(test)
if(NOT NO_CBLAS)
add_subdirectory(ctest)

View File

@@ -121,17 +121,6 @@ In chronological order:
* [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
ARMv8 support.
* Jerome Robert <jeromerobert@gmx.com>
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
* [2015-12-28] Allow to force the number of parallel make job
* [2015-12-28] Fix detection of AMD E2-3200 detection
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
* Dan Kortschak
* [2015-01-07] Added test for drotmg bug #484.
@@ -141,11 +130,5 @@ In chronological order:
* Martin Koehler <https://github.com/grisuthedragon/>
* [2015-09-07] Improved imatcopy
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
* [2015-11-20] lapack-test fixes for Cortex-A57
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]

View File

@@ -1,63 +1,4 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.17
20-Mar-2016
common:
* Enable BUILD_LAPACK_DEPRECATED=1 by default.
====================================================================
Version 0.2.16
15-Mar-2016
common:
* Avoid potential getenv segfault. (#716)
* Import LAPACK svn bugfix #142-#147,#150-#155
x86/x86_64:
* Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller
* Fix bug with scipy linalg test.
ARM:
* Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
POWER:
* Optimize D and Z BLAS3 functions for Power8.
====================================================================
Version 0.2.16.rc1
23-Feb-2016
common:
* Upgrade LAPACK to 3.6.0 version.
Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build
LAPACK deprecated functions.
* Add MAKE_NB_JOBS option in Makefile.
Force number of make jobs.This is particularly
useful when using distcc. (#735. Thanks, Jerome Robert.)
* Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor).
* Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert)
* Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert)
* Let openblas_get_num_threads return the number of active threads.
(#760. Thanks, Jerome Robert)
* Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen)
* Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey)
* Update scipy benchmark script. (#745. Thanks, John Kirkham)
x86/x86_64:
* Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller.
* Detect Intel Avoton.
* Detect AMD Trinity, Richland, E2-3200.
* Fix gemv performance bug on Mac OSX Intel Haswell.
* Fix some bugs with CMake and Visual Studio
ARM:
* Support and optimize Cortex-A57 AArch64.
(#686. Thanks, Ashwin Sekhar TK)
* Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere)
* Update ARMV6 kernels.
POWER:
* Fix detection of POWER architecture
(#684. Thanks, Sebastien Villemot)
====================================================================
Version 0.2.15
27-Oct-2015

View File

@@ -7,6 +7,10 @@ ifneq ($(DYNAMIC_ARCH), 1)
BLASDIRS += kernel
endif
ifdef UTEST_CHECK
SANITY_CHECK = 1
endif
ifdef SANITY_CHECK
BLASDIRS += reference
endif
@@ -81,22 +85,22 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), Linux)
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@@ -113,8 +117,10 @@ ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
ifdef UTEST_CHECK
$(MAKE) -C utest all
endif
endif
ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
@@ -257,9 +263,6 @@ endif
else
-@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif

View File

@@ -11,8 +11,8 @@ endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
@@ -29,3 +29,5 @@ ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -march=armv5
FCOMMON_OPT += -marm -march=armv5
endif

View File

@@ -29,7 +29,7 @@ install : lib.grd
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@@ -48,10 +48,10 @@ endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
endif
#for install static library
@@ -64,7 +64,7 @@ endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), Linux)
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.17
VERSION = 0.2.16.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -79,11 +79,8 @@ VERSION = 0.2.17
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you
@@ -111,10 +108,6 @@ NO_AFFINITY = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
# Force number of make jobs. The default is the number of logical CPU of the host.
# This is particularly useful when using distcc
# MAKE_NB_JOBS = 2
# If you would like to know minute performance report of GotoBLAS.
# FUNCTION_PROFILE = 1
@@ -145,6 +138,10 @@ NO_AFFINITY = 1
# slow (Not implemented yet).
# SANITY_CHECK = 1
# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable
# SANITY_CHECK to compare the result with reference BLAS.
# UTEST_CHECK = 1
# The installation directory.
# PREFIX = /opt/OpenBLAS
@@ -162,11 +159,10 @@ COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
# Set maximum stack allocation.
# The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV
# performance. For details, https://github.com/xianyi/OpenBLAS/pull/482
# Improve GEMV and GER for small matrices by stack allocation.
# For details, https://github.com/xianyi/OpenBLAS/pull/482
#
# MAX_STACK_ALLOC = 0
MAX_STACK_ALLOC=2048
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoid conflicts with other BLAS libraries, especially when using

View File

@@ -139,10 +139,6 @@ NO_PARALLEL_MAKE=0
endif
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
ifdef MAKE_NB_JOBS
GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS)
endif
ifeq ($(HOSTCC), loongcc)
GETARCH_FLAGS += -static
endif
@@ -296,14 +292,12 @@ endif
ifneq ($(OSNAME), WINNT)
ifneq ($(OSNAME), CYGWIN_NT)
ifneq ($(OSNAME), Interix)
ifneq ($(OSNAME), Android)
ifdef SMP
EXTRALIB += -lpthread
endif
endif
endif
endif
endif
# ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix))
@@ -330,8 +324,7 @@ ifdef SANITY_CHECK
CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU)
endif
MAX_STACK_ALLOC ?= 2048
ifneq ($(MAX_STACK_ALLOC), 0)
ifdef MAX_STACK_ALLOC
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
@@ -381,7 +374,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@@ -395,7 +388,7 @@ endif
ifeq ($(USE_OPENMP), 1)
#check
#check
ifeq ($(USE_THREAD), 0)
$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.)
endif
@@ -959,18 +952,17 @@ ifeq ($(OSNAME), SunOS)
TAR = gtar
PATCH = gpatch
GREP = ggrep
AWK = nawk
else
TAR = tar
PATCH = patch
GREP = grep
AWK = awk
endif
ifndef MD5SUM
MD5SUM = md5sum
endif
AWK = awk
REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
@@ -988,8 +980,12 @@ COMMON_OPT = -O2
endif
ifndef FCOMMON_OPT
ifeq ($(OSNAME), WINNT)
FCOMMON_OPT = -O0
else
FCOMMON_OPT = -O2 -frecursive
endif
endif
@@ -1187,3 +1183,4 @@ SUNPATH = /opt/sunstudio12.1
else
SUNPATH = /opt/SUNWspro
endif

View File

@@ -75,11 +75,10 @@ Please read GotoBLAS_01Readme.txt
#### ARM64:
- **ARMV8**: Experimental
- **ARM Cortex-A57**: Experimental
### Support OS:
- **GNU/Linux**
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.

199
USAGE.md
View File

@@ -1,199 +0,0 @@
# Notes on OpenBLAS usage
## Usage
#### Program is Terminated. Because you tried to allocate too many memory regions
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
buffers as the following.
```
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
```
This error indicates that the program exceeded the number of buffers.
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
`MAX_CPU_NUMBER=NUM_THREADS`.
#### How can I use OpenBLAS in multi-threaded applications?
If your application is already multi-threaded, it will conflict with OpenBLAS
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
following ways:
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
* Call `openblas_set_num_threads(1)` in the application on runtime.
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
If the application is parallelized by OpenMP, please use OpenBLAS built with
`USE_OPENMP=1`
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
The environment variable which control the kernel selection is
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
returns the used target.
#### How could I disable OpenBLAS threading affinity on runtime?
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
variable to disable threading affinity on runtime. For example, before the
running,
```
export OPENBLAS_MAIN_FREE=1
```
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
in `Makefile.rule`.
## Linking with the library
* Link with shared library
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
If the library is multithreaded, please add `-lpthread`. If the library
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
* Link with static library
`gcc -o test test.c /your/path/libopenblas.a`
You can download `test.c` from https://gist.github.com/xianyi/5780018
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
default), custom programs statically linked against `libopenblas.a` should also
link with the pthread library e.g.:
```
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
```
Failing to add the `-lpthread` flag will cause errors such as:
```
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
...
```
## Code examples
#### Call CBLAS interface
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
```
#include <cblas.h>
#include <stdio.h>
void main()
{
int i=0;
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
for(i=0; i<9; i++)
printf("%lf ", C[i]);
printf("\n");
}
```
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
#### Call BLAS Fortran interface
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
```
#include "stdio.h"
#include "stdlib.h"
#include "sys/time.h"
#include "time.h"
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
int main(int argc, char* argv[])
{
int i;
printf("test!\n");
if(argc<4){
printf("Input Error\n");
return 1;
}
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
int sizeofa = m * k;
int sizeofb = k * n;
int sizeofc = m * n;
char ta = 'N';
char tb = 'N';
double alpha = 1.2;
double beta = 0.001;
struct timeval start,finish;
double duration;
double* A = (double*)malloc(sizeof(double) * sizeofa);
double* B = (double*)malloc(sizeof(double) * sizeofb);
double* C = (double*)malloc(sizeof(double) * sizeofc);
srand((unsigned)time(NULL));
for (i=0; i<sizeofa; i++)
A[i] = i%3+1;//(rand()%100)/10.0;
for (i=0; i<sizeofb; i++)
B[i] = i%3+1;//(rand()%100)/10.0;
for (i=0; i<sizeofc; i++)
C[i] = i%3+1;//(rand()%100)/10.0;
//#if 0
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
gettimeofday(&start, NULL);
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
gettimeofday(&finish, NULL);
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
double gflops = 2.0 * m *n*k;
gflops = gflops/duration*1.0e-6;
FILE *fp;
fp = fopen("timeDGEMM.txt", "a");
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
fclose(fp);
free(A);
free(B);
free(C);
return 0;
}
```
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
` ./time_dgemm <m> <n> <k> `
## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## BLAS reference manual
If you want to understand every BLAS function and definition, please read
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
or [netlib.org](http://netlib.org/blas/)
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
## How to reference OpenBLAS.
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.

View File

@@ -39,6 +39,4 @@ before_build:
- cmake -G "Visual Studio 12 Win64" .
test_script:
- echo Running Test
- cd c:\projects\OpenBLAS\utest
- openblas_utest
- echo Build OK!

View File

@@ -166,8 +166,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling
ssymm.goto dsymm.goto csymm.goto zsymm.goto
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -2133,8 +2132,6 @@ cgemm3m.$(SUFFIX) : gemm3m.c
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib

View File

@@ -1,58 +0,0 @@
#!/usr/bin/env python
import os
import sys
import time
import numpy
from numpy import zeros
from numpy.random import randn
from scipy.linalg import blas
def run_dsyrk(N, l):
A = randn(N, N).astype('float64', order='F')
C = zeros((N, N), dtype='float64', order='F')
start = time.time()
for i in range(0, l):
blas.dsyrk(1.0, A, c=C, overwrite_c=True)
end = time.time()
timediff = (end - start)
mflops = (N * N * N) * l / timediff
mflops *= 1e-6
size = "%dx%d" % (N, N)
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
if __name__ == "__main__":
N = 128
NMAX = 2048
NINC = 128
LOOPS = 1
z = 0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p)
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range(N, NMAX + NINC, NINC):
run_dsyrk(i, LOOPS)

View File

@@ -1,58 +0,0 @@
#!/usr/bin/env python
import os
import sys
import time
import numpy
from numpy import zeros
from numpy.random import randn
from scipy.linalg import blas
def run_ssyrk(N, l):
A = randn(N, N).astype('float32', order='F')
C = zeros((N, N), dtype='float32', order='F')
start = time.time()
for i in range(0, l):
blas.ssyrk(1.0, A, c=C, overwrite_c=True)
end = time.time()
timediff = (end - start)
mflops = (N * N * N) * l / timediff
mflops *= 1e-6
size = "%dx%d" % (N, N)
print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
if __name__ == "__main__":
N = 128
NMAX = 2048
NINC = 128
LOOPS = 1
z = 0
for arg in sys.argv:
if z == 1:
N = int(arg)
elif z == 2:
NMAX = int(arg)
elif z == 3:
NINC = int(arg)
elif z == 4:
LOOPS = int(arg)
z = z + 1
if 'OPENBLAS_LOOPS' in os.environ:
p = os.environ['OPENBLAS_LOOPS']
if p:
LOOPS = int(p)
print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS))
print("\tSIZE\t\t\tFlops\t\t\t\t\tTime")
for i in range(N, NMAX + NINC, NINC):
run_ssyrk(i, LOOPS)

View File

@@ -1,196 +0,0 @@
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <omp.h>
#define MIN_SIZE 5
#define MAX_SIZE 60
#define NB_SIZE 10
// number of loop for a 1x1 matrix. Lower it if the test is
// too slow on you computer.
#define NLOOP 2e7
typedef struct {
int matrix_size;
int n_loop;
void (* bench_func)();
void (* blas_func)();
void * (* create_matrix)(int size);
} BenchParam;
void * s_create_matrix(int size) {
float * r = malloc(size * sizeof(double));
int i;
for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * c_create_matrix(int size) {
float * r = malloc(size * 2 * sizeof(double));
int i;
for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * z_create_matrix(int size) {
double * r = malloc(size * 2 * sizeof(double));
int i;
for(i = 0; i < 2 * size; i++)
r[i] = 1e3 * i / size;
return r;
}
void * d_create_matrix(int size) {
double * r = malloc(size * sizeof(double));
int i;
for(i = 0; i < size; i++)
r[i] = 1e3 * i / size;
return r;
}
void trmv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
}
free(A);
free(y);
}
void gemv_bench(BenchParam * param)
{
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
}
free(A);
free(y);
}
void ger_bench(BenchParam * param) {
int i, n;
int size = param->matrix_size;
n = param->n_loop / size;
double v = 1.01;
int one = 1;
void * A = param->create_matrix(size * size);
void * y = param->create_matrix(size);
for(i = 0; i < n; i++) {
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
}
free(A);
free(y);
}
#ifndef _WIN32
void * pthread_func_wrapper(void * param) {
((BenchParam *)param)->bench_func(param);
pthread_exit(NULL);
}
#endif
#define NB_TESTS 5
void * TESTS[4 * NB_TESTS] = {
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
gemv_bench, dgemv_, d_create_matrix, "dgemv",
gemv_bench, zgemv_, z_create_matrix, "zgemv",
ger_bench, dger_, d_create_matrix, "dger",
ger_bench, zgerc_, z_create_matrix, "zgerc",
};
inline static double delta_time(struct timespec tick) {
struct timespec tock;
clock_gettime(CLOCK_MONOTONIC, &tock);
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
}
double pthread_bench(BenchParam * param, int nb_threads)
{
#ifdef _WIN32
return 0;
#else
BenchParam threaded_param = *param;
pthread_t threads[nb_threads];
int t, rc;
struct timespec tick;
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
for(t=0; t<nb_threads; t++){
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
for(t=0; t<nb_threads; t++){
pthread_join(threads[t], NULL);
}
return delta_time(tick);
#endif
}
double seq_bench(BenchParam * param) {
struct timespec tick;
clock_gettime(CLOCK_MONOTONIC, &tick);
param->bench_func(param);
return delta_time(tick);
}
double omp_bench(BenchParam * param) {
BenchParam threaded_param = *param;
struct timespec tick;
int t;
int nb_threads = omp_get_max_threads();
threaded_param.n_loop /= nb_threads;
clock_gettime(CLOCK_MONOTONIC, &tick);
#pragma omp parallel for
for(t = 0; t < nb_threads; t ++){
param->bench_func(&threaded_param);
}
return delta_time(tick);
}
int main(int argc, char * argv[]) {
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
BenchParam param;
int test_id;
printf ("Running on %d threads\n", omp_get_max_threads());
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
double size = MIN_SIZE;
param.bench_func = TESTS[test_id * 4];
param.blas_func = TESTS[test_id * 4 + 1];
param.create_matrix = TESTS[test_id * 4 + 2];
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
param.n_loop = NLOOP;
while(size <= MAX_SIZE) {
param.matrix_size = (int)(size + 0.5);
double seq_time = seq_bench(&param);
double omp_time = omp_bench(&param);
double pthread_time = pthread_bench(&param, omp_get_max_threads());
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
"pthread %gs, speedup %g\n",
param.matrix_size, seq_time,
omp_time, seq_time / omp_time,
pthread_time, seq_time / pthread_time);
size *= inc_factor;
}
}
return(0);
}

View File

@@ -14,12 +14,12 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
if (NOT NO_EXPRECISION)
if (${F_COMPILER} MATCHES "GFORTRAN")
# N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
set(EXPRECISION 1)
set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION")
set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double")
@@ -28,35 +28,35 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64")
endif ()
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -wd981")
endif ()
if (USE_OPENMP)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB")
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
if (${CMAKE_C_COMPILER} STREQUAL "Clang")
message(WARNING "Clang doesn't support OpenMP yet.")
set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel")
if (${CMAKE_C_COMPILER} STREQUAL "Intel")
set(CCOMMON_OPT "${CCOMMON_OPT} -openmp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
set(CEXTRALIB "${CEXTRALIB} -lstdc++")
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
set(CCOMMON_OPT "${CCOMMON_OPT} -mp")
endif ()
endif ()
@@ -87,7 +87,7 @@ if (${ARCH} STREQUAL "ia64")
set(BINARY_DEFINED 1)
if (${F_COMPILER} MATCHES "GFORTRAN")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
if (${CMAKE_C_COMPILER} STREQUAL "GNU")
# EXPRECISION = 1
# CCOMMON_OPT += -DEXPRECISION
endif ()

View File

@@ -48,18 +48,18 @@ set(SLASRC
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
sgetc2.f sgetri.f
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
sggglm.f sgghrd.f sgglse.f sggqrf.f
sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
DEPRECATED/slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
slansy.f slantb.f slantp.f slantr.f slanv2.f
slapll.f slapmt.f
@@ -69,7 +69,7 @@ set(SLASRC
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f
slarrv.f slartv.f
slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
@@ -97,7 +97,7 @@ set(SLASRC
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
stptrs.f
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f
strtrs.f stzrqf.f stzrzf.f sstemr.f
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f
@@ -114,14 +114,14 @@ set(CLASRC
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
DEPRECATED/cgegs.f DEPRECATED/cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
cgels.f cgelsd.f cgelss.f DEPRECATED/cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
DEPRECATED/cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f
cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f
cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f
cgesvx.f cgetc2.f cgetri.f
cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f
cgghrd.f cgglse.f cggqrf.f cggrqf.f
DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f
cggsvd.f cggsvp.f
cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f
checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f
@@ -138,7 +138,7 @@ set(CLASRC
claed0.f claed7.f claed8.f
claein.f claesy.f claev2.f clags2.f clagtm.f
clahef.f clahef_rook.f clahqr.f
DEPRECATED/clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
clanhb.f clanhe.f
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
@@ -149,7 +149,7 @@ set(CLASRC
clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
DEPRECATED/clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f
cposv.f cposvx.f cpstrf.f cpstf2.f
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
@@ -166,7 +166,7 @@ set(CLASRC
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
ctprfs.f ctptri.f
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f
ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
@@ -186,18 +186,18 @@ set(DLASRC
dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
dgetc2.f dgetri.f
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
dggglm.f dgghrd.f dgglse.f dggqrf.f
dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
DEPRECATED/dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
dlapll.f dlapmt.f
@@ -207,7 +207,7 @@ set(DLASRC
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f
dlargv.f dlarrv.f dlartv.f
dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
@@ -235,7 +235,7 @@ set(DLASRC
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
dtptrs.f
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f
dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f
@@ -251,14 +251,14 @@ set(ZLASRC
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
zgetri.f
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
zgghrd.f zgglse.f zggqrf.f zggrqf.f
DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f
zggsvd.f zggsvp.f
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
@@ -275,7 +275,7 @@ set(ZLASRC
zlaed0.f zlaed7.f zlaed8.f
zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f
zlahef.f zlahef_rook.f zlahqr.f
DEPRECATED/zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
zlangt.f zlanhb.f
zlanhe.f
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
@@ -288,7 +288,7 @@ set(ZLASRC
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
zlassq.f zlasyf.f zlasyf_rook.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
@@ -306,7 +306,7 @@ set(ZLASRC
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
ztprfs.f ztptri.f
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f
ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f

View File

@@ -2038,59 +2038,6 @@ set(MATGEN
lapacke_zlagsy_work.c
)
set(Utils_SRC
lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c
lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c
lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c
lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c
lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c
lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c
lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c
lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c
lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c
lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c
lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c
lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c
lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c
lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c
lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c
lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c
lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c
lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c
lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c
lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c
lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c
lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c
lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c
lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c
lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c
lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c
lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c
lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c
lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c
lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c
lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c
lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c
lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c
lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c
lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c
lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c
lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c
lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c
lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c
lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c
lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c
lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c
lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c
lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c
lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c
lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c
lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c
lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c
lapacke_dpb_nancheck.c lapacke_sst_nancheck.c
lapacke_dpb_trans.c lapacke_ssy_nancheck.c
)
set(LAPACKE_REL_SRC "")
if (BUILD_SINGLE)
list(APPEND LAPACKE_REL_SRC ${SSRC})
@@ -2111,14 +2058,10 @@ endif ()
# add lapack-netlib folder to the sources
set(LAPACKE_SOURCES "")
foreach (LAE_FILE ${LAPACKE_REL_SRC})
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}")
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}")
endforeach ()
foreach (Utils_FILE ${Utils_SRC})
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}")
endforeach ()
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include")
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
include_directories(${lapacke_include_dir})
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")

View File

@@ -93,7 +93,7 @@ extern "C" {
#include <sched.h>
#endif
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD)
#include <sched.h>
#endif
@@ -332,13 +332,12 @@ typedef int blasint;
#endif
#endif
/*
#ifdef PILEDRIVER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
*/
/*
#ifdef STEAMROLLER
@@ -412,7 +411,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#ifndef ASSEMBLER
#ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#define readenv(p, n) GetEnvironmentVariable((n), (p), sizeof(p))
#else
typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n))
@@ -728,7 +727,6 @@ typedef struct {
#endif
#ifndef ASSEMBLER
#include "common_stackalloc.h"
#if 0
#include "symcopy.h"
#endif

View File

@@ -1194,6 +1194,8 @@ extern gotoblas_t *gotoblas;
#define XGEMM_DEFAULT_UNROLL_N 2
#endif
#define GEMM_THREAD gemm_thread_m
#ifndef GEMM_THREAD
#define GEMM_THREAD gemm_thread_n
#endif

View File

@@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@@ -258,13 +258,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
#if defined(POWER8)
#define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst
#endif
#
#ifndef L1_PREFETCH
#define L1_PREFETCH dcbt
#endif
@@ -797,8 +790,6 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

View File

@@ -1,73 +0,0 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define STACK_ALLOC_PROTECT
#ifdef STACK_ALLOC_PROTECT
// Try to detect stack smashing
#include <assert.h>
#define STACK_ALLOC_PROTECT_SET volatile int stack_check = 0x7fc01234;
#define STACK_ALLOC_PROTECT_CHECK assert(stack_check == 0x7fc01234);
#else
#define STACK_ALLOC_PROTECT_SET
#define STACK_ALLOC_PROTECT_CHECK
#endif
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
/*
* Allocate a buffer on the stack if the size is smaller than MAX_STACK_ALLOC.
* Stack allocation is much faster than blas_memory_alloc or malloc, particularly
* when OpenBLAS is used from a multi-threaded application.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else
//Original OpenBLAS/GotoBLAS codes.
#define STACK_ALLOC(SIZE, TYPE, BUFFER) BUFFER = (TYPE *)blas_memory_alloc(1)
#endif
#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
#define STACK_FREE(BUFFER) \
STACK_ALLOC_PROTECT_CHECK \
if(!stack_alloc_size) \
blas_memory_free(BUFFER);
#else
#define STACK_FREE(BUFFER) blas_memory_free(BUFFER)
#endif

View File

@@ -41,10 +41,6 @@
#ifndef ASSEMBLER
#ifdef C_MSVC
#include <intrin.h>
#endif
#define MB
#define WMB
@@ -174,13 +170,12 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
if (y <= 1) return x;
#if defined(_MSC_VER) && !defined(__clang__)
result = x/y;
return result;
#else
y = blas_quick_divide_table[y];
#if defined(_MSC_VER) && !defined(__clang__)
(void*)result;
return x*y;
#else
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
return result;

View File

@@ -396,7 +396,7 @@ REALNAME:
#define PROFCODE
#define EPILOGUE .end
#define EPILOGUE .end REALNAME
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)

View File

@@ -115,9 +115,6 @@ int detect(void)
if (strstr(p, "0xc0f")) {
return CPU_CORTEXA15;
}
if (strstr(p, "0xd07")) {
return CPU_ARMV7; //ARMV8 on 32-bit
}
}
@@ -161,27 +158,6 @@ int detect(void)
}
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if ((!strncmp("CPU architecture", buffer, 16)))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL) {
if (strstr(p, "8")) {
return CPU_ARMV7; //ARMV8 on 32-bit
}
}
#endif
return CPU_UNKNOWN;

View File

@@ -191,8 +191,6 @@ void get_cpuconfig(void)
printf("#define L2_SIZE 2097152\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}

View File

@@ -55,7 +55,6 @@
#define CPUTYPE_POWER6 5
#define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7
#define CPUTYPE_POWER8 8
char *cpuname[] = {
"UNKNOWN",
@@ -66,7 +65,6 @@ char *cpuname[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
};
char *lowercpuname[] = {
@@ -78,7 +76,6 @@ char *lowercpuname[] = {
"power6",
"cell",
"ppcg4",
"power8"
};
char *corename[] = {
@@ -90,7 +87,6 @@ char *corename[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
};
int detect(void){
@@ -119,7 +115,7 @@ int detect(void){
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;

View File

@@ -1172,9 +1172,6 @@ int get_cpuname(void){
#endif
else
return CPUTYPE_NEHALEM;
case 13:
// Avoton
return CPUTYPE_NEHALEM;
}
break;
case 5:
@@ -1232,7 +1229,6 @@ int get_cpuname(void){
case 2:
return CPUTYPE_OPTERON;
case 1:
case 3:
case 10:
return CPUTYPE_BARCELONA;
case 6:
@@ -1678,9 +1674,6 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 13:
// Avoton
return CORE_NEHALEM;
}
break;
case 5:

View File

@@ -1365,9 +1365,8 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
IF( TRACE )
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@@ -1365,9 +1365,8 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
IF( TRACE )
$ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@@ -1335,9 +1335,8 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
IF( TRACE )
$ CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@@ -1339,9 +1339,8 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
IF( TRACE )
$ CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@@ -1350,7 +1350,7 @@
*
* Call the subroutine.
*
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
IF( FULL )THEN
IF( TRACE )
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
@@ -1376,7 +1376,7 @@
CALL CZTPMV( IORDER, UPLO, TRANS, DIAG,
$ N, AA, XX, INCX )
END IF
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
IF( FULL )THEN
IF( TRACE )
$ WRITE( NTRA, FMT = 9993 )NC, SNAME,
@@ -1465,7 +1465,7 @@
END IF
*
IF( .NOT.NULL )THEN
IF( SNAME( 10: 11 ).EQ.'mv' )THEN
IF( SNAME( 4: 5 ).EQ.'mv' )THEN
*
* Check the result.
*
@@ -1473,7 +1473,7 @@
$ INCX, ZERO, Z, INCX, XT, G,
$ XX, EPS, ERR, FATAL, NOUT,
$ .TRUE. )
ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN
ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN
*
* Compute approximation to original vector.
*
@@ -1611,7 +1611,7 @@
* .. Common blocks ..
COMMON /INFOC/INFOT, NOUTC, OK
* .. Executable Statements ..
CONJ = SNAME( 11: 11 ).EQ.'c'
CONJ = SNAME( 5: 5 ).EQ.'c'
* Define the number of arguments.
NARGS = 9
*

View File

@@ -1366,9 +1366,8 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
IF( TRACE )
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@@ -1366,9 +1366,8 @@
*
150 CONTINUE
WRITE( NOUT, FMT = 9996 )SNAME
IF( TRACE )
$ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
$ M, N, ALPHA, LDA, LDB)
*
160 CONTINUE
RETURN

View File

@@ -1,7 +1,7 @@
'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -1,7 +1,7 @@
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
T LOGICAL FLAG, T TO STOP ON FAILURES.
F LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO

View File

@@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif
x = buffer;
buffer += ((COMPSIZE * args -> m + 3) & ~3);
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
}
#ifndef TRANS
@@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
queue[num_cpu - 1].next = NULL;

View File

@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1);
}

View File

@@ -48,7 +48,8 @@ foreach (float_type ${FLOAT_TYPES})
# TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
# Need to set CONJ for trmm and trsm
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type})
GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type})
@@ -71,10 +72,6 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type})
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
#herk
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type})
GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type})
#hemm
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type})
GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type})
@@ -99,17 +96,6 @@ foreach (float_type ${FLOAT_TYPES})
endif()
endif ()
endforeach ()
# for gemm3m
if(USE_GEMM3M)
foreach (GEMM_DEFINE ${GEMM_DEFINES})
string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC)
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
endif ()
endforeach ()
endif()
endif ()
endforeach ()

View File

@@ -335,9 +335,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

View File

@@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
START_RPCC();

View File

@@ -33,7 +33,6 @@ set(COMMON_SOURCES
xerbla.c
openblas_set_num_threads.c
openblas_error_handle.c
openblas_env.c
openblas_get_num_procs.c
openblas_get_num_threads.c
)

View File

@@ -1,7 +1,7 @@
TOPDIR = ../..
include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
@@ -118,9 +118,6 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_env.$(SUFFIX) : openblas_env.c
$(CC) $(CFLAGS) -c $< -o $(@F)
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)

View File

@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>
@@ -92,8 +92,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif
extern unsigned int openblas_thread_timeout();
#ifdef SMP_SERVER
#undef MONITOR
@@ -526,7 +524,6 @@ static int blas_monitor(void *arg){
int blas_thread_init(void){
BLASLONG i;
int ret;
int thread_timeout_env;
#ifdef NEED_STACKATTR
pthread_attr_t attr;
#endif
@@ -543,12 +540,22 @@ int blas_thread_init(void){
if (!blas_server_avail){
thread_timeout_env=openblas_thread_timeout();
if (thread_timeout_env>0) {
if (thread_timeout_env < 4) thread_timeout_env = 4;
if (thread_timeout_env > 30) thread_timeout_env = 30;
thread_timeout = (1 << thread_timeout_env);
}
env_var_t p;
if (readenv(p,"THREAD_TIMEOUT")) {
thread_timeout = atoi(p);
if (thread_timeout < 4) thread_timeout = 4;
if (thread_timeout > 30) thread_timeout = 30;
thread_timeout = (1 << thread_timeout);
}else{
if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
thread_timeout = atoi(p);
if (thread_timeout < 4) thread_timeout = 4;
if (thread_timeout > 30) thread_timeout = 30;
thread_timeout = (1 << thread_timeout);
}
}
for(i = 0; i < blas_num_threads - 1; i++){
@@ -569,12 +576,10 @@ int blas_thread_init(void){
struct rlimit rlim;
const char *msg = strerror(ret);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
#ifdef RLIMIT_NPROC
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
"%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max));
}
#endif
if(0 != raise(SIGINT)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n");
exit(EXIT_FAILURE);

View File

@@ -261,11 +261,6 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Avoton
if (model == 13) {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
return NULL;
case 5:
//Intel Broadwell
@@ -391,7 +386,7 @@ static char *corename[] = {
"Nehalem",
"Athlon",
"Opteron",
"Opteron_SSE3",
"Opteron(SSE3)",
"Barcelona",
"Nano",
"Sandybridge",

View File

@@ -144,7 +144,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(_MSC_VER) && !defined(__clang__)
#define CONSTRUCTOR __cdecl
#define DESTRUCTOR __cdecl
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#elif defined(OS_DARWIN) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#else
@@ -169,7 +169,7 @@ void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS)
#ifdef OS_LINUX
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
@@ -294,11 +294,8 @@ void openblas_fork_handler()
#endif
}
extern int openblas_num_threads_env();
extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
env_var_t p;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
@@ -313,18 +310,18 @@ int blas_get_cpu_number(void){
blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
if (blas_goto_num < 0) blas_goto_num = 0;
if (blas_goto_num == 0) {
blas_goto_num=openblas_goto_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
if (blas_goto_num < 0) blas_goto_num = 0;
}
#endif
blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
if (blas_omp_num < 0) blas_omp_num = 0;
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
@@ -360,9 +357,7 @@ int openblas_get_num_threads(void) {
#ifndef SMP
return 1;
#else
// init blas_cpu_number if needed
blas_get_cpu_number();
return blas_cpu_number;
return blas_get_cpu_number();
#endif
}
@@ -1343,7 +1338,6 @@ static void gotoblas_memory_init(void) {
/* Initialization for all function; this function should be called before main */
static int gotoblas_initialized = 0;
extern void openblas_read_env();
void CONSTRUCTOR gotoblas_init(void) {
@@ -1353,8 +1347,6 @@ void CONSTRUCTOR gotoblas_init(void) {
openblas_fork_handler();
#endif
openblas_read_env();
#ifdef PROFILE
moncontrol (0);
#endif
@@ -1371,8 +1363,7 @@ void CONSTRUCTOR gotoblas_init(void) {
gotoblas_memory_init();
#endif
//#if defined(OS_LINUX)
#if 0
#if defined(OS_LINUX)
struct rlimit curlimit;
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
{

View File

@@ -1,84 +0,0 @@
/***************************************************************************
Copyright (c) 2011-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static int openblas_env_verbose=0;
static unsigned int openblas_env_thread_timeout=0;
static int openblas_env_block_factor=0;
static int openblas_env_openblas_num_threads=0;
static int openblas_env_goto_num_threads=0;
static int openblas_env_omp_num_threads=0;
int openblas_verbose() { return openblas_env_verbose;}
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
int openblas_block_factor() { return openblas_env_block_factor;}
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
void openblas_read_env() {
int ret=0;
env_var_t p;
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_verbose=ret;
ret=0;
if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_block_factor=ret;
ret=0;
if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_thread_timeout=(unsigned int)ret;
ret=0;
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_openblas_num_threads=ret;
ret=0;
if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_goto_num_threads=ret;
ret=0;
if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_omp_num_threads=ret;
}

View File

@@ -33,7 +33,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
extern int openblas_verbose();
int openblas_verbose() {
int ret=0;
env_var_t p;
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
if(ret<0) ret=0;
return ret;
}
void openblas_warning(int verbose, const char * msg) {
int current_verbose;

View File

@@ -40,7 +40,6 @@
#include <string.h>
#include "common.h"
extern int openblas_block_factor();
int get_L2_size(void);
#define DEFAULT_GEMM_P 128
@@ -250,6 +249,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
env_var_t p;
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
int size = 16;
@@ -468,8 +468,9 @@ void blas_set_parameter(void){
#endif
#endif
factor=openblas_block_factor();
if (factor>0) {
if (readenv(p,"GOTO_BLOCK_FACTOR")) {
factor = atoi(p);
if (factor < 10) factor = 10;
if (factor > 200) factor = 200;

View File

@@ -26,10 +26,6 @@ ifndef ONLY_CBLAS
ONLY_CBLAS = 0
endif
ifndef BUILD_LAPACK_DEPRECATED
BUILD_LAPACK_DEPRECATED = 0
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(F_COMPILER), GFORTRAN)
ifndef ONLY_CBLAS
@@ -96,17 +92,17 @@ dll : ../$(LIBDLLNAME)
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
libopenblas.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
else
../$(LIBNAME).osx.renamed : ../$(LIBNAME) objconv.def
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).osx.renamed
$(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
../$(LIBNAME).renamed : ../$(LIBNAME) objconv.def
$(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed
$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def
endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
@@ -118,7 +114,7 @@ endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), Linux)
so : ../$(LIBSONAME)
@@ -209,26 +205,26 @@ static : ../$(LIBNAME)
rm -f goto.$(SUFFIX)
osx.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
aix.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
objcopy.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
objconv.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
test : linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest
linktest.c : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c
clean ::
@rm -f *.def *.dylib __.SYMDEF* *.renamed
@rm -f *.def *.dylib __.SYMDEF*
include ../Makefile.tail

View File

@@ -548,6 +548,7 @@
slatmt,
sorm22,
spotrf2,
xerbla,
zgejsv,
zgesvdx,
zgesvj,
@@ -590,13 +591,6 @@
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
);
@lapack_deprecated_objs = (
cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp,
cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd,
cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm,
cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf,
);
@lapackeobjs = (
# LAPACK C interface routines.
#
@@ -2991,11 +2985,6 @@ if ($ARGV[8] == 1) {
@need_2underscore_objs = (@lapack_embeded_underscore_objs);
};
if ($ARGV[11] == 1){
#BUILD_LAPACK_DEPRECATED=1
@underscore_objs =(@underscore_objs, @lapack_deprecated_objs);
}
} else {
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
}

View File

@@ -1,7 +1,5 @@
#!/usr/bin/perl
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
#
# 1. Not specified
# 1.1 Automatically detect, then check compiler
@@ -274,9 +272,8 @@ if ($link ne "") {
}
if ($flags =~ /^\-Y/) {
next if ($hostos eq 'SunOS');
$linker_L .= "-Wl,". $flags . " ";
}
}
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;

View File

@@ -86,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if defined(linux) || defined(__sun__)
#ifdef linux
#include <sys/sysinfo.h>
#include <unistd.h>
#endif
@@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER5"
#endif
#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER6"
@@ -565,20 +565,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER6"
#endif
#if defined(FORCE_POWER8)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER8"
#define SUBDIRNAME "power"
#define ARCHCONFIG "-DPOWER8 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "power8"
#define CORENAME "POWER8"
#endif
#ifdef FORCE_PPCG4
#define FORCE
#define ARCHITECTURE "POWER"
@@ -920,7 +906,7 @@ static int get_num_cores(void) {
size_t len;
#endif
#if defined(linux) || defined(__sun__)
#ifdef linux
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
@@ -1012,9 +998,7 @@ int main(int argc, char *argv[]){
#endif
#endif
#ifdef MAKE_NB_JOBS
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
#elif NO_PARALLEL_MAKE==1
#if NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
#ifndef OS_WINDOWS

View File

@@ -79,9 +79,11 @@ void NAME(char *TRANS, blasint *M, blasint *N,
FLOAT alpha = *ALPHA;
FLOAT beta = *BETA;
FLOAT *buffer;
int buffer_size;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
@@ -132,10 +134,13 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint lenx, leny;
int trans, buffer_size;
int trans;
blasint info, t;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
@@ -210,20 +215,43 @@ void CNAME(enum CBLAS_ORDER order,
if (incx < 0) x -= (lenx - 1) * incx;
if (incy < 0) y -= (leny - 1) * incy;
buffer_size = m + n + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT) ;
#ifdef MAX_STACK_ALLOC
// make it volatile because some gemv implementation (ex: dgemv_n.S)
// do not restore all register
volatile int stack_alloc_size = 0;
//for gemv_n and gemv_t, try to allocate on stack
stack_alloc_size = m + n;
#ifdef ALIGNED_ACCESS
stack_alloc_size += 3;
#endif
if(stack_alloc_size < 128)
//dgemv_n.S require a 128 bytes buffer
stack_alloc_size = 128;
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
stack_alloc_size = 0;
FLOAT stack_buffer[stack_alloc_size];
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
// printf("stack_alloc_size=%d\n", stack_alloc_size);
#else
//Original OpenBLAS/GotoBLAS codes.
buffer = (FLOAT *)blas_memory_alloc(1);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
#ifdef SMP
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
nthreads_max = num_cpu_avail(2);
nthreads_avail = nthreads_max;
MNK = (double) m * (double) n;
if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
nthreads_max = 1;
if ( nthreads_max > nthreads_avail )
nthreads = nthreads_avail;
else
nthreads = num_cpu_avail(2);
nthreads = nthreads_max;
if (nthreads == 1) {
#endif
@@ -238,7 +266,14 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
STACK_FREE(buffer);
#ifdef MAX_STACK_ALLOC
if(!stack_alloc_size){
blas_memory_free(buffer);
}
#else
blas_memory_free(buffer);
#endif
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
IDEBUG_END;

View File

@@ -171,14 +171,19 @@ void CNAME(enum CBLAS_ORDER order,
if (incy < 0) y -= (n - 1) * incy;
if (incx < 0) x -= (m - 1) * incx;
STACK_ALLOC(m, FLOAT, buffer);
#ifdef MAX_STACK_ALLOC
volatile int stack_alloc_size = m;
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT))
stack_alloc_size = 0;
FLOAT stack_buffer[stack_alloc_size];
buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1);
#else
buffer = (FLOAT *)blas_memory_alloc(1);
#endif
#ifdef SMPTEST
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = num_cpu_avail(2);
else
nthreads = 1;
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
@@ -193,7 +198,11 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
STACK_FREE(buffer);
#ifdef MAX_STACK_ALLOC
if(!stack_alloc_size)
#endif
blas_memory_free(buffer);
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
IDEBUG_END;

View File

@@ -95,7 +95,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
s = db / r;
z = ONE;
if (ada > adb) z = s;
if ((ada <= adb) && (c != ZERO)) z = ONE / c;
if ((ada < adb) && (c != ZERO)) z = ONE / c;
*C = c;
*S = s;

View File

@@ -77,13 +77,12 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){
if (incy < 0) y -= (n - 1) * incy;
#ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT))
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (incx == 0 || incy == 0)
nthreads = 1;
if (nthreads == 1) {
#endif

View File

@@ -77,9 +77,11 @@ void NAME(char *TRANS, blasint *M, blasint *N,
blasint incy = *INCY;
FLOAT *buffer;
int buffer_size;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
@@ -142,10 +144,13 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint lenx, leny;
int trans, buffer_size;
int trans;
blasint info, t;
#ifdef SMP
int nthreads;
int nthreads_max;
int nthreads_avail;
double MNK;
#endif
int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG,
@@ -231,26 +236,22 @@ void CNAME(enum CBLAS_ORDER order,
if (incx < 0) x -= (lenx - 1) * incx * 2;
if (incy < 0) y -= (leny - 1) * incy * 2;
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT) ;
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
if(trans && stack_alloc_size)
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
#endif
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
nthreads_max = num_cpu_avail(2);
nthreads_avail = nthreads_max;
MNK = (double) m * (double) n;
if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) ))
nthreads_max = 1;
if ( nthreads_max > nthreads_avail )
nthreads = nthreads_avail;
else
nthreads = num_cpu_avail(2);
nthreads = nthreads_max;
if (nthreads == 1) {
#endif
@@ -266,7 +267,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
STACK_FREE(buffer);
blas_memory_free(buffer);
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);

View File

@@ -210,14 +210,10 @@ void CNAME(enum CBLAS_ORDER order,
if (incy < 0) y -= (n - 1) * incy * 2;
if (incx < 0) x -= (m - 1) * incx * 2;
STACK_ALLOC(2 * m, FLOAT, buffer);
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMPTEST
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
if(1L * m * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
nthreads = num_cpu_avail(2);
else
nthreads = 1;
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
@@ -249,7 +245,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
STACK_FREE(buffer);
blas_memory_free(buffer);
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);

View File

@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint info;
int uplo;
int unit;
int trans, buffer_size;
int trans;
FLOAT *buffer;
#ifdef SMP
int nthreads;
@@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
int trans, uplo, unit, buffer_size;
int trans, uplo, unit;
blasint info;
FLOAT *buffer;
#ifdef SMP
@@ -227,28 +227,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (incx < 0 ) x -= (n - 1) * incx * 2;
#ifdef SMP
// Calibrated on a Xeon E5-2630
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
nthreads = num_cpu_avail(2);
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 2;
} else
nthreads = 1;
if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}
else
#endif
{
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
if(incx != 1)
buffer_size += n * 2;
}
STACK_ALLOC(buffer_size, FLOAT, buffer);
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
@@ -262,7 +245,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
#endif
STACK_FREE(buffer);
blas_memory_free(buffer);
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);

View File

@@ -227,28 +227,6 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
#gemm3m
if (USE_GEMM3M)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM3MKERNEL}" "NN" "gemm3m_kernel" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_oncopyb" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_oncopyr" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_oncopyi" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_otcopyb" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_otcopyr" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_otcopyi" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_incopyb" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_incopyr" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_incopyi" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_itcopyb" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_itcopyr" false "" "" false ${float_type})
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_itcopyi" false "" "" false ${float_type})
endif()
else () #For real
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})

View File

@@ -36,11 +36,6 @@ ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif
ifeq ($(CORE), POWER8)
USE_TRMM = 1
endif
SKERNELOBJS += \

View File

@@ -1,4 +1,26 @@
SGEMVNKERNEL = ../arm/gemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
CGEMVNKERNEL = ../arm/zgemv_n.c
CGEMVTKERNEL = ../arm/zgemv_t.c
DGEMVNKERNEL = ../arm/gemv_n.c
DGEMVTKERNEL = ../arm/gemv_t.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
#ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
#ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
#ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
#ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
#STRMMKERNEL = ../generic/trmmkernel_2x2.c
#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
#SGEMMONCOPY = ../generic/gemm_ncopy_2.c
#SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
###############################################################################
@@ -74,19 +96,19 @@ DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S
SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
CGEMVNKERNEL = cgemv_n_vfp.S
# BAD SGEMVNKERNEL = gemv_n_vfp.S
# BAD DGEMVNKERNEL = gemv_n_vfp.S
# CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
# BAD SGEMVTKERNEL = gemv_t_vfp.S
# BAD DGEMVTKERNEL = gemv_t_vfp.S
# CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
#CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
@@ -109,9 +131,9 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_2x2_vfp.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
#CGEMMKERNEL = cgemm_kernel_2x2_vfp.S
#CGEMMONCOPY = cgemm_ncopy_2_vfp.S
#CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o

View File

@@ -1,3 +1,8 @@
SGEMVNKERNEL = ../arm/gemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
CGEMVNKERNEL = ../arm/zgemv_n.c
CGEMVTKERNEL = ../arm/zgemv_t.c
#################################################################################
SAMAXKERNEL = iamax_vfp.S
@@ -72,14 +77,14 @@ DSCALKERNEL = scal.c
CSCALKERNEL = zscal.c
ZSCALKERNEL = zscal.c
SGEMVNKERNEL = gemv_n_vfpv3.S
DGEMVNKERNEL = gemv_n_vfpv3.S
CGEMVNKERNEL = cgemv_n_vfp.S
# BAD SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
#CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
# BAD SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
#CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
@@ -87,15 +92,24 @@ DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
SGEMMOTCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = dgemm_ncopy_4_vfp.S
DGEMMOTCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

View File

@@ -367,12 +367,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
movs r12, #0 // clear floating point register
vmov s0, r12
vmov s1, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
vcvt.f64.f32 d1, s1
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
#else
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
#endif
cmp N, #0

View File

@@ -185,15 +185,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
movs r4, #0 // clear floating point register
vmov s0, r4
vmov s1, s0
vmov s2, s0
vmov s3, s0
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3
cmp N, #0
ble cdot_kernel_L999

View File

@@ -57,10 +57,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, # -240]
#define FP_ZERO_1 [fp, # -236]
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
@@ -142,7 +138,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -344,7 +340,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s12, s8
vmov.f32 s13, s8
@@ -518,7 +514,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -685,7 +681,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
@@ -826,10 +822,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 4 * 2
str r3, LDC

View File

@@ -73,10 +73,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, # -240]
#define FP_ZERO_1 [fp, # -236]
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
@@ -151,7 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -372,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
@@ -554,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -734,7 +730,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s24, s16
vmov.f32 s25, s16
@@ -883,10 +879,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 4 * 2
str r3, LDC

View File

@@ -59,10 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define I r12
#define FP_ZERO [fp, #-228]
#define FP_ZERO_0 [fp, #-228]
#define FP_ZERO_1 [fp, #-224]
#define ALPHA_I [fp, #-236]
#define ALPHA_R [fp, #-244]
@@ -121,7 +117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F4
pld [ YO, #Y_PRE ]
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -224,7 +220,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
@@ -271,7 +267,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S4
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -388,7 +384,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
@@ -452,10 +448,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstm r12, { s8 - s15 } // store floating point registers
#endif
movs r12, #0
str r12, FP_ZERO
str r12, FP_ZERO_1
cmp OLD_M, #0
ble cgemvn_kernel_L999

View File

@@ -59,10 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define I r12
#define FP_ZERO [fp, #-228]
#define FP_ZERO_0 [fp, #-228]
#define FP_ZERO_1 [fp, #-224]
#define N [fp, #-252 ]
#define A [fp, #-256 ]
@@ -120,10 +116,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F2
flds s12, FP_ZERO
vmov.f32 s13, s12
vmov.f32 s14, s12
vmov.f32 s15, s12
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
vsub.f32 s14, s14, s14
vsub.f32 s15, s15, s15
.endm
@@ -176,8 +172,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
flds s12, FP_ZERO
vmov.f32 s13, s12
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
.endm
@@ -219,10 +215,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S2
flds s12, FP_ZERO
vmov.f32 s13, s12
vmov.f32 s14, s12
vmov.f32 s15, s12
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
vsub.f32 s14, s14, s14
vsub.f32 s15, s15, s15
.endm
@@ -285,8 +281,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
flds s12, FP_ZERO
vmov.f32 s13, s12
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
.endm
@@ -349,10 +345,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstm r12, { s8 - s15 } // store floating point registers
#endif
movs r12, #0
str r12, FP_ZERO
str r12, FP_ZERO_1
cmp M, #0
ble cgemvt_kernel_L999

View File

@@ -59,11 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define FP_ZERO [fp, #-232]
#define FP_ZERO_0 [fp, #-232]
#define FP_ZERO_1 [fp, #-228]
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
@@ -141,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -306,10 +301,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R
flds s1, ALPHA_I
flds s4, FP_ZERO
vmov.f32 s5, s4
vmov.f32 s6, s4
vmov.f32 s7, s4
vsub.f32 s4, s4, s4
vsub.f32 s5, s5, s5
vsub.f32 s6, s6, s6
vsub.f32 s7, s7, s7
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
@@ -323,10 +318,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstmias CO1, { s4 - s7 }
flds s4, FP_ZERO
vmov.f32 s5, s4
vmov.f32 s6, s4
vmov.f32 s7, s4
vsub.f32 s4, s4, s4
vsub.f32 s5, s5, s5
vsub.f32 s6, s6, s6
vsub.f32 s7, s7, s7
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
@@ -348,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s12, s8
vmov.f32 s13, s8
@@ -495,8 +490,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R
flds s1, ALPHA_I
flds s4, FP_ZERO
vmov.f32 s5, s4
vsub.f32 s4, s4, s4
vsub.f32 s5, s5, s5
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
@@ -505,8 +500,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstmias CO1, { s4 - s5 }
flds s4, FP_ZERO
vmov.f32 s5, s4
vsub.f32 s4, s4, s4
vsub.f32 s5, s5, s5
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
@@ -524,7 +519,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -668,10 +663,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R
flds s1, ALPHA_I
flds s4, FP_ZERO
vmov.f32 s5, s4
vmov.f32 s6, s4
vmov.f32 s7, s4
vsub.f32 s4, s4, s4
vsub.f32 s5, s5, s5
vsub.f32 s6, s6, s6
vsub.f32 s7, s7, s7
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
@@ -694,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
@@ -800,8 +795,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
flds s0, ALPHA_R
flds s1, ALPHA_I
flds s4, FP_ZERO
vmov.f32 s5, s4
vsub.f32 s4, s4, s4
vsub.f32 s5, s5, s5
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
@@ -836,10 +831,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 4 * 2
str r3, LDC

View File

@@ -59,10 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define FP_ZERO [fp, #-236]
#define FP_ZERO_0 [fp, #-236]
#define FP_ZERO_1 [fp, #-232]
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
@@ -138,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds s16 , FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -355,7 +351,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds s16 , FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
@@ -533,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds s16 , FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -710,7 +706,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds s16 , FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s24, s16
vmov.f32 s25, s16
@@ -856,10 +852,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 4 * 2
str r3, LDC

View File

@@ -31,8 +31,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* CTEST : OK
* TEST : OK
*
* 2016/01/23 Saar
* Bugfix for Refs #750 and #740
**************************************************************************************/
#define ASSEMBLER
@@ -154,12 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
movs r4, #0 // clear floating point register
vmov s0, r4
vmov s1, r4
vcvt.f64.f32 d0, s0
vcvt.f64.f32 d1, s1
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
cmp N, #0
ble ddot_kernel_L999

View File

@@ -56,13 +56,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, # -240]
#define FP_ZERO_1 [fp, # -236]
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
@@ -90,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
fldd d8, FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
@@ -178,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
fldd d8, FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
@@ -238,7 +233,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
fldd d8, FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d12, d8
.endm
@@ -288,7 +283,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
fldd d8, FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
@@ -343,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
fldd d8, FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
.endm
@@ -385,7 +380,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
fldd d8, FP_ZERO
vsub.f64 d8 , d8 , d8
.endm
@@ -438,10 +433,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 8
str r3, LDC

View File

@@ -73,10 +73,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, # -240]
#define FP_ZERO_1 [fp, # -236]
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
@@ -106,7 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x4
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
@@ -380,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x4
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
@@ -474,7 +470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x4
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d20, d16
vmov.f64 d24, d16
vmov.f64 d28, d16
@@ -537,7 +533,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
@@ -621,7 +617,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
@@ -682,7 +678,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d20, d16
.endm
@@ -727,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
@@ -786,7 +782,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
.endm
@@ -830,7 +826,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
.endm
@@ -884,10 +880,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
str OLD_A, A
vstr OLD_ALPHA, ALPHA
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers

View File

@@ -59,10 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-232]
#define FP_ZERO_0 [fp, #-232]
#define FP_ZERO_1 [fp, #-228]
#define ALPHA [fp, #-276 ]
#define B [fp, #4 ]
@@ -94,7 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
fldd d8 , FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
@@ -169,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
fldd d8 , FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
@@ -224,7 +220,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
fldd d8 , FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d12, d8
.endm
@@ -272,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
fldd d8 , FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
@@ -322,7 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
fldd d8 , FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
.endm
@@ -361,7 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
fldd d8 , FP_ZERO
vsub.f64 d8 , d8 , d8
.endm
@@ -413,10 +409,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 8
str r3, LDC

View File

@@ -59,11 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-236]
#define FP_ZERO_0 [fp, #-236]
#define FP_ZERO_1 [fp, #-232]
#define ALPHA [fp, #-276 ]
#define B [fp, #4 ]
@@ -94,7 +89,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x4
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
@@ -391,7 +386,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x4
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
@@ -473,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x4
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d20, d16
vmov.f64 d24, d16
vmov.f64 d28, d16
@@ -532,7 +527,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
@@ -606,7 +601,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d20, d16
vmov.f64 d21, d16
@@ -661,7 +656,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d20, d16
.endm
@@ -704,7 +699,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
vmov.f64 d18, d16
vmov.f64 d19, d16
@@ -758,7 +753,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
vmov.f64 d17, d16
.endm
@@ -799,7 +794,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
fldd d16, FP_ZERO
vsub.f64 d16 , d16 , d16
.endm
@@ -855,10 +850,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 8
str r3, LDC

View File

@@ -59,10 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define I r12
#define FP_ZERO [fp, #-228]
#define FP_ZERO_0 [fp, #-228]
#define FP_ZERO_1 [fp, #-224]
#define M [fp, #-252 ]
#define A [fp, #-256 ]
@@ -83,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ YO , #Y_PRE ]
pld [ YO , #Y_PRE+32 ]
fldd d8 , FP_ZERO
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
vmov.f64 d10 , d8
vmov.f64 d11 , d8
@@ -162,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
fldd d12 , FP_ZERO
vsub.f64 d12 , d12 , d12
.endm
@@ -189,7 +185,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S4
fldd d12 , FP_ZERO
vsub.f64 d12 , d12 , d12
vmov.f64 d13 , d12
vmov.f64 d14 , d12
vmov.f64 d15 , d12
@@ -249,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
fldd d12 , FP_ZERO
vsub.f64 d12 , d12 , d12
.endm
@@ -283,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ YO , #Y_PRE ]
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10 , s8
vmov.f32 s11 , s8
@@ -361,7 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
flds s12 , FP_ZERO
vsub.f32 s12 , s12 , s12
.endm
@@ -388,7 +384,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S4
flds s12 , FP_ZERO
vsub.f32 s12 , s12 , s12
vmov.f32 s13 , s12
vmov.f32 s14 , s12
vmov.f32 s15 , s12
@@ -449,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
flds s12 , FP_ZERO
vsub.f32 s12 , s12 , s12
.endm
@@ -498,10 +494,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstm r12, { s8 - s15 } // store floating point registers
#endif
movs r12, #0
str r12, FP_ZERO
str r12, FP_ZERO_1
cmp OLD_M, #0
ble gemvn_kernel_L999

View File

@@ -62,10 +62,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define M [fp, #-252 ]
#define A [fp, #-256 ]
#define FP_ZERO [fp, #-228]
#define FP_ZERO_0 [fp, #-228]
#define FP_ZERO_1 [fp, #-224]
#define X_PRE 64
#define Y_PRE 0
@@ -83,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ YO , #Y_PRE ]
pld [ YO , #Y_PRE+32 ]
fldd d24 , FP_ZERO
vsub.f64 d24 , d24 , d24
vmov.f64 d25 , d24
vmov.f64 d26 , d24
vmov.f64 d27 , d24
@@ -151,7 +147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
fldd d24 , FP_ZERO
vsub.f64 d24 , d24 , d24
.endm
@@ -179,7 +175,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S8
fldd d24 , FP_ZERO
vsub.f64 d24 , d24 , d24
vmov.f64 d25 , d24
vmov.f64 d26 , d24
vmov.f64 d27 , d24
@@ -273,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
fldd d24 , FP_ZERO
vsub.f64 d24 , d24 , d24
.endm
@@ -306,7 +302,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
pld [ YO , #Y_PRE ]
flds s24 , FP_ZERO
vsub.f32 s24 , s24 , s24
vmov.f32 s25 , s24
vmov.f32 s26 , s24
vmov.f32 s27 , s24
@@ -372,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
flds s24 , FP_ZERO
vsub.f32 s24 , s24 , s24
.endm
@@ -400,7 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S8
flds s24 , FP_ZERO
vsub.f32 s24 , s24 , s24
vmov.f32 s25 , s24
vmov.f32 s26 , s24
vmov.f32 s27 , s24
@@ -493,7 +489,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
flds s24 , FP_ZERO
vsub.f32 s24 , s24 , s24
.endm
@@ -542,10 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstm r12, { s8 - s31 } // store floating point registers
#endif
movs r12, #0
str r12, FP_ZERO
str r12, FP_ZERO_1
cmp OLD_M, #0
ble gemvn_kernel_L999

View File

@@ -59,10 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define I r12
#define FP_ZERO [fp, #-228]
#define FP_ZERO_0 [fp, #-228]
#define FP_ZERO_1 [fp, #-224]
#define N [fp, #-252 ]
#define A [fp, #-256 ]
@@ -79,8 +75,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F2
fldd d2, FP_ZERO
vmov.f64 d3 , d2
vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3
.endm
@@ -127,8 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
fldd d2, FP_ZERO
vmov.f64 d3 , d2
vsub.f64 d2 , d2 , d2
.endm
@@ -165,8 +160,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S2
fldd d2, FP_ZERO
vmov.f64 d3 , d2
vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3
.endm
@@ -229,8 +224,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
fldd d2, FP_ZERO
vmov.f64 d3 , d2
vsub.f64 d2 , d2 , d2
.endm
@@ -282,9 +276,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F2
flds s2 , FP_ZERO
vmov.f32 s3 , s2
vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3
.endm
@@ -328,7 +321,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_F1
flds s2 , FP_ZERO
vsub.f32 s2 , s2 , s2
.endm
@@ -363,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S2
flds s2 , FP_ZERO
vmov.f32 s3 , s2
vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3
.endm
@@ -425,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT_S1
flds s2 , FP_ZERO
vsub.f32 s2 , s2 , s2
.endm
@@ -495,10 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstm r12, { s8 - s15 } // store floating point registers
#endif
movs r12, #0
str r12, FP_ZERO
str r12, FP_ZERO_1
cmp M, #0
ble gemvt_kernel_L999

View File

@@ -341,12 +341,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
push {r4}
movs r12, #0 // clear floating point register
vmov s0, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0
#else
vsub.f32 s0 , s0 , s0
#endif
mov INDEX, #0
cmp N, #0

View File

@@ -409,20 +409,12 @@ KERNEL_S1_END_\@:
#if defined(DOUBLE)
znrm2_zero:
.word 0x00000000
.word 0x00000000
znrm2_one:
.word 0x00000000
.word 0x3ff00000
#else
cnrm2_zero:
.word 0x00000000
cnrm2_one:
.word 0x3f800000
@@ -432,20 +424,12 @@ cnrm2_one:
#if defined(DOUBLE)
dnrm2_zero:
.word 0x00000000
.word 0x00000000
dnrm2_one:
.word 0x00000000
.word 0x3ff00000
#else
snrm2_zero:
.word 0x00000000
snrm2_one:
.word 0x3f800000
@@ -462,12 +446,12 @@ nrm2_begin:
#if defined(COMPLEX)
#if defined(DOUBLE)
vldr.64 d0 , znrm2_zero
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , znrm2_one // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vldr.32 s0 , cnrm2_zero
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , cnrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0
@@ -476,12 +460,12 @@ nrm2_begin:
#else
#if defined(DOUBLE)
vldr.64 d0 , dnrm2_zero
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , dnrm2_one // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vldr.32 s0 , snrm2_zero
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , snrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0

View File

@@ -405,15 +405,12 @@ KERNEL_S1_END_\@:
.align 5
#if defined(DOUBLE)
movs r12 , #0
vmov.f32 s0 , r12 // scale=0.0
vcvt.f64.f32 d0, s0
vsub.f64 d0 , d0 , d0 // scale=0.0
vmov.f64 d1 , #1.0 // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
movs r12 , #0
vmov.f32 s0 , r12 // scale=0.0
vsub.f32 s0 , s0 , s0 // scale=0.0
vmov.f32 s1 , #1.0 // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0

View File

@@ -31,8 +31,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* CTEST : OK (no test for dsdot)
* TEST : OK (no test for dsdot)
*
* 2016/01/23 Saar
* Bugfix for Refs #750 and #740
**************************************************************************************/
#define ASSEMBLER
@@ -242,12 +240,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
movs r4, #0 // clear floating point register
vmov s0, r4
vmov s1, r4
#if defined(DSDOT)
vcvt.f64.f32 d0, s0
vcvt.f64.f32 d1, s1
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
#else
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
#endif
cmp N, #0

View File

@@ -56,10 +56,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, # -240]
#define FP_ZERO_1 [fp, # -236]
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
@@ -89,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
flds s8, FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -165,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds s8, FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
@@ -225,7 +221,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds s8, FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s12, s8
.endm
@@ -275,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
flds s8, FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -330,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds s8, FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
@@ -372,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds s8, FP_ZERO
vsub.f32 s8 , s8 , s8
.endm
@@ -425,10 +421,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #2 // ldc = ldc * 4
str r3, LDC

View File

@@ -73,11 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, #-240]
#define FP_ZERO_1 [fp, #-236]
#define ALPHA [fp, #-280]
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
#define C [fp, #8 ]
@@ -106,7 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x4
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -353,7 +349,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x4
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
@@ -447,7 +443,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x4
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s20, s16
vmov.f32 s24, s16
vmov.f32 s28, s16
@@ -510,7 +506,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -594,7 +590,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
@@ -655,7 +651,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s20, s16
.endm
@@ -700,7 +696,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -759,7 +755,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
.endm
@@ -803,7 +799,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds s16, FP_ZERO
vsub.f32 s16 , s16 , s16
.endm
@@ -860,10 +856,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #2 // ldc = ldc * 4
str r3, LDC

View File

@@ -59,10 +59,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-232]
#define FP_ZERO_0 [fp, #-232]
#define FP_ZERO_1 [fp, #-228]
#define ALPHA [fp, #-276 ]
#define B [fp, #4 ]
@@ -94,7 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -160,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
@@ -215,7 +211,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s12, s8
.endm
@@ -263,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
@@ -313,7 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
@@ -352,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds s8 , FP_ZERO
vsub.f32 s8 , s8 , s8
.endm
@@ -404,10 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #2 // ldc = ldc * 4
str r3, LDC

View File

@@ -58,10 +58,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, # -240]
#define FP_ZERO_1 [fp, # -236]
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
@@ -92,7 +88,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x4
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -326,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x4
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
@@ -409,7 +405,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x4
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s20, s16
vmov.f32 s24, s16
vmov.f32 s28, s16
@@ -468,7 +464,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x2
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -542,7 +538,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x2
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s20, s16
vmov.f32 s21, s16
@@ -597,7 +593,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x2
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s20, s16
.endm
@@ -640,7 +636,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT4x1
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
vmov.f32 s18, s16
vmov.f32 s19, s16
@@ -694,7 +690,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT2x1
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
vmov.f32 s17, s16
.endm
@@ -735,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro INIT1x1
flds S16, FP_ZERO
vsub.f32 s16 , s16 , s16
.endm
@@ -791,10 +787,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers
movs r4, #0
str r4, FP_ZERO
str r4, FP_ZERO_1
ldr r3, OLD_LDC
lsl r3, r3, #2 // ldc = ldc * 4
str r3, LDC

View File

@@ -187,16 +187,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
movs r4, #0 // clear floating point register
vmov s0, r4
vcvt.f64.f32 d0, s0
vcvt.f64.f32 d1, s0
vcvt.f64.f32 d2, s0
vcvt.f64.f32 d3, s0
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3
cmp N, #0
ble zdot_kernel_L999

Some files were not shown because too many files have changed in this diff Show More