Compare commits
68 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a71e8c82f6 | ||
|
|
1619b2f3c8 | ||
|
|
4f3153395a | ||
|
|
308e6195b7 | ||
|
|
fced5744fb | ||
|
|
8c0fb1258d | ||
|
|
aae581d004 | ||
|
|
e17303933a | ||
|
|
f9226275f4 | ||
|
|
cf8c7e28b3 | ||
|
|
5ac02f6dc7 | ||
|
|
7aa1ad4923 | ||
|
|
d5e1255ca7 | ||
|
|
587455868e | ||
|
|
323c237e7b | ||
|
|
faa5e2e5e3 | ||
|
|
551fdf53e8 | ||
|
|
fdf291be30 | ||
|
|
68eb4fa329 | ||
|
|
05196a8497 | ||
|
|
db9b611b12 | ||
|
|
2e6333f74e | ||
|
|
c99cc41cbd | ||
|
|
711ecb8bd5 | ||
|
|
10c2ebdfc5 | ||
|
|
26b3b3a3e6 | ||
|
|
acdff55a6a | ||
|
|
7d6b68eb4a | ||
|
|
0bbca5e803 | ||
|
|
cd5241d0cf | ||
|
|
8d652f11e7 | ||
|
|
6c86570e1f | ||
|
|
53ba1a77c8 | ||
|
|
d23c7c713c | ||
|
|
8c43d7fa5f | ||
|
|
8f758eeff9 | ||
|
|
8577be2a95 | ||
|
|
1edf30b790 | ||
|
|
4fc8c937d4 | ||
|
|
efa4f5c936 | ||
|
|
17d655fa64 | ||
|
|
f68141cf1d | ||
|
|
aa90518201 | ||
|
|
6b85dbb6dc | ||
|
|
a0debd4293 | ||
|
|
937493bfeb | ||
|
|
74b0672223 | ||
|
|
6e7be06e07 | ||
|
|
a04d0555ba | ||
|
|
3761c30ba4 | ||
|
|
38593cd3a3 | ||
|
|
e3b7781c2b | ||
|
|
5e6965ea47 | ||
|
|
5cc0301fc3 | ||
|
|
19a6dedfd6 | ||
|
|
0e2b92e216 | ||
|
|
d06b92906a | ||
|
|
8e98478ff3 | ||
|
|
fb8968fb83 | ||
|
|
dae6b82a71 | ||
|
|
d73244b825 | ||
|
|
233c6b959f | ||
|
|
16ec5323c9 | ||
|
|
0ad02ef2d6 | ||
|
|
73397faf68 | ||
|
|
5fc2203d8a | ||
|
|
78dcf5c3d5 | ||
|
|
32f793195f |
@@ -25,6 +25,7 @@ before_install:
|
||||
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
|
||||
|
||||
script:
|
||||
- set -e
|
||||
- make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
- if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
|
||||
project(OpenBLAS)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 2)
|
||||
set(OpenBLAS_PATCH_VERSION 16.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 17)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
enable_language(ASM)
|
||||
@@ -54,10 +54,6 @@ if (NOT DYNAMIC_ARCH)
|
||||
list(APPEND BLASDIRS kernel)
|
||||
endif ()
|
||||
|
||||
if (DEFINED UTEST_CHECK)
|
||||
set(SANITY_CHECK 1)
|
||||
endif ()
|
||||
|
||||
if (DEFINED SANITY_CHECK)
|
||||
list(APPEND BLASDIRS reference)
|
||||
endif ()
|
||||
@@ -110,6 +106,10 @@ if (${NO_STATIC} AND ${NO_SHARED})
|
||||
message(FATAL_ERROR "Neither static nor shared are enabled.")
|
||||
endif ()
|
||||
|
||||
#Set default output directory
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
|
||||
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
@@ -139,6 +139,17 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET
|
||||
|
||||
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
|
||||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(utest)
|
||||
|
||||
if(NOT MSVC)
|
||||
#only build shared library for MSVC
|
||||
@@ -152,7 +163,6 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
|
||||
endif()
|
||||
|
||||
#build test and ctest
|
||||
enable_testing()
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
|
||||
@@ -124,7 +124,13 @@ In chronological order:
|
||||
* Jerome Robert <jeromerobert@gmx.com>
|
||||
* [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478)
|
||||
* [2015-12-23] `stack_check` in `gemv.c` (bug #722)
|
||||
* [2015-12-28] Allow to force the number of parallel make job
|
||||
* [2015-12-28] Fix detection of AMD E2-3200 detection
|
||||
* [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected
|
||||
* [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731)
|
||||
* [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742)
|
||||
* [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760)
|
||||
* [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727)
|
||||
|
||||
* Dan Kortschak
|
||||
* [2015-01-07] Added test for drotmg bug #484.
|
||||
@@ -135,5 +141,11 @@ In chronological order:
|
||||
* Martin Koehler <https://github.com/grisuthedragon/>
|
||||
* [2015-09-07] Improved imatcopy
|
||||
|
||||
* Ashwin Sekhar T K <https://github.com/ashwinyes/>
|
||||
* [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
|
||||
* [2015-11-20] lapack-test fixes for Cortex-A57
|
||||
* [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
|
||||
* [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
|
||||
|
||||
* [Your name or handle] <[email or website]>
|
||||
* [Date] [Brief summary of your changes]
|
||||
|
||||
@@ -1,4 +1,63 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.2.17
|
||||
20-Mar-2016
|
||||
common:
|
||||
* Enable BUILD_LAPACK_DEPRECATED=1 by default.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.16
|
||||
15-Mar-2016
|
||||
common:
|
||||
* Avoid potential getenv segfault. (#716)
|
||||
* Import LAPACK svn bugfix #142-#147,#150-#155
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller
|
||||
* Fix bug with scipy linalg test.
|
||||
|
||||
ARM:
|
||||
* Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
|
||||
|
||||
POWER:
|
||||
* Optimize D and Z BLAS3 functions for Power8.
|
||||
|
||||
====================================================================
|
||||
Version 0.2.16.rc1
|
||||
23-Feb-2016
|
||||
common:
|
||||
* Upgrade LAPACK to 3.6.0 version.
|
||||
Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build
|
||||
LAPACK deprecated functions.
|
||||
* Add MAKE_NB_JOBS option in Makefile.
|
||||
Force number of make jobs.This is particularly
|
||||
useful when using distcc. (#735. Thanks, Jerome Robert.)
|
||||
* Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor).
|
||||
* Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert)
|
||||
* Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert)
|
||||
* Let openblas_get_num_threads return the number of active threads.
|
||||
(#760. Thanks, Jerome Robert)
|
||||
* Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen)
|
||||
* Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey)
|
||||
* Update scipy benchmark script. (#745. Thanks, John Kirkham)
|
||||
|
||||
x86/x86_64:
|
||||
* Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller.
|
||||
* Detect Intel Avoton.
|
||||
* Detect AMD Trinity, Richland, E2-3200.
|
||||
* Fix gemv performance bug on Mac OSX Intel Haswell.
|
||||
* Fix some bugs with CMake and Visual Studio
|
||||
|
||||
ARM:
|
||||
* Support and optimize Cortex-A57 AArch64.
|
||||
(#686. Thanks, Ashwin Sekhar TK)
|
||||
* Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere)
|
||||
* Update ARMV6 kernels.
|
||||
|
||||
POWER:
|
||||
* Fix detection of POWER architecture
|
||||
(#684. Thanks, Sebastien Villemot)
|
||||
|
||||
====================================================================
|
||||
Version 0.2.15
|
||||
27-Oct-2015
|
||||
|
||||
15
Makefile
15
Makefile
@@ -83,20 +83,20 @@ shared :
|
||||
ifndef NO_SHARED
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
@@ -113,10 +113,8 @@ ifndef CROSS
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
ifdef UTEST_CHECK
|
||||
$(MAKE) -C utest all
|
||||
endif
|
||||
endif
|
||||
ifndef NO_CBLAS
|
||||
$(MAKE) -C ctest all
|
||||
endif
|
||||
@@ -259,6 +257,9 @@ endif
|
||||
else
|
||||
-@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
|
||||
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.2.16.dev
|
||||
VERSION = 0.2.17
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -79,6 +79,9 @@ VERSION = 0.2.16.dev
|
||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
||||
# NO_LAPACKE = 1
|
||||
|
||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
BUILD_LAPACK_DEPRECATED = 1
|
||||
|
||||
# If you want to use legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
||||
@@ -142,10 +145,6 @@ NO_AFFINITY = 1
|
||||
# slow (Not implemented yet).
|
||||
# SANITY_CHECK = 1
|
||||
|
||||
# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable
|
||||
# SANITY_CHECK to compare the result with reference BLAS.
|
||||
# UTEST_CHECK = 1
|
||||
|
||||
# The installation directory.
|
||||
# PREFIX = /opt/OpenBLAS
|
||||
|
||||
|
||||
@@ -75,10 +75,11 @@ Please read GotoBLAS_01Readme.txt
|
||||
|
||||
#### ARM64:
|
||||
- **ARMV8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
### Support OS:
|
||||
- **GNU/Linux**
|
||||
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
|
||||
|
||||
199
USAGE.md
Normal file
199
USAGE.md
Normal file
@@ -0,0 +1,199 @@
|
||||
# Notes on OpenBLAS usage
|
||||
## Usage
|
||||
|
||||
#### Program is Terminated. Because you tried to allocate too many memory regions
|
||||
|
||||
In OpenBLAS, we mange a pool of memory buffers and allocate the number of
|
||||
buffers as the following.
|
||||
```
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
```
|
||||
This error indicates that the program exceeded the number of buffers.
|
||||
|
||||
Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
||||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||
|
||||
#### How can I use OpenBLAS in multi-threaded applications?
|
||||
|
||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||
multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
|
||||
following ways:
|
||||
|
||||
* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
|
||||
* Call `openblas_set_num_threads(1)` in the application on runtime.
|
||||
* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
|
||||
|
||||
If the application is parallelized by OpenMP, please use OpenBLAS built with
|
||||
`USE_OPENMP=1`
|
||||
|
||||
#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
|
||||
|
||||
The environment variable which control the kernel selection is
|
||||
`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
|
||||
OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
|
||||
returns the used target.
|
||||
|
||||
#### How could I disable OpenBLAS threading affinity on runtime?
|
||||
|
||||
You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
|
||||
variable to disable threading affinity on runtime. For example, before the
|
||||
running,
|
||||
```
|
||||
export OPENBLAS_MAIN_FREE=1
|
||||
```
|
||||
|
||||
Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
|
||||
in `Makefile.rule`.
|
||||
|
||||
## Linking with the library
|
||||
|
||||
* Link with shared library
|
||||
|
||||
`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
|
||||
|
||||
If the library is multithreaded, please add `-lpthread`. If the library
|
||||
contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
|
||||
|
||||
* Link with static library
|
||||
|
||||
`gcc -o test test.c /your/path/libopenblas.a`
|
||||
|
||||
You can download `test.c` from https://gist.github.com/xianyi/5780018
|
||||
|
||||
On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
|
||||
default), custom programs statically linked against `libopenblas.a` should also
|
||||
link with the pthread library e.g.:
|
||||
|
||||
```
|
||||
gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
|
||||
```
|
||||
|
||||
Failing to add the `-lpthread` flag will cause errors such as:
|
||||
|
||||
```
|
||||
/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
|
||||
memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
|
||||
memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
|
||||
...
|
||||
```
|
||||
|
||||
## Code examples
|
||||
|
||||
#### Call CBLAS interface
|
||||
This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
|
||||
```
|
||||
#include <cblas.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void main()
|
||||
{
|
||||
int i=0;
|
||||
double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
|
||||
double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
|
||||
|
||||
for(i=0; i<9; i++)
|
||||
printf("%lf ", C[i]);
|
||||
printf("\n");
|
||||
}
|
||||
```
|
||||
`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
|
||||
|
||||
#### Call BLAS Fortran interface
|
||||
|
||||
This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
|
||||
|
||||
```
|
||||
#include "stdio.h"
|
||||
#include "stdlib.h"
|
||||
#include "sys/time.h"
|
||||
#include "time.h"
|
||||
|
||||
extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
int i;
|
||||
printf("test!\n");
|
||||
if(argc<4){
|
||||
printf("Input Error\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
int m = atoi(argv[1]);
|
||||
int n = atoi(argv[2]);
|
||||
int k = atoi(argv[3]);
|
||||
int sizeofa = m * k;
|
||||
int sizeofb = k * n;
|
||||
int sizeofc = m * n;
|
||||
char ta = 'N';
|
||||
char tb = 'N';
|
||||
double alpha = 1.2;
|
||||
double beta = 0.001;
|
||||
|
||||
struct timeval start,finish;
|
||||
double duration;
|
||||
|
||||
double* A = (double*)malloc(sizeof(double) * sizeofa);
|
||||
double* B = (double*)malloc(sizeof(double) * sizeofb);
|
||||
double* C = (double*)malloc(sizeof(double) * sizeofc);
|
||||
|
||||
srand((unsigned)time(NULL));
|
||||
|
||||
for (i=0; i<sizeofa; i++)
|
||||
A[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofb; i++)
|
||||
B[i] = i%3+1;//(rand()%100)/10.0;
|
||||
|
||||
for (i=0; i<sizeofc; i++)
|
||||
C[i] = i%3+1;//(rand()%100)/10.0;
|
||||
//#if 0
|
||||
printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
|
||||
gettimeofday(&start, NULL);
|
||||
dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
|
||||
gettimeofday(&finish, NULL);
|
||||
|
||||
duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
|
||||
double gflops = 2.0 * m *n*k;
|
||||
gflops = gflops/duration*1.0e-6;
|
||||
|
||||
FILE *fp;
|
||||
fp = fopen("timeDGEMM.txt", "a");
|
||||
fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
|
||||
fclose(fp);
|
||||
|
||||
free(A);
|
||||
free(B);
|
||||
free(C);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
|
||||
|
||||
` ./time_dgemm <m> <n> <k> `
|
||||
|
||||
## Troubleshooting
|
||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||
|
||||
## BLAS reference manual
|
||||
If you want to understand every BLAS function and definition, please read
|
||||
[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
|
||||
or [netlib.org](http://netlib.org/blas/)
|
||||
|
||||
Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
|
||||
|
||||
## How to reference OpenBLAS.
|
||||
|
||||
You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
|
||||
|
||||
Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.
|
||||
|
||||
@@ -39,4 +39,6 @@ before_build:
|
||||
- cmake -G "Visual Studio 12 Win64" .
|
||||
|
||||
test_script:
|
||||
- echo Build OK!
|
||||
- echo Running Test
|
||||
- cd c:\projects\OpenBLAS\utest
|
||||
- openblas_utest
|
||||
|
||||
@@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
smallscaling
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
@@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c
|
||||
zgemm3m.$(SUFFIX) : gemm3m.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
smallscaling: smallscaling.c ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
||||
|
||||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
||||
|
||||
196
benchmark/smallscaling.c
Normal file
196
benchmark/smallscaling.c
Normal file
@@ -0,0 +1,196 @@
|
||||
// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <cblas.h>
|
||||
#include <omp.h>
|
||||
#define MIN_SIZE 5
|
||||
#define MAX_SIZE 60
|
||||
#define NB_SIZE 10
|
||||
|
||||
// number of loop for a 1x1 matrix. Lower it if the test is
|
||||
// too slow on you computer.
|
||||
#define NLOOP 2e7
|
||||
|
||||
typedef struct {
|
||||
int matrix_size;
|
||||
int n_loop;
|
||||
void (* bench_func)();
|
||||
void (* blas_func)();
|
||||
void * (* create_matrix)(int size);
|
||||
} BenchParam;
|
||||
|
||||
void * s_create_matrix(int size) {
|
||||
float * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * c_create_matrix(int size) {
|
||||
float * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * z_create_matrix(int size) {
|
||||
double * r = malloc(size * 2 * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < 2 * size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void * d_create_matrix(int size) {
|
||||
double * r = malloc(size * sizeof(double));
|
||||
int i;
|
||||
for(i = 0; i < size; i++)
|
||||
r[i] = 1e3 * i / size;
|
||||
return r;
|
||||
}
|
||||
|
||||
void trmv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("U", "N", "N", &size, A, &size, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void gemv_bench(BenchParam * param)
|
||||
{
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
void ger_bench(BenchParam * param) {
|
||||
int i, n;
|
||||
int size = param->matrix_size;
|
||||
n = param->n_loop / size;
|
||||
double v = 1.01;
|
||||
int one = 1;
|
||||
void * A = param->create_matrix(size * size);
|
||||
void * y = param->create_matrix(size);
|
||||
for(i = 0; i < n; i++) {
|
||||
param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size);
|
||||
}
|
||||
free(A);
|
||||
free(y);
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
void * pthread_func_wrapper(void * param) {
|
||||
((BenchParam *)param)->bench_func(param);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define NB_TESTS 5
|
||||
void * TESTS[4 * NB_TESTS] = {
|
||||
trmv_bench, ztrmv_, z_create_matrix, "ztrmv",
|
||||
gemv_bench, dgemv_, d_create_matrix, "dgemv",
|
||||
gemv_bench, zgemv_, z_create_matrix, "zgemv",
|
||||
ger_bench, dger_, d_create_matrix, "dger",
|
||||
ger_bench, zgerc_, z_create_matrix, "zgerc",
|
||||
};
|
||||
|
||||
inline static double delta_time(struct timespec tick) {
|
||||
struct timespec tock;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tock);
|
||||
return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9;
|
||||
}
|
||||
|
||||
double pthread_bench(BenchParam * param, int nb_threads)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return 0;
|
||||
#else
|
||||
BenchParam threaded_param = *param;
|
||||
pthread_t threads[nb_threads];
|
||||
int t, rc;
|
||||
struct timespec tick;
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
for(t=0; t<nb_threads; t++){
|
||||
rc = pthread_create(&threads[t], NULL, pthread_func_wrapper, &threaded_param);
|
||||
if (rc){
|
||||
printf("ERROR; return code from pthread_create() is %d\n", rc);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
for(t=0; t<nb_threads; t++){
|
||||
pthread_join(threads[t], NULL);
|
||||
}
|
||||
return delta_time(tick);
|
||||
#endif
|
||||
}
|
||||
|
||||
double seq_bench(BenchParam * param) {
|
||||
struct timespec tick;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
param->bench_func(param);
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
double omp_bench(BenchParam * param) {
|
||||
BenchParam threaded_param = *param;
|
||||
struct timespec tick;
|
||||
int t;
|
||||
int nb_threads = omp_get_max_threads();
|
||||
threaded_param.n_loop /= nb_threads;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tick);
|
||||
#pragma omp parallel for
|
||||
for(t = 0; t < nb_threads; t ++){
|
||||
param->bench_func(&threaded_param);
|
||||
}
|
||||
return delta_time(tick);
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE);
|
||||
BenchParam param;
|
||||
int test_id;
|
||||
printf ("Running on %d threads\n", omp_get_max_threads());
|
||||
for(test_id = 0; test_id < NB_TESTS; test_id ++) {
|
||||
double size = MIN_SIZE;
|
||||
param.bench_func = TESTS[test_id * 4];
|
||||
param.blas_func = TESTS[test_id * 4 + 1];
|
||||
param.create_matrix = TESTS[test_id * 4 + 2];
|
||||
printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]);
|
||||
param.n_loop = NLOOP;
|
||||
while(size <= MAX_SIZE) {
|
||||
param.matrix_size = (int)(size + 0.5);
|
||||
double seq_time = seq_bench(¶m);
|
||||
double omp_time = omp_bench(¶m);
|
||||
double pthread_time = pthread_bench(¶m, omp_get_max_threads());
|
||||
printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, "
|
||||
"pthread %gs, speedup %g\n",
|
||||
param.matrix_size, seq_time,
|
||||
omp_time, seq_time / omp_time,
|
||||
pthread_time, seq_time / pthread_time);
|
||||
size *= inc_factor;
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
@@ -2038,6 +2038,59 @@ set(MATGEN
|
||||
lapacke_zlagsy_work.c
|
||||
)
|
||||
|
||||
set(Utils_SRC
|
||||
lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c
|
||||
lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c
|
||||
lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c
|
||||
lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c
|
||||
lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c
|
||||
lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c
|
||||
lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c
|
||||
lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c
|
||||
lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c
|
||||
lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c
|
||||
lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c
|
||||
lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c
|
||||
lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c
|
||||
lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c
|
||||
lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c
|
||||
lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c
|
||||
lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c
|
||||
lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c
|
||||
lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c
|
||||
lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c
|
||||
lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c
|
||||
lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c
|
||||
lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c
|
||||
lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c
|
||||
lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c
|
||||
lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c
|
||||
lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c
|
||||
lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c
|
||||
lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c
|
||||
lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c
|
||||
lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c
|
||||
lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c
|
||||
lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c
|
||||
lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c
|
||||
lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c
|
||||
lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c
|
||||
lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c
|
||||
lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c
|
||||
lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c
|
||||
lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c
|
||||
lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c
|
||||
lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c
|
||||
lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c
|
||||
lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c
|
||||
lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c
|
||||
lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c
|
||||
lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c
|
||||
lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c
|
||||
lapacke_dpb_nancheck.c lapacke_sst_nancheck.c
|
||||
lapacke_dpb_trans.c lapacke_ssy_nancheck.c
|
||||
)
|
||||
|
||||
set(LAPACKE_REL_SRC "")
|
||||
if (BUILD_SINGLE)
|
||||
list(APPEND LAPACKE_REL_SRC ${SSRC})
|
||||
@@ -2061,6 +2114,10 @@ foreach (LAE_FILE ${LAPACKE_REL_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}")
|
||||
endforeach ()
|
||||
|
||||
foreach (Utils_FILE ${Utils_SRC})
|
||||
list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}")
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h")
|
||||
include_directories(${lapacke_include_dir})
|
||||
|
||||
3
common.h
3
common.h
@@ -332,12 +332,13 @@ typedef int blasint;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef STEAMROLLER
|
||||
|
||||
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
||||
stack_alloc_size = 0; \
|
||||
STACK_ALLOC_PROTECT_SET \
|
||||
TYPE stack_buffer[stack_alloc_size]; \
|
||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||
#else
|
||||
//Original OpenBLAS/GotoBLAS codes.
|
||||
|
||||
@@ -396,7 +396,7 @@ REALNAME:
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#define EPILOGUE .end REALNAME
|
||||
#define EPILOGUE .end
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||
|
||||
@@ -191,6 +191,8 @@ void get_cpuconfig(void)
|
||||
printf("#define L2_SIZE 2097152\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
|
||||
-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
|
||||
F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
|
||||
F LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
|
||||
@@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||
#endif
|
||||
|
||||
x = buffer;
|
||||
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
|
||||
buffer += ((COMPSIZE * args -> m + 3) & ~3);
|
||||
}
|
||||
|
||||
#ifndef TRANS
|
||||
@@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||
|
||||
if (num_cpu) {
|
||||
queue[0].sa = NULL;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
|
||||
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;
|
||||
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
||||
@@ -99,6 +99,17 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
# for gemm3m
|
||||
if(USE_GEMM3M)
|
||||
foreach (GEMM_DEFINE ${GEMM_DEFINES})
|
||||
string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type})
|
||||
endif ()
|
||||
endforeach ()
|
||||
endif()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ set(COMMON_SOURCES
|
||||
xerbla.c
|
||||
openblas_set_num_threads.c
|
||||
openblas_error_handle.c
|
||||
openblas_env.c
|
||||
openblas_get_num_procs.c
|
||||
openblas_get_num_threads.c
|
||||
)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
TOPDIR = ../..
|
||||
include ../../Makefile.system
|
||||
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
|
||||
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
|
||||
|
||||
#COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
|
||||
|
||||
@@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
|
||||
openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
openblas_env.$(SUFFIX) : openblas_env.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
||||
@@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#endif
|
||||
#endif
|
||||
|
||||
extern unsigned int openblas_thread_timeout();
|
||||
|
||||
#ifdef SMP_SERVER
|
||||
|
||||
#undef MONITOR
|
||||
@@ -524,6 +526,7 @@ static int blas_monitor(void *arg){
|
||||
int blas_thread_init(void){
|
||||
BLASLONG i;
|
||||
int ret;
|
||||
int thread_timeout_env;
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_t attr;
|
||||
#endif
|
||||
@@ -540,22 +543,12 @@ int blas_thread_init(void){
|
||||
|
||||
if (!blas_server_avail){
|
||||
|
||||
env_var_t p;
|
||||
|
||||
if (readenv(p,"THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}else{
|
||||
if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
|
||||
thread_timeout = atoi(p);
|
||||
if (thread_timeout < 4) thread_timeout = 4;
|
||||
if (thread_timeout > 30) thread_timeout = 30;
|
||||
thread_timeout = (1 << thread_timeout);
|
||||
}
|
||||
}
|
||||
|
||||
thread_timeout_env=openblas_thread_timeout();
|
||||
if (thread_timeout_env>0) {
|
||||
if (thread_timeout_env < 4) thread_timeout_env = 4;
|
||||
if (thread_timeout_env > 30) thread_timeout_env = 30;
|
||||
thread_timeout = (1 << thread_timeout_env);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
|
||||
|
||||
@@ -391,7 +391,7 @@ static char *corename[] = {
|
||||
"Nehalem",
|
||||
"Athlon",
|
||||
"Opteron",
|
||||
"Opteron(SSE3)",
|
||||
"Opteron_SSE3",
|
||||
"Barcelona",
|
||||
"Nano",
|
||||
"Sandybridge",
|
||||
|
||||
@@ -294,8 +294,11 @@ void openblas_fork_handler()
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
env_var_t p;
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
@@ -310,18 +313,18 @@ int blas_get_cpu_number(void){
|
||||
|
||||
blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
blas_omp_num = 0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
@@ -1340,6 +1343,7 @@ static void gotoblas_memory_init(void) {
|
||||
/* Initialization for all function; this function should be called before main */
|
||||
|
||||
static int gotoblas_initialized = 0;
|
||||
extern void openblas_read_env();
|
||||
|
||||
void CONSTRUCTOR gotoblas_init(void) {
|
||||
|
||||
@@ -1349,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) {
|
||||
openblas_fork_handler();
|
||||
#endif
|
||||
|
||||
openblas_read_env();
|
||||
|
||||
#ifdef PROFILE
|
||||
moncontrol (0);
|
||||
#endif
|
||||
@@ -1365,7 +1371,8 @@ void CONSTRUCTOR gotoblas_init(void) {
|
||||
gotoblas_memory_init();
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX)
|
||||
//#if defined(OS_LINUX)
|
||||
#if 0
|
||||
struct rlimit curlimit;
|
||||
if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
|
||||
{
|
||||
|
||||
84
driver/others/openblas_env.c
Normal file
84
driver/others/openblas_env.c
Normal file
@@ -0,0 +1,84 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static int openblas_env_verbose=0;
|
||||
static unsigned int openblas_env_thread_timeout=0;
|
||||
static int openblas_env_block_factor=0;
|
||||
static int openblas_env_openblas_num_threads=0;
|
||||
static int openblas_env_goto_num_threads=0;
|
||||
static int openblas_env_omp_num_threads=0;
|
||||
|
||||
int openblas_verbose() { return openblas_env_verbose;}
|
||||
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
|
||||
int openblas_block_factor() { return openblas_env_block_factor;}
|
||||
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
|
||||
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
|
||||
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
|
||||
|
||||
void openblas_read_env() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_verbose=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_block_factor=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_thread_timeout=(unsigned int)ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_openblas_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_goto_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_omp_num_threads=ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int openblas_verbose() {
|
||||
int ret=0;
|
||||
env_var_t p;
|
||||
if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
return ret;
|
||||
}
|
||||
extern int openblas_verbose();
|
||||
|
||||
void openblas_warning(int verbose, const char * msg) {
|
||||
int current_verbose;
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include <string.h>
|
||||
#include "common.h"
|
||||
|
||||
extern int openblas_block_factor();
|
||||
int get_L2_size(void);
|
||||
|
||||
#define DEFAULT_GEMM_P 128
|
||||
@@ -249,7 +250,6 @@ int get_L2_size(void){
|
||||
|
||||
void blas_set_parameter(void){
|
||||
|
||||
env_var_t p;
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
|
||||
int size = 16;
|
||||
@@ -468,9 +468,8 @@ void blas_set_parameter(void){
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
if (readenv(p,"GOTO_BLOCK_FACTOR")) {
|
||||
factor = atoi(p);
|
||||
factor=openblas_block_factor();
|
||||
if (factor>0) {
|
||||
if (factor < 10) factor = 10;
|
||||
if (factor > 200) factor = 200;
|
||||
|
||||
|
||||
@@ -26,6 +26,10 @@ ifndef ONLY_CBLAS
|
||||
ONLY_CBLAS = 0
|
||||
endif
|
||||
|
||||
ifndef BUILD_LAPACK_DEPRECATED
|
||||
BUILD_LAPACK_DEPRECATED = 0
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifndef ONLY_CBLAS
|
||||
@@ -92,10 +96,10 @@ dll : ../$(LIBDLLNAME)
|
||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
|
||||
|
||||
libopenblas.def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
@@ -205,23 +209,23 @@ static : ../$(LIBNAME)
|
||||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F)
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
||||
@@ -590,6 +590,13 @@
|
||||
dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx,
|
||||
);
|
||||
|
||||
@lapack_deprecated_objs = (
|
||||
cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp,
|
||||
cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd,
|
||||
cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm,
|
||||
cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf,
|
||||
);
|
||||
|
||||
@lapackeobjs = (
|
||||
# LAPACK C interface routines.
|
||||
#
|
||||
@@ -2984,6 +2991,11 @@ if ($ARGV[8] == 1) {
|
||||
@need_2underscore_objs = (@lapack_embeded_underscore_objs);
|
||||
};
|
||||
|
||||
if ($ARGV[11] == 1){
|
||||
#BUILD_LAPACK_DEPRECATED=1
|
||||
@underscore_objs =(@underscore_objs, @lapack_deprecated_objs);
|
||||
}
|
||||
|
||||
} else {
|
||||
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
|
||||
}
|
||||
|
||||
@@ -77,6 +77,7 @@ void NAME(char *TRANS, blasint *M, blasint *N,
|
||||
blasint incy = *INCY;
|
||||
|
||||
FLOAT *buffer;
|
||||
int buffer_size;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
#endif
|
||||
@@ -141,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
FLOAT *buffer;
|
||||
blasint lenx, leny;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
blasint info, t;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
@@ -230,7 +231,19 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incx < 0) x -= (lenx - 1) * incx * 2;
|
||||
if (incy < 0) y -= (leny - 1) * incy * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT) ;
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0
|
||||
// cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746)
|
||||
if(trans && stack_alloc_size)
|
||||
memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size));
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
@@ -253,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
||||
@@ -210,7 +210,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (incy < 0) y -= (n - 1) * incy * 2;
|
||||
if (incx < 0) x -= (m - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
STACK_ALLOC(2 * m, FLOAT, buffer);
|
||||
|
||||
#ifdef SMPTEST
|
||||
// Threshold chosen so that speed-up is > 1 on a Xeon E5-2630
|
||||
@@ -249,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n);
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
|
||||
blasint info;
|
||||
int uplo;
|
||||
int unit;
|
||||
int trans;
|
||||
int trans, buffer_size;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
@@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
|
||||
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {
|
||||
|
||||
int trans, uplo, unit;
|
||||
int trans, uplo, unit, buffer_size;
|
||||
blasint info;
|
||||
FLOAT *buffer;
|
||||
#ifdef SMP
|
||||
@@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx * 2;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
#ifdef SMP
|
||||
// Calibrated on a Xeon E5-2630
|
||||
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
|
||||
nthreads = num_cpu_avail(2);
|
||||
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 2;
|
||||
} else
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
|
||||
if(incx != 1)
|
||||
buffer_size += n * 2;
|
||||
}
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
}
|
||||
#endif
|
||||
|
||||
blas_memory_free(buffer);
|
||||
STACK_FREE(buffer);
|
||||
|
||||
FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);
|
||||
|
||||
|
||||
@@ -227,6 +227,28 @@ foreach (float_type ${FLOAT_TYPES})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
#gemm3m
|
||||
if (USE_GEMM3M)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM3MKERNEL}" "NN" "gemm3m_kernel" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_oncopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_oncopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_oncopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_otcopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_otcopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_otcopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_incopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_incopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_incopyi" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_itcopyb" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_itcopyr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_itcopyi" false "" "" false ${float_type})
|
||||
|
||||
endif()
|
||||
|
||||
else () #For real
|
||||
GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type})
|
||||
|
||||
|
||||
@@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4.S
|
||||
CTRMMKERNEL = ctrmm_kernel_4x4.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_4x4.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
endif
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_4x4.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
|
||||
2044
kernel/arm64/cgemm_kernel_8x4.S
Executable file
2044
kernel/arm64/cgemm_kernel_8x4.S
Executable file
File diff suppressed because it is too large
Load Diff
2425
kernel/arm64/ctrmm_kernel_8x4.S
Executable file
2425
kernel/arm64/ctrmm_kernel_8x4.S
Executable file
File diff suppressed because it is too large
Load Diff
@@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pA x15
|
||||
#define ppC x16
|
||||
#define ppCRow0 x17
|
||||
#define ppCRow1 x18
|
||||
#define ppCRow2 x19
|
||||
#define ppA x20
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define ppC x17
|
||||
#define ppCRow0 x18
|
||||
#define ppCRow1 x19
|
||||
#define ppCRow2 x20
|
||||
#define ppCRow3 x21
|
||||
#define ppA x22
|
||||
#define alpha x23
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
#define alpha1 d11
|
||||
#define alphaV1 v11.d[0]
|
||||
#define alpha2 d14
|
||||
#define alphaV2 v14.d[0]
|
||||
#define alpha3 d15
|
||||
#define alphaV3 v15.d[0]
|
||||
|
||||
#define A_PRE_SIZE 1024
|
||||
#define B_PRE_SIZE 1024
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
@@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pA
|
||||
// 16 ppC
|
||||
// 17 ppCRow0
|
||||
// 18 must save ppCRow1
|
||||
// 19 must save ppCRow2
|
||||
// 20 must save ppA
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 ppC
|
||||
// 18 must save ppCRow0
|
||||
// 19 must save ppCRow1
|
||||
// 20 must save ppCRow2
|
||||
// 21 must save ppCRow3
|
||||
// 22 must save ppA
|
||||
// 23 must save alpha
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
@@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//v08 must save pB00, pB01
|
||||
//v09 must save pB02, pB03
|
||||
//v10 must save ALPHA0
|
||||
//v11 must save ALPHA1
|
||||
//v11 must save
|
||||
//v12 must save pB10, pB11
|
||||
//v13 must save pB12, pB13
|
||||
//v14 must save ALPHA2
|
||||
//v15 must save ALPHA3
|
||||
//v14 must save
|
||||
//v15 must save
|
||||
//v16 must save C00, C01
|
||||
//v17 must save C02, C03
|
||||
//v18 ppC00, ppC01
|
||||
@@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_I
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v16.2d, v0.2d, v8.2d[0]
|
||||
fmul v29.2d, v1.2d, v9.2d[1]
|
||||
fmul v29.2d, v1.2d, v11.2d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v20.2d, v0.2d, v8.2d[1]
|
||||
fmul v25.2d, v1.2d, v9.2d[0]
|
||||
fmul v20.2d, v0.2d, v9.2d[0]
|
||||
fmul v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmul v18.2d, v2.2d, v8.2d[0]
|
||||
fmul v31.2d, v3.2d, v9.2d[1]
|
||||
fmul v22.2d, v2.2d, v8.2d[1]
|
||||
fmul v27.2d, v3.2d, v9.2d[0]
|
||||
fmul v31.2d, v3.2d, v11.2d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
fmul v24.2d, v0.2d, v9.2d[0]
|
||||
fmul v21.2d, v1.2d, v8.2d[1]
|
||||
fmul v22.2d, v2.2d, v9.2d[0]
|
||||
fmul v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d} , [pA] // for next round
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v24.2d, v0.2d, v10.2d[0]
|
||||
fmul v21.2d, v1.2d, v9.2d[0]
|
||||
|
||||
ldp q4, q5, [pA] // for next round
|
||||
add pA, pA, #32
|
||||
|
||||
fmul v26.2d, v2.2d, v9.2d[0]
|
||||
fmul v23.2d, v3.2d, v8.2d[1]
|
||||
fmul v26.2d, v2.2d, v10.2d[0]
|
||||
fmul v23.2d, v3.2d, v9.2d[0]
|
||||
|
||||
ld1 {v6.2d, v7.2d} , [ppA] // for next round
|
||||
ldp q6, q7, [ppA] // for next round
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmul v28.2d, v0.2d, v9.2d[1]
|
||||
fmul v28.2d, v0.2d, v11.2d[0]
|
||||
fmul v17.2d, v1.2d, v8.2d[0]
|
||||
fmul v30.2d, v2.2d, v9.2d[1]
|
||||
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmul v30.2d, v2.2d, v11.2d[0]
|
||||
fmul v19.2d, v3.2d, v8.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M2
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #512]
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_M1
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pB] // for next round
|
||||
add pB, pB, #32
|
||||
ldp d12, d13, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #512]
|
||||
ldp d14, d15, [pB]
|
||||
add pB, pB, #16
|
||||
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
prfm PLDL1KEEP, [ppA, #512]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
ld1 {v4.2d, v5.2d} , [pA] // for next round
|
||||
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
|
||||
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
|
||||
ldp q4, q5, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
|
||||
ld1 {v6.2d, v7.2d} , [ppA] // for next round
|
||||
ldp q6, q7, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_E
|
||||
fmla v16.2d, v4.2d, v12.2d[0]
|
||||
fmla v25.2d, v5.2d, v13.2d[0]
|
||||
fmla v25.2d, v5.2d, v14.2d[0]
|
||||
fmla v18.2d, v6.2d, v12.2d[0]
|
||||
fmla v27.2d, v7.2d, v13.2d[0]
|
||||
fmla v27.2d, v7.2d, v14.2d[0]
|
||||
|
||||
fmla v20.2d, v4.2d, v12.2d[1]
|
||||
fmla v29.2d, v5.2d, v13.2d[1]
|
||||
fmla v22.2d, v6.2d, v12.2d[1]
|
||||
fmla v31.2d, v7.2d, v13.2d[1]
|
||||
fmla v20.2d, v4.2d, v13.2d[0]
|
||||
fmla v29.2d, v5.2d, v15.2d[0]
|
||||
fmla v22.2d, v6.2d, v13.2d[0]
|
||||
fmla v31.2d, v7.2d, v15.2d[0]
|
||||
|
||||
fmla v24.2d, v4.2d, v13.2d[0]
|
||||
fmla v24.2d, v4.2d, v14.2d[0]
|
||||
fmla v17.2d, v5.2d, v12.2d[0]
|
||||
fmla v26.2d, v6.2d, v13.2d[0]
|
||||
fmla v26.2d, v6.2d, v14.2d[0]
|
||||
fmla v19.2d, v7.2d, v12.2d[0]
|
||||
|
||||
fmla v28.2d, v4.2d, v13.2d[1]
|
||||
fmla v21.2d, v5.2d, v12.2d[1]
|
||||
fmla v30.2d, v6.2d, v13.2d[1]
|
||||
fmla v23.2d, v7.2d, v12.2d[1]
|
||||
fmla v28.2d, v4.2d, v15.2d[0]
|
||||
fmla v21.2d, v5.2d, v13.2d[0]
|
||||
fmla v30.2d, v6.2d, v15.2d[0]
|
||||
fmla v23.2d, v7.2d, v13.2d[0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL8x4_SUB
|
||||
ld1 {v8.2d, v9.2d}, [pB]
|
||||
add pB, pB, #32
|
||||
ld1 {v0.2d, v1.2d}, [pA]
|
||||
ldp d8, d9, [pB]
|
||||
add pB, pB, #16
|
||||
ldp d10, d11, [pB]
|
||||
add pB, pB, #16
|
||||
ldp q0, q1, [pA]
|
||||
add pA, pA, #32
|
||||
|
||||
fmla v16.2d, v0.2d, v8.2d[0]
|
||||
fmla v29.2d, v1.2d, v9.2d[1]
|
||||
fmla v20.2d, v0.2d, v8.2d[1]
|
||||
fmla v25.2d, v1.2d, v9.2d[0]
|
||||
fmla v29.2d, v1.2d, v11.2d[0]
|
||||
fmla v20.2d, v0.2d, v9.2d[0]
|
||||
fmla v25.2d, v1.2d, v10.2d[0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppA]
|
||||
ldp q2, q3, [ppA]
|
||||
add ppA, ppA, #32
|
||||
|
||||
fmla v24.2d, v0.2d, v9.2d[0]
|
||||
fmla v21.2d, v1.2d, v8.2d[1]
|
||||
fmla v28.2d, v0.2d, v9.2d[1]
|
||||
fmla v24.2d, v0.2d, v10.2d[0]
|
||||
fmla v21.2d, v1.2d, v9.2d[0]
|
||||
fmla v28.2d, v0.2d, v11.2d[0]
|
||||
fmla v17.2d, v1.2d, v8.2d[0]
|
||||
|
||||
fmla v18.2d, v2.2d, v8.2d[0]
|
||||
fmla v31.2d, v3.2d, v9.2d[1]
|
||||
fmla v22.2d, v2.2d, v8.2d[1]
|
||||
fmla v27.2d, v3.2d, v9.2d[0]
|
||||
fmla v31.2d, v3.2d, v11.2d[0]
|
||||
fmla v22.2d, v2.2d, v9.2d[0]
|
||||
fmla v27.2d, v3.2d, v10.2d[0]
|
||||
|
||||
fmla v26.2d, v2.2d, v9.2d[0]
|
||||
fmla v23.2d, v3.2d, v8.2d[1]
|
||||
fmla v30.2d, v2.2d, v9.2d[1]
|
||||
fmla v26.2d, v2.2d, v10.2d[0]
|
||||
fmla v23.2d, v3.2d, v9.2d[0]
|
||||
fmla v30.2d, v2.2d, v11.2d[0]
|
||||
fmla v19.2d, v3.2d, v8.2d[0]
|
||||
.endm
|
||||
|
||||
.macro SAVE8x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
add ppCRow0, pCRow0, #32
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow0]
|
||||
ldp q0, q1, [pCRow0]
|
||||
fmla v0.2d, v16.2d, alphaV0
|
||||
fmla v1.2d, v17.2d, alphaV1
|
||||
st1 {v0.2d, v1.2d}, [pCRow0]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV2
|
||||
fmla v3.2d, v19.2d, alphaV3
|
||||
st1 {v2.2d, v3.2d}, [ppCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
add ppCRow1, ppCRow0, LDC
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV1
|
||||
st1 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV2
|
||||
fmla v7.2d, v23.2d, alphaV3
|
||||
st1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
add ppCRow2, ppCRow1, LDC
|
||||
|
||||
ld1 {v0.2d, v1.2d}, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0
|
||||
fmla v1.2d, v25.2d, alphaV1
|
||||
st1 {v0.2d, v1.2d}, [pCRow2]
|
||||
|
||||
ld1 {v2.2d, v3.2d}, [ppCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV2
|
||||
fmla v3.2d, v27.2d, alphaV3
|
||||
st1 {v2.2d, v3.2d}, [ppCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
add ppCRow1, ppCRow2, LDC
|
||||
|
||||
ld1 {v4.2d, v5.2d}, [pCRow1]
|
||||
fmla v4.2d, v28.2d, alphaV0
|
||||
fmla v5.2d, v29.2d, alphaV1
|
||||
st1 {v4.2d, v5.2d}, [pCRow1]
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v6.2d, v30.2d, alphaV2
|
||||
fmla v7.2d, v31.2d, alphaV3
|
||||
st1 {v6.2d, v7.2d}, [ppCRow1]
|
||||
fmla v1.2d, v17.2d, alphaV0
|
||||
stp q0, q1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #64
|
||||
|
||||
ldp q2, q3, [ppCRow0]
|
||||
fmla v2.2d, v18.2d, alphaV0
|
||||
fmla v3.2d, v19.2d, alphaV0
|
||||
stp q2, q3, [ppCRow0]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
add ppCRow1, pCRow1, #32
|
||||
|
||||
ldp q4, q5, [pCRow1]
|
||||
fmla v4.2d, v20.2d, alphaV0
|
||||
fmla v5.2d, v21.2d, alphaV0
|
||||
stp q4, q5, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, #64
|
||||
|
||||
ldp q6, q7, [ppCRow1]
|
||||
fmla v6.2d, v22.2d, alphaV0
|
||||
fmla v7.2d, v23.2d, alphaV0
|
||||
stp q6, q7, [ppCRow1]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
add ppCRow2, pCRow2, #32
|
||||
|
||||
ldp q0, q1, [pCRow2]
|
||||
fmla v0.2d, v24.2d, alphaV0
|
||||
fmla v1.2d, v25.2d, alphaV0
|
||||
stp q0, q1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, #64
|
||||
|
||||
ldp q2, q3, [ppCRow2]
|
||||
fmla v2.2d, v26.2d, alphaV0
|
||||
fmla v3.2d, v27.2d, alphaV0
|
||||
stp q2, q3, [ppCRow2]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
add ppCRow3, pCRow3, #32
|
||||
|
||||
ldp q4, q5, [pCRow3]
|
||||
fmla v4.2d, v28.2d, alphaV0
|
||||
fmla v5.2d, v29.2d, alphaV0
|
||||
stp q4, q5, [pCRow3]
|
||||
|
||||
add pCRow3, pCRow3, #64
|
||||
|
||||
ldp q6, q7, [ppCRow3]
|
||||
fmla v6.2d, v30.2d, alphaV0
|
||||
fmla v7.2d, v31.2d, alphaV0
|
||||
stp q6, q7, [ppCRow3]
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
@@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE4x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
fmla v9.2d, v25.2d, alphaV1
|
||||
fmla v9.2d, v25.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV2
|
||||
fmla v13.2d, v29.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
fmla v13.2d, v29.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
@@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE2x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
@@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow2, pCRow1, LDC
|
||||
|
||||
ld1 {v8.2d}, [pCRow2]
|
||||
fmla v8.2d, v24.2d, alphaV2
|
||||
fmla v8.2d, v24.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow2]
|
||||
|
||||
add pCRow1, pCRow2, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v28.2d, alphaV3
|
||||
fmla v12.2d, v28.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
@@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE1x4
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
@@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ld1 {v12.d}[0], [pCRow2]
|
||||
ld1 {v12.d}[1], [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.d}[0], [pCRow2]
|
||||
st1 {v12.d}[1], [pCRow1]
|
||||
|
||||
@@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE4x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d, v13.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV2
|
||||
fmla v13.2d, v21.2d, alphaV3
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
fmla v13.2d, v21.2d, alphaV0
|
||||
st1 {v12.2d, v13.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
@@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE2x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
@@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v12.2d}, [pCRow1]
|
||||
fmla v12.2d, v20.2d, alphaV1
|
||||
fmla v12.2d, v20.2d, alphaV0
|
||||
st1 {v12.2d}, [pCRow1]
|
||||
|
||||
add pCRow0, pCRow0, #16
|
||||
@@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE1x2
|
||||
fmov alpha0, alpha
|
||||
|
||||
add pCRow1 , pCRow0, LDC
|
||||
|
||||
ld1 {v8.d}[0], [pCRow0]
|
||||
@@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE4x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d, v9.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
fmla v9.2d, v17.2d, alphaV1
|
||||
fmla v9.2d, v17.2d, alphaV0
|
||||
st1 {v8.2d, v9.2d}, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, #32
|
||||
@@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE2x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ld1 {v8.2d}, [pCRow0]
|
||||
fmla v8.2d, v16.2d, alphaV0
|
||||
st1 {v8.2d}, [pCRow0]
|
||||
@@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endm
|
||||
|
||||
.macro SAVE1x1
|
||||
fmov alpha0, alpha
|
||||
|
||||
ldr d8, [pCRow0]
|
||||
fmadd d8, d16, alpha0, d8
|
||||
str d8, [pCRow0]
|
||||
@@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
fmov alpha0, d0
|
||||
fmov alpha1, d0
|
||||
fmov alpha2, d0
|
||||
fmov alpha3, d0
|
||||
fmov alpha, d0
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 8
|
||||
|
||||
@@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ble dgemm_kernel_L2_BEGIN
|
||||
|
||||
dgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC, pC, LDC, lsl #2
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
lsl temp, origK, #5 // k * 4 * 8
|
||||
mov pA, origPA // pA = start of A array
|
||||
add ppA, temp, pA
|
||||
prfm PLDL1KEEP, [ppA]
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN:
|
||||
cmp counterI, #0
|
||||
ble dgemm_kernel_L4_M4_BEGIN
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_20:
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #1 // L = K / 2
|
||||
cmp counterL , #2 // is there at least 4 to do?
|
||||
asr counterL , origK, #2 // L = K / 4
|
||||
cmp counterL , #2
|
||||
blt dgemm_kernel_L4_M8_32
|
||||
|
||||
KERNEL8x4_I // do one in the K
|
||||
KERNEL8x4_M2 // do another in the K
|
||||
KERNEL8x4_I
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble dgemm_kernel_L4_M8_22a
|
||||
|
||||
.align 5
|
||||
|
||||
dgemm_kernel_L4_M8_22:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt dgemm_kernel_L4_M8_22
|
||||
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_22a:
|
||||
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dgemm_kernel_L4_M8_44
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble dgemm_kernel_L4_M8_40
|
||||
|
||||
KERNEL8x4_I
|
||||
|
||||
KERNEL8x4_M2
|
||||
KERNEL8x4_M1
|
||||
KERNEL8x4_E
|
||||
|
||||
b dgemm_kernel_L4_M8_44
|
||||
@@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40:
|
||||
|
||||
dgemm_kernel_L4_M8_44:
|
||||
|
||||
ands counterL , origK, #1
|
||||
ands counterL , origK, #3
|
||||
ble dgemm_kernel_L4_M8_100
|
||||
|
||||
.align 5
|
||||
dgemm_kernel_L4_M8_46:
|
||||
|
||||
KERNEL8x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne dgemm_kernel_L4_M8_46
|
||||
|
||||
dgemm_kernel_L4_M8_100:
|
||||
lsl temp, origK, #5
|
||||
prfm PLDL1KEEP, [pA, temp]
|
||||
prfm PLDL1KEEP, [ppA, temp]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVE8x4
|
||||
|
||||
@@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END:
|
||||
subs counterI, counterI, #1
|
||||
bne dgemm_kernel_L4_M8_20
|
||||
|
||||
|
||||
dgemm_kernel_L4_M4_BEGIN:
|
||||
mov counterI, origM
|
||||
tst counterI , #7
|
||||
|
||||
1689
kernel/arm64/dgemm_kernel_4x8.S
Executable file
1689
kernel/arm64/dgemm_kernel_4x8.S
Executable file
File diff suppressed because it is too large
Load Diff
1570
kernel/arm64/dgemm_kernel_8x4.S
Executable file
1570
kernel/arm64/dgemm_kernel_8x4.S
Executable file
File diff suppressed because it is too large
Load Diff
2026
kernel/arm64/dtrmm_kernel_4x8.S
Executable file
2026
kernel/arm64/dtrmm_kernel_4x8.S
Executable file
File diff suppressed because it is too large
Load Diff
1849
kernel/arm64/dtrmm_kernel_8x4.S
Executable file
1849
kernel/arm64/dtrmm_kernel_8x4.S
Executable file
File diff suppressed because it is too large
Load Diff
1987
kernel/arm64/sgemm_kernel_16x4.S
Normal file
1987
kernel/arm64/sgemm_kernel_16x4.S
Normal file
File diff suppressed because it is too large
Load Diff
2305
kernel/arm64/sgemm_kernel_8x8.S
Normal file
2305
kernel/arm64/sgemm_kernel_8x8.S
Normal file
File diff suppressed because it is too large
Load Diff
2431
kernel/arm64/strmm_kernel_16x4.S
Executable file
2431
kernel/arm64/strmm_kernel_16x4.S
Executable file
File diff suppressed because it is too large
Load Diff
2795
kernel/arm64/strmm_kernel_8x8.S
Executable file
2795
kernel/arm64/strmm_kernel_8x8.S
Executable file
File diff suppressed because it is too large
Load Diff
@@ -147,12 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
fmla v4.4s, v3.4s, v17.4s
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ)
|
||||
|
||||
fmla v5.4s, v2.4s, v17.4s
|
||||
#else
|
||||
fmls v5.4s, v2.4s, v17.4s
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v5.4s, v3.4s, v16.4s
|
||||
#else
|
||||
fmls v5.4s, v3.4s, v16.4s
|
||||
#endif
|
||||
|
||||
st2 {v4.4s, v5.4s}, [Y], #32
|
||||
#else // DOUBLE
|
||||
@@ -165,12 +166,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
fmla v4.2d, v3.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
|
||||
fmla v5.2d, v2.2d, v17.2d
|
||||
#else
|
||||
fmls v5.2d, v2.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v5.2d, v3.2d, v16.2d
|
||||
#else
|
||||
fmls v5.2d, v3.2d, v16.2d
|
||||
#endif
|
||||
|
||||
st2 {v4.2d, v5.2d}, [Y], #32
|
||||
|
||||
@@ -183,13 +185,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
fmla v20.2d, v19.2d, v17.2d
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
fmla v21.2d, v18.2d, v17.2d
|
||||
#else
|
||||
fmls v21.2d, v18.2d, v17.2d
|
||||
#endif
|
||||
fmla v21.2d, v19.2d, v16.2d
|
||||
|
||||
fmla v21.2d, v18.2d, v17.2d
|
||||
#if !defined(CONJ)
|
||||
fmla v21.2d, v19.2d, v16.2d
|
||||
#else
|
||||
fmls v21.2d, v19.2d, v16.2d
|
||||
#endif
|
||||
st2 {v20.2d, v21.2d}, [Y], #32
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
|
||||
@@ -46,3 +46,7 @@ ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = zgemm_beta.S
|
||||
endif
|
||||
|
||||
ifndef DSDOTKERNEL
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
endif
|
||||
|
||||
|
||||
@@ -389,19 +389,19 @@ DGEMVTKERNEL = dgemv_t.S
|
||||
endif
|
||||
|
||||
ifndef CGEMVNKERNEL
|
||||
CGEMVNKERNEL = cgemv_n.S
|
||||
CGEMVNKERNEL = cgemv_n_4.c
|
||||
endif
|
||||
|
||||
ifndef CGEMVTKERNEL
|
||||
CGEMVTKERNEL = cgemv_t.S
|
||||
CGEMVTKERNEL = cgemv_t_4.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMVNKERNEL
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMVTKERNEL
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
endif
|
||||
|
||||
ifndef QGEMVNKERNEL
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
|
||||
@@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||
|
||||
@@ -11,7 +11,7 @@ ZAXPYKERNEL = zaxpy.c
|
||||
SGEMVNKERNEL = sgemv_n_4.c
|
||||
SGEMVTKERNEL = sgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||
|
||||
@@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
|
||||
DGEMVNKERNEL = dgemv_n_4.c
|
||||
DGEMVTKERNEL = dgemv_t_4.c
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||
ZGEMVNKERNEL = zgemv_t_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
DCOPYKERNEL = dcopy_bulldozer.S
|
||||
|
||||
@@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(HASWELL)
|
||||
#include "cgemv_n_microk_haswell-4.c"
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "cgemv_n_microk_bulldozer-4.c"
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
541
kernel/x86_64/cgemv_n_microk_bulldozer-4.c
Normal file
541
kernel/x86_64/cgemv_n_microk_bulldozer-4.c
Normal file
@@ -0,0 +1,541 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register n1 = n & -8 ;
|
||||
BLASLONG register n2 = n & 4 ;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"vbroadcastss (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
|
||||
"vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
|
||||
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
|
||||
"vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2
|
||||
"vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2
|
||||
"vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3
|
||||
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
||||
"prefetcht0 384(%5,%0,4) \n\t"
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
"vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 384(%6,%0,4) \n\t"
|
||||
"vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
|
||||
"vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 384(%7,%0,4) \n\t"
|
||||
"vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
|
||||
"vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vfmaddps %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddps %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vfmaddps %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddps %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 384(%3,%0,4) \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,4) \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %8 \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
|
||||
"vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n1), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (n2) // 8
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x2 1
|
||||
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register n1 = n & -8 ;
|
||||
BLASLONG register n2 = n & 4 ;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vbroadcastss (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
|
||||
"vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
|
||||
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
||||
"prefetcht0 384(%5,%0,4) \n\t"
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
"vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 384(%3,%0,4) \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,4) \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %6 \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n1), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (n2) // 6
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x1 1
|
||||
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
BLASLONG register n1 = n & -8 ;
|
||||
BLASLONG register n2 = n & 4 ;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vbroadcastss (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 384(%3,%0,4) \n\t"
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"subq $8 , %1 \n\t"
|
||||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %5 \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n1), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
"r" (n2) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_ADDY 1
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
if ( inc_dest != 2 )
|
||||
{
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||
#else
|
||||
temp_r = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i = -alpha_r * src[1] + alpha_i * src[0];
|
||||
#endif
|
||||
|
||||
*dest += temp_r;
|
||||
*(dest+1) += temp_i;
|
||||
|
||||
src+=2;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
i=0;
|
||||
BLASLONG register n1 = n & -8 ;
|
||||
BLASLONG register n2 = n & 4 ;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vbroadcastss (%4), %%ymm0 \n\t" // alpha_r
|
||||
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
|
||||
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
|
||||
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t" // 4 complex values from dest
|
||||
"vmovups 32(%3,%0,4), %%ymm11 \n\t"
|
||||
|
||||
#if !defined(XCONJ)
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"subq $8 , %1 \n\t"
|
||||
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
|
||||
"vmovups %%ymm13,-32(%3,%0,4) \n\t"
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"cmpq $4, %6 \n\t"
|
||||
"jne 3f \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
|
||||
|
||||
"vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%3,%0,4), %%ymm10 \n\t"
|
||||
|
||||
#if !defined(XCONJ)
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
|
||||
|
||||
"3: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n1), // 1
|
||||
"r" (src), // 2
|
||||
"r" (dest), // 3
|
||||
"r" (&alpha_r), // 4
|
||||
"r" (&alpha_i), // 5
|
||||
"r" (n2) // 6
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
@@ -202,7 +202,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
@@ -322,7 +322,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 320(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
|
||||
@@ -454,7 +454,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
||||
"cmpq $0 , %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
|
||||
"vmovups 32(%2,%0,4), %%ymm9 \n\t"
|
||||
|
||||
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#if defined(HASWELL)
|
||||
#include "cgemv_t_microk_haswell-4.c"
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "cgemv_t_microk_bulldozer-4.c"
|
||||
#endif
|
||||
|
||||
#define NBMAX 2048
|
||||
|
||||
541
kernel/x86_64/cgemv_t_microk_bulldozer-4.c
Normal file
541
kernel/x86_64/cgemv_t_microk_bulldozer-4.c
Normal file
@@ -0,0 +1,541 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary froms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary from must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
||||
"vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
|
||||
"vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
|
||||
"vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
|
||||
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
||||
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
"vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
|
||||
"vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"prefetcht0 384(%5,%0,4) \n\t"
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
"prefetcht0 384(%6,%0,4) \n\t"
|
||||
"vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
|
||||
"prefetcht0 384(%7,%0,4) \n\t"
|
||||
"vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
|
||||
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
||||
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
"vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
|
||||
"vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm12, %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm13, %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm14, %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm15, %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
|
||||
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
|
||||
"vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t"
|
||||
"vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
|
||||
"vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t"
|
||||
"vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
|
||||
#endif
|
||||
|
||||
"vmovsd (%3), %%xmm4 \n\t" // read y
|
||||
"vmovsd 8(%3), %%xmm5 \n\t"
|
||||
"vmovsd 16(%3), %%xmm6 \n\t"
|
||||
"vmovsd 24(%3), %%xmm7 \n\t"
|
||||
|
||||
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
|
||||
"vextractf128 $1, %%ymm10, %%xmm11 \n\t"
|
||||
"vextractf128 $1, %%ymm12, %%xmm13 \n\t"
|
||||
"vextractf128 $1, %%ymm14, %%xmm15 \n\t"
|
||||
|
||||
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
|
||||
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
|
||||
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
|
||||
|
||||
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
|
||||
"vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
|
||||
"vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t"
|
||||
"vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t"
|
||||
|
||||
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
|
||||
"vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
|
||||
"vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
|
||||
|
||||
|
||||
"vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
|
||||
"vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
|
||||
"vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
|
||||
"vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
|
||||
"vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i
|
||||
"vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r
|
||||
"vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i
|
||||
"vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r
|
||||
|
||||
#if !defined(XCONJ)
|
||||
"vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t"
|
||||
"vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
|
||||
"vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t"
|
||||
"vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t"
|
||||
"vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
|
||||
"vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
"vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t"
|
||||
"vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t"
|
||||
"vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
|
||||
#endif
|
||||
|
||||
|
||||
"vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
|
||||
"vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t"
|
||||
"vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t"
|
||||
"vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t"
|
||||
|
||||
"vmovsd %%xmm8 , (%3) \n\t"
|
||||
"vmovsd %%xmm10, 8(%3) \n\t"
|
||||
"vmovsd %%xmm12, 16(%3) \n\t"
|
||||
"vmovsd %%xmm14, 24(%3) \n\t"
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x2 1
|
||||
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||
"vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
|
||||
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
||||
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"prefetcht0 384(%5,%0,4) \n\t"
|
||||
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
"vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
|
||||
|
||||
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
"vfmaddps %%ymm10, %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm11, %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
|
||||
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
|
||||
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
|
||||
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
|
||||
#endif
|
||||
|
||||
"vmovsd (%3), %%xmm4 \n\t" // read y
|
||||
"vmovsd 8(%3), %%xmm5 \n\t"
|
||||
|
||||
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
|
||||
"vextractf128 $1, %%ymm10, %%xmm11 \n\t"
|
||||
|
||||
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
|
||||
|
||||
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
|
||||
"vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
|
||||
|
||||
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
"vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
|
||||
|
||||
"vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
|
||||
"vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
|
||||
"vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
|
||||
"vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
|
||||
|
||||
#if !defined(XCONJ)
|
||||
"vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t"
|
||||
"vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
|
||||
"vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
|
||||
"vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
"vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
|
||||
#endif
|
||||
|
||||
|
||||
"vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
|
||||
"vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t"
|
||||
|
||||
"vmovsd %%xmm8 , (%3) \n\t"
|
||||
"vmovsd %%xmm10, 8(%3) \n\t"
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (alpha) // 6
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x1 1
|
||||
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
|
||||
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
"vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
"vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
|
||||
"vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
|
||||
"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
|
||||
"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
|
||||
|
||||
"vfmaddps %%ymm8 , %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
|
||||
"vfmaddps %%ymm9 , %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
|
||||
|
||||
|
||||
"addq $16 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
|
||||
"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
|
||||
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha
|
||||
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
|
||||
"vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
|
||||
"vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
|
||||
#endif
|
||||
|
||||
"vmovsd (%3), %%xmm4 \n\t" // read y
|
||||
|
||||
"vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
|
||||
|
||||
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
|
||||
"vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
|
||||
|
||||
"vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
|
||||
"vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
|
||||
"vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
|
||||
|
||||
#if !defined(XCONJ)
|
||||
"vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
|
||||
"vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
|
||||
#else
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
|
||||
"vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
|
||||
"vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
|
||||
#endif
|
||||
|
||||
|
||||
"vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
|
||||
|
||||
"vmovsd %%xmm8 , (%3) \n\t"
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
"r" (alpha) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
@@ -292,7 +292,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
@@ -446,7 +446,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,4) \n\t"
|
||||
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
|
||||
|
||||
@@ -82,7 +82,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
||||
"shufpd $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
|
||||
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
|
||||
@@ -129,7 +129,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL_4x2
|
||||
#ifndef HAVE_KERNEL_4x1
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
@@ -144,7 +144,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||
"mulsd (%5), %%xmm12 \n\t" // alpha
|
||||
"shufpd $0, %%xmm12, %%xmm12 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
|
||||
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
|
||||
|
||||
@@ -52,7 +52,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"subq $4 , %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
|
||||
@@ -114,3 +114,78 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x2
|
||||
|
||||
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
|
||||
"vmovups (%4,%0,8), %%ymm0 \n\t"
|
||||
"vmovups (%5,%0,8), %%ymm1 \n\t"
|
||||
|
||||
"vbroadcastsd (%6), %%ymm6 \n\t" // alpha
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
|
||||
"vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
|
||||
"vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm0 \n\t"
|
||||
"vmovups (%5,%0,8), %%ymm1 \n\t"
|
||||
|
||||
"vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
|
||||
"vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t"
|
||||
"vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t"
|
||||
|
||||
|
||||
"vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t"
|
||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
|
||||
"vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y
|
||||
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (alpha) // 6
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6",
|
||||
"%xmm8",
|
||||
"%xmm12", "%xmm13",
|
||||
"memory"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%5,%0,8) , %%xmm14 \n\t" // x
|
||||
@@ -171,7 +171,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
@@ -245,7 +245,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||
"movsd (%2) , %%xmm10 \n\t"
|
||||
"shufpd $0 , %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,8) , %%xmm12 \n\t"
|
||||
|
||||
@@ -59,7 +59,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
// "prefetcht0 384(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
|
||||
|
||||
@@ -131,7 +131,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"shufps $0, %%xmm12, %%xmm12 \n\t"
|
||||
"shufps $0, %%xmm13, %%xmm13 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
|
||||
@@ -189,7 +189,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 2f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
|
||||
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
|
||||
@@ -264,7 +264,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%2,%0,4) , %%xmm12 \n\t"
|
||||
|
||||
@@ -112,7 +112,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
@@ -246,7 +246,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
@@ -105,7 +105,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%5,%0,4) , %%xmm14 \n\t" // x
|
||||
@@ -183,7 +183,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"cmpq $0, %1 \n\t"
|
||||
"je 3f \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
@@ -258,7 +258,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||
"movss (%2) , %%xmm10 \n\t"
|
||||
"shufps $0 , %%xmm10 , %%xmm10 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"movups (%3,%0,4) , %%xmm12 \n\t"
|
||||
|
||||
@@ -75,7 +75,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
"je 4f \n\t"
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 384(%2,%0,4) \n\t"
|
||||
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
|
||||
|
||||
@@ -34,9 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "zgemv_n_microk_haswell-4.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "zgemv_n_microk_sandy-4.c"
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
|
||||
#include "zgemv_n_microk_bulldozer-4.c"
|
||||
#endif
|
||||
|
||||
|
||||
#define NBMAX 1024
|
||||
|
||||
#ifndef HAVE_KERNEL_4x4
|
||||
|
||||
514
kernel/x86_64/zgemv_n_microk_bulldozer-4.c
Normal file
514
kernel/x86_64/zgemv_n_microk_bulldozer-4.c
Normal file
@@ -0,0 +1,514 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
if ( n > 384 )
|
||||
{
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
|
||||
"vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
|
||||
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
|
||||
"vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
|
||||
"vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
|
||||
"vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
|
||||
"vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 512(%4,%0,8) \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
||||
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
|
||||
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
|
||||
|
||||
"prefetcht0 512(%5,%0,8) \n\t"
|
||||
|
||||
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"prefetcht0 512(%6,%0,8) \n\t"
|
||||
|
||||
"vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
|
||||
"vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2
|
||||
|
||||
"vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3
|
||||
"vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3
|
||||
|
||||
"vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"prefetcht0 512(%7,%0,8) \n\t"
|
||||
|
||||
"vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vmovups (%3,%0,8), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
|
||||
"vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
|
||||
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
|
||||
"vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
|
||||
"vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
|
||||
"vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
|
||||
"vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
||||
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
|
||||
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
|
||||
|
||||
|
||||
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
|
||||
"vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
|
||||
"vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2
|
||||
|
||||
"vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3
|
||||
"vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3
|
||||
|
||||
"vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
|
||||
"vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
|
||||
"vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vmovups (%3,%0,8), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]), // 5
|
||||
"r" (ap[2]), // 6
|
||||
"r" (ap[3]) // 7
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#define HAVE_KERNEL_4x2 1
|
||||
static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
|
||||
"vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
|
||||
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
|
||||
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
||||
"vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
|
||||
"vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
|
||||
|
||||
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
|
||||
"vmovups (%3,%0,8), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
"r" (ap[1]) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define HAVE_KERNEL_4x1 1
|
||||
static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
|
||||
|
||||
static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
|
||||
BLASLONG register i = 0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
|
||||
|
||||
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vmovups (%3,%0,8), %%ymm10 \n\t"
|
||||
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap) // 4
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#define HAVE_KERNEL_ADDY 1
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
|
||||
{
|
||||
BLASLONG i;
|
||||
|
||||
if ( inc_dest != 2 )
|
||||
{
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
for ( i=0; i<n; i++ )
|
||||
{
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||
#else
|
||||
temp_r = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i = -alpha_r * src[1] + alpha_i * src[0];
|
||||
#endif
|
||||
|
||||
*dest += temp_r;
|
||||
*(dest+1) += temp_i;
|
||||
|
||||
src+=2;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
i=0;
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
|
||||
"vzeroupper \n\t"
|
||||
|
||||
"vbroadcastsd (%4), %%ymm0 \n\t" // alpha_r
|
||||
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
|
||||
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
|
||||
"vmovups 32(%2,%0,8), %%ymm9 \n\t"
|
||||
|
||||
"vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
|
||||
"vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
|
||||
"vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
|
||||
"vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
|
||||
|
||||
"vmovups (%3,%0,8), %%ymm10 \n\t" // 2 complex values from dest
|
||||
"vmovups 32(%3,%0,8), %%ymm11 \n\t"
|
||||
|
||||
#if !defined(XCONJ)
|
||||
"vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
|
||||
"vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
|
||||
#else
|
||||
"vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
|
||||
"vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
|
||||
"vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
|
||||
"vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
|
||||
#endif
|
||||
|
||||
"vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
|
||||
"vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
|
||||
|
||||
"vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
|
||||
"vmovups %%ymm13, 32(%3,%0,8) \n\t"
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
:
|
||||
"r" (i), // 0
|
||||
"r" (n), // 1
|
||||
"r" (src), // 2
|
||||
"r" (dest), // 3
|
||||
"r" (&alpha_r), // 4
|
||||
"r" (&alpha_i) // 5
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
"vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
@@ -152,7 +152,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
|
||||
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
|
||||
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
@@ -236,7 +236,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
|
||||
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%4,%0,8) \n\t"
|
||||
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
|
||||
@@ -338,7 +338,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
|
||||
"vbroadcastsd (%4), %%ymm0 \n\t" // alpha_r
|
||||
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
|
||||
|
||||
@@ -46,7 +46,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
|
||||
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
@@ -219,7 +219,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
|
||||
"vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
@@ -341,7 +341,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
|
||||
"vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
|
||||
"vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
|
||||
|
||||
".align 16 \n\t"
|
||||
// ".align 16 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"prefetcht0 192(%2,%0,8) \n\t"
|
||||
|
||||
@@ -51,8 +51,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
}
|
||||
#endif
|
||||
/* Allocate memory for working array(s) */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) );
|
||||
if( work == NULL ) {
|
||||
info = LAPACK_WORK_MEMORY_ERROR;
|
||||
@@ -63,8 +62,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
res = LAPACKE_clantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
|
||||
work );
|
||||
/* Release memory and exit */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
LAPACKE_free( work );
|
||||
}
|
||||
exit_level_0:
|
||||
|
||||
@@ -51,8 +51,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
}
|
||||
#endif
|
||||
/* Allocate memory for working array(s) */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) );
|
||||
if( work == NULL ) {
|
||||
info = LAPACK_WORK_MEMORY_ERROR;
|
||||
@@ -63,8 +62,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
res = LAPACKE_dlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
|
||||
work );
|
||||
/* Release memory and exit */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
LAPACKE_free( work );
|
||||
}
|
||||
exit_level_0:
|
||||
|
||||
@@ -38,10 +38,10 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo,
|
||||
const double* a, lapack_int lda, double* work )
|
||||
{
|
||||
lapack_int info = 0;
|
||||
double res = 0.;
|
||||
double res = 0.;
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
/* Call LAPACK function and adjust info */
|
||||
LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
|
||||
res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
|
||||
if( info < 0 ) {
|
||||
info = info - 1;
|
||||
}
|
||||
|
||||
@@ -74,11 +74,10 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side,
|
||||
}
|
||||
/* Allocate memory for temporary array(s) */
|
||||
if( LAPACKE_lsame( vect, 'q' ) ) {
|
||||
a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * k );
|
||||
a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,k) );
|
||||
} else {
|
||||
a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * nq );
|
||||
a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,nq) );
|
||||
}
|
||||
|
||||
if( a_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_0;
|
||||
@@ -89,11 +88,7 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side,
|
||||
goto exit_level_1;
|
||||
}
|
||||
/* Transpose input matrices */
|
||||
if( LAPACKE_lsame( vect, 'q' ) ) {
|
||||
LAPACKE_dge_trans( matrix_layout, nq, k, a, lda, a_t, lda_t );
|
||||
} else {
|
||||
LAPACKE_dge_trans( matrix_layout, k, nq, a, lda, a_t, lda_t );
|
||||
}
|
||||
LAPACKE_dge_trans( matrix_layout, r, MIN(nq,k), a, lda, a_t, lda_t );
|
||||
LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
|
||||
/* Call LAPACK function and adjust info */
|
||||
LAPACK_dormbr( &vect, &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t,
|
||||
|
||||
@@ -87,12 +87,7 @@ lapack_int LAPACKE_dormlq_work( int matrix_layout, char side, char trans,
|
||||
goto exit_level_1;
|
||||
}
|
||||
/* Transpose input matrices */
|
||||
if( LAPACKE_lsame( side, 'l' ) ){
|
||||
LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
|
||||
} else {
|
||||
LAPACKE_dge_trans( matrix_layout, k, n, a, lda, a_t, lda_t );
|
||||
}
|
||||
|
||||
LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
|
||||
LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
|
||||
/* Call LAPACK function and adjust info */
|
||||
LAPACK_dormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t,
|
||||
|
||||
@@ -51,8 +51,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
}
|
||||
#endif
|
||||
/* Allocate memory for working array(s) */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) );
|
||||
if( work == NULL ) {
|
||||
info = LAPACK_WORK_MEMORY_ERROR;
|
||||
@@ -63,8 +62,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
res = LAPACKE_slantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
|
||||
work );
|
||||
/* Release memory and exit */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
LAPACKE_free( work );
|
||||
}
|
||||
exit_level_0:
|
||||
|
||||
@@ -41,7 +41,7 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo,
|
||||
float res = 0.;
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
/* Call LAPACK function and adjust info */
|
||||
LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
|
||||
res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
|
||||
if( info < 0 ) {
|
||||
info = info - 1;
|
||||
}
|
||||
|
||||
@@ -73,8 +73,11 @@ lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side,
|
||||
return (info < 0) ? (info - 1) : info;
|
||||
}
|
||||
/* Allocate memory for temporary array(s) */
|
||||
a_t = (float*)
|
||||
LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MIN(nq,k)) );
|
||||
if( LAPACKE_lsame( vect, 'q' ) ) {
|
||||
a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,k) );
|
||||
} else {
|
||||
a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,nq) );
|
||||
}
|
||||
if( a_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_0;
|
||||
|
||||
@@ -72,7 +72,11 @@ lapack_int LAPACKE_sormlq_work( int matrix_layout, char side, char trans,
|
||||
return (info < 0) ? (info - 1) : info;
|
||||
}
|
||||
/* Allocate memory for temporary array(s) */
|
||||
a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) );
|
||||
if( LAPACKE_lsame( side, 'l' ) ) {
|
||||
a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) );
|
||||
} else {
|
||||
a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
|
||||
}
|
||||
if( a_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_0;
|
||||
|
||||
@@ -51,8 +51,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
}
|
||||
#endif
|
||||
/* Allocate memory for working array(s) */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) );
|
||||
if( work == NULL ) {
|
||||
info = LAPACK_WORK_MEMORY_ERROR;
|
||||
@@ -63,8 +62,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag,
|
||||
res = LAPACKE_zlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
|
||||
work );
|
||||
/* Release memory and exit */
|
||||
if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
|
||||
LAPACKE_lsame( norm, 'O' ) ) {
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
LAPACKE_free( work );
|
||||
}
|
||||
exit_level_0:
|
||||
|
||||
@@ -39,7 +39,7 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo,
|
||||
double* work )
|
||||
{
|
||||
lapack_int info = 0;
|
||||
double res = 0.;
|
||||
double res = 0.;
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
/* Call LAPACK function and adjust info */
|
||||
res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
|
||||
|
||||
@@ -405,9 +405,9 @@
|
||||
$ WORK( IWRK ), LWORK-IWRK+1, INFO )
|
||||
END IF
|
||||
*
|
||||
* If INFO > 0 from CHSEQR, then quit
|
||||
* If INFO .NE. 0 from CHSEQR, then quit
|
||||
*
|
||||
IF( INFO.GT.0 )
|
||||
IF( INFO.NE.0 )
|
||||
$ GO TO 50
|
||||
*
|
||||
IF( WANTVL .OR. WANTVR ) THEN
|
||||
|
||||
@@ -145,15 +145,33 @@
|
||||
INTRINSIC ABS, CMPLX, MAX
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
INFO = 0
|
||||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( N.EQ.0 )
|
||||
$ RETURN
|
||||
*
|
||||
* Set constants to control overflow
|
||||
*
|
||||
INFO = 0
|
||||
EPS = SLAMCH( 'P' )
|
||||
SMLNUM = SLAMCH( 'S' ) / EPS
|
||||
BIGNUM = ONE / SMLNUM
|
||||
CALL SLABAD( SMLNUM, BIGNUM )
|
||||
*
|
||||
* Handle the case N=1 by itself
|
||||
*
|
||||
IF( N.EQ.1 ) THEN
|
||||
IPIV( 1 ) = 1
|
||||
JPIV( 1 ) = 1
|
||||
IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
|
||||
INFO = 1
|
||||
A( 1, 1 ) = CMPLX( SMLNUM, ZERO )
|
||||
END IF
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Factorize A using complete pivoting.
|
||||
* Set pivots less than SMIN to SMIN
|
||||
*
|
||||
|
||||
@@ -339,16 +339,16 @@
|
||||
$ LDVL, VR, LDVR, WORK, -1, IERR )
|
||||
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
|
||||
CALL CHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
|
||||
$ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK,
|
||||
$ -1, WORK, IERR )
|
||||
$ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
|
||||
$ RWORK, IERR )
|
||||
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
|
||||
ELSE
|
||||
CALL CGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL,
|
||||
$ VR, LDVR, WORK, -1, IERR )
|
||||
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
|
||||
CALL CHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
|
||||
$ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK,
|
||||
$ -1, WORK, IERR )
|
||||
$ ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
|
||||
$ RWORK, IERR )
|
||||
LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
|
||||
END IF
|
||||
WORK( 1 ) = CMPLX( LWKOPT )
|
||||
|
||||
@@ -418,9 +418,9 @@
|
||||
$ WORK( IWRK ), LWORK-IWRK+1, INFO )
|
||||
END IF
|
||||
*
|
||||
* If INFO > 0 from DHSEQR, then quit
|
||||
* If INFO .NE. 0 from DHSEQR, then quit
|
||||
*
|
||||
IF( INFO.GT.0 )
|
||||
IF( INFO.NE.0 )
|
||||
$ GO TO 50
|
||||
*
|
||||
IF( WANTVL .OR. WANTVR ) THEN
|
||||
|
||||
@@ -145,15 +145,33 @@
|
||||
INTRINSIC ABS, MAX
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
INFO = 0
|
||||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( N.EQ.0 )
|
||||
$ RETURN
|
||||
*
|
||||
* Set constants to control overflow
|
||||
*
|
||||
INFO = 0
|
||||
EPS = DLAMCH( 'P' )
|
||||
SMLNUM = DLAMCH( 'S' ) / EPS
|
||||
BIGNUM = ONE / SMLNUM
|
||||
CALL DLABAD( SMLNUM, BIGNUM )
|
||||
*
|
||||
* Handle the case N=1 by itself
|
||||
*
|
||||
IF( N.EQ.1 ) THEN
|
||||
IPIV( 1 ) = 1
|
||||
JPIV( 1 ) = 1
|
||||
IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
|
||||
INFO = 1
|
||||
A( 1, 1 ) = SMLNUM
|
||||
END IF
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Factorize A using complete pivoting.
|
||||
* Set pivots less than SMIN to SMIN.
|
||||
*
|
||||
|
||||
@@ -418,9 +418,9 @@
|
||||
$ WORK( IWRK ), LWORK-IWRK+1, INFO )
|
||||
END IF
|
||||
*
|
||||
* If INFO > 0 from SHSEQR, then quit
|
||||
* If INFO .NE. 0 from SHSEQR, then quit
|
||||
*
|
||||
IF( INFO.GT.0 )
|
||||
IF( INFO.NE.0 )
|
||||
$ GO TO 50
|
||||
*
|
||||
IF( WANTVL .OR. WANTVR ) THEN
|
||||
|
||||
@@ -145,15 +145,33 @@
|
||||
INTRINSIC ABS, MAX
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
INFO = 0
|
||||
*
|
||||
* Quick return if possible
|
||||
*
|
||||
IF( N.EQ.0 )
|
||||
$ RETURN
|
||||
*
|
||||
* Set constants to control overflow
|
||||
*
|
||||
INFO = 0
|
||||
EPS = SLAMCH( 'P' )
|
||||
SMLNUM = SLAMCH( 'S' ) / EPS
|
||||
BIGNUM = ONE / SMLNUM
|
||||
CALL SLABAD( SMLNUM, BIGNUM )
|
||||
*
|
||||
* Handle the case N=1 by itself
|
||||
*
|
||||
IF( N.EQ.1 ) THEN
|
||||
IPIV( 1 ) = 1
|
||||
JPIV( 1 ) = 1
|
||||
IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
|
||||
INFO = 1
|
||||
A( 1, 1 ) = SMLNUM
|
||||
END IF
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Factorize A using complete pivoting.
|
||||
* Set pivots less than SMIN to SMIN.
|
||||
*
|
||||
|
||||
@@ -404,9 +404,9 @@
|
||||
$ WORK( IWRK ), LWORK-IWRK+1, INFO )
|
||||
END IF
|
||||
*
|
||||
* If INFO > 0 from ZHSEQR, then quit
|
||||
* If INFO .NE. 0 from ZHSEQR, then quit
|
||||
*
|
||||
IF( INFO.GT.0 )
|
||||
IF( INFO.NE.0 )
|
||||
$ GO TO 50
|
||||
*
|
||||
IF( WANTVL .OR. WANTVR ) THEN
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user