commit
8189a98d85
|
@ -91,3 +91,4 @@ benchmark/*.goto
|
||||||
benchmark/smallscaling
|
benchmark/smallscaling
|
||||||
CMakeCache.txt
|
CMakeCache.txt
|
||||||
CMakeFiles/*
|
CMakeFiles/*
|
||||||
|
.vscode
|
||||||
|
|
|
@ -212,7 +212,8 @@ Please note that it is not possible to combine support for different architectur
|
||||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||||
- **AIX**: Supported on PPC up to POWER8
|
- **AIX**: Supported on PPC up to POWER8
|
||||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||||
|
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
17
appveyor.yml
17
appveyor.yml
|
@ -30,10 +30,11 @@ environment:
|
||||||
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
|
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
|
||||||
matrix:
|
matrix:
|
||||||
- COMPILER: clang-cl
|
- COMPILER: clang-cl
|
||||||
WITH_FORTRAN: yes
|
WITH_FORTRAN: ON
|
||||||
|
USE_OPENMP: ON
|
||||||
- COMPILER: clang-cl
|
- COMPILER: clang-cl
|
||||||
DYNAMIC_ARCH: ON
|
DYNAMIC_ARCH: ON
|
||||||
WITH_FORTRAN: no
|
WITH_FORTRAN: OFF
|
||||||
- COMPILER: cl
|
- COMPILER: cl
|
||||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||||
DYNAMIC_ARCH: OFF
|
DYNAMIC_ARCH: OFF
|
||||||
|
@ -47,12 +48,7 @@ environment:
|
||||||
install:
|
install:
|
||||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake
|
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
|
||||||
|
|
||||||
- if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja
|
|
||||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja
|
|
||||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang
|
|
||||||
|
|
||||||
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||||
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
|
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
|
||||||
- if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
|
- if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
|
||||||
|
@ -68,8 +64,9 @@ before_build:
|
||||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||||
|
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
|
||||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||||
|
|
||||||
build_script:
|
build_script:
|
||||||
|
|
|
@ -148,16 +148,20 @@ endif ()
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||||
if (DEFINED TARGET)
|
if (DEFINED TARGET)
|
||||||
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
||||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
|
||||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||||
else()
|
else()
|
||||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||||
endif()
|
endif()
|
||||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
|
||||||
# endif()
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||||
|
else()
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||||
|
@ -233,6 +237,11 @@ if (BINARY64)
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if(EMBEDDED)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED")
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NEED_PIC)
|
if (NEED_PIC)
|
||||||
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
|
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
|
||||||
|
|
13
common.h
13
common.h
|
@ -122,7 +122,7 @@ extern "C" {
|
||||||
#define ATOM GOTO_ATOM
|
#define ATOM GOTO_ATOM
|
||||||
#undef GOTO_ATOM
|
#undef GOTO_ATOM
|
||||||
#endif
|
#endif
|
||||||
#else
|
#elif !defined(OS_EMBEDDED)
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
#ifndef NO_SYSV_IPC
|
#ifndef NO_SYSV_IPC
|
||||||
#include <sys/shm.h>
|
#include <sys/shm.h>
|
||||||
|
@ -134,6 +134,9 @@ extern "C" {
|
||||||
#if defined(SMP) || defined(USE_LOCKING)
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#endif
|
#endif
|
||||||
|
#else
|
||||||
|
#include <time.h>
|
||||||
|
#include <math.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(OS_SUNOS)
|
#if defined(OS_SUNOS)
|
||||||
|
@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){
|
||||||
struct timespec ts;
|
struct timespec ts;
|
||||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
|
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
|
||||||
#else
|
#elif !defined(OS_EMBEDDED)
|
||||||
struct timeval tv;
|
struct timeval tv;
|
||||||
gettimeofday(&tv,NULL);
|
gettimeofday(&tv,NULL);
|
||||||
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
|
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#define RPCC_DEFINED
|
#define RPCC_DEFINED
|
||||||
|
@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
#include "common_linux.h"
|
#include "common_linux.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef OS_EMBEDDED
|
||||||
|
#define DTB_DEFAULT_ENTRIES 64
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
|
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
|
||||||
|
|
||||||
#ifdef __NetBSD__
|
#ifdef __NetBSD__
|
||||||
|
|
|
@ -1668,16 +1668,23 @@ void gotoblas_dummy_for_PGI(void) {
|
||||||
#ifndef MEM_LARGE_PAGES
|
#ifndef MEM_LARGE_PAGES
|
||||||
#define MEM_LARGE_PAGES 0x20000000
|
#define MEM_LARGE_PAGES 0x20000000
|
||||||
#endif
|
#endif
|
||||||
#else
|
#elif !defined(OS_EMBEDDED)
|
||||||
#define ALLOC_MMAP
|
#define ALLOC_MMAP
|
||||||
#define ALLOC_MALLOC
|
#define ALLOC_MALLOC
|
||||||
|
#else
|
||||||
|
#define ALLOC_MALLOC
|
||||||
|
|
||||||
|
inline int puts(const char *str) { return 0; }
|
||||||
|
inline int printf(const char *format, ...) { return 0; }
|
||||||
|
inline char *getenv(const char *name) { return ""; }
|
||||||
|
inline int atoi(const char *str) { return 0; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
|
||||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
|
||||||
#include <sys/mman.h>
|
#include <sys/mman.h>
|
||||||
#ifndef NO_SYSV_IPC
|
#ifndef NO_SYSV_IPC
|
||||||
#include <sys/shm.h>
|
#include <sys/shm.h>
|
||||||
|
|
|
@ -1634,10 +1634,10 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c
|
||||||
cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
|
cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c
|
cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c
|
||||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||||
|
|
||||||
cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c
|
cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c
|
||||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||||
|
|
||||||
cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
|
cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
|
||||||
|
|
|
@ -0,0 +1,176 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2021, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_8 1
|
||||||
|
|
||||||
|
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
|
||||||
|
{
|
||||||
|
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
|
||||||
|
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||||
|
__asm__
|
||||||
|
(
|
||||||
|
"dcbt 0, %2 \n\t"
|
||||||
|
"xscvdpspn 32, %x3 \n\t"
|
||||||
|
"xxspltw 32, 32, 0 \n\t"
|
||||||
|
|
||||||
|
"lxvp 40, 0(%2) \n\t"
|
||||||
|
"lxvp 42, 32(%2) \n\t"
|
||||||
|
"lxvp 44, 64(%2) \n\t"
|
||||||
|
"lxvp 46, 96(%2) \n\t"
|
||||||
|
|
||||||
|
"addic. %1, %1, -16 \n\t"
|
||||||
|
"ble two%= \n\t"
|
||||||
|
|
||||||
|
".align 5 \n"
|
||||||
|
"one%=: \n\t"
|
||||||
|
|
||||||
|
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||||
|
"xvmulsp 49, 41, 32 \n\t"
|
||||||
|
"xvmulsp 50, 42, 32 \n\t"
|
||||||
|
"xvmulsp 51, 43, 32 \n\t"
|
||||||
|
"xvmulsp 52, 44, 32 \n\t"
|
||||||
|
"xvmulsp 53, 45, 32 \n\t"
|
||||||
|
"xvmulsp 54, 46, 32 \n\t"
|
||||||
|
"xvmulsp 55, 47, 32 \n\t"
|
||||||
|
|
||||||
|
"xxperm 34, 40, %x5 \n\t"
|
||||||
|
"xxperm 35, 41, %x5 \n\t"
|
||||||
|
"xxperm 36, 42, %x5 \n\t"
|
||||||
|
"xxperm 37, 43, %x5 \n\t"
|
||||||
|
"xxperm 38, 44, %x5 \n\t"
|
||||||
|
"xxperm 39, 45, %x5 \n\t"
|
||||||
|
"xxperm 56, 46, %x5 \n\t"
|
||||||
|
"xxperm 57, 47, %x5 \n\t"
|
||||||
|
|
||||||
|
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||||
|
"xvmulsp 35, 35, %x4 \n\t"
|
||||||
|
|
||||||
|
"lxvp 40, 128(%2) \n\t"
|
||||||
|
|
||||||
|
"xvmulsp 36, 36, %x4 \n\t"
|
||||||
|
"xvmulsp 37, 37, %x4 \n\t"
|
||||||
|
|
||||||
|
"lxvp 42, 160(%2) \n\t"
|
||||||
|
|
||||||
|
"xvmulsp 38, 38, %x4 \n\t"
|
||||||
|
"xvmulsp 39, 39, %x4 \n\t"
|
||||||
|
|
||||||
|
"lxvp 44, 192(%2) \n\t"
|
||||||
|
|
||||||
|
"xvmulsp 56, 56, %x4 \n\t"
|
||||||
|
"xvmulsp 57, 57, %x4 \n\t"
|
||||||
|
|
||||||
|
"lxvp 46, 224(%2) \n\t"
|
||||||
|
|
||||||
|
"xvaddsp 48, 48, 34 \n\t"
|
||||||
|
"xvaddsp 49, 49, 35 \n\t"
|
||||||
|
"xvaddsp 50, 50, 36 \n\t"
|
||||||
|
"xvaddsp 51, 51, 37 \n\t"
|
||||||
|
|
||||||
|
"stxvp 48, 0(%2) \n\t"
|
||||||
|
|
||||||
|
"xvaddsp 52, 52, 38 \n\t"
|
||||||
|
"xvaddsp 53, 53, 39 \n\t"
|
||||||
|
|
||||||
|
"stxvp 50, 32(%2) \n\t"
|
||||||
|
|
||||||
|
"xvaddsp 54, 54, 56 \n\t"
|
||||||
|
"xvaddsp 55, 55, 57 \n\t"
|
||||||
|
|
||||||
|
"stxvp 52, 64(%2) \n\t"
|
||||||
|
"stxvp 54, 96(%2) \n\t"
|
||||||
|
|
||||||
|
"addi %2, %2, 128 \n\t"
|
||||||
|
|
||||||
|
"addic. %1, %1, -16 \n\t"
|
||||||
|
"bgt one%= \n"
|
||||||
|
|
||||||
|
"two%=: \n\t"
|
||||||
|
|
||||||
|
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||||
|
"xvmulsp 49, 41, 32 \n\t"
|
||||||
|
"xvmulsp 50, 42, 32 \n\t"
|
||||||
|
"xvmulsp 51, 43, 32 \n\t"
|
||||||
|
"xvmulsp 52, 44, 32 \n\t"
|
||||||
|
"xvmulsp 53, 45, 32 \n\t"
|
||||||
|
"xvmulsp 54, 46, 32 \n\t"
|
||||||
|
"xvmulsp 55, 47, 32 \n\t"
|
||||||
|
|
||||||
|
"xxperm 34, 40, %x5 \n\t"
|
||||||
|
"xxperm 35, 41, %x5 \n\t"
|
||||||
|
"xxperm 36, 42, %x5 \n\t"
|
||||||
|
"xxperm 37, 43, %x5 \n\t"
|
||||||
|
"xxperm 38, 44, %x5 \n\t"
|
||||||
|
"xxperm 39, 45, %x5 \n\t"
|
||||||
|
"xxperm 56, 46, %x5 \n\t"
|
||||||
|
"xxperm 57, 47, %x5 \n\t"
|
||||||
|
|
||||||
|
|
||||||
|
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||||
|
"xvmulsp 35, 35, %x4 \n\t"
|
||||||
|
"xvmulsp 36, 36, %x4 \n\t"
|
||||||
|
"xvmulsp 37, 37, %x4 \n\t"
|
||||||
|
"xvmulsp 38, 38, %x4 \n\t"
|
||||||
|
"xvmulsp 39, 39, %x4 \n\t"
|
||||||
|
"xvmulsp 56, 56, %x4 \n\t"
|
||||||
|
"xvmulsp 57, 57, %x4 \n\t"
|
||||||
|
|
||||||
|
"xvaddsp 48, 48, 34 \n\t"
|
||||||
|
"xvaddsp 49, 49, 35 \n\t"
|
||||||
|
"xvaddsp 50, 50, 36 \n\t"
|
||||||
|
"xvaddsp 51, 51, 37 \n\t"
|
||||||
|
|
||||||
|
"stxvp 48, 0(%2) \n\t"
|
||||||
|
|
||||||
|
"xvaddsp 52, 52, 38 \n\t"
|
||||||
|
"xvaddsp 53, 53, 39 \n\t"
|
||||||
|
|
||||||
|
"stxvp 50, 32(%2) \n\t"
|
||||||
|
|
||||||
|
"xvaddsp 54, 54, 56 \n\t"
|
||||||
|
"xvaddsp 55, 55, 57 \n\t"
|
||||||
|
|
||||||
|
"stxvp 52, 64(%2) \n\t"
|
||||||
|
"stxvp 54, 96(%2) \n\t"
|
||||||
|
|
||||||
|
"#n=%1 x=%0=%2 alpha=(%3,%4)\n"
|
||||||
|
:
|
||||||
|
"+m" (*x),
|
||||||
|
"+r" (n), // 1
|
||||||
|
"+b" (x) // 2
|
||||||
|
:
|
||||||
|
"f" (alpha_r), // 3
|
||||||
|
"wa" (t0), // 4
|
||||||
|
"wa" (mask) // 5
|
||||||
|
:
|
||||||
|
"cr0",
|
||||||
|
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||||
|
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||||
|
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||||
|
"vs56","vs57"
|
||||||
|
);
|
||||||
|
}
|
|
@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#pragma GCC optimize "O1"
|
#pragma GCC optimize "O1"
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
|
||||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||||
|
#if defined(POWER8) || defined(POWER9)
|
||||||
#if defined(DOUBLE)
|
#if defined(DOUBLE)
|
||||||
#include "zscal_microk_power8.c"
|
#include "zscal_microk_power8.c"
|
||||||
#endif
|
#endif
|
||||||
|
#elif defined(POWER10)
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
#include "zscal_microk_power8.c"
|
||||||
|
#else
|
||||||
|
#include "cscal_microk_power10.c"
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
|
||||||
{
|
{
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
n1 = n & -8;
|
n1 = n & -8;
|
||||||
|
#else
|
||||||
|
n1 = n & -16;
|
||||||
|
#endif
|
||||||
if ( n1 > 0 )
|
if ( n1 > 0 )
|
||||||
{
|
{
|
||||||
zscal_kernel_8(n1, x, da_r, da_i);
|
zscal_kernel_8(n1, x, da_r, da_i);
|
||||||
|
|
|
@ -320,12 +320,13 @@
|
||||||
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
||||||
COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
||||||
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
||||||
$ U12, X
|
$ U12, X, ABI12, Y
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
|
COMPLEX CLADIV
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
REAL CLANHS, SLAMCH
|
REAL CLANHS, SLAMCH
|
||||||
EXTERNAL LSAME, CLANHS, SLAMCH
|
EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH
|
||||||
* ..
|
* ..
|
||||||
* .. External Subroutines ..
|
* .. External Subroutines ..
|
||||||
EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA
|
EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA
|
||||||
|
@ -729,15 +730,21 @@
|
||||||
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
|
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
|
||||||
$ ( BSCALE*T( ILAST, ILAST ) )
|
$ ( BSCALE*T( ILAST, ILAST ) )
|
||||||
ABI22 = AD22 - U12*AD21
|
ABI22 = AD22 - U12*AD21
|
||||||
|
ABI12 = AD12 - U12*AD11
|
||||||
*
|
*
|
||||||
T1 = HALF*( AD11+ABI22 )
|
SHIFT = ABI22
|
||||||
RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
|
CTEMP = SQRT( ABI12 )*SQRT( AD21 )
|
||||||
TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) +
|
TEMP = ABS1( CTEMP )
|
||||||
$ AIMAG( T1-ABI22 )*AIMAG( RTDISC )
|
IF( CTEMP.NE.ZERO ) THEN
|
||||||
IF( TEMP.LE.ZERO ) THEN
|
X = HALF*( AD11-SHIFT )
|
||||||
SHIFT = T1 + RTDISC
|
TEMP2 = ABS1( X )
|
||||||
ELSE
|
TEMP = MAX( TEMP, ABS1( X ) )
|
||||||
SHIFT = T1 - RTDISC
|
Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
|
||||||
|
IF( TEMP2.GT.ZERO ) THEN
|
||||||
|
IF( REAL( X / TEMP2 )*REAL( Y )+
|
||||||
|
$ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y
|
||||||
|
END IF
|
||||||
|
SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) )
|
||||||
END IF
|
END IF
|
||||||
ELSE
|
ELSE
|
||||||
*
|
*
|
||||||
|
|
|
@ -320,12 +320,13 @@
|
||||||
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
||||||
COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
||||||
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
||||||
$ U12, X
|
$ U12, X, ABI12, Y
|
||||||
* ..
|
* ..
|
||||||
* .. External Functions ..
|
* .. External Functions ..
|
||||||
|
COMPLEX*16 ZLADIV
|
||||||
LOGICAL LSAME
|
LOGICAL LSAME
|
||||||
DOUBLE PRECISION DLAMCH, ZLANHS
|
DOUBLE PRECISION DLAMCH, ZLANHS
|
||||||
EXTERNAL LSAME, DLAMCH, ZLANHS
|
EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS
|
||||||
* ..
|
* ..
|
||||||
* .. External Subroutines ..
|
* .. External Subroutines ..
|
||||||
EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL
|
EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL
|
||||||
|
@ -730,15 +731,21 @@
|
||||||
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
|
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
|
||||||
$ ( BSCALE*T( ILAST, ILAST ) )
|
$ ( BSCALE*T( ILAST, ILAST ) )
|
||||||
ABI22 = AD22 - U12*AD21
|
ABI22 = AD22 - U12*AD21
|
||||||
|
ABI12 = AD12 - U12*AD11
|
||||||
*
|
*
|
||||||
T1 = HALF*( AD11+ABI22 )
|
SHIFT = ABI22
|
||||||
RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
|
CTEMP = SQRT( ABI12 )*SQRT( AD21 )
|
||||||
TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) +
|
TEMP = ABS1( CTEMP )
|
||||||
$ DIMAG( T1-ABI22 )*DIMAG( RTDISC )
|
IF( CTEMP.NE.ZERO ) THEN
|
||||||
IF( TEMP.LE.ZERO ) THEN
|
X = HALF*( AD11-SHIFT )
|
||||||
SHIFT = T1 + RTDISC
|
TEMP2 = ABS1( X )
|
||||||
ELSE
|
TEMP = MAX( TEMP, ABS1( X ) )
|
||||||
SHIFT = T1 - RTDISC
|
Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
|
||||||
|
IF( TEMP2.GT.ZERO ) THEN
|
||||||
|
IF( DBLE( X / TEMP2 )*DBLE( Y )+
|
||||||
|
$ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y
|
||||||
|
END IF
|
||||||
|
SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) )
|
||||||
END IF
|
END IF
|
||||||
ELSE
|
ELSE
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in New Issue