Merge pull request #1 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2019-05-08 19:46:44 +02:00 committed by GitHub
commit ede3cab6e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 8469 additions and 78 deletions

View File

@ -25,6 +25,15 @@ matrix:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 6.dev)
set(OpenBLAS_PATCH_VERSION 7.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions

View File

@ -167,4 +167,7 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel

View File

@ -1,4 +1,82 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.6
29-Apr-2019
common:
* the build tools now check that a given cpu TARGET is actually valid
* the build-time check of system features (c_check) has been made
less dependent on particular perl features (this should mainly
benefit building on Windows)
* several problem with the ReLAPACK integration were fixed,
including INTERFACE64 support and building a shared library
* building with CMAKE on BSD systems was improved
* a non-absolute SUM function was added based on the
existing optimized code for ASUM
* CBLAS interfaces to the IxMIN and IxMAX functions were added
* a name clash between LAPACKE and BOOST headers was resolved
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
kernels
* a crash on thread (key) deletion with the USE_TLS=1 memory management
option was fixed
* restored several earlier fixes, in particular for OpenMP performance,
building on BSD, and calling fork on CYGWIN, which had inadvertently
been dropped in the 0.3.3 rewrite of the memory management code.
x86_64:
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
* building with old versions of MSVC was fixed
* it is now possible to build a static library on Windows with CMAKE
* accessing environment variables on CYGWIN at run time was fixed
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
with CMAKE as well
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
* assembly bugs involving undeclared modification of input operands were fixed
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
* a similar bug was fixed in the blas_quickdivide code used to split workloads
in most functions
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
environment does not support AVX512
* improved GEMM performance on ZEN targets
x86:
* build failures caused by the recently added checks for AVX512 were fixed
* an inline assembly bug involving undeclared modification of an input argument was
fixed in the blas_quickdivide code used to split workloads in most functions
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
MIPS32:
* a bug in the IMIN implementation made it return the result of IMAX
POWER:
* single precision BLAS1/2 functions have received optimized POWER8 kernels
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
* building on PPC970 systems under OSX Leopard or Tiger is now supported
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
* building a shared library on AIX is now supported for POWER6
* DYNAMIC_ARCH support has been added for POWER6 and newer
ARMv7:
* corrected xDOT behaviour with zero INC_X or INC_Y
* a bug in the IMIN implementation made it return the result of IMAX
ARMv8:
* added support for HiSilicon TSV110 cpus
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* cross-compilation with CMAKE now works again
* a bug in the IMIN implementation made it return the result of IMAX
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
IBM Z:
* optimized microkernels for single precicion BLAS1/2 functions have been added
for both Z13 and Z14
====================================================================
Version 0.3.5
31-Dec-2018

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.6.dev
VERSION = 0.3.7.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -181,17 +181,17 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this mumber by THREAD_TIMEOUT
# system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contigous memory
# Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-

View File

@ -10,7 +10,7 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
@ -22,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
@ -63,9 +63,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional)
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64
@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`.
#### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
### Supported OS

40
azure-pipelines.yml Normal file
View File

@ -0,0 +1,40 @@
# Starter pipeline
# Start with a minimal pipeline that you can customize to build and deploy your code.
# Add steps that build, run tests, deploy, and more:
# https://aka.ms/yaml
trigger:
- master
pool:
vmImage: 'ubuntu-latest'
steps:
- script: echo Hello, world!
displayName: 'Run a one-line script'
#- script: |
# docker run --rm --privileged multiarch/qemu-user-static:register --reset
# ls /proc/sys/fs/binfmt_misc/
# condition: not(startsWith(variables['CONFIG'], 'linux_64'))
# displayName: 'Configure binfmt_misc'
- script: |
echo "FROM openblas/alpine:arm32
COPY . /tmp/openblas
RUN mkdir /tmp/openblas/build && \
cd /tmp/openblas/build && \
CC=gcc cmake -D DYNAMIC_ARCH=OFF \
-D TARGET=ARMV6 \
-D BUILD_SHARED_LIBS=ON \
-D BUILD_WITHOUT_LAPACK=ON \
-D BUILD_WITHOUT_CBLAS=ON \
-D CMAKE_BUILD_TYPE=Release ../ && \
cmake --build ." > Dockerfile
docker build .
displayName: Run ARMV6 docker build
#- script: |
# echo Add other tasks to build, test, and deploy your project.
# echo See https://aka.ms/yaml
# displayName: 'Run a multi-line script'

View File

@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)

View File

@ -283,7 +283,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")

View File

@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
* Choosing a SIZE too small will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \

View File

@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */
/* We need this grobal for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */
@ -150,7 +150,7 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR
/* Monitor is a function to see thread's status for every seconds. */
/* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread;

View File

@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t;
/* We need this global for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */

View File

@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1;
/* if number of threads is larger than inital condition */
/* if number of threads is larger than initial condition */
if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0;
@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
#if defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp;
#else
cpu_set_t cpuset;
#endif
#endif
int nums;
int ret;
@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
}
CPU_FREE(cpusetp);
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) {
common->num_procs = nums;
} else {
@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
int i;
int n = 0;
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n;
}
#else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
common->num_procs = CPU_COUNT(&cpuset);
}
#endif

View File

@ -229,7 +229,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -1772,7 +1772,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
#endif

View File

@ -125,7 +125,7 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) {
$need2bu = 1;

View File

@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c
)
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions
set(BLAS2_SOURCES
gemv.c ger.c

View File

@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small imput size &
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small imput size &
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@ -3,12 +3,12 @@
#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = strmm_kernel_16x8_power8.S
STRMMKERNEL = sgemm_kernel_power9.S
DTRMMKERNEL = dgemm_kernel_power9.S
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
SGEMMKERNEL = sgemm_kernel_16x8_power8.S
SGEMMKERNEL = sgemm_kernel_power9.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c

View File

@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
BLASLONG i;
BLASLONG i=0;
#if defined(USE_MASK_PERMUTATIONS)
register __vector unsigned int static_index0 = {0,1,2,3};
#else

View File

@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
BLASLONG i;
BLASLONG i=0;
register __vector unsigned int static_index0 = {0,1,2,3};
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}

View File

@ -0,0 +1,286 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define LOAD ld
#define STACKSIZE (512 )
#define M r3
#define N r4
#define K r5
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#define alpha_r vs20
#define save_permute_1 vs21
#define save_permute_2 vs22
#define permute_mask vs23
#define o0 0
#define T1 r11
#define T2 r12
#define T3 r14
#define T4 r15
#define T5 r16
#define T6 r17
#define L r18
#define T7 r19
#define T8 r20
#define TEMP_REG r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define T9 r27
#define T10 r28
#define T11 r29
#define T12 r30
#define T13 r31
#include "sgemm_macros_power9.S"
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_11, 0x1415161718191a1b
.equ save_permute_12, 0x0405060708090a0b
.equ save_permute_21, 0x101112131c1d1e1f
.equ save_permute_22, 0x000102030c0d0e0f
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
stxv v20, 288(SP)
stxv v21, 304(SP)
stxv v22, 320(SP)
stxv v23, 336(SP)
stxv v24, 352(SP)
stxv v25, 368(SP)
stxv v26, 384(SP)
stxv v27, 400(SP)
stxv v28, 416(SP)
stxv v29, 432(SP)
stxv v30, 448(SP)
stxv v31, 464(SP)
#if defined(TRMMKERNEL)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
slwi LDC, LDC, 2
/* cmpwi cr0, M, 0
ble .L999_H1
cmpwi cr0, N, 0
ble .L999_H1
cmpwi cr0, K, 0
ble .L999_H1
*/
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
xxspltw alpha_r,alpha_r,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
ori T2, T2, perm_const2@higher
rldicr T2, T2, 32, 31
oris T2, T2, perm_const2@h
ori T2, T2, perm_const2@l
lis T1, perm_const1@highest
ori T1, T1, perm_const1@higher
rldicr T1, T1, 32, 31
oris T1, T1, perm_const1@h
ori T1, T1, perm_const1@l
mtvsrdd permute_mask,T2,T1
lis T2, save_permute_12@highest
ori T2, T2, save_permute_12@higher
rldicr T2, T2, 32, 31
oris T2, T2, save_permute_12@h
ori T2, T2, save_permute_12@l
lis T1, save_permute_11@highest
ori T1, T1, save_permute_11@higher
rldicr T1, T1, 32, 31
oris T1, T1, save_permute_11@h
ori T1, T1, save_permute_11@l
mtvsrdd save_permute_1,T2,T1
lis T2, save_permute_22@highest
ori T2, T2, save_permute_22@higher
rldicr T2, T2, 32, 31
oris T2, T2, save_permute_22@h
ori T2, T2, save_permute_22@l
lis T1, save_permute_21@highest
ori T1, T1, save_permute_21@higher
rldicr T1, T1, 32, 31
oris T1, T1, save_permute_21@h
ori T1, T1, save_permute_21@l
mtvsrdd save_permute_2,T2,T1
#include "sgemm_logic_power9.S"
.L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
lxv v20, 288(SP)
lxv v21, 304(SP)
lxv v22, 320(SP)
lxv v23, 336(SP)
lxv v24, 352(SP)
lxv v25, 368(SP)
lxv v26, 384(SP)
lxv v27, 400(SP)
lxv v28, 416(SP)
lxv v29, 432(SP)
lxv v30, 448(SP)
lxv v31, 464(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -9,8 +9,8 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c
DGEMMINCOPY = dgemm_ncopy_8_skylakex.c
DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c
#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c

View File

@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 1280
#define SGEMM_DEFAULT_P 640
#define DGEMM_DEFAULT_P 128
#define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_Q 640
#define SGEMM_DEFAULT_Q 1408
#define DGEMM_DEFAULT_Q 384
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640

View File

@ -36,8 +36,8 @@
// allow malloc in xsygst for improved performance
#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
// allow malloc in xsytrf if the passed work buffer is too small
#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
#define XSYTRF_ALLOW_MALLOC 0
////////////////////////////////
// LAPACK routine replacement //

View File

@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec(
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
//RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
if (*info)
*info += n1;
// shift pivots

View File

@ -22,7 +22,7 @@ void RELAPACK_cgetrf(
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
else if (*ldA < MAX(1, *m))
*info = -4;
if (*info) {
const blasint minfo = -*info;

View File

@ -1,5 +1,6 @@
#include "relapack.h"
#include "stdlib.h"
#include <stdlib.h>
#include <stdio.h>
static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *,
const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *,
const blasint *, blasint *);
@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec(
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
// RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
if (*info)
*info += n1;
// shift pivots

View File

@ -15,16 +15,15 @@ void RELAPACK_dgetrf(
double *A, const blasint *ldA, blasint *ipiv,
blasint *info
) {
// Check arguments
*info = 0;
if (*m < 0)
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
else if (*ldA < MAX(1, *m))
*info = -4;
if (*info) {
if (*info!=0) {
const blasint minfo = -*info;
LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF"));
return;

View File

@ -55,15 +55,16 @@ void RELAPACK_sgbtrf(
// Allocate work space
const blasint n1 = SREC_SPLIT(*n);
const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const blasint nWorkl = (kv > n1) ? n1 : kv;
const blasint mWorku = (*kl > n1) ? n1 : *kl;
const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv );
const blasint nWorkl = abs( (kv > n1) ? n1 : kv );
const blasint mWorku = abs( (*kl > n1) ? n1 : *kl );
const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl );
float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
float *Worku = malloc(mWorku * nWorku * sizeof(float));
LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
// Recursive kernel
RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
@ -81,6 +82,7 @@ static void RELAPACK_sgbtrf_rec(
blasint *info
) {
if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
// Unblocked
LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
@ -155,6 +157,7 @@ static void RELAPACK_sgbtrf_rec(
float *const A_BRbl = A_BR + m21;
float *const A_BRbr = A_BR + *ldA * n21 + m21;
// recursion(Ab_L, ipiv_T)
RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
@ -216,8 +219,11 @@ static void RELAPACK_sgbtrf_rec(
}
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
//cause of infinite recursion here ?
// RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
if (*info)
*info += n1;
// shift pivots

View File

@ -1,5 +1,4 @@
#include "relapack.h"
static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *,
blasint *, blasint *);
@ -22,16 +21,14 @@ void RELAPACK_sgetrf(
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
else if (*ldA < MAX(1, *m))
*info = -4;
if (*info) {
const blasint minfo = -*info;
LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF"));
return;
}
const blasint sn = MIN(*m, *n);
RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);
// Right remainder
@ -61,7 +58,6 @@ static void RELAPACK_sgetrf_rec(
float *A, const blasint *ldA, blasint *ipiv,
blasint *info
) {
if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
// Unblocked
LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
@ -77,7 +73,6 @@ static void RELAPACK_sgetrf_rec(
const blasint n1 = SREC_SPLIT(*n);
const blasint n2 = *n - n1;
const blasint m2 = *m - n1;
// A_L A_R
float *const A_L = A;
float *const A_R = A + *ldA * n1;

View File

@ -56,10 +56,10 @@ void RELAPACK_zgbtrf(
// Allocate work space
const blasint n1 = ZREC_SPLIT(*n);
const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
const blasint nWorkl = (kv > n1) ? n1 : kv;
const blasint mWorku = (*kl > n1) ? n1 : *kl;
const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv);
const blasint nWorkl = abs ( (kv > n1) ? n1 : kv);
const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl);
const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl);
double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec(
}
// recursion(Ab_BR, ipiv_B)
RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
// RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
if (*info)
*info += n1;
// shift pivots

View File

@ -22,7 +22,7 @@ void RELAPACK_zgetrf(
*info = -1;
else if (*n < 0)
*info = -2;
else if (*ldA < MAX(1, *n))
else if (*ldA < MAX(1, *m))
*info = -4;
if (*info) {
const blasint minfo = -*info;

View File

@ -576,7 +576,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -991,7 +991,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -946,7 +946,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -576,7 +576,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*