Merge pull request #2213 from xianyi/develop

Update from develop in preparation of the 0.3.7 release
This commit is contained in:
Martin Kroeker 2019-08-11 23:14:49 +02:00 committed by GitHub
commit 20d417762f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
165 changed files with 20231 additions and 595 deletions

143
.drone.yml Normal file
View File

@ -0,0 +1,143 @@
---
kind: pipeline
name: arm64_gcc_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_gcc_make
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm64_clang_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_clang_cmake
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_gcc_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_clang_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest

View File

@ -25,6 +25,15 @@ matrix:
- TARGET_BOX=LINUX64 - TARGET_BOX=LINUX64
- BTYPE="BINARY=64" - BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu - <<: *test-ubuntu
env: env:
- TARGET_BOX=LINUX64 - TARGET_BOX=LINUX64
@ -164,42 +173,6 @@ matrix:
env: env:
- BTYPE="BINARY=32" - BTYPE="BINARY=32"
- &emulated-arm
dist: trusty
sudo: required
services: docker
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
name: "Emulated Build for ARMV6 with gcc"
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
script: |
echo "FROM openblas/alpine:${IMAGE_ARCH}
COPY . /tmp/openblas
RUN mkdir /tmp/openblas/build && \
cd /tmp/openblas/build && \
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
-D TARGET=${TARGET_ARCH} \
-D BUILD_SHARED_LIBS=ON \
-D BUILD_WITHOUT_LAPACK=ON \
-D BUILD_WITHOUT_CBLAS=ON \
-D CMAKE_BUILD_TYPE=Release ../ && \
cmake --build ." > Dockerfile
docker build .
- <<: *emulated-arm
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
name: "Emulated Build for ARMV6 with clang"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
name: "Emulated Build for ARMV8 with gcc"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
name: "Emulated Build for ARMV8 with clang"
allow_failures:
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
# whitelist # whitelist
branches: branches:
only: only:

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM) project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 6) set(OpenBLAS_PATCH_VERSION 7.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions # Adhere to GNU filesystem layout conventions
@ -20,9 +20,14 @@ if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif() endif()
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library. # Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using # Avoids conflicts with other BLAS libraries, especially when using
@ -206,7 +211,8 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif() endif()
if (MSVC OR NOT NOFORTRAN) #if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
# Broken without fortran on unix # Broken without fortran on unix
add_subdirectory(utest) add_subdirectory(utest)
endif() endif()

View File

@ -167,4 +167,7 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13 * [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel

View File

@ -34,7 +34,7 @@ endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install .PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@ -109,6 +109,7 @@ endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn @$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll @$(MAKE) -C exports dll
@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME) touch $(LIBNAME)
ifndef NO_FBLAS ifndef NO_FBLAS
$(MAKE) -C test all $(MAKE) -C test all
$(MAKE) -C utest all
endif endif
$(MAKE) -C utest all
ifndef NO_CBLAS ifndef NO_CBLAS
$(MAKE) -C ctest all $(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif endif
endif endif

View File

@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android) ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a CCOMMON_OPT += -mfpu=neon
FCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon
else else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@ -9,11 +9,6 @@ endif
endif endif
ifeq ($(CORE), ARMV6) ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6 CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
endif endif

View File

@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"

View File

@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif endif
endif endif
# workaround for C->FORTRAN ABI violation in LAPACKE
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fno-optimize-sibling-calls
endif
FLAMEPATH = $(HOME)/flame/lib FLAMEPATH = $(HOME)/flame/lib

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.3.6 VERSION = 0.3.7.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -58,6 +58,12 @@ VERSION = 0.3.6
# For force setting for multi threaded, specify USE_THREAD = 1 # For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0 # USE_THREAD = 0
# If you want to build a single-threaded OpenBLAS, but expect to call this
# from several concurrent threads in some other program, comment this in for
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
# be necessary when USE_OPENMP=1)
# USE_LOCKING = 1
# If you're going to use this library with OpenMP, please comment it in. # If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1 # USE_OPENMP = 1
@ -157,6 +163,10 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1 # NO_AVX2 = 1
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
# system will try to determine this automatically)
# NO_AVX512 = 1
# Don't use parallel make. # Don't use parallel make.
# NO_PARALLEL_MAKE = 1 # NO_PARALLEL_MAKE = 1
@ -181,17 +191,17 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30 # time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26, # which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this mumber by THREAD_TIMEOUT # system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26 # CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contigous memory # Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it. # to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1 # DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only). # If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1 # CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the # with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer # number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi- # an individual dimension). You can use this setting to avoid the overhead of multi-
@ -239,6 +249,21 @@ COMMON_PROF = -pg
# SYMBOLPREFIX= # SYMBOLPREFIX=
# SYMBOLSUFFIX= # SYMBOLSUFFIX=
# Run a C++ based thread safety tester after the build is done.
# This is mostly intended as a developer feature to spot regressions, but users and
# package maintainers can enable this if they have doubts about the thread safety of
# the library, given the configuration in this file.
# By default, the thread safety tester launches 52 concurrent calculations at the same
# time.
#
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
#
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
# an OpenMP implementation. If you are cross-compiling this test will probably not
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
# #
# End of user configuration # End of user configuration
# #

View File

@ -9,6 +9,11 @@ ifndef TOPDIR
TOPDIR = . TOPDIR = .
endif endif
# If ARCH is not set, we use the host system's architecture.
ifndef ARCH
ARCH := $(shell uname -m)
endif
# Catch conflicting usage of ARCH in some BSD environments # Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64) ifeq ($(ARCH), amd64)
override ARCH=x86_64 override ARCH=x86_64
@ -137,7 +142,12 @@ endif
endif endif
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
ifeq ($(ARCH), x86_64)
ifneq ($(C_COMPILER), PGI)
GETARCH_FLAGS += -march=native
endif
endif
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0) ifneq ($(INTERFACE64), 0)
@ -237,6 +247,10 @@ SMP = 1
endif endif
endif endif
ifeq ($(SMP), 1)
USE_LOCKING =
endif
ifndef NEED_PIC ifndef NEED_PIC
NEED_PIC = 1 NEED_PIC = 1
endif endif
@ -253,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS. # When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1) ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1 NO_LAPACK = 1
override FEXTRALIB =
endif endif
# #
@ -388,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif endif
ifdef USE_LOCKING
ifneq ($(USE_LOCKING), 0)
CCOMMON_OPT += -DUSE_LOCKING
endif
endif
# #
# Architecture dependent settings # Architecture dependent settings
# #
@ -744,6 +765,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847 # make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1) ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran EXTRALIB += -lgfortran
@ -1049,7 +1072,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif endif
ifdef USE_TLS ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS CCOMMON_OPT += -DUSE_TLS
endif endif
@ -1102,8 +1125,12 @@ endif
endif endif
ifdef NO_AFFINITY ifdef NO_AFFINITY
ifeq ($(NO_AFFINITY), 0)
override undefine NO_AFFINITY
else
CCOMMON_OPT += -DNO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY
endif endif
endif
ifdef FUNCTION_PROFILE ifdef FUNCTION_PROFILE
CCOMMON_OPT += -DFUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE

View File

@ -28,11 +28,15 @@ endif
ifeq ($(CORE), HASWELL) ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH ifndef DYNAMIC_ARCH
ifndef NO_AVX2 ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2 CCOMMON_OPT += -mavx2
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2 FCOMMON_OPT += -mavx2
endif endif
endif endif
endif endif
endif

View File

@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction ## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>. Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages ## Binary Packages
@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source ## Installation from Source
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git. using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies ### Dependencies
@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional) ### Compile with MASS support on Power CPU (optional)
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown: The library can be installed as shown:
@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64 #### MIPS64
@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`.
#### PPC/PPC64 #### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System #### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
### Supported OS ### Supported OS

View File

@ -35,6 +35,13 @@ environment:
DYNAMIC_ARCH: ON DYNAMIC_ARCH: ON
WITH_FORTRAN: no WITH_FORTRAN: no
- COMPILER: cl - COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- COMPILER: MinGW64-gcc-7.2.0
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
install: install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
@ -52,7 +59,14 @@ install:
before_build: before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build } - ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build - cd build
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
@ -64,3 +78,4 @@ test_script:
- echo Running Test - echo Running Test
- cd utest - cd utest
- openblas_utest - openblas_utest

51
azure-pipelines.yml Normal file
View File

@ -0,0 +1,51 @@
trigger:
# start a new build for every push
batch: False
branches:
include:
- develop
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
# does not support AVX512VL, so use more recent LTS version
echo "FROM ubuntu:bionic
COPY . /tmp/openblas
RUN apt-get -y update && apt-get -y install \\
cmake \\
gfortran \\
make \\
wget
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
mkdir sde-external-8.35.0-2019-03-11-lin && \\
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
docker build -t intel_sde .
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'

View File

@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
} else { } else {
$no_avx512 = 0; $no_avx512 = 0;
} }
unlink("tmpf.o"); unlink("$tmpf.o");
} }
} }

View File

@ -73,6 +73,7 @@ if (DYNAMIC_ARCH)
endif () endif ()
if (NOT NO_AVX512) if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
endif () endif ()
if (DYNAMIC_LIST) if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
@ -80,7 +81,8 @@ if (DYNAMIC_ARCH)
endif () endif ()
if (NOT DYNAMIC_CORE) if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH) message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)
endif () endif ()
endif () endif ()

View File

@ -44,7 +44,10 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN") if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK) if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran") set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt # helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. # Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1) macro(SetDefaultL1)
set(SAMAXKERNEL amax.S) set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S)

View File

@ -59,6 +59,9 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_") set(FU "_")
endif() endif()
if(MINGW AND NOT MINGW64)
set(FU "_")
endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU") if (${COMPILER_ID} STREQUAL "GNU")
@ -82,6 +85,11 @@ endif ()
# f_check # f_check
if (NOT NOFORTRAN) if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
else ()
file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
set(BU "_")
endif () endif ()
# Cannot run getarch on target if we are cross-compiling # Cannot run getarch on target if we are cross-compiling

View File

@ -65,6 +65,18 @@ if (DEFINED TARGET)
set(GETARCH_FLAGS "-DFORCE_${TARGET}") set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif () endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
# On x86 no AVX support is available
if (X86 OR X86_64)
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
endif ()
endif ()
if (INTERFACE64) if (INTERFACE64)
message(STATUS "Using 64-bit integers.") message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@ -136,10 +148,16 @@ endif ()
if (USE_THREAD) if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
else()
if (${USE_LOCKING})
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
endif ()
endif () endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
if (NOT DEFINED NEED_PIC) if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1) set(NEED_PIC 1)
endif () endif ()
@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN) if (NOT NOFORTRAN)
# Fortran Compiler dependent settings # Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif () endif ()
if (BINARY64) if (BINARY64)
@ -181,10 +202,15 @@ if (NEED_PIC)
endif () endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER) if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif () endif ()
else ()
unset (DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
endif ()
endif () endif ()
if (DYNAMIC_LIST) if (DYNAMIC_LIST)
@ -283,7 +309,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles # TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440") if (${CORE} STREQUAL "PPC440")

View File

@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android") if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID) set(HOST_OS ANDROID)
endif(${OPERATING_SYSTEM} MATCHES "Android") endif()
endif() endif()

View File

@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction () endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition # generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from # @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@ -131,7 +131,7 @@ extern "C" {
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#include <math.h> #include <math.h>
#ifdef SMP #if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h> #include <pthread.h>
#endif #endif
#endif #endif
@ -200,7 +200,7 @@ extern "C" {
#error "You can't specify both LOCK operation!" #error "You can't specify both LOCK operation!"
#endif #endif
#ifdef SMP #if defined(SMP) || defined(USE_LOCKING)
#define USE_PTHREAD_LOCK #define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK #undef USE_PTHREAD_SPINLOCK
#endif #endif

View File

@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH #define HAVE_PREFETCH
#endif #endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
#define DCBT_ARG 0 #define DCBT_ARG 0
#else #else
#define DCBT_ARG 8 #define DCBT_ARG 8
@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM) #if defined(ASSEMBLER) && !defined(NEEDPARAM)
#ifdef OS_LINUX #if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__ #ifndef __64BIT__
#define PROLOGUE \ #define PROLOGUE \
.section .text;\ .section .text;\
@ -784,7 +784,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023 #define HALT mfspr r0, 1023
#ifdef OS_LINUX #if defined(OS_LINUX) || defined(OS_FREEBSD)
#if defined(PPC440) || defined(PPC440FP2) #if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER #undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1 #define MAX_CPU_NUMBER 1
@ -829,7 +829,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON #define MAP_ANONYMOUS MAP_ANON
#endif #endif
#ifdef OS_LINUX #if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__ #ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8) #define FRAMESLOT(X) (((X) * 4) + 8)
#else #else

View File

@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be: * SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation * - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel * - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing. * Choosing a SIZE too small will lead to a stack smashing.
*/ */
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \ /* make it volatile because some function (ex: dgemv_n.S) */ \

View File

@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif #endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona. //Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION #define BARCELONA_OPTIMIZATION
#endif #endif

View File

@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2]; *ecx=cpuinfo[2];
*edx=cpuinfo[3]; *edx=cpuinfo[3];
#else #else
__asm__ __volatile__("cpuid" __asm__ __volatile__("mov $0, %%ecx;"
"cpuid"
: "=a" (*eax), : "=a" (*eax),
"=b" (*ebx), "=b" (*ebx),
"=c" (*ecx), "=c" (*ecx),
"=d" (*edx) "=d" (*edx)
: "0" (op), "c"(0)); : "0" (op));
#endif #endif
} }
@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER #ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona. //Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION #define BARCELONA_OPTIMIZATION
#endif #endif

14
cpp_thread_test/Makefile Normal file
View File

@ -0,0 +1,14 @@
include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
./dgemm_tester
clean ::
rm -f dgemv_tester dgemm_tester

View File

@ -0,0 +1,55 @@
inline void pauser(){
/// a portable way to pause a program
std::string dummy;
std::cout << "Press enter to continue...";
std::getline(std::cin, dummy);
}
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for(uint32_t i=0; i<numMat; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
matBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
for(uint32_t j=0; j<numMat; j++){
matBlock[i+j] = matBlock[j];
}
}
}
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
for(uint32_t i=0; i<numVec; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
vecBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
for(uint32_t j=0; j<numVec; j++){
vecBlock[i+j] = vecBlock[j];
}
}
}
std::mt19937_64 InitPRNG(){
std::random_device rd;
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
return PRNG;
}
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
std::cout<<i<<std::endl;
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
}

View File

@ -0,0 +1,92 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs[0]);
numConcurrentThreads = std::stoul(cliArgs[1]);
numTestRounds = std::stoul(cliArgs[2]);
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMM thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
matBlock[i].resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMM thread safety\n";
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMM round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -0,0 +1,101 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMV thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
vecBlock.at(i).resize(randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
std::cout<<"done\n";
std::cout<<"Filling vectors with random numbers..."<<std::flush;
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -94,7 +94,7 @@ int get_feature(char *search)
if( p == NULL ) return 0; if( p == NULL ) return 0;
t = strtok(p," "); t = strtok(p," ");
while( t = strtok(NULL," ")) while( (t = strtok(NULL," ")))
{ {
if (!strcmp(t, search)) { return(1); } if (!strcmp(t, search)) { return(1); }
} }
@ -344,7 +344,7 @@ void get_features(void)
if( p == NULL ) return; if( p == NULL ) return;
t = strtok(p," "); t = strtok(p," ");
while( t = strtok(NULL," ")) while( (t = strtok(NULL," ")))
{ {
} }

View File

@ -1211,7 +1211,7 @@ int get_cpuname(void){
return CPUTYPE_CORE2; return CPUTYPE_CORE2;
} }
break; break;
case 1: case 1: // family 6 exmodel 1
switch (model) { switch (model) {
case 6: case 6:
return CPUTYPE_CORE2; return CPUTYPE_CORE2;
@ -1228,7 +1228,7 @@ int get_cpuname(void){
return CPUTYPE_DUNNINGTON; return CPUTYPE_DUNNINGTON;
} }
break; break;
case 2: case 2: // family 6 exmodel 2
switch (model) { switch (model) {
case 5: case 5:
//Intel Core (Clarkdale) / Core (Arrandale) //Intel Core (Clarkdale) / Core (Arrandale)
@ -1257,7 +1257,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 3: case 3: // family 6 exmodel 3
switch (model) { switch (model) {
case 7: case 7:
// Bay Trail // Bay Trail
@ -1287,7 +1287,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 4: case 4: // family 6 exmodel 4
switch (model) { switch (model) {
case 5: case 5:
case 6: case 6:
@ -1321,7 +1321,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 5: case 5: // family 6 exmodel 5
switch (model) { switch (model) {
case 6: case 6:
//Broadwell //Broadwell
@ -1364,7 +1364,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 6: case 6: // family 6 exmodel 6
switch (model) { switch (model) {
case 6: // Cannon Lake case 6: // Cannon Lake
if(support_avx512()) if(support_avx512())
@ -1377,6 +1377,19 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 7: // family 6 exmodel 7
switch (model) {
case 14: // Ice Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9: case 9:
case 8: case 8:
switch (model) { switch (model) {

View File

@ -6,6 +6,8 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS override CFLAGS += -DADD$(BU) -DCBLAS
override TARGET_ARCH=
override TARGET_MACH=
LIB = $(TOPDIR)/$(LIBNAME) LIB = $(TOPDIR)/$(LIBNAME)

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */ /* jobs is queued. */
/* We need this grobal for cheking if initialization is finished. */ /* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */ /* Local Variables */
@ -150,7 +150,7 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR #ifdef MONITOR
/* Monitor is a function to see thread's status for every seconds. */ /* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */ /* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread; static pthread_t monitor_thread;

View File

@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */ /* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */ /* Thread server common information */
typedef struct{ typedef struct{
CRITICAL_SECTION lock; CRITICAL_SECTION lock;
HANDLE filled; HANDLE filled;
@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t; } blas_pool_t;
/* We need this global for cheking if initialization is finished. */ /* We need this global for checking if initialization is finished. */
int blas_server_avail = 0; int blas_server_avail = 0;
/* Local Variables */ /* Local Variables */

View File

@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){
} }
} }
return NULL; return NULL;
case 7:
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9: case 9:
case 8: case 8:
if (model == 14 ) { // Kaby Lake if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2()) if(support_avx2())
return &gotoblas_HASWELL; return &gotoblas_HASWELL;
if(support_avx()) { if(support_avx()) {

View File

@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1; int mynode = 1;
/* if number of threads is larger than inital condition */ /* if number of threads is larger than initial condition */
if (pos < 0) { if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0; return 0;
@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid; common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) { if (common -> magic != SH_MAGIC) {
#if defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp; cpu_set_t *cpusetp;
#else
cpu_set_t cpuset;
#endif
#endif
int nums; int nums;
int ret; int ret;
@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
} }
CPU_FREE(cpusetp); CPU_FREE(cpusetp);
#else #else
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) { if (ret!=0) {
common->num_procs = nums; common->num_procs = nums;
} else { } else {
@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
int i; int i;
int n = 0; int n = 0;
for (i=0;i<nums;i++) for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++; if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n; common->num_procs = n;
} }
#else #else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); common->num_procs = CPU_COUNT(&cpuset);
} }
#endif #endif

View File

@ -229,7 +229,7 @@ int get_num_procs(void) {
n=0; n=0;
#if !__GLIBC_PREREQ(2, 6) #if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++) for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++; if (CPU_ISSET(i,&cpuset)) n++;
nums=n; nums=n;
#else #else
nums = CPU_COUNT(sizeof(cpuset),&cpuset); nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init(); gotoblas_init();
gotoblas_quit(); gotoblas_quit();
#if __PGIC__ < 19
#if 0 #if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif #endif
#endif
} }
#endif #endif
@ -1772,7 +1774,7 @@ int get_num_procs(void) {
n=0; n=0;
#if !__GLIBC_PREREQ(2, 6) #if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++) for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++; if (CPU_ISSET(i,&cpuset)) n++;
nums=n; nums=n;
#else #else
nums = CPU_COUNT(sizeof(cpuset),&cpuset); nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
static void alloc_mmap_free(struct release_t *release){ static void alloc_mmap_free(struct release_t *release){
if (!release->address) return;
if (munmap(release -> address, BUFFER_SIZE)) { if (munmap(release -> address, BUFFER_SIZE)) {
printf("OpenBLAS : munmap failed\n"); int errsv=errno;
perror("OpenBLAS : munmap failed:");
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
} }
} }
@ -2062,14 +2068,20 @@ static void *alloc_mmap(void *address){
} }
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif #endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif
} else {
#ifdef DEBUG
int errsv=errno;
perror("OpenBLAS : mmap failed:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
#endif #endif
} }
@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){
#endif #endif
if (map_address != (void *)-1) { if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif #endif
release_info[release_pos].address = map_address; release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free; release_info[release_pos].func = alloc_mmap_free;
release_pos ++; release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif #endif
} }
@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){
position = 0; position = 0;
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif #endif
do { do {
@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){
position ++; position ++;
} while (position < NUM_BUFFERS); } while (position < NUM_BUFFERS);
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif #endif
goto error; goto error;
@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){
#endif #endif
memory[position].used = 1; memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#else #else
blas_unlock(&memory[position].lock); blas_unlock(&memory[position].lock);
@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER #ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
} }
#endif #endif
@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1); } while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif #endif
memory[position].addr = map_address; memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif #endif
@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){
#endif #endif
position = 0; position = 0;
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock); LOCK_COMMAND(&alloc_lock);
#endif #endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){
WMB; WMB;
memory[position].used = 0; memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif #endif
@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++) for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif #endif
#if defined(SMP) && !defined(USE_OPENMP) #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);
#endif #endif
return; return;
@ -2924,7 +2936,7 @@ void blas_shutdown(void){
#if defined(OS_LINUX) && !defined(NO_WARMUP) #if defined(OS_LINUX) && !defined(NO_WARMUP)
#ifdef SMP #if defined(SMP) || defined(USE_LOCKING)
#if defined(USE_PTHREAD_LOCK) #if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK) #elif defined(USE_PTHREAD_SPINLOCK)
@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
if (hot_alloc != 2) { if (hot_alloc != 2) {
#endif #endif
#ifdef SMP #if defined(SMP) || defined(USE_LOCKING)
LOCK_COMMAND(&init_lock); LOCK_COMMAND(&init_lock);
#endif #endif
@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size -= PAGESIZE; size -= PAGESIZE;
} }
#ifdef SMP #if defined(SMP) || defined(USE_LOCKING)
UNLOCK_COMMAND(&init_lock); UNLOCK_COMMAND(&init_lock);
#endif #endif
@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init(); gotoblas_init();
gotoblas_quit(); gotoblas_quit();
#if __PGIC__ < 19
#if 0 #if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif #endif
#endif
} }
#endif #endif

View File

@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
libgoto_hpl.def : gensymbol libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def $(LIBDYNNAME) : ../$(LIBNAME) osx.def
else else
@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif endif
ifneq (,$(filter 1 2,$(NOFORTRAN))) ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran #only build without Fortran
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else else
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif endif
dllinit.$(SUFFIX) : dllinit.c dllinit.$(SUFFIX) : dllinit.c

View File

@ -125,7 +125,7 @@ if ($compiler eq "") {
$openmp = "-openmp"; $openmp = "-openmp";
} }
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores. # for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) { if ($data =~ / zho_ge__/) {
$need2bu = 1; $need2bu = 1;

View File

@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c axpby.c
) )
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f # TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions # these all have 'z' sources for complex versions
set(BLAS2_SOURCES set(BLAS2_SOURCES
gemv.c ger.c gemv.c ger.c

View File

@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
// //
//Temporarily work-around the low performance issue with small imput size & //Temporarily work-around the low performance issue with small input size &
//multithreads. //multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1; nthreads = 1;

View File

@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
//disable multi-thread when incx==0 or incy==0 //disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent. //In that case, the threads would be dependent.
// //
//Temporarily work-around the low performance issue with small imput size & //Temporarily work-around the low performance issue with small input size &
//multithreads. //multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1; nthreads = 1;

View File

@ -1,30 +1,30 @@
include $(KERNELDIR)/KERNEL.ARMV5 include $(KERNELDIR)/KERNEL.ARMV5
SAMAXKERNEL = iamax_vfp.S SAMAXKERNEL = amax_vfp.S
DAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = amax_vfp.S
CAMAXKERNEL = iamax_vfp.S #CAMAXKERNEL = amax_vfp.S
ZAMAXKERNEL = iamax_vfp.S #ZAMAXKERNEL = amax_vfp.S
SAMINKERNEL = iamax_vfp.S SAMINKERNEL = amax_vfp.S
DAMINKERNEL = iamax_vfp.S DAMINKERNEL = amax_vfp.S
CAMINKERNEL = iamax_vfp.S #CAMINKERNEL = amax_vfp.S
ZAMINKERNEL = iamax_vfp.S #ZAMINKERNEL = amax_vfp.S
SMAXKERNEL = iamax_vfp.S SMAXKERNEL = amax_vfp.S
DMAXKERNEL = iamax_vfp.S DMAXKERNEL = amax_vfp.S
SMINKERNEL = iamax_vfp.S SMINKERNEL = amax_vfp.S
DMINKERNEL = iamax_vfp.S DMINKERNEL = amax_vfp.S
ISAMAXKERNEL = iamax_vfp.S ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S #ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S #IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S #ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S #IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S IDMAXKERNEL = iamax_vfp.S

445
kernel/arm/amax_vfp.S Normal file
View File

@ -0,0 +1,445 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(USE_ABS)
#if defined(DOUBLE)
#define VABS(x0,x1) vabs.f64 x0, x1
#else
#define VABS(x0,x1) vabs.f32 x0, x1
#endif
#else
#define VABS(x0,x1) nop
#endif
/*****************************************************************************************/
#if defined(USE_MIN)
#define MOVCOND movlt
#if defined(DOUBLE)
#define VMOVCOND vmovlt.f64
#else
#define VMOVCOND vmovlt.f32
#endif
#else
#define MOVCOND movgt
#if defined(DOUBLE)
#define VMOVCOND vmovgt.f64
#else
#define VMOVCOND vmovgt.f32
#endif
#endif
/*****************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro INIT_F
vldmia.f64 X!, { d0 }
VABS( d0, d0 )
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 }
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
.endm
.macro INIT_S
vldmia.f64 X, { d0 }
VABS( d0, d0 )
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 }
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
add X, X, INC_X
.endm
#else
.macro INIT_F
vldmia.f32 X!, { s0 }
VABS( s0, s0 )
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 }
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
.endm
.macro INIT_S
vldmia.f32 X, { s0 }
VABS( s0, s0 )
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 }
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro INIT_F
vldmia.f64 X!, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 - d5 }
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
.endm
.macro INIT_S
vldmia.f64 X, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 - d5 }
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
add X, X, INC_X
.endm
#else
.macro INIT_F
vldmia.f32 X!, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 - s5 }
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
.endm
.macro INIT_S
vldmia.f32 X, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 - s5 }
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
movs r12, #0 // clear floating point register
vmov s0, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
#endif
cmp N, #0
ble amax_kernel_L999
cmp INC_X, #0
beq amax_kernel_L999
cmp INC_X, #1
bne amax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
INIT_F
subs N, N , #1
ble amax_kernel_L999
asrs I, N, #2 // I = N / 4
ble amax_kernel_F1
.align 5
amax_kernel_F4:
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
ble amax_kernel_F1
#if defined(COMPLEX) || defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
bne amax_kernel_F4
amax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
b amax_kernel_L999
amax_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
INIT_S
subs N, N , #1
ble amax_kernel_L999
asrs I, N, #2 // I = N / 4
ble amax_kernel_S1
.align 5
amax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
amax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
amax_kernel_L999:
#if !defined(__ARM_PCS_VFP)
#if defined(DOUBLE)
vmov r0, r1, d0
#else
vmov r0, s0
#endif
#endif
bx lr
EPILOGUE

View File

@ -3,12 +3,12 @@
#CGEMM_BETA = ../generic/zgemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = strmm_kernel_16x8_power8.S STRMMKERNEL = sgemm_kernel_power9.S
DTRMMKERNEL = dgemm_kernel_power9.S DTRMMKERNEL = dgemm_kernel_power9.S
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S CTRMMKERNEL = cgemm_kernel_power9.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S ZTRMMKERNEL = zgemm_kernel_power9.S
SGEMMKERNEL = sgemm_kernel_16x8_power8.S SGEMMKERNEL = sgemm_kernel_power9.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_8.c
@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMKERNEL = cgemm_kernel_power9.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMKERNEL = zgemm_kernel_power9.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define N r3 #define N r3
#define X r6 #define X r6

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define N r3 #define N r3
#define X r6 #define X r6

View File

@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfs f2, ALPHA_I_SP stfs f2, ALPHA_I_SP
// stw r0, FZERO // stw r0, FZERO
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif
@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif #endif

View File

@ -0,0 +1,293 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* Abdelrauf(quickwritereader@gmail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define LOAD ld
#define STACKSIZE (512 )
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
#define K r5
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#define alpha_r vs19
#define alpha_i vs20
#define save_permute_1 vs21
#define permute_mask vs22
#define o0 0
#define T1 r11
#define T2 r12
#define T3 r14
#define T4 r15
#define T5 r16
#define T6 r17
#define L r18
#define T7 r19
#define T8 r20
#define TEMP_REG r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define T9 r27
#define T10 r28
#define PRE r29
#define T12 r30
#define T13 r31
#include "cgemm_macros_power9.S"
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
.equ save_permute_11, 0x0405060714151617
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
mflr r0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
stxv vs52, 288(SP)
stxv vs53, 304(SP)
stxv vs54, 320(SP)
stxv vs55, 336(SP)
stxv vs56, 352(SP)
stxv vs57, 368(SP)
stxv vs58, 384(SP)
stxv vs59, 400(SP)
stxv vs60, 416(SP)
stxv vs61, 432(SP)
stxv vs62, 448(SP)
stxv vs63, 464(SP)
std r0, FLINK_SAVE(SP)
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#ifdef TRMMKERNEL
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
slwi LDC, LDC, ZBASE_SHIFT
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
xscvdpspn alpha_i,vs2
xxspltw alpha_r,alpha_r,0
xxspltw alpha_i,alpha_i,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
oris T4, T4, save_permute_11@h
ori T2, T2, perm_const2@l
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
li r0,0
li PRE,512
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
/*negate for this case as we will use addition -1*(a+b) */
xvnegsp alpha_r,alpha_r
xvnegsp alpha_i,alpha_i
#endif
mtvsrdd permute_mask,T2,T1
mtvsrdd save_permute_1,T3,T4
/*mask is reverse permute so we have to make it inner permute */
xxpermdi permute_mask, permute_mask, permute_mask,2
#include "cgemm_logic_power9.S"
.L999:
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r0, FLINK_SAVE(SP)
lxv vs52, 288(SP)
lxv vs53, 304(SP)
lxv vs54, 320(SP)
lxv vs55, 336(SP)
lxv vs56, 352(SP)
lxv vs57, 368(SP)
lxv vs58, 384(SP)
lxv vs59, 400(SP)
mtlr r0
lxv vs60, 416(SP)
lxv vs61, 432(SP)
lxv vs62, 448(SP)
lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfs f2, ALPHA_I_SP stfs f2, ALPHA_I_SP
// stw r0, FZERO // stw r0, FZERO
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__ #ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP) ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif #endif

View File

@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -271,7 +271,7 @@ li r11,0
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

View File

@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP) std r14, 280(SP)
stxv v20, 288(SP) stxv vs52, 288(SP)
stxv v21, 304(SP) stxv vs53, 304(SP)
stxv v22, 320(SP) stxv vs54, 320(SP)
stxv v23, 336(SP) stxv vs55, 336(SP)
stxv v24, 352(SP) stxv vs56, 352(SP)
stxv v25, 368(SP) stxv vs57, 368(SP)
stxv v26, 384(SP) stxv vs58, 384(SP)
stxv v27, 400(SP) stxv vs59, 400(SP)
stxv v28, 416(SP) stxv vs60, 416(SP)
stxv v29, 432(SP) stxv vs61, 432(SP)
stxv v30, 448(SP) stxv vs62, 448(SP)
stxv v31, 464(SP) stxv vs63, 464(SP)
stfd f1, ALPHA_SP stfd f1, ALPHA_SP
@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r15, 272(SP) ld r15, 272(SP)
ld r14, 280(SP) ld r14, 280(SP)
lxv v20, 288(SP) lxv vs52, 288(SP)
lxv v21, 304(SP) lxv vs53, 304(SP)
lxv v22, 320(SP) lxv vs54, 320(SP)
lxv v23, 336(SP) lxv vs55, 336(SP)
lxv v24, 352(SP) lxv vs56, 352(SP)
lxv v25, 368(SP) lxv vs57, 368(SP)
lxv v26, 384(SP) lxv vs58, 384(SP)
lxv v27, 400(SP) lxv vs59, 400(SP)
lxv v28, 416(SP) lxv vs60, 416(SP)
lxv v29, 432(SP) lxv vs61, 432(SP)
lxv v30, 448(SP) lxv vs62, 448(SP)
lxv v31, 464(SP) lxv vs63, 464(SP)
addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE
blr blr

View File

@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stvx v31, r11, r0 stvx v31, r11, r0
li r11,0 li r11,0
stw r31, 144(SP)
stfd f1, ALPHA_SP stfd f1, ALPHA_SP
stw r0, FZERO stw r0, FZERO
@ -271,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

View File

@ -61,7 +61,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -217,7 +217,7 @@ li r11,0
#endif #endif
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

View File

@ -62,7 +62,7 @@
stfd f31, 16(SP) stfd f31, 16(SP)
stw r0, 24(SP) stw r0, 24(SP)
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else #else

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -186,7 +186,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -228,7 +228,7 @@
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -58,7 +58,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7

View File

@ -58,7 +58,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7

View File

@ -58,7 +58,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -192,7 +192,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -226,7 +226,7 @@
li PREC, 4 * SIZE li PREC, 4 * SIZE
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -184,7 +184,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

View File

@ -46,7 +46,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#define A r6 #define A r6
#define B r7 #define B r7
#define C r8 #define C r8

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -187,7 +187,7 @@
li PREC, 4 * SIZE li PREC, 4 * SIZE
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -183,7 +183,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -183,7 +183,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define M r3 #define M r3
#define N r4 #define N r4
@ -252,7 +252,7 @@
stw r27, 196(SP) stw r27, 196(SP)
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define M r3 #define M r3
#define N r4 #define N r4
@ -199,7 +199,7 @@
stw r23, 180(SP) stw r23, 180(SP)
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define M r3 #define M r3
#define N r4 #define N r4
@ -260,7 +260,7 @@
stw r29, 220(SP) stw r29, 220(SP)
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define M r3 #define M r3
#define N r4 #define N r4
@ -190,7 +190,7 @@
stw r22, 192(SP) stw r22, 192(SP)
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -47,7 +47,7 @@
#endif #endif
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define M r3 #define M r3
#define N r4 #define N r4
@ -224,7 +224,7 @@
stw r27, 196(SP) stw r27, 196(SP)
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index; BLASLONG index;
BLASLONG i; BLASLONG i=0;
#if defined(USE_MASK_PERMUTATIONS) #if defined(USE_MASK_PERMUTATIONS)
register __vector unsigned int static_index0 = {0,1,2,3}; register __vector unsigned int static_index0 = {0,1,2,3};
#else #else

View File

@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index; BLASLONG index;
BLASLONG i; BLASLONG i=0;
register __vector unsigned int static_index0 = {0,1,2,3}; register __vector unsigned int static_index0 = {0,1,2,3};
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}

View File

@ -43,7 +43,7 @@
#define XX r4 #define XX r4
#define PREA r5 #define PREA r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define X r6 #define X r6
#define INCX r7 #define INCX r7

View File

@ -43,7 +43,7 @@
#define XX r4 #define XX r4
#define PRE r5 #define PRE r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define X r6 #define X r6
#define INCX r7 #define INCX r7

View File

@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, 2 slwi LDC, LDC, 2
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif #endif

View File

@ -0,0 +1,272 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define LOAD ld
#define STACKSIZE (512 )
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
#define K r5
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#define alpha_r vs20
#define save_permute_1 vs21
#define save_permute_2 vs22
#define permute_mask vs23
#define o0 0
#define T1 r11
#define T2 r12
#define T3 r14
#define T4 r15
#define T5 r16
#define T6 r17
#define L r18
#define T7 r19
#define T8 r20
#define TEMP_REG r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define T9 r27
#define T10 r28
#define T11 r29
#define T12 r30
#define T13 r31
#include "sgemm_macros_power9.S"
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_11, 0x1415161718191a1b
.equ save_permute_12, 0x0405060708090a0b
.equ save_permute_21, 0x101112131c1d1e1f
.equ save_permute_22, 0x000102030c0d0e0f
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
mflr r0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
stxv vs52, 288(SP)
stxv vs53, 304(SP)
stxv vs54, 320(SP)
stxv vs55, 336(SP)
stxv vs56, 352(SP)
stxv vs57, 368(SP)
stxv vs58, 384(SP)
stxv vs59, 400(SP)
stxv vs60, 416(SP)
stxv vs61, 432(SP)
stxv vs62, 448(SP)
stxv vs63, 464(SP)
std r0, FLINK_SAVE(SP)
#if defined(TRMMKERNEL)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
slwi LDC, LDC, 2
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
xxspltw alpha_r,alpha_r,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
lis T5, save_permute_22@highest
lis T6, save_permute_21@highest
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
ori T5, T5, save_permute_22@higher
ori T6, T6, save_permute_21@higher
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
rldicr T5, T5, 32, 31
rldicr T6, T6, 32, 31
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
oris T4, T4, save_permute_11@h
oris T5, T5, save_permute_22@h
oris T6, T6, save_permute_21@h
ori T2, T2, perm_const2@l
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
ori T5, T5, save_permute_22@l
ori T6, T6, save_permute_21@l
li r0,0
mtvsrdd permute_mask,T2,T1
mtvsrdd save_permute_1,T3,T4
mtvsrdd save_permute_2,T5,T6
#include "sgemm_logic_power9.S"
.L999:
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r0, FLINK_SAVE(SP)
lxv vs52, 288(SP)
lxv vs53, 304(SP)
lxv vs54, 320(SP)
lxv vs55, 336(SP)
lxv vs56, 352(SP)
lxv vs57, 368(SP)
lxv vs58, 384(SP)
lxv vs59, 400(SP)
mtlr r0
lxv vs60, 416(SP)
lxv vs61, 432(SP)
lxv vs62, 448(SP)
lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) #if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define N r3 #define N r3
#define X r6 #define X r6

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define M r3 #define M r3
#define N r4 #define N r4
@ -248,7 +248,7 @@
stw r27, 196(SP) stw r27, 196(SP)
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else #else

View File

@ -39,7 +39,7 @@
#define ASSEMBLER #define ASSEMBLER
#include "common.h" #include "common.h"
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define M r3 #define M r3
#define IS r4 #define IS r4
@ -247,7 +247,7 @@
stw r27, 196(SP) stw r27, 196(SP)
#endif #endif
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else #else

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -236,7 +236,7 @@
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -257,7 +257,7 @@
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -254,7 +254,7 @@
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -231,7 +231,7 @@
li PREC, -4 * SIZE li PREC, -4 * SIZE
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -257,7 +257,7 @@
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif
@ -231,7 +231,7 @@
li PREC, -4 * SIZE li PREC, -4 * SIZE
#else #else
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
mr PREA, r10 mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -46,7 +46,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#define A r6 #define A r6
#define B r7 #define B r7
#define C r8 #define C r8

View File

@ -46,7 +46,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#define A r6 #define A r6
#define B r7 #define B r7
#define C r8 #define C r8

View File

@ -46,7 +46,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#define A r6 #define A r6
#define B r7 #define B r7
#define C r8 #define C r8

View File

@ -59,7 +59,7 @@
#define N r4 #define N r4
#define K r5 #define K r5
#ifdef linux #if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__ #ifndef __64BIT__
#define A r6 #define A r6
#define B r7 #define B r7
@ -179,7 +179,7 @@
slwi LDC, LDC, BASE_SHIFT slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__) #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif #endif

Some files were not shown because too many files have changed in this diff Show More