Merge pull request #2213 from xianyi/develop

Update from develop in preparation of the 0.3.7 release
This commit is contained in:
Martin Kroeker 2019-08-11 23:14:49 +02:00 committed by GitHub
commit 20d417762f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
165 changed files with 20231 additions and 595 deletions

143
.drone.yml Normal file
View File

@ -0,0 +1,143 @@
---
kind: pipeline
name: arm64_gcc_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_gcc_make
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm64_clang_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_clang_cmake
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_gcc_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
---
kind: pipeline
name: arm64_clang_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest

View File

@ -25,6 +25,15 @@ matrix:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
@ -164,42 +173,6 @@ matrix:
env:
- BTYPE="BINARY=32"
- &emulated-arm
dist: trusty
sudo: required
services: docker
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
name: "Emulated Build for ARMV6 with gcc"
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
script: |
echo "FROM openblas/alpine:${IMAGE_ARCH}
COPY . /tmp/openblas
RUN mkdir /tmp/openblas/build && \
cd /tmp/openblas/build && \
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
-D TARGET=${TARGET_ARCH} \
-D BUILD_SHARED_LIBS=ON \
-D BUILD_WITHOUT_LAPACK=ON \
-D BUILD_WITHOUT_CBLAS=ON \
-D CMAKE_BUILD_TYPE=Release ../ && \
cmake --build ." > Dockerfile
docker build .
- <<: *emulated-arm
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
name: "Emulated Build for ARMV6 with clang"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
name: "Emulated Build for ARMV8 with gcc"
- <<: *emulated-arm
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
name: "Emulated Build for ARMV8 with clang"
allow_failures:
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
# whitelist
branches:
only:

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 6)
set(OpenBLAS_PATCH_VERSION 7.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@ -20,9 +20,14 @@ if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
@ -206,7 +211,8 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif()
if (MSVC OR NOT NOFORTRAN)
#if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
# Broken without fortran on unix
add_subdirectory(utest)
endif()

View File

@ -167,4 +167,7 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel

View File

@ -34,7 +34,7 @@ endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@ -109,6 +109,7 @@ endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
$(MAKE) -C utest all
endif
$(MAKE) -C utest all
ifndef NO_CBLAS
$(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif
endif

View File

@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
CCOMMON_OPT += -mfpu=neon
FCOMMON_OPT += -mfpu=neon
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@ -9,11 +9,6 @@ endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp
endif

View File

@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"

View File

@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif
endif
# workaround for C->FORTRAN ABI violation in LAPACKE
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fno-optimize-sibling-calls
endif
FLAMEPATH = $(HOME)/flame/lib

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.6
VERSION = 0.3.7.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -58,6 +58,12 @@ VERSION = 0.3.6
# For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0
# If you want to build a single-threaded OpenBLAS, but expect to call this
# from several concurrent threads in some other program, comment this in for
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
# be necessary when USE_OPENMP=1)
# USE_LOCKING = 1
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1
@ -157,6 +163,10 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
# system will try to determine this automatically)
# NO_AVX512 = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
@ -181,17 +191,17 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this mumber by THREAD_TIMEOUT
# system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contigous memory
# Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
@ -239,6 +249,21 @@ COMMON_PROF = -pg
# SYMBOLPREFIX=
# SYMBOLSUFFIX=
# Run a C++ based thread safety tester after the build is done.
# This is mostly intended as a developer feature to spot regressions, but users and
# package maintainers can enable this if they have doubts about the thread safety of
# the library, given the configuration in this file.
# By default, the thread safety tester launches 52 concurrent calculations at the same
# time.
#
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
#
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
# an OpenMP implementation. If you are cross-compiling this test will probably not
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
#
# End of user configuration
#

View File

@ -9,6 +9,11 @@ ifndef TOPDIR
TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture.
ifndef ARCH
ARCH := $(shell uname -m)
endif
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
@ -137,7 +142,12 @@ endif
endif
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
ifeq ($(ARCH), x86_64)
ifneq ($(C_COMPILER), PGI)
GETARCH_FLAGS += -march=native
endif
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@ -237,6 +247,10 @@ SMP = 1
endif
endif
ifeq ($(SMP), 1)
USE_LOCKING =
endif
ifndef NEED_PIC
NEED_PIC = 1
endif
@ -253,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS.
# When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
override FEXTRALIB =
endif
#
@ -388,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif
ifdef USE_LOCKING
ifneq ($(USE_LOCKING), 0)
CCOMMON_OPT += -DUSE_LOCKING
endif
endif
#
# Architecture dependent settings
#
@ -744,6 +765,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
@ -1049,7 +1072,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
ifdef USE_TLS
ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif
@ -1102,8 +1125,12 @@ endif
endif
ifdef NO_AFFINITY
ifeq ($(NO_AFFINITY), 0)
override undefine NO_AFFINITY
else
CCOMMON_OPT += -DNO_AFFINITY
endif
endif
ifdef FUNCTION_PROFILE
CCOMMON_OPT += -DFUNCTION_PROFILE

View File

@ -28,11 +28,15 @@ endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif

View File

@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional)
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64
@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`.
#### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
### Supported OS

View File

@ -35,7 +35,14 @@ environment:
DYNAMIC_ARCH: ON
WITH_FORTRAN: no
- COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- COMPILER: MinGW64-gcc-7.2.0
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
@ -52,7 +59,14 @@ install:
before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
@ -64,3 +78,4 @@ test_script:
- echo Running Test
- cd utest
- openblas_utest

51
azure-pipelines.yml Normal file
View File

@ -0,0 +1,51 @@
trigger:
# start a new build for every push
batch: False
branches:
include:
- develop
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
# does not support AVX512VL, so use more recent LTS version
echo "FROM ubuntu:bionic
COPY . /tmp/openblas
RUN apt-get -y update && apt-get -y install \\
cmake \\
gfortran \\
make \\
wget
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
mkdir sde-external-8.35.0-2019-03-11-lin && \\
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
docker build -t intel_sde .
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'

View File

@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
fprintf(stderr, " SIZE Flops Time\n");
for (i = from; i <= to; i += step) {

View File

@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
} else {
$no_avx512 = 0;
}
unlink("tmpf.o");
unlink("$tmpf.o");
}
}

View File

@ -73,14 +73,16 @@ if (DYNAMIC_ARCH)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif ()
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)
endif ()
endif ()

View File

@ -44,7 +44,10 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1)
set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S)

View File

@ -59,6 +59,9 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_")
endif()
if(MINGW AND NOT MINGW64)
set(FU "_")
endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
@ -82,6 +85,11 @@ endif ()
# f_check
if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
else ()
file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
set(BU "_")
endif ()
# Cannot run getarch on target if we are cross-compiling

View File

@ -65,6 +65,18 @@ if (DEFINED TARGET)
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
# On x86 no AVX support is available
if (X86 OR X86_64)
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
endif ()
endif ()
if (INTERFACE64)
message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@ -136,10 +148,16 @@ endif ()
if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
else()
if (${USE_LOCKING})
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
endif ()
endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1)
endif ()
@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif ()
if (BINARY64)
@ -181,9 +202,14 @@ if (NEED_PIC)
endif ()
if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
else ()
unset (DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
endif ()
endif ()
@ -283,7 +309,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")

View File

@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID)
endif(${OPERATING_SYSTEM} MATCHES "Android")
endif()
endif()

View File

@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@ -131,7 +131,7 @@ extern "C" {
#include <time.h>
#include <unistd.h>
#include <math.h>
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h>
#endif
#endif
@ -200,7 +200,7 @@ extern "C" {
#error "You can't specify both LOCK operation!"
#endif
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK
#endif

View File

@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define PROLOGUE \
.section .text;\
@ -784,7 +784,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1
@ -829,7 +829,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON
#endif
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8)
#else

View File

@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
* Choosing a SIZE too small will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \

View File

@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

View File

@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2];
*edx=cpuinfo[3];
#else
__asm__ __volatile__("cpuid"
__asm__ __volatile__("mov $0, %%ecx;"
"cpuid"
: "=a" (*eax),
"=b" (*ebx),
"=c" (*ecx),
"=d" (*edx)
: "0" (op), "c"(0));
: "0" (op));
#endif
}
@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona.
//Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION
#endif

14
cpp_thread_test/Makefile Normal file
View File

@ -0,0 +1,14 @@
include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
./dgemm_tester
clean ::
rm -f dgemv_tester dgemm_tester

View File

@ -0,0 +1,55 @@
inline void pauser(){
/// a portable way to pause a program
std::string dummy;
std::cout << "Press enter to continue...";
std::getline(std::cin, dummy);
}
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for(uint32_t i=0; i<numMat; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
matBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
for(uint32_t j=0; j<numMat; j++){
matBlock[i+j] = matBlock[j];
}
}
}
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
for(uint32_t i=0; i<numVec; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
vecBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
for(uint32_t j=0; j<numVec; j++){
vecBlock[i+j] = vecBlock[j];
}
}
}
std::mt19937_64 InitPRNG(){
std::random_device rd;
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
return PRNG;
}
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
std::cout<<i<<std::endl;
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
}

View File

@ -0,0 +1,92 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs[0]);
numConcurrentThreads = std::stoul(cliArgs[1]);
numTestRounds = std::stoul(cliArgs[2]);
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMM thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
matBlock[i].resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMM thread safety\n";
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMM round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -0,0 +1,101 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMV thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
vecBlock.at(i).resize(randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
std::cout<<"done\n";
std::cout<<"Filling vectors with random numbers..."<<std::flush;
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -94,7 +94,7 @@ int get_feature(char *search)
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
if (!strcmp(t, search)) { return(1); }
}
@ -344,7 +344,7 @@ void get_features(void)
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
while( (t = strtok(NULL," ")))
{
}

View File

@ -1211,7 +1211,7 @@ int get_cpuname(void){
return CPUTYPE_CORE2;
}
break;
case 1:
case 1: // family 6 exmodel 1
switch (model) {
case 6:
return CPUTYPE_CORE2;
@ -1228,7 +1228,7 @@ int get_cpuname(void){
return CPUTYPE_DUNNINGTON;
}
break;
case 2:
case 2: // family 6 exmodel 2
switch (model) {
case 5:
//Intel Core (Clarkdale) / Core (Arrandale)
@ -1257,7 +1257,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 3:
case 3: // family 6 exmodel 3
switch (model) {
case 7:
// Bay Trail
@ -1287,7 +1287,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 4:
case 4: // family 6 exmodel 4
switch (model) {
case 5:
case 6:
@ -1321,7 +1321,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 5:
case 5: // family 6 exmodel 5
switch (model) {
case 6:
//Broadwell
@ -1364,7 +1364,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 6:
case 6: // family 6 exmodel 6
switch (model) {
case 6: // Cannon Lake
if(support_avx512())
@ -1376,7 +1376,20 @@ int get_cpuname(void){
else
return CPUTYPE_NEHALEM;
}
break;
break;
case 7: // family 6 exmodel 7
switch (model) {
case 14: // Ice Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
switch (model) {

View File

@ -6,6 +6,8 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS
override TARGET_ARCH=
override TARGET_MACH=
LIB = $(TOPDIR)/$(LIBNAME)

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 *****************************
*
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
*

View File

@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */
/* We need this grobal for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */
@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR
/* Monitor is a function to see thread's status for every seconds. */
/* Usually it turns off and it's for debugging. */
/* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread;
static int main_status[MAX_CPU_NUMBER];

View File

@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t;
/* We need this global for cheking if initialization is finished. */
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */

View File

@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){
}
}
return NULL;
case 7:
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {

View File

@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
int mynode = 1;
/* if number of threads is larger than inital condition */
/* if number of threads is larger than initial condition */
if (pos < 0) {
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
return 0;
@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
#if defined(__GLIBC_PREREQ)
#if __GLIBC_PREREQ(2, 7)
cpu_set_t *cpusetp;
#else
cpu_set_t cpuset;
#endif
#endif
int nums;
int ret;
@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
}
CPU_FREE(cpusetp);
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) {
common->num_procs = nums;
} else {
@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
int i;
int n = 0;
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n;
}
#else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
common->num_procs = CPU_COUNT(&cpuset);
}
#endif

View File

@ -229,7 +229,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif
@ -1772,7 +1774,7 @@ int get_num_procs(void) {
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpuset)) n++;
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
static void alloc_mmap_free(struct release_t *release){
if (!release->address) return;
if (munmap(release -> address, BUFFER_SIZE)) {
printf("OpenBLAS : munmap failed\n");
int errsv=errno;
perror("OpenBLAS : munmap failed:");
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
}
}
@ -2062,15 +2068,21 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
} else {
#ifdef DEBUG
int errsv=errno;
perror("OpenBLAS : mmap failed:");
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
#endif
}
#ifdef OS_LINUX
@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){
#endif
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){
position ++;
} while (position < NUM_BUFFERS);
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
goto error;
@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&memory[position].lock);
@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
#endif
@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
@ -2924,7 +2936,7 @@ void blas_shutdown(void){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
if (hot_alloc != 2) {
#endif
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
LOCK_COMMAND(&init_lock);
#endif
@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
size -= PAGESIZE;
}
#ifdef SMP
#if defined(SMP) || defined(USE_LOCKING)
UNLOCK_COMMAND(&init_lock);
#endif
@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
gotoblas_init();
gotoblas_quit();
#if __PGIC__ < 19
#if 0
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
#endif
#endif
}
#endif

View File

@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
else
@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c

View File

@ -125,7 +125,7 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ / zho_ge__/) {
$need2bu = 1;

View File

@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
axpby.c
)
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
# these all have 'z' sources for complex versions
set(BLAS2_SOURCES
gemv.c ger.c

View File

@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small imput size &
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
//
//Temporarily work-around the low performance issue with small imput size &
//Temporarily work-around the low performance issue with small input size &
//multithreads.
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
nthreads = 1;

View File

@ -1,30 +1,30 @@
include $(KERNELDIR)/KERNEL.ARMV5
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S
SAMAXKERNEL = amax_vfp.S
DAMAXKERNEL = amax_vfp.S
#CAMAXKERNEL = amax_vfp.S
#ZAMAXKERNEL = amax_vfp.S
SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S
SAMINKERNEL = amax_vfp.S
DAMINKERNEL = amax_vfp.S
#CAMINKERNEL = amax_vfp.S
#ZAMINKERNEL = amax_vfp.S
SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S
SMAXKERNEL = amax_vfp.S
DMAXKERNEL = amax_vfp.S
SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S
SMINKERNEL = amax_vfp.S
DMINKERNEL = amax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S
#ICAMAXKERNEL = iamax_vfp.S
#IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S
#ICAMINKERNEL = iamax_vfp.S
#IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S

445
kernel/arm/amax_vfp.S Normal file
View File

@ -0,0 +1,445 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(USE_ABS)
#if defined(DOUBLE)
#define VABS(x0,x1) vabs.f64 x0, x1
#else
#define VABS(x0,x1) vabs.f32 x0, x1
#endif
#else
#define VABS(x0,x1) nop
#endif
/*****************************************************************************************/
#if defined(USE_MIN)
#define MOVCOND movlt
#if defined(DOUBLE)
#define VMOVCOND vmovlt.f64
#else
#define VMOVCOND vmovlt.f32
#endif
#else
#define MOVCOND movgt
#if defined(DOUBLE)
#define VMOVCOND vmovgt.f64
#else
#define VMOVCOND vmovgt.f32
#endif
#endif
/*****************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro INIT_F
vldmia.f64 X!, { d0 }
VABS( d0, d0 )
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 }
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
.endm
.macro INIT_S
vldmia.f64 X, { d0 }
VABS( d0, d0 )
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 }
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
add X, X, INC_X
.endm
#else
.macro INIT_F
vldmia.f32 X!, { s0 }
VABS( s0, s0 )
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 }
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
.endm
.macro INIT_S
vldmia.f32 X, { s0 }
VABS( s0, s0 )
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 }
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro INIT_F
vldmia.f64 X!, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 - d5 }
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
.endm
.macro INIT_S
vldmia.f64 X, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 - d5 }
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
add X, X, INC_X
.endm
#else
.macro INIT_F
vldmia.f32 X!, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 - s5 }
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
.endm
.macro INIT_S
vldmia.f32 X, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 - s5 }
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
movs r12, #0 // clear floating point register
vmov s0, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
#endif
cmp N, #0
ble amax_kernel_L999
cmp INC_X, #0
beq amax_kernel_L999
cmp INC_X, #1
bne amax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
INIT_F
subs N, N , #1
ble amax_kernel_L999
asrs I, N, #2 // I = N / 4
ble amax_kernel_F1
.align 5
amax_kernel_F4:
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
ble amax_kernel_F1
#if defined(COMPLEX) || defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
bne amax_kernel_F4
amax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
b amax_kernel_L999
amax_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
INIT_S
subs N, N , #1
ble amax_kernel_L999
asrs I, N, #2 // I = N / 4
ble amax_kernel_S1
.align 5
amax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
amax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
amax_kernel_L999:
#if !defined(__ARM_PCS_VFP)
#if defined(DOUBLE)
vmov r0, r1, d0
#else
vmov r0, s0
#endif
#endif
bx lr
EPILOGUE

View File

@ -3,12 +3,12 @@
#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = strmm_kernel_16x8_power8.S
STRMMKERNEL = sgemm_kernel_power9.S
DTRMMKERNEL = dgemm_kernel_power9.S
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
CTRMMKERNEL = cgemm_kernel_power9.S
ZTRMMKERNEL = zgemm_kernel_power9.S
SGEMMKERNEL = sgemm_kernel_16x8_power8.S
SGEMMKERNEL = sgemm_kernel_power9.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_power8.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMKERNEL = cgemm_kernel_power9.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = cgemm_tcopy_8_power8.S
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMKERNEL = zgemm_kernel_power9.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6

View File

@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfs f2, ALPHA_I_SP
// stw r0, FZERO
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif
@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
#endif

View File

@ -0,0 +1,293 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* Abdelrauf(quickwritereader@gmail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define LOAD ld
#define STACKSIZE (512 )
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
#define K r5
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#define alpha_r vs19
#define alpha_i vs20
#define save_permute_1 vs21
#define permute_mask vs22
#define o0 0
#define T1 r11
#define T2 r12
#define T3 r14
#define T4 r15
#define T5 r16
#define T6 r17
#define L r18
#define T7 r19
#define T8 r20
#define TEMP_REG r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define T9 r27
#define T10 r28
#define PRE r29
#define T12 r30
#define T13 r31
#include "cgemm_macros_power9.S"
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
.equ save_permute_11, 0x0405060714151617
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
mflr r0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
stxv vs52, 288(SP)
stxv vs53, 304(SP)
stxv vs54, 320(SP)
stxv vs55, 336(SP)
stxv vs56, 352(SP)
stxv vs57, 368(SP)
stxv vs58, 384(SP)
stxv vs59, 400(SP)
stxv vs60, 416(SP)
stxv vs61, 432(SP)
stxv vs62, 448(SP)
stxv vs63, 464(SP)
std r0, FLINK_SAVE(SP)
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#ifdef TRMMKERNEL
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
slwi LDC, LDC, ZBASE_SHIFT
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
xscvdpspn alpha_i,vs2
xxspltw alpha_r,alpha_r,0
xxspltw alpha_i,alpha_i,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
oris T4, T4, save_permute_11@h
ori T2, T2, perm_const2@l
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
li r0,0
li PRE,512
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
/*negate for this case as we will use addition -1*(a+b) */
xvnegsp alpha_r,alpha_r
xvnegsp alpha_i,alpha_i
#endif
mtvsrdd permute_mask,T2,T1
mtvsrdd save_permute_1,T3,T4
/*mask is reverse permute so we have to make it inner permute */
xxpermdi permute_mask, permute_mask, permute_mask,2
#include "cgemm_logic_power9.S"
.L999:
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r0, FLINK_SAVE(SP)
lxv vs52, 288(SP)
lxv vs53, 304(SP)
lxv vs54, 320(SP)
lxv vs55, 336(SP)
lxv vs56, 352(SP)
lxv vs57, 368(SP)
lxv vs58, 384(SP)
lxv vs59, 400(SP)
mtlr r0
lxv vs60, 416(SP)
lxv vs61, 432(SP)
lxv vs62, 448(SP)
lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stfs f2, ALPHA_I_SP
// stw r0, FZERO
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif

View File

@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -271,7 +271,7 @@ li r11,0
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif

View File

@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
std r14, 280(SP)
stxv v20, 288(SP)
stxv v21, 304(SP)
stxv v22, 320(SP)
stxv v23, 336(SP)
stxv v24, 352(SP)
stxv v25, 368(SP)
stxv v26, 384(SP)
stxv v27, 400(SP)
stxv v28, 416(SP)
stxv v29, 432(SP)
stxv v30, 448(SP)
stxv v31, 464(SP)
stxv vs52, 288(SP)
stxv vs53, 304(SP)
stxv vs54, 320(SP)
stxv vs55, 336(SP)
stxv vs56, 352(SP)
stxv vs57, 368(SP)
stxv vs58, 384(SP)
stxv vs59, 400(SP)
stxv vs60, 416(SP)
stxv vs61, 432(SP)
stxv vs62, 448(SP)
stxv vs63, 464(SP)
stfd f1, ALPHA_SP
@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld r15, 272(SP)
ld r14, 280(SP)
lxv v20, 288(SP)
lxv v21, 304(SP)
lxv v22, 320(SP)
lxv v23, 336(SP)
lxv v24, 352(SP)
lxv v25, 368(SP)
lxv v26, 384(SP)
lxv v27, 400(SP)
lxv v28, 416(SP)
lxv v29, 432(SP)
lxv v30, 448(SP)
lxv v31, 464(SP)
lxv vs52, 288(SP)
lxv vs53, 304(SP)
lxv vs54, 320(SP)
lxv vs55, 336(SP)
lxv vs56, 352(SP)
lxv vs57, 368(SP)
lxv vs58, 384(SP)
lxv vs59, 400(SP)
lxv vs60, 416(SP)
lxv vs61, 432(SP)
lxv vs62, 448(SP)
lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr

View File

@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stvx v31, r11, r0
li r11,0
stw r31, 144(SP)
stfd f1, ALPHA_SP
stw r0, FZERO
@ -271,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif

View File

@ -61,7 +61,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -217,7 +217,7 @@ li r11,0
#endif
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif

View File

@ -62,7 +62,7 @@
stfd f31, 16(SP)
stw r0, 24(SP)
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -186,7 +186,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -228,7 +228,7 @@
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -58,7 +58,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7

View File

@ -58,7 +58,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7

View File

@ -58,7 +58,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -192,7 +192,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -226,7 +226,7 @@
li PREC, 4 * SIZE
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -184,7 +184,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif

View File

@ -46,7 +46,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -187,7 +187,7 @@
li PREC, 4 * SIZE
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -183,7 +183,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -183,7 +183,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@ -252,7 +252,7 @@
stw r27, 196(SP)
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@ -199,7 +199,7 @@
stw r23, 180(SP)
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@ -260,7 +260,7 @@
stw r29, 220(SP)
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@ -190,7 +190,7 @@
stw r22, 192(SP)
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -47,7 +47,7 @@
#endif
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@ -224,7 +224,7 @@
stw r27, 196(SP)
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)

View File

@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
BLASLONG i;
BLASLONG i=0;
#if defined(USE_MASK_PERMUTATIONS)
register __vector unsigned int static_index0 = {0,1,2,3};
#else

View File

@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
BLASLONG i;
BLASLONG i=0;
register __vector unsigned int static_index0 = {0,1,2,3};
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}

View File

@ -43,7 +43,7 @@
#define XX r4
#define PREA r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define X r6
#define INCX r7

View File

@ -43,7 +43,7 @@
#define XX r4
#define PRE r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define X r6
#define INCX r7

View File

@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, 2
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
#endif

View File

@ -0,0 +1,272 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define LOAD ld
#define STACKSIZE (512 )
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
#define K r5
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#define alpha_r vs20
#define save_permute_1 vs21
#define save_permute_2 vs22
#define permute_mask vs23
#define o0 0
#define T1 r11
#define T2 r12
#define T3 r14
#define T4 r15
#define T5 r16
#define T6 r17
#define L r18
#define T7 r19
#define T8 r20
#define TEMP_REG r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define T9 r27
#define T10 r28
#define T11 r29
#define T12 r30
#define T13 r31
#include "sgemm_macros_power9.S"
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_11, 0x1415161718191a1b
.equ save_permute_12, 0x0405060708090a0b
.equ save_permute_21, 0x101112131c1d1e1f
.equ save_permute_22, 0x000102030c0d0e0f
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
mflr r0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
stxv vs52, 288(SP)
stxv vs53, 304(SP)
stxv vs54, 320(SP)
stxv vs55, 336(SP)
stxv vs56, 352(SP)
stxv vs57, 368(SP)
stxv vs58, 384(SP)
stxv vs59, 400(SP)
stxv vs60, 416(SP)
stxv vs61, 432(SP)
stxv vs62, 448(SP)
stxv vs63, 464(SP)
std r0, FLINK_SAVE(SP)
#if defined(TRMMKERNEL)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
slwi LDC, LDC, 2
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
xxspltw alpha_r,alpha_r,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
lis T5, save_permute_22@highest
lis T6, save_permute_21@highest
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
ori T5, T5, save_permute_22@higher
ori T6, T6, save_permute_21@higher
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
rldicr T5, T5, 32, 31
rldicr T6, T6, 32, 31
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
oris T4, T4, save_permute_11@h
oris T5, T5, save_permute_22@h
oris T6, T6, save_permute_21@h
ori T2, T2, perm_const2@l
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
ori T5, T5, save_permute_22@l
ori T6, T6, save_permute_21@l
li r0,0
mtvsrdd permute_mask,T2,T1
mtvsrdd save_permute_1,T3,T4
mtvsrdd save_permute_2,T5,T6
#include "sgemm_logic_power9.S"
.L999:
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r0, FLINK_SAVE(SP)
lxv vs52, 288(SP)
lxv vs53, 304(SP)
lxv vs54, 320(SP)
lxv vs55, 336(SP)
lxv vs56, 352(SP)
lxv vs57, 368(SP)
lxv vs58, 384(SP)
lxv vs59, 400(SP)
mtlr r0
lxv vs60, 416(SP)
lxv vs61, 432(SP)
lxv vs62, 448(SP)
lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define N r3
#define X r6

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define N r4
@ -248,7 +248,7 @@
stw r27, 196(SP)
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define M r3
#define IS r4
@ -247,7 +247,7 @@
stw r27, 196(SP)
#endif
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
#else

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -236,7 +236,7 @@
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -257,7 +257,7 @@
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -254,7 +254,7 @@
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -231,7 +231,7 @@
li PREC, -4 * SIZE
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -257,7 +257,7 @@
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -59,7 +59,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
#define A r6
#define B r7
@ -180,7 +180,7 @@
slwi LDC, LDC, BASE_SHIFT
#if defined(linux) && defined(__64BIT__)
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
@ -231,7 +231,7 @@
li PREC, -4 * SIZE
#else
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#ifndef __64BIT__
mr PREA, r10
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)

View File

@ -46,7 +46,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8

View File

@ -46,7 +46,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8

View File

@ -46,7 +46,7 @@
#define N r4
#define K r5
#ifdef linux
#if defined(linux) || defined(__FreeBSD__)
#define A r6
#define B r7
#define C r8

Some files were not shown because too many files have changed in this diff Show More