Merge pull request #2213 from xianyi/develop
Update from develop in preparation of the 0.3.7 release
This commit is contained in:
commit
20d417762f
|
@ -0,0 +1,143 @@
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_gcc_make
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:19.04
|
||||||
|
environment:
|
||||||
|
CC: gcc
|
||||||
|
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||||
|
commands:
|
||||||
|
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC gfortran perl
|
||||||
|
- $CC --version
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||||
|
- make -C test $COMMON_FLAGS
|
||||||
|
- make -C ctest $COMMON_FLAGS
|
||||||
|
- make -C utest $COMMON_FLAGS
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm32_gcc_make
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:19.04
|
||||||
|
environment:
|
||||||
|
CC: gcc
|
||||||
|
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
|
||||||
|
commands:
|
||||||
|
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC gfortran perl
|
||||||
|
- $CC --version
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||||
|
- make -C test $COMMON_FLAGS
|
||||||
|
- make -C ctest $COMMON_FLAGS
|
||||||
|
- make -C utest $COMMON_FLAGS
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_clang_make
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: clang
|
||||||
|
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||||
|
commands:
|
||||||
|
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC gfortran perl
|
||||||
|
- $CC --version
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||||
|
- make -C test $COMMON_FLAGS
|
||||||
|
- make -C ctest $COMMON_FLAGS
|
||||||
|
- make -C utest $COMMON_FLAGS
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm32_clang_cmake
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: clang
|
||||||
|
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||||
|
commands:
|
||||||
|
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC g++ perl cmake
|
||||||
|
- $CC --version
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake $CMAKE_FLAGS ..
|
||||||
|
- make -j
|
||||||
|
- ctest
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_gcc_cmake
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: gcc
|
||||||
|
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||||
|
commands:
|
||||||
|
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC g++ perl cmake
|
||||||
|
- $CC --version
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake $CMAKE_FLAGS ..
|
||||||
|
- make -j
|
||||||
|
- ctest
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_clang_cmake
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: clang
|
||||||
|
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||||
|
commands:
|
||||||
|
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC g++ perl cmake
|
||||||
|
- $CC --version
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake $CMAKE_FLAGS ..
|
||||||
|
- make -j
|
||||||
|
- ctest
|
45
.travis.yml
45
.travis.yml
|
@ -25,6 +25,15 @@ matrix:
|
||||||
- TARGET_BOX=LINUX64
|
- TARGET_BOX=LINUX64
|
||||||
- BTYPE="BINARY=64"
|
- BTYPE="BINARY=64"
|
||||||
|
|
||||||
|
- <<: *test-ubuntu
|
||||||
|
os: linux-ppc64le
|
||||||
|
before_script:
|
||||||
|
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||||
|
env:
|
||||||
|
# for matrix annotation only
|
||||||
|
- TARGET_BOX=PPC64LE_LINUX
|
||||||
|
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||||
|
|
||||||
- <<: *test-ubuntu
|
- <<: *test-ubuntu
|
||||||
env:
|
env:
|
||||||
- TARGET_BOX=LINUX64
|
- TARGET_BOX=LINUX64
|
||||||
|
@ -164,42 +173,6 @@ matrix:
|
||||||
env:
|
env:
|
||||||
- BTYPE="BINARY=32"
|
- BTYPE="BINARY=32"
|
||||||
|
|
||||||
- &emulated-arm
|
|
||||||
dist: trusty
|
|
||||||
sudo: required
|
|
||||||
services: docker
|
|
||||||
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
|
|
||||||
name: "Emulated Build for ARMV6 with gcc"
|
|
||||||
before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
|
|
||||||
script: |
|
|
||||||
echo "FROM openblas/alpine:${IMAGE_ARCH}
|
|
||||||
COPY . /tmp/openblas
|
|
||||||
RUN mkdir /tmp/openblas/build && \
|
|
||||||
cd /tmp/openblas/build && \
|
|
||||||
CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \
|
|
||||||
-D TARGET=${TARGET_ARCH} \
|
|
||||||
-D BUILD_SHARED_LIBS=ON \
|
|
||||||
-D BUILD_WITHOUT_LAPACK=ON \
|
|
||||||
-D BUILD_WITHOUT_CBLAS=ON \
|
|
||||||
-D CMAKE_BUILD_TYPE=Release ../ && \
|
|
||||||
cmake --build ." > Dockerfile
|
|
||||||
docker build .
|
|
||||||
- <<: *emulated-arm
|
|
||||||
env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
|
|
||||||
name: "Emulated Build for ARMV6 with clang"
|
|
||||||
- <<: *emulated-arm
|
|
||||||
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
|
|
||||||
name: "Emulated Build for ARMV8 with gcc"
|
|
||||||
- <<: *emulated-arm
|
|
||||||
env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
|
|
||||||
name: "Emulated Build for ARMV8 with clang"
|
|
||||||
|
|
||||||
allow_failures:
|
|
||||||
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
|
|
||||||
- env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
|
|
||||||
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
|
|
||||||
- env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
|
|
||||||
|
|
||||||
# whitelist
|
# whitelist
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
|
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||||
project(OpenBLAS C ASM)
|
project(OpenBLAS C ASM)
|
||||||
set(OpenBLAS_MAJOR_VERSION 0)
|
set(OpenBLAS_MAJOR_VERSION 0)
|
||||||
set(OpenBLAS_MINOR_VERSION 3)
|
set(OpenBLAS_MINOR_VERSION 3)
|
||||||
set(OpenBLAS_PATCH_VERSION 6)
|
set(OpenBLAS_PATCH_VERSION 7.dev)
|
||||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||||
|
|
||||||
# Adhere to GNU filesystem layout conventions
|
# Adhere to GNU filesystem layout conventions
|
||||||
|
@ -20,9 +20,14 @@ if(MSVC)
|
||||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||||
endif()
|
endif()
|
||||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
|
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||||
option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||||
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||||
|
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||||
|
else()
|
||||||
|
set(NO_AFFINITY 1)
|
||||||
|
endif()
|
||||||
|
|
||||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||||
# Avoids conflicts with other BLAS libraries, especially when using
|
# Avoids conflicts with other BLAS libraries, especially when using
|
||||||
|
@ -206,7 +211,8 @@ if (USE_THREAD)
|
||||||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MSVC OR NOT NOFORTRAN)
|
#if (MSVC OR NOT NOFORTRAN)
|
||||||
|
if (NOT NO_CBLAS)
|
||||||
# Broken without fortran on unix
|
# Broken without fortran on unix
|
||||||
add_subdirectory(utest)
|
add_subdirectory(utest)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -167,4 +167,7 @@ In chronological order:
|
||||||
* [2017-02-26] ztrmm kernel for IBM z13
|
* [2017-02-26] ztrmm kernel for IBM z13
|
||||||
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
||||||
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
|
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
|
||||||
|
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
|
||||||
|
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
||||||
|
* [2019-03-14] power9 dgemm/dtrmm kernel
|
||||||
|
* [2019-04-29] power9 sgemm/strmm kernel
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -34,7 +34,7 @@ endif
|
||||||
|
|
||||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||||
|
|
||||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||||
|
|
||||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||||
|
@ -109,6 +109,7 @@ endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
@$(MAKE) -C exports dyn
|
@$(MAKE) -C exports dyn
|
||||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||||
|
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
@$(MAKE) -C exports dll
|
@$(MAKE) -C exports dll
|
||||||
|
@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
touch $(LIBNAME)
|
touch $(LIBNAME)
|
||||||
ifndef NO_FBLAS
|
ifndef NO_FBLAS
|
||||||
$(MAKE) -C test all
|
$(MAKE) -C test all
|
||||||
$(MAKE) -C utest all
|
|
||||||
endif
|
endif
|
||||||
|
$(MAKE) -C utest all
|
||||||
ifndef NO_CBLAS
|
ifndef NO_CBLAS
|
||||||
$(MAKE) -C ctest all
|
$(MAKE) -C ctest all
|
||||||
|
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||||
|
$(MAKE) -C cpp_thread_test all
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
13
Makefile.arm
13
Makefile.arm
|
@ -1,7 +1,7 @@
|
||||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||||
ifeq ($(OSNAME), Android)
|
ifeq ($(OSNAME), Android)
|
||||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
CCOMMON_OPT += -mfpu=neon
|
||||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
FCOMMON_OPT += -mfpu=neon
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||||
|
@ -9,11 +9,6 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), ARMV6)
|
ifeq ($(CORE), ARMV6)
|
||||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
CCOMMON_OPT += -mfpu=vfp
|
||||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
FCOMMON_OPT += -mfpu=vfp
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(CORE), ARMV5)
|
|
||||||
CCOMMON_OPT += -march=armv5
|
|
||||||
FCOMMON_OPT += -march=armv5
|
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
|
||||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||||
|
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||||
|
|
|
@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# workaround for C->FORTRAN ABI violation in LAPACKE
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
|
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||||
|
endif
|
||||||
|
|
||||||
FLAMEPATH = $(HOME)/flame/lib
|
FLAMEPATH = $(HOME)/flame/lib
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.3.6
|
VERSION = 0.3.7.dev
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -58,6 +58,12 @@ VERSION = 0.3.6
|
||||||
# For force setting for multi threaded, specify USE_THREAD = 1
|
# For force setting for multi threaded, specify USE_THREAD = 1
|
||||||
# USE_THREAD = 0
|
# USE_THREAD = 0
|
||||||
|
|
||||||
|
# If you want to build a single-threaded OpenBLAS, but expect to call this
|
||||||
|
# from several concurrent threads in some other program, comment this in for
|
||||||
|
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
|
||||||
|
# be necessary when USE_OPENMP=1)
|
||||||
|
# USE_LOCKING = 1
|
||||||
|
|
||||||
# If you're going to use this library with OpenMP, please comment it in.
|
# If you're going to use this library with OpenMP, please comment it in.
|
||||||
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
|
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
|
||||||
# USE_OPENMP = 1
|
# USE_OPENMP = 1
|
||||||
|
@ -157,6 +163,10 @@ NO_AFFINITY = 1
|
||||||
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
|
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
|
||||||
# NO_AVX2 = 1
|
# NO_AVX2 = 1
|
||||||
|
|
||||||
|
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
|
||||||
|
# system will try to determine this automatically)
|
||||||
|
# NO_AVX512 = 1
|
||||||
|
|
||||||
# Don't use parallel make.
|
# Don't use parallel make.
|
||||||
# NO_PARALLEL_MAKE = 1
|
# NO_PARALLEL_MAKE = 1
|
||||||
|
|
||||||
|
@ -181,17 +191,17 @@ NO_AFFINITY = 1
|
||||||
# time out to improve performance. This number should be from 4 to 30
|
# time out to improve performance. This number should be from 4 to 30
|
||||||
# which corresponds to (1 << n) cycles. For example, if you set to 26,
|
# which corresponds to (1 << n) cycles. For example, if you set to 26,
|
||||||
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
|
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
|
||||||
# system). Also you can control this mumber by THREAD_TIMEOUT
|
# system). Also you can control this number by THREAD_TIMEOUT
|
||||||
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
|
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
|
||||||
|
|
||||||
# Using special device driver for mapping physically contigous memory
|
# Using special device driver for mapping physically contiguous memory
|
||||||
# to the user space. If bigphysarea is enabled, it will use it.
|
# to the user space. If bigphysarea is enabled, it will use it.
|
||||||
# DEVICEDRIVER_ALLOCATION = 1
|
# DEVICEDRIVER_ALLOCATION = 1
|
||||||
|
|
||||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
||||||
# CONSISTENT_FPCSR = 1
|
# CONSISTENT_FPCSR = 1
|
||||||
|
|
||||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
|
||||||
# with single thread. (Actually in recent versions this is a factor proportional to the
|
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||||
# number of floating point operations necessary for the given problem size, no longer
|
# number of floating point operations necessary for the given problem size, no longer
|
||||||
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||||
|
@ -239,6 +249,21 @@ COMMON_PROF = -pg
|
||||||
# SYMBOLPREFIX=
|
# SYMBOLPREFIX=
|
||||||
# SYMBOLSUFFIX=
|
# SYMBOLSUFFIX=
|
||||||
|
|
||||||
|
# Run a C++ based thread safety tester after the build is done.
|
||||||
|
# This is mostly intended as a developer feature to spot regressions, but users and
|
||||||
|
# package maintainers can enable this if they have doubts about the thread safety of
|
||||||
|
# the library, given the configuration in this file.
|
||||||
|
# By default, the thread safety tester launches 52 concurrent calculations at the same
|
||||||
|
# time.
|
||||||
|
#
|
||||||
|
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
|
||||||
|
#
|
||||||
|
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
|
||||||
|
# an OpenMP implementation. If you are cross-compiling this test will probably not
|
||||||
|
# work at all.
|
||||||
|
#
|
||||||
|
# CPP_THREAD_SAFETY_TEST = 1
|
||||||
|
|
||||||
#
|
#
|
||||||
# End of user configuration
|
# End of user configuration
|
||||||
#
|
#
|
||||||
|
|
|
@ -9,6 +9,11 @@ ifndef TOPDIR
|
||||||
TOPDIR = .
|
TOPDIR = .
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# If ARCH is not set, we use the host system's architecture.
|
||||||
|
ifndef ARCH
|
||||||
|
ARCH := $(shell uname -m)
|
||||||
|
endif
|
||||||
|
|
||||||
# Catch conflicting usage of ARCH in some BSD environments
|
# Catch conflicting usage of ARCH in some BSD environments
|
||||||
ifeq ($(ARCH), amd64)
|
ifeq ($(ARCH), amd64)
|
||||||
override ARCH=x86_64
|
override ARCH=x86_64
|
||||||
|
@ -137,7 +142,12 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||||
|
ifeq ($(ARCH), x86_64)
|
||||||
|
ifneq ($(C_COMPILER), PGI)
|
||||||
|
GETARCH_FLAGS += -march=native
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
ifneq ($(INTERFACE64), 0)
|
ifneq ($(INTERFACE64), 0)
|
||||||
|
@ -237,6 +247,10 @@ SMP = 1
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(SMP), 1)
|
||||||
|
USE_LOCKING =
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef NEED_PIC
|
ifndef NEED_PIC
|
||||||
NEED_PIC = 1
|
NEED_PIC = 1
|
||||||
endif
|
endif
|
||||||
|
@ -253,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
|
||||||
OBJCONV = $(CROSS_SUFFIX)objconv
|
OBJCONV = $(CROSS_SUFFIX)objconv
|
||||||
|
|
||||||
|
|
||||||
# For detect fortran failed, only build BLAS.
|
# When fortran support was either not detected or actively deselected, only build BLAS.
|
||||||
ifeq ($(NOFORTRAN), 1)
|
ifeq ($(NOFORTRAN), 1)
|
||||||
NO_LAPACK = 1
|
NO_LAPACK = 1
|
||||||
|
override FEXTRALIB =
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -388,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
|
||||||
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef USE_LOCKING
|
||||||
|
ifneq ($(USE_LOCKING), 0)
|
||||||
|
CCOMMON_OPT += -DUSE_LOCKING
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Architecture dependent settings
|
# Architecture dependent settings
|
||||||
#
|
#
|
||||||
|
@ -744,6 +765,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||||
FCOMMON_OPT += -Wall
|
FCOMMON_OPT += -Wall
|
||||||
# make single-threaded LAPACK calls thread-safe #1847
|
# make single-threaded LAPACK calls thread-safe #1847
|
||||||
FCOMMON_OPT += -frecursive
|
FCOMMON_OPT += -frecursive
|
||||||
|
# work around ABI problem with passing single-character arguments
|
||||||
|
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||||
ifneq ($(NO_LAPACK), 1)
|
ifneq ($(NO_LAPACK), 1)
|
||||||
EXTRALIB += -lgfortran
|
EXTRALIB += -lgfortran
|
||||||
|
@ -1049,7 +1072,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef USE_TLS
|
ifeq ($(USE_TLS), 1)
|
||||||
CCOMMON_OPT += -DUSE_TLS
|
CCOMMON_OPT += -DUSE_TLS
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -1102,8 +1125,12 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef NO_AFFINITY
|
ifdef NO_AFFINITY
|
||||||
|
ifeq ($(NO_AFFINITY), 0)
|
||||||
|
override undefine NO_AFFINITY
|
||||||
|
else
|
||||||
CCOMMON_OPT += -DNO_AFFINITY
|
CCOMMON_OPT += -DNO_AFFINITY
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef FUNCTION_PROFILE
|
ifdef FUNCTION_PROFILE
|
||||||
CCOMMON_OPT += -DFUNCTION_PROFILE
|
CCOMMON_OPT += -DFUNCTION_PROFILE
|
||||||
|
|
|
@ -28,11 +28,15 @@ endif
|
||||||
ifeq ($(CORE), HASWELL)
|
ifeq ($(CORE), HASWELL)
|
||||||
ifndef DYNAMIC_ARCH
|
ifndef DYNAMIC_ARCH
|
||||||
ifndef NO_AVX2
|
ifndef NO_AVX2
|
||||||
|
ifeq ($(C_COMPILER), GCC)
|
||||||
CCOMMON_OPT += -mavx2
|
CCOMMON_OPT += -mavx2
|
||||||
|
endif
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
FCOMMON_OPT += -mavx2
|
FCOMMON_OPT += -mavx2
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
15
README.md
15
README.md
|
@ -6,11 +6,13 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||||
|
|
||||||
|
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||||
|
|
||||||
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||||
|
|
||||||
## Binary Packages
|
## Binary Packages
|
||||||
|
|
||||||
|
@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
||||||
|
|
||||||
## Installation from Source
|
## Installation from Source
|
||||||
|
|
||||||
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||||
|
|
||||||
### Dependencies
|
### Dependencies
|
||||||
|
@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`.
|
||||||
|
|
||||||
### Compile with MASS support on Power CPU (optional)
|
### Compile with MASS support on Power CPU (optional)
|
||||||
|
|
||||||
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
|
||||||
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
|
||||||
are tuned for optimum performance on POWER architectures.
|
|
||||||
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||||
The library can be installed as shown:
|
The library can be installed as shown:
|
||||||
|
|
||||||
|
@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||||
|
- **AMD ZEN**: Uses Haswell codes with some optimizations.
|
||||||
|
|
||||||
#### MIPS64
|
#### MIPS64
|
||||||
|
|
||||||
|
@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||||
|
|
||||||
#### PPC/PPC64
|
#### PPC/PPC64
|
||||||
|
|
||||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
|
||||||
|
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
||||||
|
|
||||||
#### IBM zEnterprise System
|
#### IBM zEnterprise System
|
||||||
|
|
||||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||||
|
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
|
||||||
|
|
||||||
### Supported OS
|
### Supported OS
|
||||||
|
|
||||||
|
|
17
appveyor.yml
17
appveyor.yml
|
@ -35,7 +35,14 @@ environment:
|
||||||
DYNAMIC_ARCH: ON
|
DYNAMIC_ARCH: ON
|
||||||
WITH_FORTRAN: no
|
WITH_FORTRAN: no
|
||||||
- COMPILER: cl
|
- COMPILER: cl
|
||||||
|
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||||
|
DYNAMIC_ARCH: OFF
|
||||||
|
WITH_FORTRAN: ignore
|
||||||
|
- COMPILER: MinGW64-gcc-7.2.0
|
||||||
|
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||||
|
COMPILER: MinGW-gcc-5.3.0
|
||||||
|
WITH_FORTRAN: ignore
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||||
|
@ -52,7 +59,14 @@ install:
|
||||||
before_build:
|
before_build:
|
||||||
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
||||||
- cd build
|
- cd build
|
||||||
|
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
|
||||||
|
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||||
|
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||||
|
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||||
|
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||||
|
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
|
||||||
|
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||||
|
@ -64,3 +78,4 @@ test_script:
|
||||||
- echo Running Test
|
- echo Running Test
|
||||||
- cd utest
|
- cd utest
|
||||||
- openblas_utest
|
- openblas_utest
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
trigger:
|
||||||
|
# start a new build for every push
|
||||||
|
batch: False
|
||||||
|
branches:
|
||||||
|
include:
|
||||||
|
- develop
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
# manylinux1 is useful to test because the
|
||||||
|
# standard Docker container uses an old version
|
||||||
|
# of gcc / glibc
|
||||||
|
- job: manylinux1_gcc
|
||||||
|
pool:
|
||||||
|
vmImage: 'ubuntu-16.04'
|
||||||
|
steps:
|
||||||
|
- script: |
|
||||||
|
echo "FROM quay.io/pypa/manylinux1_x86_64
|
||||||
|
COPY . /tmp/openblas
|
||||||
|
RUN cd /tmp/openblas && \
|
||||||
|
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
|
||||||
|
BTYPE='BINARY=64' CC=gcc && \
|
||||||
|
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
|
||||||
|
make -C test $COMMON_FLAGS $BTYPE && \
|
||||||
|
make -C ctest $COMMON_FLAGS $BTYPE && \
|
||||||
|
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
|
||||||
|
docker build .
|
||||||
|
displayName: Run manylinux1 docker build
|
||||||
|
- job: Intel_SDE_skx
|
||||||
|
pool:
|
||||||
|
vmImage: 'ubuntu-16.04'
|
||||||
|
steps:
|
||||||
|
- script: |
|
||||||
|
# at the time of writing the available Azure Ubuntu vm image
|
||||||
|
# does not support AVX512VL, so use more recent LTS version
|
||||||
|
echo "FROM ubuntu:bionic
|
||||||
|
COPY . /tmp/openblas
|
||||||
|
RUN apt-get -y update && apt-get -y install \\
|
||||||
|
cmake \\
|
||||||
|
gfortran \\
|
||||||
|
make \\
|
||||||
|
wget
|
||||||
|
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
|
||||||
|
mkdir sde-external-8.35.0-2019-03-11-lin && \\
|
||||||
|
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
|
||||||
|
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
|
||||||
|
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
|
||||||
|
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
|
||||||
|
docker build -t intel_sde .
|
||||||
|
# we need a privileged docker run for sde process attachment
|
||||||
|
docker run --privileged intel_sde
|
||||||
|
displayName: 'Run AVX512 SkylakeX docker build / test'
|
|
@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
|
||||||
for (i = 0; i < m * n * COMPSIZE; i++) {
|
for (i = 0; i < m * n * COMPSIZE; i++) {
|
||||||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, " SIZE Flops Time\n");
|
fprintf(stderr, " SIZE Flops Time\n");
|
||||||
|
|
||||||
for (i = from; i <= to; i += step) {
|
for (i = from; i <= to; i += step) {
|
||||||
|
|
2
c_check
2
c_check
|
@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||||
} else {
|
} else {
|
||||||
$no_avx512 = 0;
|
$no_avx512 = 0;
|
||||||
}
|
}
|
||||||
unlink("tmpf.o");
|
unlink("$tmpf.o");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -73,14 +73,16 @@ if (DYNAMIC_ARCH)
|
||||||
endif ()
|
endif ()
|
||||||
if (NOT NO_AVX512)
|
if (NOT NO_AVX512)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||||
endif ()
|
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
|
||||||
|
endif ()
|
||||||
if (DYNAMIC_LIST)
|
if (DYNAMIC_LIST)
|
||||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NOT DYNAMIC_CORE)
|
if (NOT DYNAMIC_CORE)
|
||||||
unset(DYNAMIC_ARCH)
|
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
|
||||||
|
unset(DYNAMIC_ARCH CACHE)
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,10 @@ endif ()
|
||||||
|
|
||||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||||
|
# ensure reentrancy of lapack codes
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||||
|
# work around ABI violation in passing string arguments from C
|
||||||
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||||
if (NOT NO_LAPACK)
|
if (NOT NO_LAPACK)
|
||||||
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# helper functions for the kernel CMakeLists.txt
|
# helper functions for the kernel CMakeLists.txt
|
||||||
|
|
||||||
|
|
||||||
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
|
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
|
||||||
macro(SetDefaultL1)
|
macro(SetDefaultL1)
|
||||||
set(SAMAXKERNEL amax.S)
|
set(SAMAXKERNEL amax.S)
|
||||||
set(DAMAXKERNEL amax.S)
|
set(DAMAXKERNEL amax.S)
|
||||||
|
|
|
@ -59,6 +59,9 @@ set(FU "")
|
||||||
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
|
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
|
||||||
set(FU "_")
|
set(FU "_")
|
||||||
endif()
|
endif()
|
||||||
|
if(MINGW AND NOT MINGW64)
|
||||||
|
set(FU "_")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
|
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
|
||||||
if (${COMPILER_ID} STREQUAL "GNU")
|
if (${COMPILER_ID} STREQUAL "GNU")
|
||||||
|
@ -82,6 +85,11 @@ endif ()
|
||||||
# f_check
|
# f_check
|
||||||
if (NOT NOFORTRAN)
|
if (NOT NOFORTRAN)
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
||||||
|
else ()
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define BUNDERSCORE _\n"
|
||||||
|
"#define NEEDBUNDERSCORE 1\n")
|
||||||
|
set(BU "_")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# Cannot run getarch on target if we are cross-compiling
|
# Cannot run getarch on target if we are cross-compiling
|
||||||
|
|
|
@ -65,6 +65,18 @@ if (DEFINED TARGET)
|
||||||
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||||
|
if (X86_64)
|
||||||
|
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
# On x86 no AVX support is available
|
||||||
|
if (X86 OR X86_64)
|
||||||
|
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
|
||||||
|
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (INTERFACE64)
|
if (INTERFACE64)
|
||||||
message(STATUS "Using 64-bit integers.")
|
message(STATUS "Using 64-bit integers.")
|
||||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
|
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
|
||||||
|
@ -136,10 +148,16 @@ endif ()
|
||||||
|
|
||||||
if (USE_THREAD)
|
if (USE_THREAD)
|
||||||
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
|
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
|
||||||
|
else()
|
||||||
|
if (${USE_LOCKING})
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||||
|
if (DEFINED BINARY)
|
||||||
|
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||||
|
endif ()
|
||||||
if (NOT DEFINED NEED_PIC)
|
if (NOT DEFINED NEED_PIC)
|
||||||
set(NEED_PIC 1)
|
set(NEED_PIC 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||||
if (NOT NOFORTRAN)
|
if (NOT NOFORTRAN)
|
||||||
# Fortran Compiler dependent settings
|
# Fortran Compiler dependent settings
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||||
|
else ()
|
||||||
|
set(NO_LAPACK 1)
|
||||||
|
set(NO_LAPACKE 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
|
@ -181,9 +202,14 @@ if (NEED_PIC)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||||
if (DYNAMIC_OLDER)
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
if (DYNAMIC_OLDER)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||||
|
endif ()
|
||||||
|
else ()
|
||||||
|
unset (DYNAMIC_ARCH)
|
||||||
|
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
@ -283,7 +309,7 @@ endif ()
|
||||||
|
|
||||||
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
|
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
|
||||||
|
|
||||||
# TODO: nead to convert these Makefiles
|
# TODO: need to convert these Makefiles
|
||||||
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
|
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
|
||||||
|
|
||||||
if (${CORE} STREQUAL "PPC440")
|
if (${CORE} STREQUAL "PPC440")
|
||||||
|
|
|
@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
|
||||||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
||||||
if(${OPERATING_SYSTEM} MATCHES "Android")
|
if(${OPERATING_SYSTEM} MATCHES "Android")
|
||||||
set(HOST_OS ANDROID)
|
set(HOST_OS ANDROID)
|
||||||
endif(${OPERATING_SYSTEM} MATCHES "Android")
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
|
||||||
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
|
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
|
||||||
endfunction ()
|
endfunction ()
|
||||||
|
|
||||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
|
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
|
||||||
# @param sources_in the source files to build from
|
# @param sources_in the source files to build from
|
||||||
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
|
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
|
||||||
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
|
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
|
||||||
|
|
4
common.h
4
common.h
|
@ -131,7 +131,7 @@ extern "C" {
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#ifdef SMP
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -200,7 +200,7 @@ extern "C" {
|
||||||
#error "You can't specify both LOCK operation!"
|
#error "You can't specify both LOCK operation!"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
#define USE_PTHREAD_LOCK
|
#define USE_PTHREAD_LOCK
|
||||||
#undef USE_PTHREAD_SPINLOCK
|
#undef USE_PTHREAD_SPINLOCK
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define HAVE_PREFETCH
|
#define HAVE_PREFETCH
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
|
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
|
||||||
#define DCBT_ARG 0
|
#define DCBT_ARG 0
|
||||||
#else
|
#else
|
||||||
#define DCBT_ARG 8
|
#define DCBT_ARG 8
|
||||||
|
@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
|
|
||||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.section .text;\
|
.section .text;\
|
||||||
|
@ -784,7 +784,7 @@ Lmcount$lazy_ptr:
|
||||||
|
|
||||||
#define HALT mfspr r0, 1023
|
#define HALT mfspr r0, 1023
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||||
#if defined(PPC440) || defined(PPC440FP2)
|
#if defined(PPC440) || defined(PPC440FP2)
|
||||||
#undef MAX_CPU_NUMBER
|
#undef MAX_CPU_NUMBER
|
||||||
#define MAX_CPU_NUMBER 1
|
#define MAX_CPU_NUMBER 1
|
||||||
|
@ -829,7 +829,7 @@ Lmcount$lazy_ptr:
|
||||||
#define MAP_ANONYMOUS MAP_ANON
|
#define MAP_ANONYMOUS MAP_ANON
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define FRAMESLOT(X) (((X) * 4) + 8)
|
#define FRAMESLOT(X) (((X) * 4) + 8)
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* SIZE must be carefully chosen to be:
|
* SIZE must be carefully chosen to be:
|
||||||
* - as small as possible to maximize the number of stack allocation
|
* - as small as possible to maximize the number of stack allocation
|
||||||
* - large enough to support all architectures and kernel
|
* - large enough to support all architectures and kernel
|
||||||
* Chosing a too small SIZE will lead to a stack smashing.
|
* Choosing a SIZE too small will lead to a stack smashing.
|
||||||
*/
|
*/
|
||||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||||
|
|
|
@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
//Enable some optimazation for barcelona.
|
//Enable some optimization for barcelona.
|
||||||
#define BARCELONA_OPTIMIZATION
|
#define BARCELONA_OPTIMIZATION
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
||||||
*ecx=cpuinfo[2];
|
*ecx=cpuinfo[2];
|
||||||
*edx=cpuinfo[3];
|
*edx=cpuinfo[3];
|
||||||
#else
|
#else
|
||||||
__asm__ __volatile__("cpuid"
|
__asm__ __volatile__("mov $0, %%ecx;"
|
||||||
|
"cpuid"
|
||||||
: "=a" (*eax),
|
: "=a" (*eax),
|
||||||
"=b" (*ebx),
|
"=b" (*ebx),
|
||||||
"=c" (*ecx),
|
"=c" (*ecx),
|
||||||
"=d" (*edx)
|
"=d" (*edx)
|
||||||
: "0" (op), "c"(0));
|
: "0" (op));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#ifdef ASSEMBLER
|
#ifdef ASSEMBLER
|
||||||
|
|
||||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
//Enable some optimazation for barcelona.
|
//Enable some optimization for barcelona.
|
||||||
#define BARCELONA_OPTIMIZATION
|
#define BARCELONA_OPTIMIZATION
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
include ../Makefile.rule
|
||||||
|
|
||||||
|
all :: dgemv_tester dgemm_tester
|
||||||
|
|
||||||
|
dgemv_tester :
|
||||||
|
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
|
||||||
|
./dgemv_tester
|
||||||
|
|
||||||
|
dgemm_tester : dgemv_tester
|
||||||
|
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
|
||||||
|
./dgemm_tester
|
||||||
|
|
||||||
|
clean ::
|
||||||
|
rm -f dgemv_tester dgemm_tester
|
|
@ -0,0 +1,55 @@
|
||||||
|
inline void pauser(){
|
||||||
|
/// a portable way to pause a program
|
||||||
|
std::string dummy;
|
||||||
|
std::cout << "Press enter to continue...";
|
||||||
|
std::getline(std::cin, dummy);
|
||||||
|
}
|
||||||
|
|
||||||
|
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||||
|
for(uint32_t i=0; i<numMat; i++){
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||||
|
matBlock[i][j] = rngdist(PRNG);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
|
||||||
|
for(uint32_t j=0; j<numMat; j++){
|
||||||
|
matBlock[i+j] = matBlock[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
|
||||||
|
for(uint32_t i=0; i<numVec; i++){
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||||
|
vecBlock[i][j] = rngdist(PRNG);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
|
||||||
|
for(uint32_t j=0; j<numVec; j++){
|
||||||
|
vecBlock[i+j] = vecBlock[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::mt19937_64 InitPRNG(){
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
|
||||||
|
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||||
|
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
|
||||||
|
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
|
||||||
|
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
|
||||||
|
return PRNG;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||||
|
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
|
||||||
|
std::cout<<i<<std::endl;
|
||||||
|
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||||
|
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
|
||||||
|
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,92 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <random>
|
||||||
|
#include <future>
|
||||||
|
#include <omp.h>
|
||||||
|
#include "../cblas.h"
|
||||||
|
#include "cpp_thread_safety_common.h"
|
||||||
|
|
||||||
|
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
|
||||||
|
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]){
|
||||||
|
blasint randomMatSize = 1024; //dimension of the random square matrices used
|
||||||
|
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||||
|
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||||
|
|
||||||
|
if (argc > 4){
|
||||||
|
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(argc == 4){
|
||||||
|
std::vector<std::string> cliArgs;
|
||||||
|
for (int i = 1; i < argc; i++){
|
||||||
|
cliArgs.push_back(argv[i]);
|
||||||
|
std::cout<<argv[i]<<std::endl;
|
||||||
|
}
|
||||||
|
randomMatSize = std::stoul(cliArgs[0]);
|
||||||
|
numConcurrentThreads = std::stoul(cliArgs[1]);
|
||||||
|
numTestRounds = std::stoul(cliArgs[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||||
|
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
|
||||||
|
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||||
|
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"| DGEMM thread safety tester |\n";
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
|
||||||
|
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||||
|
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||||
|
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||||
|
std::mt19937_64 PRNG = InitPRNG();
|
||||||
|
std::cout<<"done\n";
|
||||||
|
|
||||||
|
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
|
||||||
|
std::cout<<"Allocating matrices..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
|
||||||
|
matBlock[i].resize(randomMatSize*randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
//pauser();
|
||||||
|
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||||
|
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
|
||||||
|
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Testing CBLAS DGEMM thread safety\n";
|
||||||
|
omp_set_num_threads(numConcurrentThreads);
|
||||||
|
for(uint32_t R=0; R<numTestRounds; R++){
|
||||||
|
std::cout<<"DGEMM round #"<<R<<std::endl;
|
||||||
|
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||||
|
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
|
||||||
|
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i].get();
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||||
|
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||||
|
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||||
|
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
|
||||||
|
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
|
||||||
|
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<"OK!\n"<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,101 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <random>
|
||||||
|
#include <future>
|
||||||
|
#include <omp.h>
|
||||||
|
#include "../cblas.h"
|
||||||
|
#include "cpp_thread_safety_common.h"
|
||||||
|
|
||||||
|
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
|
||||||
|
const blasint inc = 1;
|
||||||
|
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]){
|
||||||
|
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
|
||||||
|
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||||
|
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||||
|
|
||||||
|
if (argc > 4){
|
||||||
|
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
if(argc == 4){
|
||||||
|
std::vector<std::string> cliArgs;
|
||||||
|
for (int i = 1; i < argc; i++){
|
||||||
|
cliArgs.push_back(argv[i]);
|
||||||
|
std::cout<<argv[i]<<std::endl;
|
||||||
|
}
|
||||||
|
randomMatSize = std::stoul(cliArgs.at(0));
|
||||||
|
numConcurrentThreads = std::stoul(cliArgs.at(1));
|
||||||
|
numTestRounds = std::stoul(cliArgs.at(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||||
|
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
|
||||||
|
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
|
||||||
|
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||||
|
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"| DGEMV thread safety tester |\n";
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
|
||||||
|
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||||
|
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||||
|
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||||
|
std::mt19937_64 PRNG = InitPRNG();
|
||||||
|
std::cout<<"done\n";
|
||||||
|
|
||||||
|
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
|
||||||
|
std::cout<<"Allocating matrices..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
matBlock.at(i).resize(randomMatSize*randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Allocating vectors..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
|
||||||
|
vecBlock.at(i).resize(randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
//pauser();
|
||||||
|
|
||||||
|
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||||
|
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
|
||||||
|
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Filling vectors with random numbers..."<<std::flush;
|
||||||
|
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
|
||||||
|
std::cout<<"done\n";
|
||||||
|
|
||||||
|
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
|
||||||
|
omp_set_num_threads(numConcurrentThreads);
|
||||||
|
for(uint32_t R=0; R<numTestRounds; R++){
|
||||||
|
std::cout<<"DGEMV round #"<<R<<std::endl;
|
||||||
|
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||||
|
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i].get();
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||||
|
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||||
|
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
|
||||||
|
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
|
||||||
|
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<"OK!\n"<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -94,7 +94,7 @@ int get_feature(char *search)
|
||||||
if( p == NULL ) return 0;
|
if( p == NULL ) return 0;
|
||||||
|
|
||||||
t = strtok(p," ");
|
t = strtok(p," ");
|
||||||
while( t = strtok(NULL," "))
|
while( (t = strtok(NULL," ")))
|
||||||
{
|
{
|
||||||
if (!strcmp(t, search)) { return(1); }
|
if (!strcmp(t, search)) { return(1); }
|
||||||
}
|
}
|
||||||
|
@ -344,7 +344,7 @@ void get_features(void)
|
||||||
if( p == NULL ) return;
|
if( p == NULL ) return;
|
||||||
|
|
||||||
t = strtok(p," ");
|
t = strtok(p," ");
|
||||||
while( t = strtok(NULL," "))
|
while( (t = strtok(NULL," ")))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
27
cpuid_x86.c
27
cpuid_x86.c
|
@ -1211,7 +1211,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_CORE2;
|
return CPUTYPE_CORE2;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1: // family 6 exmodel 1
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 6:
|
case 6:
|
||||||
return CPUTYPE_CORE2;
|
return CPUTYPE_CORE2;
|
||||||
|
@ -1228,7 +1228,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_DUNNINGTON;
|
return CPUTYPE_DUNNINGTON;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2: // family 6 exmodel 2
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 5:
|
case 5:
|
||||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||||
|
@ -1257,7 +1257,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3: // family 6 exmodel 3
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 7:
|
case 7:
|
||||||
// Bay Trail
|
// Bay Trail
|
||||||
|
@ -1287,7 +1287,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4: // family 6 exmodel 4
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 5:
|
case 5:
|
||||||
case 6:
|
case 6:
|
||||||
|
@ -1321,7 +1321,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5: // family 6 exmodel 5
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 6:
|
case 6:
|
||||||
//Broadwell
|
//Broadwell
|
||||||
|
@ -1364,7 +1364,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 6:
|
case 6: // family 6 exmodel 6
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 6: // Cannon Lake
|
case 6: // Cannon Lake
|
||||||
if(support_avx512())
|
if(support_avx512())
|
||||||
|
@ -1376,7 +1376,20 @@ int get_cpuname(void){
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 7: // family 6 exmodel 7
|
||||||
|
switch (model) {
|
||||||
|
case 14: // Ice Lake
|
||||||
|
if(support_avx512())
|
||||||
|
return CPUTYPE_SKYLAKEX;
|
||||||
|
if(support_avx2())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 9:
|
case 9:
|
||||||
case 8:
|
case 8:
|
||||||
switch (model) {
|
switch (model) {
|
||||||
|
|
|
@ -6,6 +6,8 @@ TOPDIR = ..
|
||||||
include $(TOPDIR)/Makefile.system
|
include $(TOPDIR)/Makefile.system
|
||||||
|
|
||||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||||
|
override TARGET_ARCH=
|
||||||
|
override TARGET_MACH=
|
||||||
|
|
||||||
LIB = $(TOPDIR)/$(LIBNAME)
|
LIB = $(TOPDIR)/$(LIBNAME)
|
||||||
|
|
||||||
|
|
|
@ -577,7 +577,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -653,7 +653,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -653,7 +653,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -577,7 +577,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
|
||||||
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
|
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
|
||||||
/* jobs is queued. */
|
/* jobs is queued. */
|
||||||
|
|
||||||
/* We need this grobal for cheking if initialization is finished. */
|
/* We need this global for checking if initialization is finished. */
|
||||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
||||||
|
|
||||||
/* Local Variables */
|
/* Local Variables */
|
||||||
|
@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
|
||||||
/* Monitor is a function to see thread's status for every seconds. */
|
/* Monitor is a function to see thread's status for every second. */
|
||||||
/* Usually it turns off and it's for debugging. */
|
/* Usually it turns off and it's for debugging. */
|
||||||
|
|
||||||
static pthread_t monitor_thread;
|
static pthread_t monitor_thread;
|
||||||
static int main_status[MAX_CPU_NUMBER];
|
static int main_status[MAX_CPU_NUMBER];
|
||||||
|
|
|
@ -50,7 +50,7 @@
|
||||||
|
|
||||||
/* This is a thread implementation for Win32 lazy implementation */
|
/* This is a thread implementation for Win32 lazy implementation */
|
||||||
|
|
||||||
/* Thread server common infomation */
|
/* Thread server common information */
|
||||||
typedef struct{
|
typedef struct{
|
||||||
CRITICAL_SECTION lock;
|
CRITICAL_SECTION lock;
|
||||||
HANDLE filled;
|
HANDLE filled;
|
||||||
|
@ -61,7 +61,7 @@ typedef struct{
|
||||||
|
|
||||||
} blas_pool_t;
|
} blas_pool_t;
|
||||||
|
|
||||||
/* We need this global for cheking if initialization is finished. */
|
/* We need this global for checking if initialization is finished. */
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
/* Local Variables */
|
/* Local Variables */
|
||||||
|
|
|
@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
|
case 7:
|
||||||
|
if (model == 14) {
|
||||||
|
// Ice Lake
|
||||||
|
if (support_avx512())
|
||||||
|
return &gotoblas_SKYLAKEX;
|
||||||
|
if(support_avx2()){
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||||
|
return &gotoblas_HASWELL;
|
||||||
|
}
|
||||||
|
if(support_avx()) {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||||
|
return &gotoblas_SANDYBRIDGE;
|
||||||
|
} else {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||||
|
return &gotoblas_NEHALEM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
case 9:
|
case 9:
|
||||||
case 8:
|
case 8:
|
||||||
if (model == 14 ) { // Kaby Lake
|
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||||
if(support_avx2())
|
if(support_avx2())
|
||||||
return &gotoblas_HASWELL;
|
return &gotoblas_HASWELL;
|
||||||
if(support_avx()) {
|
if(support_avx()) {
|
||||||
|
|
|
@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
|
||||||
|
|
||||||
int mynode = 1;
|
int mynode = 1;
|
||||||
|
|
||||||
/* if number of threads is larger than inital condition */
|
/* if number of threads is larger than initial condition */
|
||||||
if (pos < 0) {
|
if (pos < 0) {
|
||||||
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
|
sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
|
||||||
common -> shmid = pshmid;
|
common -> shmid = pshmid;
|
||||||
|
|
||||||
if (common -> magic != SH_MAGIC) {
|
if (common -> magic != SH_MAGIC) {
|
||||||
|
|
||||||
|
#if defined(__GLIBC_PREREQ)
|
||||||
|
#if __GLIBC_PREREQ(2, 7)
|
||||||
cpu_set_t *cpusetp;
|
cpu_set_t *cpusetp;
|
||||||
|
#else
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
int nums;
|
int nums;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
|
||||||
}
|
}
|
||||||
CPU_FREE(cpusetp);
|
CPU_FREE(cpusetp);
|
||||||
#else
|
#else
|
||||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
|
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
|
||||||
if (ret!=0) {
|
if (ret!=0) {
|
||||||
common->num_procs = nums;
|
common->num_procs = nums;
|
||||||
} else {
|
} else {
|
||||||
|
@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
|
||||||
int i;
|
int i;
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (i=0;i<nums;i++)
|
for (i=0;i<nums;i++)
|
||||||
if (CPU_ISSET(i,cpusetp)) n++;
|
if (CPU_ISSET(i,&cpuset)) n++;
|
||||||
common->num_procs = n;
|
common->num_procs = n;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
|
common->num_procs = CPU_COUNT(&cpuset);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -229,7 +229,7 @@ int get_num_procs(void) {
|
||||||
n=0;
|
n=0;
|
||||||
#if !__GLIBC_PREREQ(2, 6)
|
#if !__GLIBC_PREREQ(2, 6)
|
||||||
for (i=0;i<nums;i++)
|
for (i=0;i<nums;i++)
|
||||||
if (CPU_ISSET(i,cpuset)) n++;
|
if (CPU_ISSET(i,&cpuset)) n++;
|
||||||
nums=n;
|
nums=n;
|
||||||
#else
|
#else
|
||||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||||
|
@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||||
gotoblas_init();
|
gotoblas_init();
|
||||||
gotoblas_quit();
|
gotoblas_quit();
|
||||||
|
|
||||||
|
#if __PGIC__ < 19
|
||||||
#if 0
|
#if 0
|
||||||
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
||||||
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
||||||
|
@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||||
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
||||||
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1772,7 +1774,7 @@ int get_num_procs(void) {
|
||||||
n=0;
|
n=0;
|
||||||
#if !__GLIBC_PREREQ(2, 6)
|
#if !__GLIBC_PREREQ(2, 6)
|
||||||
for (i=0;i<nums;i++)
|
for (i=0;i<nums;i++)
|
||||||
if (CPU_ISSET(i,cpuset)) n++;
|
if (CPU_ISSET(i,&cpuset)) n++;
|
||||||
nums=n;
|
nums=n;
|
||||||
#else
|
#else
|
||||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||||
|
@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
|
||||||
|
|
||||||
static void alloc_mmap_free(struct release_t *release){
|
static void alloc_mmap_free(struct release_t *release){
|
||||||
|
|
||||||
|
if (!release->address) return;
|
||||||
|
|
||||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||||
printf("OpenBLAS : munmap failed\n");
|
int errsv=errno;
|
||||||
|
perror("OpenBLAS : munmap failed:");
|
||||||
|
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2062,15 +2068,21 @@ static void *alloc_mmap(void *address){
|
||||||
}
|
}
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
if (map_address != (void *)-1) {
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
LOCK_COMMAND(&alloc_lock);
|
LOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
release_info[release_pos].address = map_address;
|
release_info[release_pos].address = map_address;
|
||||||
release_info[release_pos].func = alloc_mmap_free;
|
release_info[release_pos].func = alloc_mmap_free;
|
||||||
release_pos ++;
|
release_pos ++;
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
|
} else {
|
||||||
|
#ifdef DEBUG
|
||||||
|
int errsv=errno;
|
||||||
|
perror("OpenBLAS : mmap failed:");
|
||||||
|
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#ifdef OS_LINUX
|
||||||
|
@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (map_address != (void *)-1) {
|
if (map_address != (void *)-1) {
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
LOCK_COMMAND(&alloc_lock);
|
LOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
release_info[release_pos].address = map_address;
|
release_info[release_pos].address = map_address;
|
||||||
release_info[release_pos].func = alloc_mmap_free;
|
release_info[release_pos].func = alloc_mmap_free;
|
||||||
release_pos ++;
|
release_pos ++;
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){
|
||||||
|
|
||||||
position = 0;
|
position = 0;
|
||||||
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
LOCK_COMMAND(&alloc_lock);
|
LOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
do {
|
do {
|
||||||
|
@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){
|
||||||
position ++;
|
position ++;
|
||||||
|
|
||||||
} while (position < NUM_BUFFERS);
|
} while (position < NUM_BUFFERS);
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
goto error;
|
goto error;
|
||||||
|
@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
memory[position].used = 1;
|
memory[position].used = 1;
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#else
|
#else
|
||||||
blas_unlock(&memory[position].lock);
|
blas_unlock(&memory[position].lock);
|
||||||
|
@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){
|
||||||
|
|
||||||
#ifdef ALLOC_DEVICEDRIVER
|
#ifdef ALLOC_DEVICEDRIVER
|
||||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
||||||
fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
|
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){
|
||||||
|
|
||||||
} while ((BLASLONG)map_address == -1);
|
} while ((BLASLONG)map_address == -1);
|
||||||
|
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
LOCK_COMMAND(&alloc_lock);
|
LOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
memory[position].addr = map_address;
|
memory[position].addr = map_address;
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
position = 0;
|
position = 0;
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
LOCK_COMMAND(&alloc_lock);
|
LOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||||
|
@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){
|
||||||
WMB;
|
WMB;
|
||||||
|
|
||||||
memory[position].used = 0;
|
memory[position].used = 0;
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){
|
||||||
for (position = 0; position < NUM_BUFFERS; position++)
|
for (position = 0; position < NUM_BUFFERS; position++)
|
||||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||||
#endif
|
#endif
|
||||||
#if defined(SMP) && !defined(USE_OPENMP)
|
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
|
@ -2924,7 +2936,7 @@ void blas_shutdown(void){
|
||||||
|
|
||||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||||
|
|
||||||
#ifdef SMP
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
#if defined(USE_PTHREAD_LOCK)
|
#if defined(USE_PTHREAD_LOCK)
|
||||||
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
#elif defined(USE_PTHREAD_SPINLOCK)
|
#elif defined(USE_PTHREAD_SPINLOCK)
|
||||||
|
@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
if (hot_alloc != 2) {
|
if (hot_alloc != 2) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
LOCK_COMMAND(&init_lock);
|
LOCK_COMMAND(&init_lock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
size -= PAGESIZE;
|
size -= PAGESIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SMP
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
UNLOCK_COMMAND(&init_lock);
|
UNLOCK_COMMAND(&init_lock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||||
|
|
||||||
gotoblas_init();
|
gotoblas_init();
|
||||||
gotoblas_quit();
|
gotoblas_quit();
|
||||||
|
#if __PGIC__ < 19
|
||||||
#if 0
|
#if 0
|
||||||
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
|
||||||
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
|
||||||
|
@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||||
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
|
||||||
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
|
||||||
libgoto_hpl.def : gensymbol
|
libgoto_hpl.def : gensymbol
|
||||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||||
|
|
||||||
|
ifeq ($(OSNAME), Darwin)
|
||||||
|
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||||
else
|
else
|
||||||
|
@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||||
endif
|
endif
|
||||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||||
#only build without Fortran
|
#only build without Fortran
|
||||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
else
|
else
|
||||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
dllinit.$(SUFFIX) : dllinit.c
|
dllinit.$(SUFFIX) : dllinit.c
|
||||||
|
|
2
f_check
2
f_check
|
@ -125,7 +125,7 @@ if ($compiler eq "") {
|
||||||
$openmp = "-openmp";
|
$openmp = "-openmp";
|
||||||
}
|
}
|
||||||
|
|
||||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||||
if ($data =~ / zho_ge__/) {
|
if ($data =~ / zho_ge__/) {
|
||||||
$need2bu = 1;
|
$need2bu = 1;
|
||||||
|
|
|
@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
|
||||||
axpby.c
|
axpby.c
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
|
# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
|
||||||
# these all have 'z' sources for complex versions
|
# these all have 'z' sources for complex versions
|
||||||
set(BLAS2_SOURCES
|
set(BLAS2_SOURCES
|
||||||
gemv.c ger.c
|
gemv.c ger.c
|
||||||
|
|
|
@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
//
|
//
|
||||||
//Temporarily work-around the low performance issue with small imput size &
|
//Temporarily work-around the low performance issue with small input size &
|
||||||
//multithreads.
|
//multithreads.
|
||||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
|
|
@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||||
//disable multi-thread when incx==0 or incy==0
|
//disable multi-thread when incx==0 or incy==0
|
||||||
//In that case, the threads would be dependent.
|
//In that case, the threads would be dependent.
|
||||||
//
|
//
|
||||||
//Temporarily work-around the low performance issue with small imput size &
|
//Temporarily work-around the low performance issue with small input size &
|
||||||
//multithreads.
|
//multithreads.
|
||||||
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
|
|
|
@ -1,30 +1,30 @@
|
||||||
include $(KERNELDIR)/KERNEL.ARMV5
|
include $(KERNELDIR)/KERNEL.ARMV5
|
||||||
|
|
||||||
SAMAXKERNEL = iamax_vfp.S
|
SAMAXKERNEL = amax_vfp.S
|
||||||
DAMAXKERNEL = iamax_vfp.S
|
DAMAXKERNEL = amax_vfp.S
|
||||||
CAMAXKERNEL = iamax_vfp.S
|
#CAMAXKERNEL = amax_vfp.S
|
||||||
ZAMAXKERNEL = iamax_vfp.S
|
#ZAMAXKERNEL = amax_vfp.S
|
||||||
|
|
||||||
SAMINKERNEL = iamax_vfp.S
|
SAMINKERNEL = amax_vfp.S
|
||||||
DAMINKERNEL = iamax_vfp.S
|
DAMINKERNEL = amax_vfp.S
|
||||||
CAMINKERNEL = iamax_vfp.S
|
#CAMINKERNEL = amax_vfp.S
|
||||||
ZAMINKERNEL = iamax_vfp.S
|
#ZAMINKERNEL = amax_vfp.S
|
||||||
|
|
||||||
SMAXKERNEL = iamax_vfp.S
|
SMAXKERNEL = amax_vfp.S
|
||||||
DMAXKERNEL = iamax_vfp.S
|
DMAXKERNEL = amax_vfp.S
|
||||||
|
|
||||||
SMINKERNEL = iamax_vfp.S
|
SMINKERNEL = amax_vfp.S
|
||||||
DMINKERNEL = iamax_vfp.S
|
DMINKERNEL = amax_vfp.S
|
||||||
|
|
||||||
ISAMAXKERNEL = iamax_vfp.S
|
ISAMAXKERNEL = iamax_vfp.S
|
||||||
IDAMAXKERNEL = iamax_vfp.S
|
IDAMAXKERNEL = iamax_vfp.S
|
||||||
ICAMAXKERNEL = iamax_vfp.S
|
#ICAMAXKERNEL = iamax_vfp.S
|
||||||
IZAMAXKERNEL = iamax_vfp.S
|
#IZAMAXKERNEL = iamax_vfp.S
|
||||||
|
|
||||||
ISAMINKERNEL = iamax_vfp.S
|
ISAMINKERNEL = iamax_vfp.S
|
||||||
IDAMINKERNEL = iamax_vfp.S
|
IDAMINKERNEL = iamax_vfp.S
|
||||||
ICAMINKERNEL = iamax_vfp.S
|
#ICAMINKERNEL = iamax_vfp.S
|
||||||
IZAMINKERNEL = iamax_vfp.S
|
#IZAMINKERNEL = iamax_vfp.S
|
||||||
|
|
||||||
ISMAXKERNEL = iamax_vfp.S
|
ISMAXKERNEL = iamax_vfp.S
|
||||||
IDMAXKERNEL = iamax_vfp.S
|
IDMAXKERNEL = iamax_vfp.S
|
||||||
|
|
|
@ -0,0 +1,445 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* 2013/11/14 Saar
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
*
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define STACKSIZE 256
|
||||||
|
|
||||||
|
#define N r0
|
||||||
|
#define X r1
|
||||||
|
#define INC_X r2
|
||||||
|
|
||||||
|
#define I r12
|
||||||
|
|
||||||
|
#define X_PRE 512
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(USE_ABS)
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
|
#define VABS(x0,x1) vabs.f64 x0, x1
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define VABS(x0,x1) vabs.f32 x0, x1
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define VABS(x0,x1) nop
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*****************************************************************************************/
|
||||||
|
|
||||||
|
#if defined(USE_MIN)
|
||||||
|
|
||||||
|
#define MOVCOND movlt
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
|
#define VMOVCOND vmovlt.f64
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define VMOVCOND vmovlt.f32
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define MOVCOND movgt
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
|
#define VMOVCOND vmovgt.f64
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define VMOVCOND vmovgt.f32
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*****************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#if !defined(COMPLEX)
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
|
.macro INIT_F
|
||||||
|
|
||||||
|
vldmia.f64 X!, { d0 }
|
||||||
|
VABS( d0, d0 )
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f64 X!, { d4 }
|
||||||
|
VABS( d4, d4 )
|
||||||
|
vcmpe.f64 d4, d0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND d0, d4
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro INIT_S
|
||||||
|
|
||||||
|
vldmia.f64 X, { d0 }
|
||||||
|
VABS( d0, d0 )
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 }
|
||||||
|
VABS( d4, d4 )
|
||||||
|
vcmpe.f64 d4, d0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND d0, d4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
.macro INIT_F
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s0 }
|
||||||
|
VABS( s0, s0 )
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s4 }
|
||||||
|
VABS( s4, s4 )
|
||||||
|
vcmpe.f32 s4, s0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND s0, s4
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro INIT_S
|
||||||
|
|
||||||
|
vldmia.f32 X, { s0 }
|
||||||
|
VABS( s0, s0 )
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 }
|
||||||
|
VABS( s4, s4 )
|
||||||
|
vcmpe.f32 s4, s0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND s0, s4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
|
.macro INIT_F
|
||||||
|
|
||||||
|
vldmia.f64 X!, { d0 -d1 }
|
||||||
|
vabs.f64 d0, d0
|
||||||
|
vabs.f64 d1, d1
|
||||||
|
vadd.f64 d0 , d0, d1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f64 X!, { d4 - d5 }
|
||||||
|
vabs.f64 d4, d4
|
||||||
|
vabs.f64 d5, d5
|
||||||
|
vadd.f64 d4 , d4, d5
|
||||||
|
vcmpe.f64 d4, d0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND d0, d4
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro INIT_S
|
||||||
|
|
||||||
|
vldmia.f64 X, { d0 -d1 }
|
||||||
|
vabs.f64 d0, d0
|
||||||
|
vabs.f64 d1, d1
|
||||||
|
vadd.f64 d0 , d0, d1
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 - d5 }
|
||||||
|
vabs.f64 d4, d4
|
||||||
|
vabs.f64 d5, d5
|
||||||
|
vadd.f64 d4 , d4, d5
|
||||||
|
vcmpe.f64 d4, d0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND d0, d4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
.macro INIT_F
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s0 -s1 }
|
||||||
|
vabs.f32 s0, s0
|
||||||
|
vabs.f32 s1, s1
|
||||||
|
vadd.f32 s0 , s0, s1
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s4 - s5 }
|
||||||
|
vabs.f32 s4, s4
|
||||||
|
vabs.f32 s5, s5
|
||||||
|
vadd.f32 s4 , s4, s5
|
||||||
|
vcmpe.f32 s4, s0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND s0, s4
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro INIT_S
|
||||||
|
|
||||||
|
vldmia.f32 X, { s0 -s1 }
|
||||||
|
vabs.f32 s0, s0
|
||||||
|
vabs.f32 s1, s1
|
||||||
|
vadd.f32 s0 , s0, s1
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 - s5 }
|
||||||
|
vabs.f32 s4, s4
|
||||||
|
vabs.f32 s5, s5
|
||||||
|
vadd.f32 s4 , s4, s5
|
||||||
|
vcmpe.f32 s4, s0
|
||||||
|
vmrs APSR_nzcv, fpscr
|
||||||
|
VMOVCOND s0, s4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
movs r12, #0 // clear floating point register
|
||||||
|
vmov s0, r12
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
vcvt.f64.f32 d0, s0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
cmp N, #0
|
||||||
|
ble amax_kernel_L999
|
||||||
|
|
||||||
|
cmp INC_X, #0
|
||||||
|
beq amax_kernel_L999
|
||||||
|
|
||||||
|
|
||||||
|
cmp INC_X, #1
|
||||||
|
bne amax_kernel_S_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
amax_kernel_F_BEGIN:
|
||||||
|
|
||||||
|
INIT_F
|
||||||
|
|
||||||
|
subs N, N , #1
|
||||||
|
ble amax_kernel_L999
|
||||||
|
|
||||||
|
asrs I, N, #2 // I = N / 4
|
||||||
|
ble amax_kernel_F1
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
amax_kernel_F4:
|
||||||
|
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
KERNEL_F1
|
||||||
|
KERNEL_F1
|
||||||
|
#if defined(COMPLEX) && defined(DOUBLE)
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
#endif
|
||||||
|
KERNEL_F1
|
||||||
|
KERNEL_F1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
ble amax_kernel_F1
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(COMPLEX) || defined(DOUBLE)
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
#endif
|
||||||
|
KERNEL_F1
|
||||||
|
KERNEL_F1
|
||||||
|
#if defined(COMPLEX) && defined(DOUBLE)
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
#endif
|
||||||
|
KERNEL_F1
|
||||||
|
KERNEL_F1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne amax_kernel_F4
|
||||||
|
|
||||||
|
amax_kernel_F1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble amax_kernel_L999
|
||||||
|
|
||||||
|
amax_kernel_F10:
|
||||||
|
|
||||||
|
KERNEL_F1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne amax_kernel_F10
|
||||||
|
|
||||||
|
b amax_kernel_L999
|
||||||
|
|
||||||
|
amax_kernel_S_BEGIN:
|
||||||
|
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
|
||||||
|
#else
|
||||||
|
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
lsl INC_X, INC_X, #3 // INC_X * SIZE
|
||||||
|
#else
|
||||||
|
lsl INC_X, INC_X, #2 // INC_X * SIZE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
INIT_S
|
||||||
|
|
||||||
|
subs N, N , #1
|
||||||
|
ble amax_kernel_L999
|
||||||
|
|
||||||
|
asrs I, N, #2 // I = N / 4
|
||||||
|
ble amax_kernel_S1
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
amax_kernel_S4:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne amax_kernel_S4
|
||||||
|
|
||||||
|
amax_kernel_S1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble amax_kernel_L999
|
||||||
|
|
||||||
|
amax_kernel_S10:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne amax_kernel_S10
|
||||||
|
|
||||||
|
|
||||||
|
amax_kernel_L999:
|
||||||
|
#if !defined(__ARM_PCS_VFP)
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
vmov r0, r1, d0
|
||||||
|
#else
|
||||||
|
vmov r0, s0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
EPILOGUE
|
||||||
|
|
|
@ -3,12 +3,12 @@
|
||||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
|
|
||||||
STRMMKERNEL = strmm_kernel_16x8_power8.S
|
STRMMKERNEL = sgemm_kernel_power9.S
|
||||||
DTRMMKERNEL = dgemm_kernel_power9.S
|
DTRMMKERNEL = dgemm_kernel_power9.S
|
||||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
CTRMMKERNEL = cgemm_kernel_power9.S
|
||||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
ZTRMMKERNEL = zgemm_kernel_power9.S
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_16x8_power8.S
|
SGEMMKERNEL = sgemm_kernel_power9.S
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
CGEMMKERNEL = cgemm_kernel_power9.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S
|
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
ZGEMMKERNEL = zgemm_kernel_power9.S
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define N r3
|
#define N r3
|
||||||
#define X r6
|
#define X r6
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define N r3
|
#define N r3
|
||||||
#define X r6
|
#define X r6
|
||||||
|
|
|
@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stfs f2, ALPHA_I_SP
|
stfs f2, ALPHA_I_SP
|
||||||
// stw r0, FZERO
|
// stw r0, FZERO
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifdef __64BIT__
|
#ifdef __64BIT__
|
||||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||||
#endif
|
#endif
|
||||||
|
@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,293 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* Abdelrauf(quickwritereader@gmail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD ld
|
||||||
|
#define STACKSIZE (512 )
|
||||||
|
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define K r5
|
||||||
|
|
||||||
|
|
||||||
|
#define A r8
|
||||||
|
#define B r9
|
||||||
|
#define C r10
|
||||||
|
#define LDC r6
|
||||||
|
#define OFFSET r7
|
||||||
|
|
||||||
|
|
||||||
|
#define alpha_r vs19
|
||||||
|
#define alpha_i vs20
|
||||||
|
#define save_permute_1 vs21
|
||||||
|
#define permute_mask vs22
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
|
||||||
|
#define T1 r11
|
||||||
|
#define T2 r12
|
||||||
|
#define T3 r14
|
||||||
|
#define T4 r15
|
||||||
|
#define T5 r16
|
||||||
|
#define T6 r17
|
||||||
|
#define L r18
|
||||||
|
#define T7 r19
|
||||||
|
#define T8 r20
|
||||||
|
#define TEMP_REG r21
|
||||||
|
#define I r22
|
||||||
|
#define J r23
|
||||||
|
#define AO r24
|
||||||
|
#define BO r25
|
||||||
|
#define CO r26
|
||||||
|
#define T9 r27
|
||||||
|
#define T10 r28
|
||||||
|
#define PRE r29
|
||||||
|
|
||||||
|
#define T12 r30
|
||||||
|
#define T13 r31
|
||||||
|
|
||||||
|
#include "cgemm_macros_power9.S"
|
||||||
|
|
||||||
|
.equ perm_const1, 0x0405060700010203
|
||||||
|
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||||
|
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||||
|
.equ save_permute_11, 0x0405060714151617
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef NEEDPARAM
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
mflr r0
|
||||||
|
|
||||||
|
|
||||||
|
stfd f14, 0(SP)
|
||||||
|
stfd f15, 8(SP)
|
||||||
|
stfd f16, 16(SP)
|
||||||
|
stfd f17, 24(SP)
|
||||||
|
|
||||||
|
stfd f18, 32(SP)
|
||||||
|
stfd f19, 40(SP)
|
||||||
|
stfd f20, 48(SP)
|
||||||
|
stfd f21, 56(SP)
|
||||||
|
|
||||||
|
stfd f22, 64(SP)
|
||||||
|
stfd f23, 72(SP)
|
||||||
|
stfd f24, 80(SP)
|
||||||
|
stfd f25, 88(SP)
|
||||||
|
|
||||||
|
stfd f26, 96(SP)
|
||||||
|
stfd f27, 104(SP)
|
||||||
|
stfd f28, 112(SP)
|
||||||
|
stfd f29, 120(SP)
|
||||||
|
|
||||||
|
stfd f30, 128(SP)
|
||||||
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
|
stxv vs52, 288(SP)
|
||||||
|
stxv vs53, 304(SP)
|
||||||
|
stxv vs54, 320(SP)
|
||||||
|
stxv vs55, 336(SP)
|
||||||
|
stxv vs56, 352(SP)
|
||||||
|
stxv vs57, 368(SP)
|
||||||
|
stxv vs58, 384(SP)
|
||||||
|
stxv vs59, 400(SP)
|
||||||
|
stxv vs60, 416(SP)
|
||||||
|
stxv vs61, 432(SP)
|
||||||
|
stxv vs62, 448(SP)
|
||||||
|
stxv vs63, 464(SP)
|
||||||
|
std r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef TRMMKERNEL
|
||||||
|
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
#endif
|
||||||
|
slwi LDC, LDC, ZBASE_SHIFT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*alpha is stored in f1. convert to single and splat*/
|
||||||
|
xscvdpspn alpha_r,vs1
|
||||||
|
xscvdpspn alpha_i,vs2
|
||||||
|
xxspltw alpha_r,alpha_r,0
|
||||||
|
xxspltw alpha_i,alpha_i,0
|
||||||
|
/*load reverse permute mask for big endian
|
||||||
|
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||||
|
*/
|
||||||
|
|
||||||
|
lis T2, perm_const2@highest
|
||||||
|
lis T1, perm_const1@highest
|
||||||
|
lis T3, save_permute_12@highest
|
||||||
|
lis T4, save_permute_11@highest
|
||||||
|
|
||||||
|
|
||||||
|
ori T2, T2, perm_const2@higher
|
||||||
|
ori T1, T1, perm_const1@higher
|
||||||
|
ori T3, T3, save_permute_12@higher
|
||||||
|
ori T4, T4, save_permute_11@higher
|
||||||
|
|
||||||
|
|
||||||
|
rldicr T2, T2, 32, 31
|
||||||
|
rldicr T1, T1, 32, 31
|
||||||
|
rldicr T3, T3, 32, 31
|
||||||
|
rldicr T4, T4, 32, 31
|
||||||
|
|
||||||
|
oris T2, T2, perm_const2@h
|
||||||
|
oris T1, T1, perm_const1@h
|
||||||
|
oris T3, T3, save_permute_12@h
|
||||||
|
oris T4, T4, save_permute_11@h
|
||||||
|
|
||||||
|
|
||||||
|
ori T2, T2, perm_const2@l
|
||||||
|
ori T1, T1, perm_const1@l
|
||||||
|
ori T3, T3, save_permute_12@l
|
||||||
|
ori T4, T4, save_permute_11@l
|
||||||
|
|
||||||
|
|
||||||
|
li r0,0
|
||||||
|
li PRE,512
|
||||||
|
|
||||||
|
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||||
|
/*negate for this case as we will use addition -1*(a+b) */
|
||||||
|
xvnegsp alpha_r,alpha_r
|
||||||
|
xvnegsp alpha_i,alpha_i
|
||||||
|
#endif
|
||||||
|
|
||||||
|
mtvsrdd permute_mask,T2,T1
|
||||||
|
mtvsrdd save_permute_1,T3,T4
|
||||||
|
|
||||||
|
/*mask is reverse permute so we have to make it inner permute */
|
||||||
|
xxpermdi permute_mask, permute_mask, permute_mask,2
|
||||||
|
|
||||||
|
#include "cgemm_logic_power9.S"
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
lfd f14, 0(SP)
|
||||||
|
lfd f15, 8(SP)
|
||||||
|
lfd f16, 16(SP)
|
||||||
|
lfd f17, 24(SP)
|
||||||
|
|
||||||
|
lfd f18, 32(SP)
|
||||||
|
lfd f19, 40(SP)
|
||||||
|
lfd f20, 48(SP)
|
||||||
|
lfd f21, 56(SP)
|
||||||
|
|
||||||
|
lfd f22, 64(SP)
|
||||||
|
lfd f23, 72(SP)
|
||||||
|
lfd f24, 80(SP)
|
||||||
|
lfd f25, 88(SP)
|
||||||
|
|
||||||
|
lfd f26, 96(SP)
|
||||||
|
lfd f27, 104(SP)
|
||||||
|
lfd f28, 112(SP)
|
||||||
|
lfd f29, 120(SP)
|
||||||
|
|
||||||
|
lfd f30, 128(SP)
|
||||||
|
lfd f31, 136(SP)
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
ld r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
lxv vs52, 288(SP)
|
||||||
|
lxv vs53, 304(SP)
|
||||||
|
lxv vs54, 320(SP)
|
||||||
|
lxv vs55, 336(SP)
|
||||||
|
lxv vs56, 352(SP)
|
||||||
|
lxv vs57, 368(SP)
|
||||||
|
lxv vs58, 384(SP)
|
||||||
|
lxv vs59, 400(SP)
|
||||||
|
mtlr r0
|
||||||
|
lxv vs60, 416(SP)
|
||||||
|
lxv vs61, 432(SP)
|
||||||
|
lxv vs62, 448(SP)
|
||||||
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
blr
|
||||||
|
|
||||||
|
|
||||||
|
EPILOGUE
|
||||||
|
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stfs f2, ALPHA_I_SP
|
stfs f2, ALPHA_I_SP
|
||||||
// stw r0, FZERO
|
// stw r0, FZERO
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifdef __64BIT__
|
#ifdef __64BIT__
|
||||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -271,7 +271,7 @@ li r11,0
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
std r14, 280(SP)
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
stxv v20, 288(SP)
|
stxv vs52, 288(SP)
|
||||||
stxv v21, 304(SP)
|
stxv vs53, 304(SP)
|
||||||
stxv v22, 320(SP)
|
stxv vs54, 320(SP)
|
||||||
stxv v23, 336(SP)
|
stxv vs55, 336(SP)
|
||||||
stxv v24, 352(SP)
|
stxv vs56, 352(SP)
|
||||||
stxv v25, 368(SP)
|
stxv vs57, 368(SP)
|
||||||
stxv v26, 384(SP)
|
stxv vs58, 384(SP)
|
||||||
stxv v27, 400(SP)
|
stxv vs59, 400(SP)
|
||||||
stxv v28, 416(SP)
|
stxv vs60, 416(SP)
|
||||||
stxv v29, 432(SP)
|
stxv vs61, 432(SP)
|
||||||
stxv v30, 448(SP)
|
stxv vs62, 448(SP)
|
||||||
stxv v31, 464(SP)
|
stxv vs63, 464(SP)
|
||||||
|
|
||||||
|
|
||||||
stfd f1, ALPHA_SP
|
stfd f1, ALPHA_SP
|
||||||
|
@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ld r15, 272(SP)
|
ld r15, 272(SP)
|
||||||
ld r14, 280(SP)
|
ld r14, 280(SP)
|
||||||
|
|
||||||
lxv v20, 288(SP)
|
lxv vs52, 288(SP)
|
||||||
lxv v21, 304(SP)
|
lxv vs53, 304(SP)
|
||||||
lxv v22, 320(SP)
|
lxv vs54, 320(SP)
|
||||||
lxv v23, 336(SP)
|
lxv vs55, 336(SP)
|
||||||
lxv v24, 352(SP)
|
lxv vs56, 352(SP)
|
||||||
lxv v25, 368(SP)
|
lxv vs57, 368(SP)
|
||||||
lxv v26, 384(SP)
|
lxv vs58, 384(SP)
|
||||||
lxv v27, 400(SP)
|
lxv vs59, 400(SP)
|
||||||
lxv v28, 416(SP)
|
lxv vs60, 416(SP)
|
||||||
lxv v29, 432(SP)
|
lxv vs61, 432(SP)
|
||||||
lxv v30, 448(SP)
|
lxv vs62, 448(SP)
|
||||||
lxv v31, 464(SP)
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
|
@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stvx v31, r11, r0
|
stvx v31, r11, r0
|
||||||
li r11,0
|
li r11,0
|
||||||
|
|
||||||
stw r31, 144(SP)
|
|
||||||
|
|
||||||
stfd f1, ALPHA_SP
|
stfd f1, ALPHA_SP
|
||||||
stw r0, FZERO
|
stw r0, FZERO
|
||||||
|
|
||||||
|
@ -271,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -217,7 +217,7 @@ li r11,0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -62,7 +62,7 @@
|
||||||
stfd f31, 16(SP)
|
stfd f31, 16(SP)
|
||||||
stw r0, 24(SP)
|
stw r0, 24(SP)
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -186,7 +186,7 @@
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -228,7 +228,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
|
|
@ -58,7 +58,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -192,7 +192,7 @@
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -226,7 +226,7 @@
|
||||||
li PREC, 4 * SIZE
|
li PREC, 4 * SIZE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -184,7 +184,7 @@
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
#define C r8
|
#define C r8
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -187,7 +187,7 @@
|
||||||
li PREC, 4 * SIZE
|
li PREC, 4 * SIZE
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -183,7 +183,7 @@
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -183,7 +183,7 @@
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
|
@ -252,7 +252,7 @@
|
||||||
stw r27, 196(SP)
|
stw r27, 196(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
|
@ -199,7 +199,7 @@
|
||||||
stw r23, 180(SP)
|
stw r23, 180(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
|
@ -260,7 +260,7 @@
|
||||||
stw r29, 220(SP)
|
stw r29, 220(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
|
@ -190,7 +190,7 @@
|
||||||
stw r22, 192(SP)
|
stw r22, 192(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
|
|
@ -47,7 +47,7 @@
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
|
@ -224,7 +224,7 @@
|
||||||
stw r27, 196(SP)
|
stw r27, 196(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
|
|
@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
|
||||||
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||||
|
|
||||||
BLASLONG index;
|
BLASLONG index;
|
||||||
BLASLONG i;
|
BLASLONG i=0;
|
||||||
#if defined(USE_MASK_PERMUTATIONS)
|
#if defined(USE_MASK_PERMUTATIONS)
|
||||||
register __vector unsigned int static_index0 = {0,1,2,3};
|
register __vector unsigned int static_index0 = {0,1,2,3};
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||||
|
|
||||||
BLASLONG index;
|
BLASLONG index;
|
||||||
BLASLONG i;
|
BLASLONG i=0;
|
||||||
register __vector unsigned int static_index0 = {0,1,2,3};
|
register __vector unsigned int static_index0 = {0,1,2,3};
|
||||||
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
|
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
|
||||||
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}
|
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}
|
||||||
|
|
|
@ -43,7 +43,7 @@
|
||||||
#define XX r4
|
#define XX r4
|
||||||
#define PREA r5
|
#define PREA r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define X r6
|
#define X r6
|
||||||
#define INCX r7
|
#define INCX r7
|
||||||
|
|
|
@ -43,7 +43,7 @@
|
||||||
#define XX r4
|
#define XX r4
|
||||||
#define PRE r5
|
#define PRE r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define X r6
|
#define X r6
|
||||||
#define INCX r7
|
#define INCX r7
|
||||||
|
|
|
@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
slwi LDC, LDC, 2
|
slwi LDC, LDC, 2
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,272 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD ld
|
||||||
|
#define STACKSIZE (512 )
|
||||||
|
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define K r5
|
||||||
|
|
||||||
|
|
||||||
|
#define A r7
|
||||||
|
#define B r8
|
||||||
|
#define C r9
|
||||||
|
#define LDC r10
|
||||||
|
#define OFFSET r6
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define alpha_r vs20
|
||||||
|
#define save_permute_1 vs21
|
||||||
|
#define save_permute_2 vs22
|
||||||
|
#define permute_mask vs23
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
|
||||||
|
#define T1 r11
|
||||||
|
#define T2 r12
|
||||||
|
#define T3 r14
|
||||||
|
#define T4 r15
|
||||||
|
#define T5 r16
|
||||||
|
#define T6 r17
|
||||||
|
#define L r18
|
||||||
|
#define T7 r19
|
||||||
|
#define T8 r20
|
||||||
|
#define TEMP_REG r21
|
||||||
|
#define I r22
|
||||||
|
#define J r23
|
||||||
|
#define AO r24
|
||||||
|
#define BO r25
|
||||||
|
#define CO r26
|
||||||
|
#define T9 r27
|
||||||
|
#define T10 r28
|
||||||
|
#define T11 r29
|
||||||
|
|
||||||
|
#define T12 r30
|
||||||
|
#define T13 r31
|
||||||
|
|
||||||
|
#include "sgemm_macros_power9.S"
|
||||||
|
|
||||||
|
.equ perm_const1, 0x0405060700010203
|
||||||
|
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||||
|
.equ save_permute_11, 0x1415161718191a1b
|
||||||
|
.equ save_permute_12, 0x0405060708090a0b
|
||||||
|
.equ save_permute_21, 0x101112131c1d1e1f
|
||||||
|
.equ save_permute_22, 0x000102030c0d0e0f
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef NEEDPARAM
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
mflr r0
|
||||||
|
|
||||||
|
|
||||||
|
stfd f14, 0(SP)
|
||||||
|
stfd f15, 8(SP)
|
||||||
|
stfd f16, 16(SP)
|
||||||
|
stfd f17, 24(SP)
|
||||||
|
|
||||||
|
stfd f18, 32(SP)
|
||||||
|
stfd f19, 40(SP)
|
||||||
|
stfd f20, 48(SP)
|
||||||
|
stfd f21, 56(SP)
|
||||||
|
|
||||||
|
stfd f22, 64(SP)
|
||||||
|
stfd f23, 72(SP)
|
||||||
|
stfd f24, 80(SP)
|
||||||
|
stfd f25, 88(SP)
|
||||||
|
|
||||||
|
stfd f26, 96(SP)
|
||||||
|
stfd f27, 104(SP)
|
||||||
|
stfd f28, 112(SP)
|
||||||
|
stfd f29, 120(SP)
|
||||||
|
|
||||||
|
stfd f30, 128(SP)
|
||||||
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
|
stxv vs52, 288(SP)
|
||||||
|
stxv vs53, 304(SP)
|
||||||
|
stxv vs54, 320(SP)
|
||||||
|
stxv vs55, 336(SP)
|
||||||
|
stxv vs56, 352(SP)
|
||||||
|
stxv vs57, 368(SP)
|
||||||
|
stxv vs58, 384(SP)
|
||||||
|
stxv vs59, 400(SP)
|
||||||
|
stxv vs60, 416(SP)
|
||||||
|
stxv vs61, 432(SP)
|
||||||
|
stxv vs62, 448(SP)
|
||||||
|
stxv vs63, 464(SP)
|
||||||
|
std r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
#endif
|
||||||
|
slwi LDC, LDC, 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*alpha is stored in f1. convert to single and splat*/
|
||||||
|
xscvdpspn alpha_r,vs1
|
||||||
|
xxspltw alpha_r,alpha_r,0
|
||||||
|
|
||||||
|
/*load reverse permute mask for big endian
|
||||||
|
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||||
|
*/
|
||||||
|
|
||||||
|
lis T2, perm_const2@highest
|
||||||
|
lis T1, perm_const1@highest
|
||||||
|
lis T3, save_permute_12@highest
|
||||||
|
lis T4, save_permute_11@highest
|
||||||
|
lis T5, save_permute_22@highest
|
||||||
|
lis T6, save_permute_21@highest
|
||||||
|
ori T2, T2, perm_const2@higher
|
||||||
|
ori T1, T1, perm_const1@higher
|
||||||
|
ori T3, T3, save_permute_12@higher
|
||||||
|
ori T4, T4, save_permute_11@higher
|
||||||
|
ori T5, T5, save_permute_22@higher
|
||||||
|
ori T6, T6, save_permute_21@higher
|
||||||
|
rldicr T2, T2, 32, 31
|
||||||
|
rldicr T1, T1, 32, 31
|
||||||
|
rldicr T3, T3, 32, 31
|
||||||
|
rldicr T4, T4, 32, 31
|
||||||
|
rldicr T5, T5, 32, 31
|
||||||
|
rldicr T6, T6, 32, 31
|
||||||
|
oris T2, T2, perm_const2@h
|
||||||
|
oris T1, T1, perm_const1@h
|
||||||
|
oris T3, T3, save_permute_12@h
|
||||||
|
oris T4, T4, save_permute_11@h
|
||||||
|
oris T5, T5, save_permute_22@h
|
||||||
|
oris T6, T6, save_permute_21@h
|
||||||
|
ori T2, T2, perm_const2@l
|
||||||
|
ori T1, T1, perm_const1@l
|
||||||
|
ori T3, T3, save_permute_12@l
|
||||||
|
ori T4, T4, save_permute_11@l
|
||||||
|
ori T5, T5, save_permute_22@l
|
||||||
|
ori T6, T6, save_permute_21@l
|
||||||
|
li r0,0
|
||||||
|
mtvsrdd permute_mask,T2,T1
|
||||||
|
mtvsrdd save_permute_1,T3,T4
|
||||||
|
mtvsrdd save_permute_2,T5,T6
|
||||||
|
|
||||||
|
#include "sgemm_logic_power9.S"
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
lfd f14, 0(SP)
|
||||||
|
lfd f15, 8(SP)
|
||||||
|
lfd f16, 16(SP)
|
||||||
|
lfd f17, 24(SP)
|
||||||
|
|
||||||
|
lfd f18, 32(SP)
|
||||||
|
lfd f19, 40(SP)
|
||||||
|
lfd f20, 48(SP)
|
||||||
|
lfd f21, 56(SP)
|
||||||
|
|
||||||
|
lfd f22, 64(SP)
|
||||||
|
lfd f23, 72(SP)
|
||||||
|
lfd f24, 80(SP)
|
||||||
|
lfd f25, 88(SP)
|
||||||
|
|
||||||
|
lfd f26, 96(SP)
|
||||||
|
lfd f27, 104(SP)
|
||||||
|
lfd f28, 112(SP)
|
||||||
|
lfd f29, 120(SP)
|
||||||
|
|
||||||
|
lfd f30, 128(SP)
|
||||||
|
lfd f31, 136(SP)
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
ld r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
lxv vs52, 288(SP)
|
||||||
|
lxv vs53, 304(SP)
|
||||||
|
lxv vs54, 320(SP)
|
||||||
|
lxv vs55, 336(SP)
|
||||||
|
lxv vs56, 352(SP)
|
||||||
|
lxv vs57, 368(SP)
|
||||||
|
lxv vs58, 384(SP)
|
||||||
|
lxv vs59, 400(SP)
|
||||||
|
mtlr r0
|
||||||
|
lxv vs60, 416(SP)
|
||||||
|
lxv vs61, 432(SP)
|
||||||
|
lxv vs62, 448(SP)
|
||||||
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
blr
|
||||||
|
|
||||||
|
|
||||||
|
EPILOGUE
|
||||||
|
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define N r3
|
#define N r3
|
||||||
#define X r6
|
#define X r6
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define M r3
|
#define M r3
|
||||||
#define N r4
|
#define N r4
|
||||||
|
@ -248,7 +248,7 @@
|
||||||
stw r27, 196(SP)
|
stw r27, 196(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#define ASSEMBLER
|
#define ASSEMBLER
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define M r3
|
#define M r3
|
||||||
#define IS r4
|
#define IS r4
|
||||||
|
@ -247,7 +247,7 @@
|
||||||
stw r27, 196(SP)
|
stw r27, 196(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -180,7 +180,7 @@
|
||||||
|
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -236,7 +236,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -180,7 +180,7 @@
|
||||||
|
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -257,7 +257,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -180,7 +180,7 @@
|
||||||
|
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -254,7 +254,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -180,7 +180,7 @@
|
||||||
|
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -231,7 +231,7 @@
|
||||||
li PREC, -4 * SIZE
|
li PREC, -4 * SIZE
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -180,7 +180,7 @@
|
||||||
|
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -257,7 +257,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -59,7 +59,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
|
@ -180,7 +180,7 @@
|
||||||
|
|
||||||
slwi LDC, LDC, BASE_SHIFT
|
slwi LDC, LDC, BASE_SHIFT
|
||||||
|
|
||||||
#if defined(linux) && defined(__64BIT__)
|
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -231,7 +231,7 @@
|
||||||
li PREC, -4 * SIZE
|
li PREC, -4 * SIZE
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
mr PREA, r10
|
mr PREA, r10
|
||||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
#define C r8
|
#define C r8
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
#define C r8
|
#define C r8
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
#define N r4
|
#define N r4
|
||||||
#define K r5
|
#define K r5
|
||||||
|
|
||||||
#ifdef linux
|
#if defined(linux) || defined(__FreeBSD__)
|
||||||
#define A r6
|
#define A r6
|
||||||
#define B r7
|
#define B r7
|
||||||
#define C r8
|
#define C r8
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue