Merge pull request #3536 from xianyi/develop
Update from develop for release 0.3.20
This commit is contained in:
commit
15ff556862
|
@ -251,12 +251,14 @@ if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
|
|||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
|
||||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
|
||||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
|
||||
else ()
|
||||
set (CMAKE_C_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
|
||||
endif ()
|
||||
endif()
|
||||
|
|
|
@ -201,3 +201,9 @@ In chronological order:
|
|||
* Bine Brank <https://github.com/binebrank>
|
||||
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
|
||||
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
|
||||
* [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions
|
||||
* [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions
|
||||
* [2022-01-18] SVE kernels and copy functions for TRSM
|
||||
|
||||
* Ilya Kurdyukov <https://github.com/ilyakurdyukov>
|
||||
* [2021-02-21] Add basic support for the Elbrus E2000 architecture
|
||||
|
|
|
@ -1,4 +1,39 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.20
|
||||
20-Feb-2022
|
||||
|
||||
general:
|
||||
- some code cleanup, with added casts etc.
|
||||
- fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset
|
||||
- fixed pivot index calculation by ?LASWP for negative increments other than one
|
||||
- fixed input argument check in LAPACK ? GEQRT2
|
||||
- improved the check for a Fortran compiler in CMAKE builds
|
||||
- disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1
|
||||
- fixed building of LAPACK on certain distributed filesystems with parallel gmake
|
||||
- fixed building the shared library on MacOS with classic flang
|
||||
|
||||
x86_64:
|
||||
- fixed cross-compilation with CMAKE for CORE2 target
|
||||
- fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds
|
||||
- added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS
|
||||
|
||||
E2K:
|
||||
- add new architecture (Russian Elbrus E2000 family)
|
||||
|
||||
SPARC:
|
||||
- fix IMIN/IMAX
|
||||
|
||||
ARMV8:
|
||||
- added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX
|
||||
- added support for Neoverse N2 and V1 cpus
|
||||
|
||||
MIPS,MIPS64:
|
||||
- fixed autodetection of MSA capability
|
||||
|
||||
LOONGARCH64:
|
||||
- added an optimized DGEMM kernel
|
||||
|
||||
====================================================================
|
||||
Version 0.3.19
|
||||
19-Dec-2021
|
||||
|
|
|
@ -78,6 +78,66 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-V1 is only available
|
||||
# in GCC>=9.4
|
||||
ifeq ($(CORE), NEOVERSEV1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||
CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N2 is only available
|
||||
# in GCC>=9.4
|
||||
ifeq ($(CORE), NEOVERSEN2)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||
CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||
ifeq ($(CORE), CORTEXA55)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
COPT = -Wall -O2 # -DGEMMTEST
|
|
@ -3,6 +3,10 @@
|
|||
export BINARY
|
||||
export USE_OPENMP
|
||||
|
||||
ifdef DYNAMIC_ARCH
|
||||
override HOST_CFLAGS += -DDYNAMIC_ARCH
|
||||
endif
|
||||
|
||||
ifdef TARGET_CORE
|
||||
TARGET_MAKE = Makefile_kernel.conf
|
||||
TARGET_CONF = config_kernel.h
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.19
|
||||
VERSION = 0.3.19.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
@ -277,7 +277,7 @@ HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo
|
|||
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
|
||||
|
||||
# Generating Makefile.conf and config.h
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
|
||||
ifndef TARGET_CORE
|
||||
include $(TOPDIR)/Makefile.conf
|
||||
|
@ -374,6 +374,7 @@ else
|
|||
endif
|
||||
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
|
||||
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
|
||||
GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||
endif
|
||||
|
||||
|
@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57
|
|||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
|
|
|
@ -93,6 +93,8 @@ CORTEXA57
|
|||
CORTEXA72
|
||||
CORTEXA73
|
||||
NEOVERSEN1
|
||||
NEOVERSEV1
|
||||
NEOVERSEN2
|
||||
CORTEXA55
|
||||
EMAG8180
|
||||
FALKOR
|
||||
|
@ -113,3 +115,7 @@ C910V
|
|||
|
||||
11.LOONGARCH64:
|
||||
LOONGSON3R5
|
||||
|
||||
12. Elbrus E2000:
|
||||
E2K
|
||||
|
||||
|
|
|
@ -224,7 +224,7 @@ jobs:
|
|||
|
||||
- job: OSX_IOS_ARMV8
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
|
||||
|
|
7
c_check
7
c_check
|
@ -84,6 +84,7 @@ $os = Haiku if ($data =~ /OS_HAIKU/);
|
|||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
|
@ -124,6 +125,11 @@ if ($architecture eq "zarch") {
|
|||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "e2k") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
|
@ -223,6 +229,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
|
|
|
@ -44,7 +44,7 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
|
|
|
@ -20,19 +20,16 @@
|
|||
# NEEDBUNDERSCORE
|
||||
# NEED2UNDERSCORES
|
||||
|
||||
if (NOT NO_LAPACK)
|
||||
include(CheckLanguage)
|
||||
check_language(Fortran)
|
||||
if(CMAKE_Fortran_COMPILER)
|
||||
include(CheckLanguage)
|
||||
check_language(Fortran)
|
||||
if(CMAKE_Fortran_COMPILER)
|
||||
enable_language(Fortran)
|
||||
else()
|
||||
else()
|
||||
if (NOT NO_LAPACK)
|
||||
message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
|
||||
endif()
|
||||
set (NOFORTRAN 1)
|
||||
set (NO_LAPACK 1)
|
||||
endif()
|
||||
else()
|
||||
include(CMakeForceCompiler)
|
||||
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
|
||||
endif()
|
||||
|
||||
if (NOT ONLY_CBLAS)
|
||||
|
|
|
@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
|
@ -243,11 +247,11 @@ endif ()
|
|||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t1048576\n\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t16\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
|
@ -263,6 +267,62 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "NEOVERSEV1")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t1048576\n\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define HAVE_SVE\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L2_SIZE\t1048576\n\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define HAVE_SVE\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "FALKOR")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
|
|
|
@ -125,7 +125,7 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}")
|
||||
set (ElseSeen 0)
|
||||
if (DEFINED ${CMAKE_MATCH_2})
|
||||
if (${CMAKE_MATCH_2})
|
||||
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
|
|
4
common.h
4
common.h
|
@ -474,6 +474,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_loongarch64.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_E2K
|
||||
#include "common_e2k.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_E2K
|
||||
#define COMMON_E2K
|
||||
|
||||
#ifdef ASSEMBLER
|
||||
#error
|
||||
#endif
|
||||
|
||||
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define RMB
|
||||
|
||||
#define INLINE __attribute__((__always_inline__)) inline
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y) {
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE ( 4 << 10)
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#ifndef BUFFERSIZE
|
||||
#define BUFFER_SIZE (32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||
#endif
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#endif
|
||||
|
|
@ -2611,7 +2611,7 @@
|
|||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|
||||
|| defined(ARCH_LOONGARCH64)
|
||||
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sbgemm_p;
|
||||
|
|
|
@ -43,6 +43,8 @@ size_t length64=sizeof(value64);
|
|||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
#define CPU_NEOVERSEN1 11
|
||||
#define CPU_NEOVERSEV1 16
|
||||
#define CPU_NEOVERSEN2 17
|
||||
// Qualcomm
|
||||
#define CPU_FALKOR 6
|
||||
// Cavium
|
||||
|
@ -71,6 +73,8 @@ static char *cpuname[] = {
|
|||
"TSV110",
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1",
|
||||
"NEOVERSEV1"
|
||||
"NEOVERSEN2"
|
||||
"THUNDERX3T110",
|
||||
"VORTEX",
|
||||
"CORTEXA55",
|
||||
|
@ -90,6 +94,8 @@ static char *cpuname_lower[] = {
|
|||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"neoversev1",
|
||||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"vortex",
|
||||
"cortexa55",
|
||||
|
@ -170,6 +176,10 @@ int detect(void)
|
|||
return CPU_CORTEXA73;
|
||||
else if (strstr(cpu_part, "0xd0c"))
|
||||
return CPU_NEOVERSEN1;
|
||||
else if (strstr(cpu_part, "0xd40"))
|
||||
return CPU_NEOVERSEV1;
|
||||
else if (strstr(cpu_part, "0xd49"))
|
||||
return CPU_NEOVERSEN2;
|
||||
else if (strstr(cpu_part, "0xd05"))
|
||||
return CPU_CORTEXA55;
|
||||
}
|
||||
|
@ -338,8 +348,38 @@ void get_cpuconfig(void)
|
|||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_NEOVERSEV1:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_NEOVERSEN2:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
|
|
|
@ -165,7 +165,7 @@ void get_cpuconfig(void){
|
|||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
if (!get_feature(msa)) printf("#define NO_MSA\n");
|
||||
if (!get_feature("msa")) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
@ -193,7 +193,7 @@ int get_feature(char *search)
|
|||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("Features", buffer, 8))
|
||||
if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
|
@ -207,7 +207,7 @@ int get_feature(char *search)
|
|||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
{
|
||||
if (!strcmp(t, search)) { return(1); }
|
||||
if (strstr(t, search)) { return(1); }
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -201,7 +201,7 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}
|
||||
if (!get_feature(msa)) printf("#define NO_MSA\n");
|
||||
if (!get_feature("msa")) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
@ -233,7 +233,7 @@ int get_feature(char *search)
|
|||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("Features", buffer, 8))
|
||||
if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
|
@ -247,7 +247,7 @@ int get_feature(char *search)
|
|||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
{
|
||||
if (!strcmp(t, search)) { return(1); }
|
||||
if (strstr(t, search)) { return(1); }
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
11
cpuid_x86.c
11
cpuid_x86.c
|
@ -323,9 +323,11 @@ int get_vendor(void){
|
|||
|
||||
int get_cputype(int gettype){
|
||||
int eax, ebx, ecx, edx;
|
||||
/*
|
||||
int extend_family, family;
|
||||
int extend_model, model;
|
||||
int type, stepping;
|
||||
*/
|
||||
int feature = 0;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
@ -428,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
|||
cpuid(0, &cpuid_level, &ebx, &ecx, &edx);
|
||||
|
||||
if (cpuid_level > 1) {
|
||||
int numcalls =0 ;
|
||||
int numcalls;
|
||||
|
||||
cpuid(2, &eax, &ebx, &ecx, &edx);
|
||||
numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries
|
||||
info[ 0] = BITMASK(eax, 8, 0xff);
|
||||
|
@ -1492,6 +1495,10 @@ int get_cpuname(void){
|
|||
switch (model) {
|
||||
case 7: // Alder Lake desktop
|
||||
case 10: // Alder Lake mobile
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
|
@ -1637,7 +1644,6 @@ int get_cpuname(void){
|
|||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
case 10: // Zen3
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
|
@ -2193,7 +2199,6 @@ int get_coretype(void){
|
|||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
|
||||
case 7:
|
||||
if (model == 10)
|
||||
|
|
4
ctest.c
4
ctest.c
|
@ -165,3 +165,7 @@ ARCH_LOONGARCH64
|
|||
HAVE_C11
|
||||
#endif
|
||||
|
||||
#if defined(__e2k__)
|
||||
ARCH_E2K
|
||||
#endif
|
||||
|
||||
|
|
|
@ -64,9 +64,9 @@ CBLASOBJS += \
|
|||
chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \
|
||||
chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \
|
||||
chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \
|
||||
csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
|
||||
cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
|
||||
csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
|
||||
csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \
|
||||
cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
|
||||
csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
|
||||
ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \
|
||||
ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \
|
||||
ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \
|
||||
|
@ -92,6 +92,13 @@ CBLASOBJS += \
|
|||
ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \
|
||||
ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX)
|
||||
|
||||
ifndef NO_LAPACK
|
||||
CBLASOBJS += \
|
||||
cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
|
||||
cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \
|
||||
csyr_U.$(SUFFIX) csyr_L.$(SUFFIX)
|
||||
endif
|
||||
|
||||
ZBLASOBJS += \
|
||||
zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \
|
||||
zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \
|
||||
|
|
|
@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
double *, BLASLONG, void *) = func;
|
||||
double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG,
|
||||
double *, BLASLONG, double *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((double *)args -> alpha)[0],
|
||||
|
@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / Single */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *) = func;
|
||||
float *, BLASLONG, void *) = (void (*)
|
||||
(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((float *)args -> alpha)[0],
|
||||
|
@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / BFLOAT16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
bfloat16 *, BLASLONG, void *) = func;
|
||||
bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
bfloat16 *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((bfloat16 *)args -> alpha)[0],
|
||||
|
@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / BLAS_STOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
float *, BLASLONG, void *) = func;
|
||||
float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
float *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((float *)args -> alpha)[0],
|
||||
|
@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / BLAS_DTOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
double *, BLASLONG, void *) = func;
|
||||
double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
double *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((double *)args -> alpha)[0],
|
||||
|
@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* COMPLEX / Extended Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
|
||||
xdouble *, BLASLONG, xdouble *, BLASLONG,
|
||||
xdouble *, BLASLONG, void *) = func;
|
||||
xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
|
||||
xdouble *, BLASLONG, xdouble *, BLASLONG,
|
||||
xdouble *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((xdouble *)args -> alpha)[0],
|
||||
|
@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* COMPLEX / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
double *, BLASLONG, void *) = func;
|
||||
double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
double *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((double *)args -> alpha)[0],
|
||||
|
@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* COMPLEX / Single */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *) = func;
|
||||
float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((float *)args -> alpha)[0],
|
||||
|
@ -425,7 +441,7 @@ blas_queue_t *tscq;
|
|||
#endif
|
||||
|
||||
if (queue) {
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine;
|
||||
|
||||
atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
|
||||
|
||||
|
@ -503,7 +519,7 @@ blas_queue_t *tscq;
|
|||
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
||||
} else
|
||||
if (queue -> mode & BLAS_PTHREAD) {
|
||||
void (*pthreadcompat)(void *) = queue -> routine;
|
||||
void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine;
|
||||
(pthreadcompat)(queue -> args);
|
||||
} else
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
||||
|
@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
fprintf(STDERR, "\n");
|
||||
#endif
|
||||
|
||||
routine = queue -> routine;
|
||||
routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine;
|
||||
|
||||
if (queue -> mode & BLAS_LEGACY) {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
||||
} else
|
||||
if (queue -> mode & BLAS_PTHREAD) {
|
||||
void (*pthreadcompat)(void *) = queue -> routine;
|
||||
void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine;
|
||||
(pthreadcompat)(queue -> args);
|
||||
} else
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n,
|
||||
|
|
|
@ -708,8 +708,11 @@ static gotoblas_t *get_coretype(void){
|
|||
|
||||
case 9:
|
||||
if (model == 7 || model == 10) { // Alder Lake
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
|
|
|
@ -147,6 +147,8 @@ static char *corename[] = {
|
|||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"neoversev1",
|
||||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
"unknown"
|
||||
|
|
|
@ -232,11 +232,11 @@ int get_num_procs(void);
|
|||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
|
||||
int ret;
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
|
@ -249,7 +249,8 @@ int get_num_procs(void) {
|
|||
|
||||
#if defined(USE_OPENMP)
|
||||
#if _OPENMP >= 201511
|
||||
nums = omp_get_num_places();
|
||||
ret = omp_get_num_places();
|
||||
if (ret >0 ) nums = ret;
|
||||
#endif
|
||||
return nums;
|
||||
#endif
|
||||
|
@ -1800,11 +1801,12 @@ int get_num_procs(void);
|
|||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
|
@ -1818,7 +1820,8 @@ int get_num_procs(void) {
|
|||
#if defined(USE_OPENMP)
|
||||
/* if (omp_get_proc_bind() != omp_proc_bind_false) */
|
||||
#if _OPENMP >= 201511
|
||||
nums = omp_get_num_places();
|
||||
ret = omp_get_num_places();
|
||||
if (ret >0 ) nums = ret;
|
||||
#endif
|
||||
return nums;
|
||||
#endif
|
||||
|
|
|
@ -142,10 +142,14 @@ ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
|||
else
|
||||
ifeq ($(F_COMPILER), INTEL)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
|
||||
else
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
|
1
f_check
1
f_check
|
@ -361,6 +361,7 @@ if ($link ne "") {
|
|||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /ibrary/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /flangmain/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
&& ($flags !~ /crt[0-9]/)
|
||||
|
|
48
getarch.c
48
getarch.c
|
@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \
|
||||
"-march=armv8.2-a -mtune=cortex-a72"
|
||||
"-march=armv8.2-a -mtune=neoverse-n1"
|
||||
#define LIBNAME "neoversen1"
|
||||
#define CORENAME "NEOVERSEN1"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_NEOVERSEV1
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "NEOVERSEV1"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DNEOVERSEV1 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
|
||||
"-march=armv8.4-a -mtune=neoverse-v1"
|
||||
#define LIBNAME "neoversev1"
|
||||
#define CORENAME "NEOVERSEV1"
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_NEOVERSEN2
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "NEOVERSEN2"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DNEOVERSEN2 " \
|
||||
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
|
||||
"-march=armv8.5-a -mtune=neoverse-n2"
|
||||
#define LIBNAME "neoversen2"
|
||||
#define CORENAME "NEOVERSEN2"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA55
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
|
@ -1501,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(FORCE_E2K) || defined(__e2k__)
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "E2K"
|
||||
#define ARCHCONFIG "-DGENERIC " \
|
||||
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "generic"
|
||||
#define CORENAME "generic"
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#ifdef USER_TARGET
|
||||
|
|
|
@ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES
|
|||
# these all have 'z' sources for complex versions
|
||||
set(BLAS2_SOURCES
|
||||
gemv.c ger.c
|
||||
trsv.c trmv.c symv.c
|
||||
syr.c syr2.c gbmv.c
|
||||
sbmv.c spmv.c
|
||||
spr.c spr2.c
|
||||
trsv.c trmv.c
|
||||
syr2.c gbmv.c
|
||||
sbmv.c
|
||||
spr2.c
|
||||
tbsv.c tbmv.c
|
||||
tpsv.c tpmv.c
|
||||
)
|
||||
|
||||
set(BLAS2_REAL_ONLY_SOURCES
|
||||
symv.c syr.c spmv.c spr.c
|
||||
)
|
||||
set(BLAS2_COMPLEX_LAPACK_SOURCES
|
||||
symv.c syr.c spmv.c spr.c
|
||||
)
|
||||
|
||||
set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
|
||||
hemv.c hbmv.c
|
||||
her.c her2.c
|
||||
|
@ -78,6 +85,10 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
|
|||
GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1)
|
||||
GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
|
||||
GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
|
||||
GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1)
|
||||
if (NOT DEFINED NO_LAPACK)
|
||||
GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
|
||||
endif ()
|
||||
GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4)
|
||||
GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
|
||||
GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
|
||||
|
|
|
@ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c
|
|||
qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifndef NO_LAPACK
|
||||
csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
@ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c
|
|||
qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifndef NO_LAPACK
|
||||
csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
@ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c
|
|||
qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifndef NO_LAPACK
|
||||
cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
@ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c
|
|||
qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifndef NO_LAPACK
|
||||
cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
|
|
@ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
|
|||
#endif
|
||||
|
||||
blas_level1_thread(mode, n, 0, 0, &alpha,
|
||||
x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads);
|
||||
x, incx, y, incy, NULL, 0, (int (*)(void))AXPYU_K, nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
|
|||
#else
|
||||
&alpha,
|
||||
#endif
|
||||
x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);
|
||||
x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
|||
|
||||
blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0,
|
||||
#ifndef CONJ
|
||||
(void *)AXPYU_K,
|
||||
(int (*)(void))AXPYU_K,
|
||||
#else
|
||||
(void *)AXPYC_K,
|
||||
(int (*)(void))AXPYC_K,
|
||||
#endif
|
||||
nthreads);
|
||||
}
|
||||
|
|
|
@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
|
|||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -323,55 +323,93 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
|
||||
|
||||
#hemm
|
||||
GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
|
||||
if (NOT DEFINED ${float_char}HEMMUTCOPY_M)
|
||||
set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}")
|
||||
set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}")
|
||||
endif()
|
||||
GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type})
|
||||
|
||||
# symm for c and z
|
||||
if (NOT DEFINED ${float_char}SYMMUCOPY_M)
|
||||
set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}")
|
||||
set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}")
|
||||
endif()
|
||||
GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type})
|
||||
|
||||
if (NOT DEFINED ${float_char}TRMMUNCOPY_M)
|
||||
set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}")
|
||||
set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}")
|
||||
set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}")
|
||||
set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}")
|
||||
endif ()
|
||||
GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
|
||||
|
||||
if (NOT DEFINED ZTRSMCOPYLN_M)
|
||||
set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}")
|
||||
set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}")
|
||||
set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}")
|
||||
set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}")
|
||||
endif ()
|
||||
GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
|
@ -465,23 +503,35 @@ endif ()
|
|||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type})
|
||||
|
||||
if (NOT DEFINED TRSMCOPYLN_M)
|
||||
set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c")
|
||||
else ()
|
||||
set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}")
|
||||
set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}")
|
||||
set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}")
|
||||
set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}")
|
||||
endif ()
|
||||
GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type})
|
||||
|
||||
|
|
|
@ -617,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
|
|||
$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
ifeq ($(ARCH), E2K)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
|
||||
|
@ -1691,29 +1695,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N
|
|||
$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
|
||||
|
||||
ifdef CTRMMUNCOPY_M
|
||||
$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
|
||||
ifdef CTRMMLNCOPY_M
|
||||
$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
|
||||
$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef CTRMMUTCOPY_M
|
||||
$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
ifdef CTRMMLTCOPY_M
|
||||
$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
@ -1739,29 +1775,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_
|
|||
$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
|
||||
|
||||
ifdef ZTRMMUNCOPY_M
|
||||
$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRMMLNCOPY_M
|
||||
$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRMMUTCOPY_M
|
||||
$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRMMLTCOPY_M
|
||||
$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
@ -1897,11 +1965,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N)
|
|||
$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
|
||||
|
||||
ifdef CSYMMUCOPY_M
|
||||
$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
else
|
||||
$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
endif
|
||||
|
||||
ifdef CSYMMLCOPY_M
|
||||
$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
else
|
||||
$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
|
||||
|
@ -1909,11 +1987,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N)
|
|||
$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
|
||||
|
||||
ifdef ZSYMMUCOPY_M
|
||||
$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
else
|
||||
$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZSYMMLCOPY_M
|
||||
$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
else
|
||||
$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
|
||||
|
@ -1933,11 +2021,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N
|
|||
$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
|
||||
|
||||
ifdef CHEMMUTCOPY_M
|
||||
$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
|
||||
else
|
||||
$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
|
||||
endif
|
||||
|
||||
ifdef CHEMMLTCOPY_M
|
||||
$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||
else
|
||||
$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
|
||||
|
@ -1945,11 +2043,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N
|
|||
$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
|
||||
|
||||
ifdef ZHEMMUTCOPY_M
|
||||
$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
|
||||
else
|
||||
$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
|
||||
endif
|
||||
|
||||
ifdef ZHEMMLTCOPY_M
|
||||
$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||
else
|
||||
$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
|
||||
|
@ -2287,29 +2395,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR
|
|||
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
|
||||
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
|
||||
|
||||
ifdef TRSMCOPYUN_M
|
||||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef TRSMCOPYLN_M
|
||||
$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef TRSMCOPYUT_M
|
||||
$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef TRSMCOPYLT_M
|
||||
$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
@ -2335,29 +2475,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N
|
|||
$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
|
||||
|
||||
ifdef TRSMCOPYUN_M
|
||||
$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef TRSMCOPYLN_M
|
||||
$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef TRSMCOPYUT_M
|
||||
$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef TRSMCOPYLT_M
|
||||
$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
@ -2431,29 +2603,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N
|
|||
$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
|
||||
|
||||
ifdef ZTRSMCOPYUN_M
|
||||
$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRSMCOPYLN_M
|
||||
$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRSMCOPYUT_M
|
||||
$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRSMCOPYLT_M
|
||||
$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
@ -2479,29 +2683,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_
|
|||
$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
|
||||
|
||||
ifdef ZTRSMCOPYUN_M
|
||||
$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRSMCOPYLN_M
|
||||
$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRSMCOPYUT_M
|
||||
$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
ifdef ZTRSMCOPYLT_M
|
||||
$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
else
|
||||
$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
|
||||
|
||||
$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
|
||||
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
|
||||
|
|
|
@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c
|
|||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
TRSMCOPYLN_M = trsm_lncopy_sve.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_sve.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_sve.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
|
@ -156,28 +167,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
|||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
|
|
@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c
|
|||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
TRSMCOPYLN_M = trsm_lncopy_sve.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_sve.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_sve.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
|
@ -140,8 +151,8 @@ DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
|||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
@ -156,28 +167,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
|||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
|
|
@ -0,0 +1,189 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
@ -0,0 +1,189 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
@ -0,0 +1,874 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/* X0 X1 X2 s0 X3 x4 x5 x6 */
|
||||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
|
||||
|
||||
#define origM x0
|
||||
#define origN x1
|
||||
#define origK x2
|
||||
#define origPA x3
|
||||
#define origPB x4
|
||||
#define pC x5
|
||||
#define LDC x6
|
||||
#define temp x7
|
||||
#define counterL x8
|
||||
#define counterI x9
|
||||
#define counterJ x10
|
||||
#define pB x11
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define lanes x17
|
||||
|
||||
#define alphaR w19
|
||||
#define alphaI w20
|
||||
|
||||
#define alphaz_R z6.s
|
||||
#define alphaz_I z7.s
|
||||
#define alpha0_R s4
|
||||
#define alpha0_I s5
|
||||
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmls
|
||||
#define OP_ri fmla
|
||||
#define OP_ir fmla
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmla
|
||||
#define OP_ri fmls
|
||||
#define OP_ir fmla
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmla
|
||||
#define OP_ri fmla
|
||||
#define OP_ir fmls
|
||||
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmls
|
||||
#define OP_ri fmls
|
||||
#define OP_ir fmls
|
||||
#endif
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
// 02 origK
|
||||
// 03 origPA
|
||||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 offset -> temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
// 11 pB
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
// 27 must save
|
||||
// 28 must save
|
||||
// 29 frame
|
||||
// 30 link
|
||||
// 31 sp
|
||||
|
||||
//v00 ALPHA_R -> pA00_R, pA01_R
|
||||
//v01 ALPHA_I -> pA00_I, pA01_I
|
||||
//v02 pA02_R, pA03_R
|
||||
//v03 pA02_I, pA03_I
|
||||
//v04 pA10_R, pA11_R
|
||||
//v05 pA10_I, pA11_I
|
||||
//v06 pA12_R, pA13_R
|
||||
//v07 pA12_I, pA13_I
|
||||
//v08 must save pB00_R, pB01_R
|
||||
//v09 must save pB00_I, pB01_I
|
||||
//v10 must save pB02_R, pB03_R OR ALPHA0_R
|
||||
//v11 must save pB02_I, pB03_I OR ALPHA0_I
|
||||
//v12 must save pB10_R, pB11_R
|
||||
//v13 must save pB10_I, pB11_I
|
||||
//v14 must save pB12_R, pB13_R OR ALPHA1_R
|
||||
//v15 must save pB12_I, pB13_I OR ALPHA1_R
|
||||
//v16 pC0R
|
||||
//v17 pC0I
|
||||
//v18 pC1R
|
||||
//v19 pC1I
|
||||
//v20 pC2R
|
||||
//v21 pC2I
|
||||
//v22 pC3R
|
||||
//v23 pC3I
|
||||
//v24 pC3R
|
||||
//v25 pC3I
|
||||
//v26 pC22_R, pC23_R
|
||||
//v27 pC22_I, pC23_I
|
||||
//v28 pC30_R, pC31_R
|
||||
//v29 pC30_I, pC31_I
|
||||
//v30 pC32_R, pC33_R
|
||||
//v31 pC32_I, pC33_I
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
.macro INITv1x4
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
dup z18.s, #0
|
||||
dup z19.s, #0
|
||||
dup z20.s, #0
|
||||
dup z21.s, #0
|
||||
dup z22.s, #0
|
||||
dup z23.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_I
|
||||
ld2w {z0.s, z1.s}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA += lanes*2*4
|
||||
ld2w {z2.s, z3.s}, p1/z, [pA] // next one
|
||||
add pA, pA, lanes, lsl #3 // pA += lanes*2*4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
fmla z16.s, p1/m, z0.s, z8.s
|
||||
OP_ir z17.s, p1/m, z1.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z17.16b, z17.16b, z17.16b
|
||||
fmls z17.s, p1/m, z0.s, z9.s
|
||||
#else
|
||||
fmla z17.s, p1/m, z0.s, z9.s
|
||||
#endif
|
||||
OP_ii z16.s, p1/m, z1.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
|
||||
|
||||
fmla z18.s, p1/m, z0.s, z10.s
|
||||
OP_ir z19.s, p1/m, z1.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
OP_ii z18.s, p1/m, z1.s, z11.s
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z19.16b, z21.16b, z21.16b
|
||||
fmls z19.s, p1/m, z0.s, z11.s
|
||||
#else
|
||||
fmla z19.s, p1/m, z0.s, z11.s
|
||||
#endif
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
|
||||
|
||||
fmla z20.s, p1/m, z0.s, z12.s
|
||||
OP_ir z21.s, p1/m, z1.s, z12.s
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z21.16b, z23.16b, z23.16b
|
||||
fmls z21.s, p1/m, z0.s, z13.s
|
||||
#else
|
||||
fmla z21.s, p1/m, z0.s, z13.s
|
||||
#endif
|
||||
OP_ii z20.s, p1/m, z1.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
|
||||
|
||||
fmla z22.s, p1/m, z0.s, z14.s
|
||||
OP_ir z23.s, p1/m, z1.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z23.16b, z19.16b, z19.16b
|
||||
fmls z23.s, p1/m, z0.s, z15.s
|
||||
#else
|
||||
fmla z23.s, p1/m, z0.s, z15.s
|
||||
#endif
|
||||
OP_ii z22.s, p1/m, z1.s, z15.s
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M1
|
||||
ld2w {z2.s, z3.s}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4
|
||||
|
||||
OP_rr z16.s, p1/m, z0.s, z8.s
|
||||
OP_ir z17.s, p1/m, z1.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
OP_ii z16.s, p1/m, z1.s, z9.s
|
||||
OP_ri z17.s, p1/m, z0.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
|
||||
OP_rr z18.s, p1/m, z0.s, z10.s
|
||||
OP_ir z19.s, p1/m, z1.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
OP_ii z18.s, p1/m, z1.s, z11.s
|
||||
OP_ri z19.s, p1/m, z0.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
|
||||
OP_rr z20.s, p1/m, z0.s, z12.s
|
||||
OP_ir z21.s, p1/m, z1.s, z12.s
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
OP_ii z20.s, p1/m, z1.s, z13.s
|
||||
OP_ri z21.s, p1/m, z0.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
|
||||
OP_rr z22.s, p1/m, z0.s, z14.s
|
||||
OP_ir z23.s, p1/m, z1.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
OP_ii z22.s, p1/m, z1.s, z15.s
|
||||
OP_ri z23.s, p1/m, z0.s, z15.s
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M2
|
||||
ld2w {z0.s, z1.s}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4
|
||||
|
||||
OP_rr z16.s, p1/m, z2.s, z8.s
|
||||
OP_ir z17.s, p1/m, z3.s, z8.s
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
OP_ii z16.s, p1/m, z3.s, z9.s
|
||||
OP_ri z17.s, p1/m, z2.s, z9.s
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
|
||||
OP_rr z18.s, p1/m, z2.s, z10.s
|
||||
OP_ir z19.s, p1/m, z3.s, z10.s
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
OP_ii z18.s, p1/m, z3.s, z11.s
|
||||
OP_ri z19.s, p1/m, z2.s, z11.s
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
|
||||
OP_rr z20.s, p1/m, z2.s, z12.s
|
||||
OP_ir z21.s, p1/m, z3.s, z12.s
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
OP_ii z20.s, p1/m, z3.s, z13.s
|
||||
OP_ri z21.s, p1/m, z2.s, z13.s
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
|
||||
OP_rr z22.s, p1/m, z2.s, z14.s
|
||||
OP_ir z23.s, p1/m, z3.s, z14.s
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
OP_ii z22.s, p1/m, z3.s, z15.s
|
||||
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_E
|
||||
OP_rr z16.s, p1/m, z2.s, z8.s
|
||||
OP_ir z17.s, p1/m, z3.s, z8.s
|
||||
OP_ii z16.s, p1/m, z3.s, z9.s
|
||||
OP_ri z17.s, p1/m, z2.s, z9.s
|
||||
|
||||
OP_rr z18.s, p1/m, z2.s, z10.s
|
||||
OP_ir z19.s, p1/m, z3.s, z10.s
|
||||
OP_ii z18.s, p1/m, z3.s, z11.s
|
||||
OP_ri z19.s, p1/m, z2.s, z11.s
|
||||
|
||||
OP_rr z20.s, p1/m, z2.s, z12.s
|
||||
OP_ir z21.s, p1/m, z3.s, z12.s
|
||||
OP_ii z20.s, p1/m, z3.s, z13.s
|
||||
OP_ri z21.s, p1/m, z2.s, z13.s
|
||||
|
||||
OP_rr z22.s, p1/m, z2.s, z14.s
|
||||
OP_ir z23.s, p1/m, z3.s, z14.s
|
||||
OP_ii z22.s, p1/m, z3.s, z15.s
|
||||
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
ld2w {z0.s, z1.s}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
|
||||
OP_rr z16.s, p1/m, z0.s, z8.s
|
||||
OP_ir z17.s, p1/m, z1.s, z8.s
|
||||
OP_ii z16.s, p1/m, z1.s, z9.s
|
||||
OP_ri z17.s, p1/m, z0.s, z9.s
|
||||
|
||||
ld1rw z12.s, p0/z, [pB, 16]
|
||||
ld1rw z13.s, p0/z, [pB, 20]
|
||||
ld1rw z14.s, p0/z, [pB, 24]
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
OP_rr z18.s, p1/m, z0.s, z10.s
|
||||
OP_ir z19.s, p1/m, z1.s, z10.s
|
||||
OP_ii z18.s, p1/m, z1.s, z11.s
|
||||
OP_ri z19.s, p1/m, z0.s, z11.s
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
OP_rr z20.s, p1/m, z0.s, z12.s
|
||||
OP_ir z21.s, p1/m, z1.s, z12.s
|
||||
OP_ii z20.s, p1/m, z1.s, z13.s
|
||||
OP_ri z21.s, p1/m, z0.s, z13.s
|
||||
|
||||
OP_rr z22.s, p1/m, z0.s, z14.s
|
||||
OP_ir z23.s, p1/m, z1.s, z14.s
|
||||
OP_ii z22.s, p1/m, z1.s, z15.s
|
||||
OP_ri z23.s, p1/m, z0.s, z15.s
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
fmla z25.s, p1/m, z16.s, alphaz_I
|
||||
fmla z25.s, p1/m, z17.s, alphaz_R
|
||||
st2w {z24.s, z25.s}, p1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3
|
||||
|
||||
ld2w {z26.s, z27.s}, p1/z, [pCRow1]
|
||||
fmla z26.s, p1/m, z18.s, alphaz_R
|
||||
fmls z26.s, p1/m, z19.s, alphaz_I
|
||||
fmla z27.s, p1/m, z18.s, alphaz_I
|
||||
fmla z27.s, p1/m, z19.s, alphaz_R
|
||||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #3
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
|
||||
fmla z28.s, p1/m, z20.s, alphaz_R
|
||||
fmls z28.s, p1/m, z21.s, alphaz_I
|
||||
fmla z29.s, p1/m, z20.s, alphaz_I
|
||||
fmla z29.s, p1/m, z21.s, alphaz_R
|
||||
st2w {z28.s, z29.s}, p1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, lanes, lsl #3
|
||||
|
||||
ld2w {z30.s, z31.s}, p1/z, [pCRow3]
|
||||
fmla z30.s, p1/m, z22.s, alphaz_R
|
||||
fmls z30.s, p1/m, z23.s, alphaz_I
|
||||
fmla z31.s, p1/m, z22.s, alphaz_I
|
||||
fmla z31.s, p1/m, z23.s, alphaz_R
|
||||
st2w {z30.s, z31.s}, p1, [pCRow3]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
.macro INITv1x2
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
dup z18.s, #0
|
||||
dup z19.s, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x2_SUB
|
||||
ld2w {z0.s, z1.s}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
ld1rw z10.s, p0/z, [pB, 8]
|
||||
ld1rw z11.s, p0/z, [pB, 12]
|
||||
|
||||
OP_rr z16.s, p1/m, z0.s, z8.s
|
||||
OP_ir z17.s, p1/m, z1.s, z8.s
|
||||
OP_ii z16.s, p1/m, z1.s, z9.s
|
||||
OP_ri z17.s, p1/m, z0.s, z9.s
|
||||
|
||||
OP_rr z18.s, p1/m, z0.s, z10.s
|
||||
OP_ir z19.s, p1/m, z1.s, z10.s
|
||||
OP_ii z18.s, p1/m, z1.s, z11.s
|
||||
OP_ri z19.s, p1/m, z0.s, z11.s
|
||||
|
||||
add pB, pB, 16
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
fmla z25.s, p1/m, z16.s, alphaz_I
|
||||
fmla z25.s, p1/m, z17.s, alphaz_R
|
||||
st2w {z24.s, z25.s}, p1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3
|
||||
|
||||
ld2w {z26.s, z27.s}, p1/z, [pCRow1]
|
||||
fmla z26.s, p1/m, z18.s, alphaz_R
|
||||
fmls z26.s, p1/m, z19.s, alphaz_I
|
||||
fmla z27.s, p1/m, z18.s, alphaz_I
|
||||
fmla z27.s, p1/m, z19.s, alphaz_R
|
||||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #3
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
.macro INITv1x1
|
||||
dup z16.s, #0
|
||||
dup z17.s, #0
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNELv1x1_SUB
|
||||
ld2w {z0.s, z1.s}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4
|
||||
|
||||
ld1rw z8.s, p0/z, [pB]
|
||||
ld1rw z9.s, p0/z, [pB, 4]
|
||||
|
||||
add pB, pB, 8
|
||||
|
||||
OP_rr z16.s, p1/m, z0.s, z8.s
|
||||
OP_ir z17.s, p1/m, z1.s, z8.s
|
||||
OP_ii z16.s, p1/m, z1.s, z9.s
|
||||
OP_ri z17.s, p1/m, z0.s, z9.s
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
fmla z25.s, p1/m, z16.s, alphaz_I
|
||||
fmla z25.s, p1/m, z17.s, alphaz_R
|
||||
st2w {z24.s, z25.s}, p1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, s0
|
||||
dup alphaz_R, alphaR
|
||||
fmov alphaI, s1
|
||||
dup alphaz_I, alphaI
|
||||
|
||||
lsl LDC, LDC, #3 // ldc = ldc * 2 * 4
|
||||
ptrue p0.s // create true predicate
|
||||
|
||||
mov pB, origPB
|
||||
|
||||
// Loop over N
|
||||
mov counterJ, origN
|
||||
asr counterJ, counterJ, #2 // J = J / 4
|
||||
cmp counterJ, #0
|
||||
ble .Lcgemm_kernel_L2_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
.Lcgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_BEGIN:
|
||||
|
||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM
|
||||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
|
||||
|
||||
.align 5
|
||||
.Lcgemm_kernel_L4_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x4 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3
|
||||
cmp counterL , #2
|
||||
blt .Lcgemm_kernel_L4_Mv1_32
|
||||
|
||||
KERNELv1x4_I
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble .Lcgemm_kernel_L4_Mv1_22a
|
||||
|
||||
.align 5
|
||||
.Lcgemm_kernel_L4_Mv1_22:
|
||||
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lcgemm_kernel_L4_Mv1_22
|
||||
|
||||
.align 5
|
||||
.Lcgemm_kernel_L4_Mv1_22a:
|
||||
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_E
|
||||
|
||||
b .Lcgemm_kernel_L4_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Lcgemm_kernel_L4_Mv1_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble .Lcgemm_kernel_L4_Mv1_40
|
||||
|
||||
KERNELv1x4_I
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_E
|
||||
|
||||
b .Lcgemm_kernel_L4_Mv1_44
|
||||
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_40:
|
||||
|
||||
INITv1x4
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Lcgemm_kernel_L4_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Lcgemm_kernel_L4_Mv1_46:
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Lcgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
|
||||
b.any .Lcgemm_kernel_L4_Mv1_20
|
||||
|
||||
|
||||
|
||||
.Lcgemm_kernel_L4_END:
|
||||
|
||||
lsl temp, origK, #5
|
||||
add origPB, origPB, temp // B = B + K * 4 * 4 * 2
|
||||
|
||||
subs counterJ, counterJ , #1 // j--
|
||||
bgt .Lcgemm_kernel_L4_BEGIN
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #3
|
||||
ble .Lcgemm_kernel_L999
|
||||
|
||||
tst counterJ , #2
|
||||
ble .Lcgemm_kernel_L1_BEGIN
|
||||
|
||||
mov pCRow0, pC // pCRow0 = pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
add pC,pC,LDC, lsl #1
|
||||
|
||||
mov pA, origPA // pA = A
|
||||
|
||||
|
||||
|
||||
.Lcgemm_kernel_L2_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
|
||||
.Lcgemm_kernel_L2_Mv1_20:
|
||||
|
||||
INITv1x2
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL,#0
|
||||
ble .Lcgemm_kernel_L2_Mv1_40
|
||||
.align 5
|
||||
|
||||
.Lcgemm_kernel_L2_Mv1_22:
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lcgemm_kernel_L2_Mv1_22
|
||||
|
||||
|
||||
.Lcgemm_kernel_L2_Mv1_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble .Lcgemm_kernel_L2_Mv1_100
|
||||
|
||||
.Lcgemm_kernel_L2_Mv1_42:
|
||||
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lcgemm_kernel_L2_Mv1_42
|
||||
|
||||
.Lcgemm_kernel_L2_Mv1_100:
|
||||
|
||||
SAVEv1x2
|
||||
|
||||
.Lcgemm_kernel_L2_Mv1_END:
|
||||
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Lcgemm_kernel_L2_Mv1_20
|
||||
|
||||
|
||||
.Lcgemm_kernel_L2_END:
|
||||
lsl temp, origK, #4
|
||||
add origPB, origPB, temp // B = B + K * 2 * 4 * 2
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Lcgemm_kernel_L1_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #1
|
||||
ble .Lcgemm_kernel_L999 // done
|
||||
|
||||
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC , pC , LDC // Update pC to point to next
|
||||
|
||||
mov pA, origPA // pA = A
|
||||
|
||||
.Lcgemm_kernel_L1_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
|
||||
|
||||
.Lcgemm_kernel_L1_Mv1_20:
|
||||
|
||||
INITv1x1
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL , #0
|
||||
ble .Lcgemm_kernel_L1_Mv1_40
|
||||
.align 5
|
||||
|
||||
.Lcgemm_kernel_L1_Mv1_22:
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lcgemm_kernel_L1_Mv1_22
|
||||
|
||||
|
||||
.Lcgemm_kernel_L1_Mv1_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble .Lcgemm_kernel_L1_Mv1_100
|
||||
|
||||
.Lcgemm_kernel_L1_Mv1_42:
|
||||
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lcgemm_kernel_L1_Mv1_42
|
||||
|
||||
.Lcgemm_kernel_L1_Mv1_100:
|
||||
|
||||
SAVEv1x1
|
||||
|
||||
.Lcgemm_kernel_L1_Mv1_END:
|
||||
|
||||
incw counterI
|
||||
whilelt p1.s, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.s
|
||||
b.any .Lcgemm_kernel_L1_Mv1_20
|
||||
|
||||
.Lcgemm_kernel_L1_END:
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Lcgemm_kernel_L999:
|
||||
mov x0, #0 // set return value
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint32_t lda_vec = svindex_s32(0, lda * 2);
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
|
||||
svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec);
|
||||
svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag));
|
||||
aoffset1 += 2;
|
||||
boffset += active * 2;
|
||||
}
|
||||
aoffset += active * lda * 2;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1);
|
||||
svst2_f32(pg, (float *) boffset, a_vec);
|
||||
aoffset1 += lda * 2;
|
||||
boffset += active * 2;
|
||||
}
|
||||
aoffset += active * 2;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,320 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_L
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
a += (m - 1) * m;
|
||||
b += (m - 1) * n;
|
||||
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
|
||||
aa = *(a + i);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb = *(c + i + j * ldc);
|
||||
bb *= aa;
|
||||
*b = bb;
|
||||
*(c + i + j * ldc) = bb;
|
||||
b ++;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
*(c + k + j * ldc) -= bb * *(a + k);
|
||||
}
|
||||
|
||||
}
|
||||
a -= m;
|
||||
b -= 2 * n;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
a += (m - 1) * m * 2;
|
||||
b += (m - 1) * n * 2;
|
||||
|
||||
for (i = m - 1; i >= 0; i--) {
|
||||
|
||||
aa1 = *(a + i * 2 + 0);
|
||||
aa2 = *(a + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb1 = *(c + i * 2 + 0 + j * ldc);
|
||||
bb2 = *(c + i * 2 + 1 + j * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = aa1 * bb2 - aa2 * bb1;
|
||||
#endif
|
||||
|
||||
|
||||
*(b + 0) = cc1;
|
||||
*(b + 1) = cc2;
|
||||
*(c + i * 2 + 0 + j * ldc) = cc1;
|
||||
*(c + i * 2 + 1 + j * ldc) = cc2;
|
||||
b += 2;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#else
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
a -= m * 2;
|
||||
b -= 4 * n;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
BLASLONG i, j;
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
#ifdef DOUBLE
|
||||
int sve_size = svcntd();
|
||||
#else
|
||||
int sve_size = svcntw();
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n",
|
||||
m, n, k, offset);
|
||||
#endif
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
kk = m + offset;
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
aa = a + (m - i) * k * COMPSIZE;
|
||||
cc = c + (m - i) * COMPSIZE;
|
||||
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + (kk - i) * i * COMPSIZE,
|
||||
b + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
kk -= i;
|
||||
|
||||
}
|
||||
|
||||
int mod = i;
|
||||
i = sve_size;
|
||||
if (i <= m) {
|
||||
aa = a + (m - mod - sve_size) * k * COMPSIZE;
|
||||
cc = c + (m - mod - sve_size) * COMPSIZE;
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + sve_size * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, GEMM_UNROLL_N,
|
||||
aa + (kk - sve_size) * sve_size * COMPSIZE,
|
||||
b + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa -= sve_size * k * COMPSIZE;
|
||||
cc -= sve_size * COMPSIZE;
|
||||
kk -= sve_size;
|
||||
|
||||
i += sve_size;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
kk = m + offset;
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
aa = a + (m - i) * k * COMPSIZE;
|
||||
cc = c + (m - i) * COMPSIZE;
|
||||
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc, ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + (kk - i) * i * COMPSIZE,
|
||||
b + (kk - i) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
kk -= i;
|
||||
|
||||
}
|
||||
|
||||
int mod = i;
|
||||
i = sve_size;
|
||||
if (i <= m) {
|
||||
aa = a + (m - mod - sve_size) * k * COMPSIZE;
|
||||
cc = c + (m - mod - sve_size) * COMPSIZE;
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(sve_size, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + sve_size * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, j,
|
||||
aa + (kk - sve_size) * sve_size * COMPSIZE,
|
||||
b + (kk - sve_size) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa -= sve_size * k * COMPSIZE;
|
||||
cc -= sve_size * COMPSIZE;
|
||||
kk -= sve_size;
|
||||
|
||||
i += sve_size;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,295 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_L
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
aa = *(a + i);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb = *(c + i + j * ldc);
|
||||
bb *= aa;
|
||||
*b = bb;
|
||||
*(c + i + j * ldc) = bb;
|
||||
b ++;
|
||||
|
||||
for (k = i + 1; k < m; k ++){
|
||||
*(c + k + j * ldc) -= bb * *(a + k);
|
||||
}
|
||||
|
||||
}
|
||||
a += m;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
aa1 = *(a + i * 2 + 0);
|
||||
aa2 = *(a + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < n; j ++) {
|
||||
bb1 = *(c + i * 2 + 0 + j * ldc);
|
||||
bb2 = *(c + i * 2 + 1 + j * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = aa1 * bb2 - aa2 * bb1;
|
||||
#endif
|
||||
|
||||
*(b + 0) = cc1;
|
||||
*(b + 1) = cc2;
|
||||
*(c + i * 2 + 0 + j * ldc) = cc1;
|
||||
*(c + i * 2 + 1 + j * ldc) = cc2;
|
||||
b += 2;
|
||||
|
||||
for (k = i + 1; k < m; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#else
|
||||
*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
|
||||
*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
a += m * 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
BLASLONG i, j, jj;
|
||||
#ifdef DOUBLE
|
||||
int sve_size = svcntd();
|
||||
#else
|
||||
int sve_size = svcntw();
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n",
|
||||
m, n, k, offset);
|
||||
#endif
|
||||
|
||||
jj = 0;
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = sve_size;
|
||||
|
||||
while (i <= m) {
|
||||
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, GEMM_UNROLL_N,
|
||||
aa + kk * sve_size * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += sve_size * k * COMPSIZE;
|
||||
cc += sve_size * COMPSIZE;
|
||||
kk += sve_size;
|
||||
i += sve_size;
|
||||
}
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
kk += i;
|
||||
|
||||
}
|
||||
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
jj += sve_size;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = sve_size;
|
||||
|
||||
while (i <= m) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(sve_size, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, j,
|
||||
aa + kk * sve_size * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += sve_size * k * COMPSIZE;
|
||||
cc += sve_size * COMPSIZE;
|
||||
kk += sve_size;
|
||||
i += sve_size;
|
||||
}
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
kk += i;
|
||||
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,293 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_R
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
|
||||
bb = *(b + i);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
aa = *(c + j + i * ldc);
|
||||
aa *= bb;
|
||||
*a = aa;
|
||||
*(c + j + i * ldc) = aa;
|
||||
a ++;
|
||||
|
||||
for (k = i + 1; k < n; k ++){
|
||||
*(c + j + k * ldc) -= aa * *(b + k);
|
||||
}
|
||||
|
||||
}
|
||||
b += n;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
|
||||
bb1 = *(b + i * 2 + 0);
|
||||
bb2 = *(b + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
aa1 = *(c + j * 2 + 0 + i * ldc);
|
||||
aa2 = *(c + j * 2 + 1 + i * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = -aa1 * bb2 + aa2 * bb1;
|
||||
#endif
|
||||
|
||||
*(a + 0) = cc1;
|
||||
*(a + 1) = cc2;
|
||||
*(c + j * 2 + 0 + i * ldc) = cc1;
|
||||
*(c + j * 2 + 1 + i * ldc) = cc2;
|
||||
a += 2;
|
||||
|
||||
for (k = i + 1; k < n; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#else
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
b += n * 2;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
BLASLONG i, j, jj;
|
||||
#ifdef DOUBLE
|
||||
int sve_size = svcntd();
|
||||
#else
|
||||
int sve_size = svcntw();
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n",
|
||||
m, n, k, offset);
|
||||
#endif
|
||||
|
||||
jj = 0;
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
kk = -offset;
|
||||
|
||||
while (j > 0) {
|
||||
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = sve_size;
|
||||
|
||||
if (i <= m) {
|
||||
do {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, GEMM_UNROLL_N,
|
||||
aa + kk * sve_size * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += sve_size * k * COMPSIZE;
|
||||
cc += sve_size * COMPSIZE;
|
||||
i += sve_size;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa, b, cc, ldc);
|
||||
}
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
kk += GEMM_UNROLL_N;
|
||||
b += GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c += GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
j --;
|
||||
jj += sve_size;
|
||||
}
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = (GEMM_UNROLL_N >> 1);
|
||||
while (j > 0) {
|
||||
if (n & j) {
|
||||
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
i = sve_size;
|
||||
|
||||
while (i <= m) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(sve_size, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, j,
|
||||
aa + kk * sve_size * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += sve_size * k * COMPSIZE;
|
||||
cc += sve_size * COMPSIZE;
|
||||
i += sve_size;
|
||||
}
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
if (kk > 0) {
|
||||
GEMM_KERNEL(i, j, kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa,
|
||||
b,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + kk * i * COMPSIZE,
|
||||
b + kk * j * COMPSIZE, cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
b += j * k * COMPSIZE;
|
||||
c += j * ldc * COMPSIZE;
|
||||
kk += j;
|
||||
}
|
||||
j >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,317 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
static FLOAT dm1 = -1.;
|
||||
|
||||
#ifdef CONJ
|
||||
#define GEMM_KERNEL GEMM_KERNEL_R
|
||||
#else
|
||||
#define GEMM_KERNEL GEMM_KERNEL_N
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 1
|
||||
#define GEMM_UNROLL_N_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 2
|
||||
#define GEMM_UNROLL_N_SHIFT 1
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 4
|
||||
#define GEMM_UNROLL_N_SHIFT 2
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 8
|
||||
#define GEMM_UNROLL_N_SHIFT 3
|
||||
#endif
|
||||
|
||||
#if GEMM_DEFAULT_UNROLL_N == 16
|
||||
#define GEMM_UNROLL_N_SHIFT 4
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef COMPLEX
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa, bb;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
a += (n - 1) * m;
|
||||
b += (n - 1) * n;
|
||||
|
||||
for (i = n - 1; i >= 0; i--) {
|
||||
|
||||
bb = *(b + i);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
aa = *(c + j + i * ldc);
|
||||
aa *= bb;
|
||||
*a = aa;
|
||||
*(c + j + i * ldc) = aa;
|
||||
a ++;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
*(c + j + k * ldc) -= aa * *(b + k);
|
||||
}
|
||||
|
||||
}
|
||||
b -= n;
|
||||
a -= 2 * m;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||
|
||||
FLOAT aa1, aa2;
|
||||
FLOAT bb1, bb2;
|
||||
FLOAT cc1, cc2;
|
||||
|
||||
int i, j, k;
|
||||
|
||||
ldc *= 2;
|
||||
|
||||
a += (n - 1) * m * 2;
|
||||
b += (n - 1) * n * 2;
|
||||
|
||||
for (i = n - 1; i >= 0; i--) {
|
||||
|
||||
bb1 = *(b + i * 2 + 0);
|
||||
bb2 = *(b + i * 2 + 1);
|
||||
|
||||
for (j = 0; j < m; j ++) {
|
||||
|
||||
aa1 = *(c + j * 2 + 0 + i * ldc);
|
||||
aa2 = *(c + j * 2 + 1 + i * ldc);
|
||||
|
||||
#ifndef CONJ
|
||||
cc1 = aa1 * bb1 - aa2 * bb2;
|
||||
cc2 = aa1 * bb2 + aa2 * bb1;
|
||||
#else
|
||||
cc1 = aa1 * bb1 + aa2 * bb2;
|
||||
cc2 = - aa1 * bb2 + aa2 * bb1;
|
||||
#endif
|
||||
|
||||
*(a + 0) = cc1;
|
||||
*(a + 1) = cc2;
|
||||
|
||||
*(c + j * 2 + 0 + i * ldc) = cc1;
|
||||
*(c + j * 2 + 1 + i * ldc) = cc2;
|
||||
a += 2;
|
||||
|
||||
for (k = 0; k < i; k ++){
|
||||
#ifndef CONJ
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#else
|
||||
*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
|
||||
*(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
b -= n * 2;
|
||||
a -= 4 * m;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
|
||||
#ifdef COMPLEX
|
||||
FLOAT dummy2,
|
||||
#endif
|
||||
FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
|
||||
|
||||
BLASLONG i, j;
|
||||
FLOAT *aa, *cc;
|
||||
BLASLONG kk;
|
||||
#ifdef DOUBLE
|
||||
int sve_size = svcntd();
|
||||
#else
|
||||
int sve_size = svcntw();
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n",
|
||||
m, n, k, offset);
|
||||
#endif
|
||||
|
||||
kk = n - offset;
|
||||
c += n * ldc * COMPSIZE;
|
||||
b += n * k * COMPSIZE;
|
||||
|
||||
if (n & (GEMM_UNROLL_N - 1)) {
|
||||
|
||||
j = 1;
|
||||
while (j < GEMM_UNROLL_N) {
|
||||
if (n & j) {
|
||||
|
||||
aa = a;
|
||||
b -= j * k * COMPSIZE;
|
||||
c -= j * ldc* COMPSIZE;
|
||||
cc = c;
|
||||
|
||||
i = sve_size;
|
||||
if (i <= m) {
|
||||
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(sve_size, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + sve_size * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, j,
|
||||
aa + (kk - j) * sve_size * COMPSIZE,
|
||||
b + (kk - j) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += sve_size * k * COMPSIZE;
|
||||
cc += sve_size * COMPSIZE;
|
||||
i += sve_size;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, j, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + j * kk * COMPSIZE,
|
||||
cc, ldc);
|
||||
}
|
||||
|
||||
solve(i, j,
|
||||
aa + (kk - j) * i * COMPSIZE,
|
||||
b + (kk - j) * j * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
kk -= j;
|
||||
}
|
||||
j <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
j = (n >> GEMM_UNROLL_N_SHIFT);
|
||||
|
||||
if (j > 0) {
|
||||
|
||||
do {
|
||||
aa = a;
|
||||
b -= GEMM_UNROLL_N * k * COMPSIZE;
|
||||
c -= GEMM_UNROLL_N * ldc * COMPSIZE;
|
||||
cc = c;
|
||||
|
||||
i = sve_size;
|
||||
if (i <= m) {
|
||||
do {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + sve_size * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(sve_size, GEMM_UNROLL_N,
|
||||
aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE,
|
||||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += sve_size * k * COMPSIZE;
|
||||
cc += sve_size * COMPSIZE;
|
||||
i += sve_size;
|
||||
} while (i <= m);
|
||||
}
|
||||
|
||||
i = m % sve_size;
|
||||
if (i) {
|
||||
if (k - kk > 0) {
|
||||
GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
|
||||
#ifdef COMPLEX
|
||||
ZERO,
|
||||
#endif
|
||||
aa + i * kk * COMPSIZE,
|
||||
b + GEMM_UNROLL_N * kk * COMPSIZE,
|
||||
cc,
|
||||
ldc);
|
||||
}
|
||||
|
||||
solve(i, GEMM_UNROLL_N,
|
||||
aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE,
|
||||
b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
|
||||
cc, ldc);
|
||||
|
||||
aa += i * k * COMPSIZE;
|
||||
cc += i * COMPSIZE;
|
||||
|
||||
}
|
||||
|
||||
kk -= GEMM_UNROLL_N;
|
||||
j --;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0; k < j; k++) {
|
||||
*(b + j * n_active + k) = *(ao + k * lda + j);
|
||||
}
|
||||
*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
}
|
||||
ao += n_active;
|
||||
b += n_active * n_active;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii > jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
}
|
||||
ao++;
|
||||
b += n_active;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active * lda;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
*(b + j * n_active + k) = *(ao + j * lda + k);
|
||||
}
|
||||
}
|
||||
b += n_active * n_active;
|
||||
ao += lda * n_active;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii < jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1(pn, ao);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
}
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
*(b + j * n_active + k) = *(ao + k * lda + j);
|
||||
}
|
||||
}
|
||||
ao += n_active;
|
||||
b += n_active * n_active;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii < jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1_gather_index(pn, ao, index);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
}
|
||||
ao++;
|
||||
b += n_active;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active * lda;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
#ifndef UNIT
|
||||
#define INV(a) (ONE / (a))
|
||||
#else
|
||||
#define INV(a) (ONE)
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0; k < j; k++) {
|
||||
*(b + j * n_active + k) = *(ao + j * lda + k);
|
||||
}
|
||||
*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
}
|
||||
ao += lda * n_active;
|
||||
b += n_active * n_active;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii > jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec = svld1(pn, ao);
|
||||
#else
|
||||
svfloat32_t aj_vec = svld1(pn, ao);
|
||||
#endif
|
||||
svst1(pn, b, aj_vec);
|
||||
}
|
||||
ao += lda;
|
||||
b += n_active;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,874 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/* X0 X1 X2 s0 X3 x4 x5 x6 */
|
||||
/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
|
||||
|
||||
#define origM x0
|
||||
#define origN x1
|
||||
#define origK x2
|
||||
#define origPA x3
|
||||
#define origPB x4
|
||||
#define pC x5
|
||||
#define LDC x6
|
||||
#define temp x7
|
||||
#define counterL x8
|
||||
#define counterI x9
|
||||
#define counterJ x10
|
||||
#define pB x11
|
||||
#define pCRow0 x12
|
||||
#define pCRow1 x13
|
||||
#define pCRow2 x14
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define lanes x17
|
||||
|
||||
#define alphaR x19
|
||||
#define alphaI x20
|
||||
|
||||
#define alphaz_R z6.d
|
||||
#define alphaz_I z7.d
|
||||
#define alpha0_R d6
|
||||
#define alpha0_I d7
|
||||
|
||||
|
||||
#define A_PRE_SIZE 2560
|
||||
#define B_PRE_SIZE 448
|
||||
#define C_PRE_SIZE 128
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmls
|
||||
#define OP_ri fmla
|
||||
#define OP_ir fmla
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmla
|
||||
#define OP_ri fmls
|
||||
#define OP_ir fmla
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmla
|
||||
#define OP_ri fmla
|
||||
#define OP_ir fmls
|
||||
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define OP_rr fmla
|
||||
#define OP_ii fmls
|
||||
#define OP_ri fmls
|
||||
#define OP_ir fmls
|
||||
#endif
|
||||
|
||||
// 00 origM
|
||||
// 01 origN
|
||||
// 02 origK
|
||||
// 03 origPA
|
||||
// 04 origPB
|
||||
// 05 pC
|
||||
// 06 origLDC -> LDC
|
||||
// 07 offset -> temp
|
||||
// 08 counterL
|
||||
// 09 counterI
|
||||
// 10 counterJ
|
||||
// 11 pB
|
||||
// 12 pCRow0
|
||||
// 13 pCRow1
|
||||
// 14 pCRow2
|
||||
// 15 pCRow3
|
||||
// 16 pA
|
||||
// 17 alpha_save_R
|
||||
// 18 must save alpha_save_I
|
||||
// 19 must save
|
||||
// 20 must save
|
||||
// 21 must save
|
||||
// 22 must save
|
||||
// 23 must save
|
||||
// 24 must save
|
||||
// 25 must save
|
||||
// 26 must save
|
||||
// 27 must save
|
||||
// 28 must save
|
||||
// 29 frame
|
||||
// 30 link
|
||||
// 31 sp
|
||||
|
||||
//v00 ALPHA_R -> pA00_R, pA01_R
|
||||
//v01 ALPHA_I -> pA00_I, pA01_I
|
||||
//v02 pA02_R, pA03_R
|
||||
//v03 pA02_I, pA03_I
|
||||
//v04 pA10_R, pA11_R
|
||||
//v05 pA10_I, pA11_I
|
||||
//v06 pA12_R, pA13_R
|
||||
//v07 pA12_I, pA13_I
|
||||
//v08 must save pB00_R, pB01_R
|
||||
//v09 must save pB00_I, pB01_I
|
||||
//v10 must save pB02_R, pB03_R OR ALPHA0_R
|
||||
//v11 must save pB02_I, pB03_I OR ALPHA0_I
|
||||
//v12 must save pB10_R, pB11_R
|
||||
//v13 must save pB10_I, pB11_I
|
||||
//v14 must save pB12_R, pB13_R OR ALPHA1_R
|
||||
//v15 must save pB12_I, pB13_I OR ALPHA1_R
|
||||
//v16 pC0R
|
||||
//v17 pC0I
|
||||
//v18 pC1R
|
||||
//v19 pC1I
|
||||
//v20 pC2R
|
||||
//v21 pC2I
|
||||
//v22 pC3R
|
||||
//v23 pC3I
|
||||
//v24 pC3R
|
||||
//v25 pC3I
|
||||
//v26 pC22_R, pC23_R
|
||||
//v27 pC22_I, pC23_I
|
||||
//v28 pC30_R, pC31_R
|
||||
//v29 pC30_I, pC31_I
|
||||
//v30 pC32_R, pC33_R
|
||||
//v31 pC32_I, pC33_I
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
.macro INITv1x4
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
dup z18.d, #0
|
||||
dup z19.d, #0
|
||||
dup z20.d, #0
|
||||
dup z21.d, #0
|
||||
dup z22.d, #0
|
||||
dup z23.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_I
|
||||
ld2d {z0.d, z1.d}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #4 // pA += lanes*2*8
|
||||
ld2d {z2.d, z3.d}, p1/z, [pA] // next one
|
||||
add pA, pA, lanes, lsl #4 // pA += lanes*2*8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
fmla z16.d, p1/m, z0.d, z8.d
|
||||
OP_ir z17.d, p1/m, z1.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z17.16b, z17.16b, z17.16b
|
||||
fmls z17.d, p1/m, z0.d, z9.d
|
||||
#else
|
||||
fmla z17.d, p1/m, z0.d, z9.d
|
||||
#endif
|
||||
OP_ii z16.d, p1/m, z1.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
||||
|
||||
fmla z18.d, p1/m, z0.d, z10.d
|
||||
OP_ir z19.d, p1/m, z1.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
OP_ii z18.d, p1/m, z1.d, z11.d
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z19.16b, z21.16b, z21.16b
|
||||
fmls z19.d, p1/m, z0.d, z11.d
|
||||
#else
|
||||
fmla z19.d, p1/m, z0.d, z11.d
|
||||
#endif
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
|
||||
|
||||
fmla z20.d, p1/m, z0.d, z12.d
|
||||
OP_ir z21.d, p1/m, z1.d, z12.d
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z21.16b, z23.16b, z23.16b
|
||||
fmls z21.d, p1/m, z0.d, z13.d
|
||||
#else
|
||||
fmla z21.d, p1/m, z0.d, z13.d
|
||||
#endif
|
||||
OP_ii z20.d, p1/m, z1.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
|
||||
|
||||
fmla z22.d, p1/m, z0.d, z14.d
|
||||
OP_ir z23.d, p1/m, z1.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
|
||||
defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#eor z23.16b, z19.16b, z19.16b
|
||||
fmls z23.d, p1/m, z0.d, z15.d
|
||||
#else
|
||||
fmla z23.d, p1/m, z0.d, z15.d
|
||||
#endif
|
||||
OP_ii z22.d, p1/m, z1.d, z15.d
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M1
|
||||
ld2d {z2.d, z3.d}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
|
||||
|
||||
OP_rr z16.d, p1/m, z0.d, z8.d
|
||||
OP_ir z17.d, p1/m, z1.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
OP_ii z16.d, p1/m, z1.d, z9.d
|
||||
OP_ri z17.d, p1/m, z0.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
||||
OP_rr z18.d, p1/m, z0.d, z10.d
|
||||
OP_ir z19.d, p1/m, z1.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
OP_ii z18.d, p1/m, z1.d, z11.d
|
||||
OP_ri z19.d, p1/m, z0.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
|
||||
OP_rr z20.d, p1/m, z0.d, z12.d
|
||||
OP_ir z21.d, p1/m, z1.d, z12.d
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
OP_ii z20.d, p1/m, z1.d, z13.d
|
||||
OP_ri z21.d, p1/m, z0.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
|
||||
OP_rr z22.d, p1/m, z0.d, z14.d
|
||||
OP_ir z23.d, p1/m, z1.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
OP_ii z22.d, p1/m, z1.d, z15.d
|
||||
OP_ri z23.d, p1/m, z0.d, z15.d
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M2
|
||||
ld2d {z0.d, z1.d}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8
|
||||
|
||||
OP_rr z16.d, p1/m, z2.d, z8.d
|
||||
OP_ir z17.d, p1/m, z3.d, z8.d
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
OP_ii z16.d, p1/m, z3.d, z9.d
|
||||
OP_ri z17.d, p1/m, z2.d, z9.d
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
||||
OP_rr z18.d, p1/m, z2.d, z10.d
|
||||
OP_ir z19.d, p1/m, z3.d, z10.d
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
OP_ii z18.d, p1/m, z3.d, z11.d
|
||||
OP_ri z19.d, p1/m, z2.d, z11.d
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
|
||||
OP_rr z20.d, p1/m, z2.d, z12.d
|
||||
OP_ir z21.d, p1/m, z3.d, z12.d
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
OP_ii z20.d, p1/m, z3.d, z13.d
|
||||
OP_ri z21.d, p1/m, z2.d, z13.d
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
|
||||
OP_rr z22.d, p1/m, z2.d, z14.d
|
||||
OP_ir z23.d, p1/m, z3.d, z14.d
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
OP_ii z22.d, p1/m, z3.d, z15.d
|
||||
OP_ri z23.d, p1/m, z2.d, z15.d
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_E
|
||||
OP_rr z16.d, p1/m, z2.d, z8.d
|
||||
OP_ir z17.d, p1/m, z3.d, z8.d
|
||||
OP_ii z16.d, p1/m, z3.d, z9.d
|
||||
OP_ri z17.d, p1/m, z2.d, z9.d
|
||||
|
||||
OP_rr z18.d, p1/m, z2.d, z10.d
|
||||
OP_ir z19.d, p1/m, z3.d, z10.d
|
||||
OP_ii z18.d, p1/m, z3.d, z11.d
|
||||
OP_ri z19.d, p1/m, z2.d, z11.d
|
||||
|
||||
OP_rr z20.d, p1/m, z2.d, z12.d
|
||||
OP_ir z21.d, p1/m, z3.d, z12.d
|
||||
OP_ii z20.d, p1/m, z3.d, z13.d
|
||||
OP_ri z21.d, p1/m, z2.d, z13.d
|
||||
|
||||
OP_rr z22.d, p1/m, z2.d, z14.d
|
||||
OP_ir z23.d, p1/m, z3.d, z14.d
|
||||
OP_ii z22.d, p1/m, z3.d, z15.d
|
||||
OP_ri z23.d, p1/m, z2.d, z15.d
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
ld2d {z0.d, z1.d}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
|
||||
OP_rr z16.d, p1/m, z0.d, z8.d
|
||||
OP_ir z17.d, p1/m, z1.d, z8.d
|
||||
OP_ii z16.d, p1/m, z1.d, z9.d
|
||||
OP_ri z17.d, p1/m, z0.d, z9.d
|
||||
|
||||
ld1rd z12.d, p0/z, [pB, 32]
|
||||
ld1rd z13.d, p0/z, [pB, 40]
|
||||
ld1rd z14.d, p0/z, [pB, 48]
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
OP_rr z18.d, p1/m, z0.d, z10.d
|
||||
OP_ir z19.d, p1/m, z1.d, z10.d
|
||||
OP_ii z18.d, p1/m, z1.d, z11.d
|
||||
OP_ri z19.d, p1/m, z0.d, z11.d
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
OP_rr z20.d, p1/m, z0.d, z12.d
|
||||
OP_ir z21.d, p1/m, z1.d, z12.d
|
||||
OP_ii z20.d, p1/m, z1.d, z13.d
|
||||
OP_ri z21.d, p1/m, z0.d, z13.d
|
||||
|
||||
OP_rr z22.d, p1/m, z0.d, z14.d
|
||||
OP_ir z23.d, p1/m, z1.d, z14.d
|
||||
OP_ii z22.d, p1/m, z1.d, z15.d
|
||||
OP_ri z23.d, p1/m, z0.d, z15.d
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaz_R
|
||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||
fmla z25.d, p1/m, z16.d, alphaz_I
|
||||
fmla z25.d, p1/m, z17.d, alphaz_R
|
||||
st2d {z24.d, z25.d}, p1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #4
|
||||
|
||||
ld2d {z26.d, z27.d}, p1/z, [pCRow1]
|
||||
fmla z26.d, p1/m, z18.d, alphaz_R
|
||||
fmls z26.d, p1/m, z19.d, alphaz_I
|
||||
fmla z27.d, p1/m, z18.d, alphaz_I
|
||||
fmla z27.d, p1/m, z19.d, alphaz_R
|
||||
st2d {z26.d, z27.d}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #4
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z28.d, z29.d}, p1/z, [pCRow2]
|
||||
fmla z28.d, p1/m, z20.d, alphaz_R
|
||||
fmls z28.d, p1/m, z21.d, alphaz_I
|
||||
fmla z29.d, p1/m, z20.d, alphaz_I
|
||||
fmla z29.d, p1/m, z21.d, alphaz_R
|
||||
st2d {z28.d, z29.d}, p1, [pCRow2]
|
||||
|
||||
add pCRow2, pCRow2, lanes, lsl #4
|
||||
|
||||
ld2d {z30.d, z31.d}, p1/z, [pCRow3]
|
||||
fmla z30.d, p1/m, z22.d, alphaz_R
|
||||
fmls z30.d, p1/m, z23.d, alphaz_I
|
||||
fmla z31.d, p1/m, z22.d, alphaz_I
|
||||
fmla z31.d, p1/m, z23.d, alphaz_R
|
||||
st2d {z30.d, z31.d}, p1, [pCRow3]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
.macro INITv1x2
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
dup z18.d, #0
|
||||
dup z19.d, #0
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x2_SUB
|
||||
ld2d {z0.d, z1.d}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
ld1rd z10.d, p0/z, [pB, 16]
|
||||
ld1rd z11.d, p0/z, [pB, 24]
|
||||
|
||||
OP_rr z16.d, p1/m, z0.d, z8.d
|
||||
OP_ir z17.d, p1/m, z1.d, z8.d
|
||||
OP_ii z16.d, p1/m, z1.d, z9.d
|
||||
OP_ri z17.d, p1/m, z0.d, z9.d
|
||||
|
||||
OP_rr z18.d, p1/m, z0.d, z10.d
|
||||
OP_ir z19.d, p1/m, z1.d, z10.d
|
||||
OP_ii z18.d, p1/m, z1.d, z11.d
|
||||
OP_ri z19.d, p1/m, z0.d, z11.d
|
||||
|
||||
add pB, pB, 32
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaz_R
|
||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||
fmla z25.d, p1/m, z16.d, alphaz_I
|
||||
fmla z25.d, p1/m, z17.d, alphaz_R
|
||||
st2d {z24.d, z25.d}, p1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #4
|
||||
|
||||
ld2d {z26.d, z27.d}, p1/z, [pCRow1]
|
||||
fmla z26.d, p1/m, z18.d, alphaz_R
|
||||
fmls z26.d, p1/m, z19.d, alphaz_I
|
||||
fmla z27.d, p1/m, z18.d, alphaz_I
|
||||
fmla z27.d, p1/m, z19.d, alphaz_R
|
||||
st2d {z26.d, z27.d}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #4
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
.macro INITv1x1
|
||||
dup z16.d, #0
|
||||
dup z17.d, #0
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNELv1x1_SUB
|
||||
ld2d {z0.d, z1.d}, p1/z, [pA]
|
||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8
|
||||
|
||||
ld1rd z8.d, p0/z, [pB]
|
||||
ld1rd z9.d, p0/z, [pB, 8]
|
||||
|
||||
add pB, pB, 16
|
||||
|
||||
OP_rr z16.d, p1/m, z0.d, z8.d
|
||||
OP_ir z17.d, p1/m, z1.d, z8.d
|
||||
OP_ii z16.d, p1/m, z1.d, z9.d
|
||||
OP_ri z17.d, p1/m, z0.d, z9.d
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaz_R
|
||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||
fmla z25.d, p1/m, z16.d, alphaz_I
|
||||
fmla z25.d, p1/m, z17.d, alphaz_R
|
||||
st2d {z24.d, z25.d}, p1, [pCRow0]
|
||||
|
||||
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
dup alphaz_R, alphaR
|
||||
fmov alphaI, d1
|
||||
dup alphaz_I, alphaI
|
||||
|
||||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
|
||||
ptrue p0.d // create true predicate
|
||||
|
||||
mov pB, origPB
|
||||
|
||||
// Loop over N
|
||||
mov counterJ, origN
|
||||
asr counterJ, counterJ, #2 // J = J / 4
|
||||
cmp counterJ, #0
|
||||
ble .Lzgemm_kernel_L2_BEGIN
|
||||
|
||||
/******************************************************************************/
|
||||
.Lzgemm_kernel_L4_BEGIN:
|
||||
mov pCRow0, pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
add pCRow2, pCRow1, LDC
|
||||
add pCRow3, pCRow2, LDC
|
||||
|
||||
add pC, pCRow3, LDC
|
||||
|
||||
mov pA, origPA // pA = start of A array
|
||||
|
||||
.Lzgemm_kernel_L4_Mv1_BEGIN:
|
||||
|
||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
|
||||
.align 5
|
||||
.Lzgemm_kernel_L4_Mv1_20:
|
||||
|
||||
mov pB, origPB
|
||||
INITv1x4 // fill with zeros
|
||||
|
||||
asr counterL , origK, #3
|
||||
cmp counterL , #2
|
||||
blt .Lzgemm_kernel_L4_Mv1_32
|
||||
|
||||
KERNELv1x4_I
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
|
||||
subs counterL, counterL, #2 // subtract 2
|
||||
ble .Lzgemm_kernel_L4_Mv1_22a
|
||||
|
||||
.align 5
|
||||
.Lzgemm_kernel_L4_Mv1_22:
|
||||
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lzgemm_kernel_L4_Mv1_22
|
||||
|
||||
.align 5
|
||||
.Lzgemm_kernel_L4_Mv1_22a:
|
||||
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_E
|
||||
|
||||
b .Lzgemm_kernel_L4_Mv1_44
|
||||
|
||||
.align 5
|
||||
.Lzgemm_kernel_L4_Mv1_32:
|
||||
|
||||
tst counterL, #1
|
||||
ble .Lzgemm_kernel_L4_Mv1_40
|
||||
|
||||
KERNELv1x4_I
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_M2
|
||||
KERNELv1x4_M1
|
||||
KERNELv1x4_E
|
||||
|
||||
b .Lzgemm_kernel_L4_Mv1_44
|
||||
|
||||
|
||||
.Lzgemm_kernel_L4_Mv1_40:
|
||||
|
||||
INITv1x4
|
||||
|
||||
.Lzgemm_kernel_L4_Mv1_44:
|
||||
|
||||
ands counterL , origK, #7
|
||||
ble .Lzgemm_kernel_L4_Mv1_100
|
||||
|
||||
.align 5
|
||||
.Lzgemm_kernel_L4_Mv1_46:
|
||||
KERNELv1x4_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bne .Lzgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Lzgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Lzgemm_kernel_L4_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
|
||||
b.any .Lzgemm_kernel_L4_Mv1_20
|
||||
|
||||
|
||||
|
||||
.Lzgemm_kernel_L4_END:
|
||||
|
||||
lsl temp, origK, #6
|
||||
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
|
||||
|
||||
subs counterJ, counterJ , #1 // j--
|
||||
bgt .Lzgemm_kernel_L4_BEGIN
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #3
|
||||
ble .Lzgemm_kernel_L999
|
||||
|
||||
tst counterJ , #2
|
||||
ble .Lzgemm_kernel_L1_BEGIN
|
||||
|
||||
mov pCRow0, pC // pCRow0 = pC
|
||||
add pCRow1, pCRow0, LDC
|
||||
|
||||
add pC,pC,LDC, lsl #1
|
||||
|
||||
mov pA, origPA // pA = A
|
||||
|
||||
|
||||
|
||||
.Lzgemm_kernel_L2_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
|
||||
.Lzgemm_kernel_L2_Mv1_20:
|
||||
|
||||
INITv1x2
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL,#0
|
||||
ble .Lzgemm_kernel_L2_Mv1_40
|
||||
.align 5
|
||||
|
||||
.Lzgemm_kernel_L2_Mv1_22:
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lzgemm_kernel_L2_Mv1_22
|
||||
|
||||
|
||||
.Lzgemm_kernel_L2_Mv1_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble .Lzgemm_kernel_L2_Mv1_100
|
||||
|
||||
.Lzgemm_kernel_L2_Mv1_42:
|
||||
|
||||
KERNELv1x2_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lzgemm_kernel_L2_Mv1_42
|
||||
|
||||
.Lzgemm_kernel_L2_Mv1_100:
|
||||
|
||||
SAVEv1x2
|
||||
|
||||
.Lzgemm_kernel_L2_Mv1_END:
|
||||
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Lzgemm_kernel_L2_Mv1_20
|
||||
|
||||
|
||||
.Lzgemm_kernel_L2_END:
|
||||
lsl temp, origK, #5
|
||||
add origPB, origPB, temp // B = B + K * 2 * 8 * 2
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Lzgemm_kernel_L1_BEGIN:
|
||||
|
||||
mov counterJ , origN
|
||||
tst counterJ , #1
|
||||
ble .Lzgemm_kernel_L999 // done
|
||||
|
||||
|
||||
mov pCRow0, pC // pCRow0 = C
|
||||
add pC , pC , LDC // Update pC to point to next
|
||||
|
||||
mov pA, origPA // pA = A
|
||||
|
||||
.Lzgemm_kernel_L1_Mv1_BEGIN:
|
||||
|
||||
mov counterI, #0
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
|
||||
|
||||
.Lzgemm_kernel_L1_Mv1_20:
|
||||
|
||||
INITv1x1
|
||||
|
||||
mov pB, origPB
|
||||
asr counterL , origK, #3 // counterL = counterL / 8
|
||||
cmp counterL , #0
|
||||
ble .Lzgemm_kernel_L1_Mv1_40
|
||||
.align 5
|
||||
|
||||
.Lzgemm_kernel_L1_Mv1_22:
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lzgemm_kernel_L1_Mv1_22
|
||||
|
||||
|
||||
.Lzgemm_kernel_L1_Mv1_40:
|
||||
|
||||
ands counterL , origK, #7 // counterL = counterL % 8
|
||||
ble .Lzgemm_kernel_L1_Mv1_100
|
||||
|
||||
.Lzgemm_kernel_L1_Mv1_42:
|
||||
|
||||
KERNELv1x1_SUB
|
||||
|
||||
subs counterL, counterL, #1
|
||||
bgt .Lzgemm_kernel_L1_Mv1_42
|
||||
|
||||
.Lzgemm_kernel_L1_Mv1_100:
|
||||
|
||||
SAVEv1x1
|
||||
|
||||
.Lzgemm_kernel_L1_Mv1_END:
|
||||
|
||||
incd counterI
|
||||
whilelt p1.d, counterI, origM //SVE instruction
|
||||
cntp lanes, p0, p1.d
|
||||
b.any .Lzgemm_kernel_L1_Mv1_20
|
||||
|
||||
.Lzgemm_kernel_L1_END:
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.Lzgemm_kernel_L999:
|
||||
mov x0, #0 // set return value
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint64_t lda_vec = svindex_s64(0LL, lda * 2);
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
|
||||
svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec);
|
||||
svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag));
|
||||
aoffset1 += 2;
|
||||
boffset += active * 2;
|
||||
}
|
||||
aoffset += active * lda * 2;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1);
|
||||
svst2_f64(pg, (double *) boffset, a_vec);
|
||||
aoffset1 += lda * 2;
|
||||
boffset += active * 2;
|
||||
}
|
||||
aoffset += active * 2;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
#if defined(DOUBLE)
|
||||
BLASLONG offset, i;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmul_z(pg, temp, 2);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
|
||||
svint64_t temp2 = svmul_z(pg, temp, lda_vec);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, 2);
|
||||
svint64_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b64(offset, 0LL);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
// dealing with ZERO separately
|
||||
if (offset > -active && offset < 1)
|
||||
b[ -2*offset + 1 ] = ZERO;
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
#else
|
||||
|
||||
int offset, i;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t j = 0;
|
||||
int32_t N = n;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmul_z(pg, temp, 2);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
|
||||
svint32_t temp2 = svmul_z(pg, temp, lda_vec);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, 2);
|
||||
svint32_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b32(offset, 0);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
// dealing with ZERO separately
|
||||
if (offset > -active && offset < 1)
|
||||
b[ -2*offset + 1 ] = ZERO;
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
#if defined(DOUBLE)
|
||||
BLASLONG offset, i;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmul_z(pg, temp, lda);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, 2);
|
||||
svint64_t temp2 = svmul_z(pg, temp, 2);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, lda);
|
||||
svint64_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, 2);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
data_vec_imag = svneg_z(pg, data_vec_imag);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b64(offset, 0LL);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
// dealing with ZERO separately
|
||||
if (offset > -active && offset < 1)
|
||||
b[ -2*offset + 1 ] = ZERO;
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
#else
|
||||
int offset, i;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t j = 0;
|
||||
int32_t N = n;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmul_z(pg, temp, lda);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, 2);
|
||||
svint32_t temp2 = svmul_z(pg, temp, 2);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, lda);
|
||||
svint32_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, 2);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
data_vec_imag = svneg_z(pg, data_vec_imag);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b32(offset, 0);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
// dealing with ZERO separately
|
||||
if (offset > -active && offset < 1)
|
||||
b[ -2*offset + 1 ] = ZERO;
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, offset;
|
||||
lda *= 2;
|
||||
|
||||
#if defined(DOUBLE)
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmul_z(pg, temp, 2);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
|
||||
svint64_t temp2 = svmul_z(pg, temp, lda_vec);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, 2);
|
||||
svint64_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
#else
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmul_z(pg, temp, 2);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, lda_vec);
|
||||
svint32_t temp2 = svmul_z(pg, temp, lda_vec);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, 2);
|
||||
svint32_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, offset;
|
||||
lda *= 2;
|
||||
|
||||
#if defined(DOUBLE)
|
||||
uint64_t sve_size = svcntd();
|
||||
svint64_t posY_vec = svdup_s64(posY);
|
||||
svint64_t posX_vec = svdup_s64(posX);
|
||||
svint64_t lda_vec = svdup_s64(lda);
|
||||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint64_t vec_off = svdup_s64(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint64_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint64_t temp1 = svmul_z(pg, temp, lda_vec);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, 2);
|
||||
svint64_t temp2 = svmul_z(pg, temp, 2);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, lda);
|
||||
svint64_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, 2);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
#else
|
||||
uint32_t sve_size = svcntw();
|
||||
svint32_t posY_vec = svdup_s32(posY);
|
||||
svint32_t posX_vec = svdup_s32(posX);
|
||||
svint32_t lda_vec = svdup_s32(lda);
|
||||
svint32_t one_vec = svdup_s32(1);
|
||||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
do {
|
||||
offset = posX - posY;
|
||||
svint32_t vec_off = svdup_s32(offset);
|
||||
svbool_t cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
svint32_t temp = svadd_z(pg, posX_vec, index);
|
||||
svint32_t temp1 = svmul_z(pg, temp, lda_vec);
|
||||
temp1 = svmla_z(pg, temp1, posY_vec, 2);
|
||||
svint32_t temp2 = svmul_z(pg, temp, 2);
|
||||
temp2 = svmla_z(pg, temp2, posY_vec, lda);
|
||||
svint32_t gat_ind = svsel(cmp, temp1, temp2);
|
||||
|
||||
i = m;
|
||||
while (i>0) {
|
||||
svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind);
|
||||
svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind);
|
||||
|
||||
gat_ind = svadd_m(cmp, gat_ind, 2);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
|
||||
svst2(pg, b, svcreate2(data_vec_real, data_vec_imag));
|
||||
|
||||
b += active * 2;
|
||||
offset --;
|
||||
vec_off = svsub_z(pg, vec_off, one_vec);
|
||||
cmp = svcmpgt(pg, vec_off, index_neg);
|
||||
|
||||
i--;
|
||||
}
|
||||
|
||||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,145 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
lda += lda;
|
||||
|
||||
js = 0;
|
||||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posY * 2 + posX * lda;
|
||||
} else {
|
||||
ao = a + posX * 2 + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#else
|
||||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#endif
|
||||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
|
||||
ao += 2;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X < posY) {
|
||||
ao += lda;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = *(ao+k*lda+j*2);
|
||||
b[temp++] = *(ao+k*lda+j*2+1);
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
b[temp++] = ZERO;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k <= j; k++) {
|
||||
b[temp++] = *(ao+k*lda+j*2);
|
||||
b[temp++] = *(ao+k*lda+j*2+1);
|
||||
}
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * 2;
|
||||
b += n_active*n_active * 2;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
lda += lda;
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posY * 2 + posX * lda;
|
||||
} else {
|
||||
ao = a + posX * 2 + posY * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X > posY) {
|
||||
ao += 2;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X < posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64x2_t aj_vec = svld2(pn, ao);
|
||||
#else
|
||||
svfloat32x2_t aj_vec = svld2(pn, ao);
|
||||
#endif
|
||||
svst2(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
b[temp++] = ZERO;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = *(ao+j*lda+k*2);
|
||||
b[temp++] = *(ao+j*lda+k*2+1);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
for (int k = j; k < n_active; k++) {
|
||||
b[temp++] = *(ao+j*lda+k*2);
|
||||
b[temp++] = *(ao+j*lda+k*2+1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * lda;
|
||||
b += n_active*n_active * 2;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
lda += lda;
|
||||
|
||||
js = 0;
|
||||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posX * 2 + posY * lda;
|
||||
} else {
|
||||
ao = a + posY * 2 + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#else
|
||||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#endif
|
||||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
|
||||
ao += 2;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
ao += lda;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
b[temp++] = ZERO;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = *(ao+k*lda+j*2);
|
||||
b[temp++] = *(ao+k*lda+j*2+1);
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
for (int k = j; k < n_active; k++) {
|
||||
b[temp++] = *(ao+k*lda+j*2);
|
||||
b[temp++] = *(ao+k*lda+j*2+1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * 2;
|
||||
b += n_active*n_active * 2;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,141 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js;
|
||||
BLASLONG X;
|
||||
|
||||
lda += lda;
|
||||
|
||||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
{
|
||||
X = posX;
|
||||
|
||||
if (posX <= posY) {
|
||||
ao = a + posX * 2 + posY * lda;
|
||||
} else {
|
||||
ao = a + posY * 2 + posX * lda;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
do
|
||||
{
|
||||
if (X < posY) {
|
||||
ao += 2;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64x2_t aj_vec = svld2(pn, ao);
|
||||
#else
|
||||
svfloat32x2_t aj_vec = svld2(pn, ao);
|
||||
#endif
|
||||
svst2(pn, b, aj_vec);
|
||||
ao += lda;
|
||||
b += n_active * 2;
|
||||
X ++;
|
||||
i ++;
|
||||
} else {
|
||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
|
||||
#ifdef UNIT
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k < j; k++) {
|
||||
b[temp++] = *(ao+j*lda+k*2);
|
||||
b[temp++] = *(ao+j*lda+k*2+1);
|
||||
}
|
||||
b[temp++] = ONE;
|
||||
b[temp++] = ZERO;
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#else
|
||||
int temp = 0;
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0 ; k <= j; k++) {
|
||||
b[temp++] = *(ao+j*lda+k*2);
|
||||
b[temp++] = *(ao+j*lda+k*2+1);
|
||||
}
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
b[temp++] = ZERO;
|
||||
b[temp++] = ZERO;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ao += n_active * lda;
|
||||
b += n_active*n_active * 2;
|
||||
X += n_active;
|
||||
i += n_active;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0; k < j; k++) {
|
||||
*(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
|
||||
*(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
|
||||
}
|
||||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
|
||||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
}
|
||||
ao += n_active * 2;
|
||||
b += n_active * n_active * 2;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii > jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#else
|
||||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#endif
|
||||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
|
||||
}
|
||||
ao += 2;
|
||||
b += n_active * 2;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active * lda;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
|
||||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
*(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
|
||||
*(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
|
||||
}
|
||||
}
|
||||
b += n_active * n_active * 2;
|
||||
ao += lda * n_active;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii < jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64x2_t aj_vec = svld2(pn, ao);
|
||||
#else
|
||||
svfloat32x2_t aj_vec = svld2(pn, ao);
|
||||
#endif
|
||||
svst2(pn, b, aj_vec);
|
||||
}
|
||||
ao += lda;
|
||||
b += n_active * 2;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active * 2;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
|
||||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
for (int k = j+1; k < n_active; k++) {
|
||||
*(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j);
|
||||
*(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1);
|
||||
}
|
||||
}
|
||||
ao += n_active * 2;
|
||||
b += n_active * n_active * 2;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii < jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#else
|
||||
svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
|
||||
svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
|
||||
#endif
|
||||
svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
|
||||
}
|
||||
ao += 2;
|
||||
b += n_active * 2;
|
||||
i++;
|
||||
ii++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active * lda;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include "arm_sve.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, jj;
|
||||
|
||||
FLOAT *ao;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
||||
ao = a;
|
||||
|
||||
i = 0;
|
||||
ii = 0;
|
||||
do {
|
||||
|
||||
if (ii == jj) {
|
||||
for (int j = 0; j < n_active; j++) {
|
||||
for (int k = 0; k < j; k++) {
|
||||
*(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k);
|
||||
*(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1);
|
||||
}
|
||||
compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1));
|
||||
//*(b + j * n_active + j) = INV(*(ao + j * lda + j));
|
||||
}
|
||||
ao += lda * n_active;
|
||||
b += n_active * n_active * 2;
|
||||
i += n_active;
|
||||
ii += n_active;
|
||||
} else {
|
||||
if (ii > jj) {
|
||||
#ifdef DOUBLE
|
||||
svfloat64x2_t aj_vec = svld2(pn, ao);
|
||||
#else
|
||||
svfloat32x2_t aj_vec = svld2(pn, ao);
|
||||
#endif
|
||||
svst2(pn, b, aj_vec);
|
||||
}
|
||||
ao += lda;
|
||||
b += n_active * 2;
|
||||
i ++;
|
||||
ii ++;
|
||||
}
|
||||
} while (i < m);
|
||||
|
||||
|
||||
a += n_active * 2;
|
||||
jj += n_active;
|
||||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,149 @@
|
|||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
ICAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = ../arm/asum.c
|
||||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../arm/copy.c
|
||||
DCOPYKERNEL = ../arm/copy.c
|
||||
CCOPYKERNEL = ../arm/zcopy.c
|
||||
ZCOPYKERNEL = ../arm/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../arm/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
SSCALKERNEL = ../arm/scal.c
|
||||
DSCALKERNEL = ../arm/scal.c
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../arm/swap.c
|
||||
DSWAPKERNEL = ../arm/swap.c
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
|
|
@ -0,0 +1 @@
|
|||
clean ::
|
|
@ -1 +1,14 @@
|
|||
#TODO: Add loongarch64 SIMD optimizations
|
||||
DGEMMKERNEL = dgemm_kernel_16x4.S
|
||||
DGEMMINCOPY = dgemm_ncopy_16.S
|
||||
DGEMMITCOPY = dgemm_tcopy_16.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4.S
|
||||
DGEMMOTCOPY = dgemm_tcopy_4.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,691 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define S9 $r20
|
||||
#define S10 $r23
|
||||
#define S11 $r24
|
||||
#define S12 $r25
|
||||
#define S13 $r26
|
||||
#define S14 $r27
|
||||
#define S15 $r28
|
||||
#define S16 $r29
|
||||
#define TD $r30
|
||||
#define TS $r31
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define U8 $xr8
|
||||
#define U9 $xr9
|
||||
#define U10 $xr10
|
||||
#define U11 $xr11
|
||||
#define U12 $xr12
|
||||
#define U13 $xr13
|
||||
#define U14 $xr14
|
||||
#define U15 $xr15
|
||||
#define D0 $xr16
|
||||
#define D1 $xr17
|
||||
#define D2 $xr18
|
||||
#define D3 $xr19
|
||||
#define D4 $xr20
|
||||
#define D5 $xr21
|
||||
#define D6 $xr22
|
||||
#define D7 $xr23
|
||||
#define D8 $xr24
|
||||
#define D9 $xr25
|
||||
#define D10 $xr26
|
||||
#define D11 $xr27
|
||||
#define D12 $xr28
|
||||
#define D13 $xr29
|
||||
#define D14 $xr30
|
||||
#define D15 $xr31
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -0x90
|
||||
SDARG $r23, $sp, 0x00
|
||||
SDARG $r24, $sp, 0x08
|
||||
SDARG $r25, $sp, 0x10
|
||||
SDARG $r26, $sp, 0x18
|
||||
SDARG $r27, $sp, 0x20
|
||||
SDARG $r28, $sp, 0x28
|
||||
SDARG $r29, $sp, 0x30
|
||||
SDARG $r30, $sp, 0x38
|
||||
SDARG $r31, $sp, 0x40
|
||||
ST $f23, $sp, 0x48
|
||||
ST $f24, $sp, 0x50
|
||||
ST $f25, $sp, 0x58
|
||||
ST $f26, $sp, 0x60
|
||||
ST $f27, $sp, 0x68
|
||||
ST $f28, $sp, 0x70
|
||||
ST $f29, $sp, 0x78
|
||||
ST $f30, $sp, 0x80
|
||||
ST $f31, $sp, 0x88
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
slli.d TL, LDA, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
srai.d J, N, 0x04
|
||||
beq J, ZERO, .L_N8
|
||||
|
||||
.L_J1: /* J-- */
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x03
|
||||
add.d S3, S2, TL
|
||||
addi.d J, J, -1
|
||||
add.d S4, S3, TL
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d S9, S7, T0
|
||||
add.d S10, S8, T0
|
||||
add.d S11, S9, T0
|
||||
add.d S12, S10, T0
|
||||
add.d S13, S11, T0
|
||||
add.d S14, S12, T0
|
||||
add.d S15, S13, T0
|
||||
add.d S16, S14, T0
|
||||
add.d TS, S15, T0
|
||||
beq I, ZERO, .L_I7
|
||||
|
||||
.L_I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
xvld U8, S9, 0x00
|
||||
xvld U9, S10, 0x00
|
||||
xvld U10, S11, 0x00
|
||||
xvld U11, S12, 0x00
|
||||
xvld U12, S13, 0x00
|
||||
xvld U13, S14, 0x00
|
||||
xvld U14, S15, 0x00
|
||||
xvld U15, S16, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvpackev.d D8, U9, U8
|
||||
xvpackod.d D9, U9, U8
|
||||
xvpackev.d D10, U11, U10
|
||||
xvpackod.d D11, U11, U10
|
||||
xvpackev.d D12, U13, U12
|
||||
xvpackod.d D13, U13, U12
|
||||
xvpackev.d D14, U15, U14
|
||||
xvpackod.d D15, U15, U14
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 4
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 5
|
||||
xvpermi.q D2, U0, 0x31 // 8
|
||||
xvpermi.q D6, U4, 0x31 // 9
|
||||
xvpermi.q D3, U1, 0x31 // 12
|
||||
xvpermi.q D7, U5, 0x31 // 13
|
||||
|
||||
xvand.v U8, D8, D8
|
||||
xvpermi.q D8, D10, 0x02 // 2
|
||||
xvand.v U12, D12, D12
|
||||
xvpermi.q D12, D14, 0x02 // 3
|
||||
xvand.v U9, D9, D9
|
||||
xvpermi.q D9, D11, 0x02 // 6
|
||||
xvand.v U13, D13, D13
|
||||
xvpermi.q D13, D15, 0x02 // 7
|
||||
xvpermi.q D10, U8, 0x31 // 10
|
||||
xvpermi.q D14, U12, 0x31 // 11
|
||||
xvpermi.q D11, U9, 0x31 // 14
|
||||
xvpermi.q D15, U13, 0x31 // 15
|
||||
|
||||
xvst D0, TD, 0x00 // 0
|
||||
xvst D4, TD, 0x20 // 1
|
||||
xvst D8, TD, 0x40 // 2
|
||||
xvst D12, TD, 0x60 // 3
|
||||
xvst D1, TD, 0x80 // 4
|
||||
xvst D5, TD, 0xA0 // 5
|
||||
xvst D9, TD, 0xC0 // 6
|
||||
xvst D13, TD, 0xE0 // 7
|
||||
addi.d TD, TD, 0x100
|
||||
xvst D2, TD, 0x00 // 8
|
||||
xvst D6, TD, 0x20 // 9
|
||||
xvst D10, TD, 0x40 // 10
|
||||
xvst D14, TD, 0x60 // 11
|
||||
xvst D3, TD, 0x80 // 12
|
||||
xvst D7, TD, 0xA0 // 13
|
||||
xvst D11, TD, 0xC0 // 14
|
||||
xvst D15, TD, 0xE0 // 15
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
xvld U0, S1, 0x20
|
||||
xvld U1, S2, 0x20
|
||||
xvld U2, S3, 0x20
|
||||
xvld U3, S4, 0x20
|
||||
xvld U4, S5, 0x20
|
||||
xvld U5, S6, 0x20
|
||||
xvld U6, S7, 0x20
|
||||
xvld U7, S8, 0x20
|
||||
xvld U8, S9, 0x20
|
||||
xvld U9, S10, 0x20
|
||||
xvld U10, S11, 0x20
|
||||
xvld U11, S12, 0x20
|
||||
xvld U12, S13, 0x20
|
||||
xvld U13, S14, 0x20
|
||||
xvld U14, S15, 0x20
|
||||
xvld U15, S16, 0x20
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvpackev.d D8, U9, U8
|
||||
xvpackod.d D9, U9, U8
|
||||
xvpackev.d D10, U11, U10
|
||||
xvpackod.d D11, U11, U10
|
||||
xvpackev.d D12, U13, U12
|
||||
xvpackod.d D13, U13, U12
|
||||
xvpackev.d D14, U15, U14
|
||||
xvpackod.d D15, U15, U14
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 4
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 5
|
||||
xvpermi.q D2, U0, 0x31 // 8
|
||||
xvpermi.q D6, U4, 0x31 // 9
|
||||
xvpermi.q D3, U1, 0x31 // 12
|
||||
xvpermi.q D7, U5, 0x31 // 13
|
||||
|
||||
xvand.v U8, D8, D8
|
||||
xvpermi.q D8, D10, 0x02 // 2
|
||||
xvand.v U12, D12, D12
|
||||
xvpermi.q D12, D14, 0x02 // 3
|
||||
xvand.v U9, D9, D9
|
||||
xvpermi.q D9, D11, 0x02 // 6
|
||||
xvand.v U13, D13, D13
|
||||
xvpermi.q D13, D15, 0x02 // 7
|
||||
xvpermi.q D10, U8, 0x31 // 10
|
||||
xvpermi.q D14, U12, 0x31 // 11
|
||||
xvpermi.q D11, U9, 0x31 // 14
|
||||
xvpermi.q D15, U13, 0x31 // 15
|
||||
|
||||
xvst D0, TD, 0x00 // 0
|
||||
xvst D4, TD, 0x20 // 1
|
||||
xvst D8, TD, 0x40 // 2
|
||||
xvst D12, TD, 0x60 // 3
|
||||
xvst D1, TD, 0x80 // 4
|
||||
xvst D5, TD, 0xA0 // 5
|
||||
xvst D9, TD, 0xC0 // 6
|
||||
xvst D13, TD, 0xE0 // 7
|
||||
addi.d TD, TD, 0x100
|
||||
xvst D2, TD, 0x00 // 8
|
||||
xvst D6, TD, 0x20 // 9
|
||||
xvst D10, TD, 0x40 // 10
|
||||
xvst D14, TD, 0x60 // 11
|
||||
xvst D3, TD, 0x80 // 12
|
||||
xvst D7, TD, 0xA0 // 13
|
||||
xvst D11, TD, 0xC0 // 14
|
||||
xvst D15, TD, 0xE0 // 15
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
addi.d S9, S9, 0x40
|
||||
addi.d S10, S10, 0x40
|
||||
addi.d S11, S11, 0x40
|
||||
addi.d S12, S12, 0x40
|
||||
addi.d S13, S13, 0x40
|
||||
addi.d S14, S14, 0x40
|
||||
addi.d S15, S15, 0x40
|
||||
addi.d S16, S16, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
|
||||
.L_I7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_I0
|
||||
|
||||
.L_II1: /* I-- */
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
fld.d F4, S5, 0x00
|
||||
fld.d F5, S6, 0x00
|
||||
fld.d F6, S7, 0x00
|
||||
fld.d F7, S8, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
fst.d F4, TD, 0x20
|
||||
addi.d S5, S5, 0x08
|
||||
fst.d F5, TD, 0x28
|
||||
addi.d S6, S6, 0x08
|
||||
fst.d F6, TD, 0x30
|
||||
addi.d S7, S7, 0x08
|
||||
fst.d F7, TD, 0x38
|
||||
addi.d S8, S8, 0x08
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
fld.d F0, S9, 0x00
|
||||
fld.d F1, S10, 0x00
|
||||
fld.d F2, S11, 0x00
|
||||
fld.d F3, S12, 0x00
|
||||
fld.d F4, S13, 0x00
|
||||
fld.d F5, S14, 0x00
|
||||
fld.d F6, S15, 0x00
|
||||
fld.d F7, S16, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S9, S9, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S10, S10, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S11, S11, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S12, S12, 0x08
|
||||
fst.d F4, TD, 0x20
|
||||
addi.d S13, S13, 0x08
|
||||
fst.d F5, TD, 0x28
|
||||
addi.d S14, S14, 0x08
|
||||
fst.d F6, TD, 0x30
|
||||
addi.d S15, S15, 0x08
|
||||
fst.d F7, TD, 0x38
|
||||
addi.d S16, S16, 0x08
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_II1
|
||||
|
||||
.L_I0:
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_N8:
|
||||
andi J, N, 0x08
|
||||
beq ZERO, J, .L_N4
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x03
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S2, T0
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d TS, S7, T0
|
||||
beq I, ZERO, .L_8I3
|
||||
|
||||
.L_8I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 2
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 3
|
||||
xvpermi.q D2, U0, 0x31 // 4
|
||||
xvpermi.q D6, U4, 0x31 // 5
|
||||
xvpermi.q D3, U1, 0x31 // 6
|
||||
xvpermi.q D7, U5, 0x31 // 7
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D4, TD, 0x20
|
||||
xvst D1, TD, 0x40
|
||||
xvst D5, TD, 0x60
|
||||
xvst D2, TD, 0x80
|
||||
xvst D6, TD, 0xA0
|
||||
xvst D3, TD, 0xC0
|
||||
xvst D7, TD, 0xE0
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
xvld U0, S1, 0x20
|
||||
xvld U1, S2, 0x20
|
||||
xvld U2, S3, 0x20
|
||||
xvld U3, S4, 0x20
|
||||
xvld U4, S5, 0x20
|
||||
xvld U5, S6, 0x20
|
||||
xvld U6, S7, 0x20
|
||||
xvld U7, S8, 0x20
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 2
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 3
|
||||
xvpermi.q D2, U0, 0x31 // 4
|
||||
xvpermi.q D6, U4, 0x31 // 5
|
||||
xvpermi.q D3, U1, 0x31 // 6
|
||||
xvpermi.q D7, U5, 0x31 // 7
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D4, TD, 0x20
|
||||
xvst D1, TD, 0x40
|
||||
xvst D5, TD, 0x60
|
||||
xvst D2, TD, 0x80
|
||||
xvst D6, TD, 0xA0
|
||||
xvst D3, TD, 0xC0
|
||||
xvst D7, TD, 0xE0
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_8I1
|
||||
|
||||
.L_8I3:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_N4
|
||||
|
||||
.L_8I11:
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
fld.d F4, S5, 0x00
|
||||
fld.d F5, S6, 0x00
|
||||
fld.d F6, S7, 0x00
|
||||
fld.d F7, S8, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
fst.d F4, TD, 0x20
|
||||
addi.d S5, S5, 0x08
|
||||
fst.d F5, TD, 0x28
|
||||
addi.d S6, S6, 0x08
|
||||
fst.d F6, TD, 0x30
|
||||
addi.d S7, S7, 0x08
|
||||
fst.d F7, TD, 0x38
|
||||
addi.d S8, S8, 0x08
|
||||
|
||||
addi.d TD, TD, 0x40
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_8I11
|
||||
|
||||
.L_N4:
|
||||
andi J, N, 0x04
|
||||
beq ZERO, J, .L_N2
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x02
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S2, T0
|
||||
add.d TS, S3, T0
|
||||
beq I, ZERO, .L_I3
|
||||
|
||||
.L_4I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 1
|
||||
xvpermi.q D2, U0, 0x31 // 2
|
||||
xvpermi.q D3, U1, 0x31 // 3
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D1, TD, 0x20
|
||||
xvst D2, TD, 0x40
|
||||
xvst D3, TD, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d TD, TD, 0x80
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_4I1
|
||||
|
||||
.L_I3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_N2
|
||||
|
||||
.L_4II1:
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
|
||||
addi.d TD, TD, 0x20
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_4II1
|
||||
|
||||
.L_N2:
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x01
|
||||
add.d TS, S2, TL
|
||||
beq I, ZERO, .L_NI1
|
||||
|
||||
.L_2I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
|
||||
xvpermi.q D0, D1, 0x02 // 0
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d TD, TD, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_2I1
|
||||
|
||||
.L_NI1:
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N1
|
||||
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
|
||||
.L_M1:
|
||||
fld.d F0, S1, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d TD, TD, 0x08
|
||||
addi.d M, M, -1
|
||||
blt ZERO, M, .L_M1
|
||||
|
||||
.L_N0:
|
||||
LDARG $r23, $sp, 0x00
|
||||
LDARG $r24, $sp, 0x08
|
||||
LDARG $r25, $sp, 0x10
|
||||
LDARG $r26, $sp, 0x18
|
||||
LDARG $r27, $sp, 0x20
|
||||
LDARG $r28, $sp, 0x28
|
||||
LDARG $r29, $sp, 0x30
|
||||
LDARG $r30, $sp, 0x38
|
||||
LDARG $r31, $sp, 0x40
|
||||
LD $f23, $sp, 0x48
|
||||
LD $f24, $sp, 0x50
|
||||
LD $f25, $sp, 0x58
|
||||
LD $f26, $sp, 0x60
|
||||
LD $f27, $sp, 0x68
|
||||
LD $f28, $sp, 0x70
|
||||
LD $f29, $sp, 0x78
|
||||
LD $f30, $sp, 0x80
|
||||
LD $f31, $sp, 0x88
|
||||
addi.d $sp, $sp, 0x90
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,237 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r7
|
||||
#define T0 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define D0 $xr14
|
||||
#define D1 $xr8
|
||||
#define D2 $xr9
|
||||
#define D3 $xr10
|
||||
#define D4 $xr11
|
||||
#define D5 $xr12
|
||||
#define D6 $xr13
|
||||
#define D7 $xr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
slli.d TL, LDA, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
srai.d J, N, 0x02
|
||||
beq J, ZERO, .L_N2
|
||||
|
||||
.L_J1: /* J-- */
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x02
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S2, T0
|
||||
add.d TS, S3, T0
|
||||
addi.d J, J, -1
|
||||
beq I, ZERO, .L_I3
|
||||
|
||||
.L_I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 1
|
||||
xvpermi.q D2, U0, 0x31 // 2
|
||||
xvpermi.q D3, U1, 0x31 // 3
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D1, TD, 0x20
|
||||
xvst D2, TD, 0x40
|
||||
xvst D3, TD, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d TD, TD, 0x80
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
|
||||
.L_I3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_I0
|
||||
|
||||
.L_II1:
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
|
||||
addi.d TD, TD, 0x20
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_II1
|
||||
|
||||
.L_I0:
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_N2:
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x02
|
||||
add.d TS, S2, TL
|
||||
beq I, ZERO, .L_2I3
|
||||
|
||||
.L_2I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D1, 0x02 // 0
|
||||
xvpermi.q D1, U0, 0x31 // 1
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D1, TD, 0x20
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d TD, TD, 0x40
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_2I1
|
||||
|
||||
.L_2I3:
|
||||
andi I, M, 0x03
|
||||
beq ZERO, I, .L_N1
|
||||
|
||||
.L_2II1: /* I-- */
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d I, I, -1
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
blt ZERO, I, .L_2II1
|
||||
|
||||
.L_N1:
|
||||
andi J, N, 0x01
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
move S1, TS
|
||||
srai.d I, M, 0x02
|
||||
beq ZERO, I, .L_1I3
|
||||
|
||||
.L_1I1:
|
||||
xvld U0, S1, 0x00
|
||||
addi.d S1, S1, 0x20
|
||||
xvst U0, TD, 0x00
|
||||
addi.d I, I, -1
|
||||
addi.d TD, TD, 0x20
|
||||
blt ZERO, I, .L_1I1
|
||||
|
||||
.L_1I3:
|
||||
andi I, M, 0x03
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
.L_1II1:
|
||||
fld.d F0, S1, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d I, I, -1
|
||||
addi.d TD, TD, 0x08
|
||||
blt ZERO, I, .L_1II1
|
||||
|
||||
.L_N0:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,710 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define P0 $r20
|
||||
#define P1 $r23
|
||||
#define P2 $r24
|
||||
#define P3 $r25
|
||||
#define P4 $r26
|
||||
#define P5 $r27
|
||||
#define T0 $r28
|
||||
#define T1 $r29
|
||||
#define TL $r7
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -56
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
SDARG $r25, $sp, 16
|
||||
SDARG $r26, $sp, 24
|
||||
SDARG $r27, $sp, 32
|
||||
SDARG $r28, $sp, 40
|
||||
SDARG $r29, $sp, 48
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
srai.d T0, N, 0x04
|
||||
srai.d T1, N, 0x03
|
||||
slli.d T0, T0, 0x04
|
||||
slli.d T1, T1, 0x03
|
||||
mul.d P2, M, T0
|
||||
mul.d P3, M, T1
|
||||
slli.d P2, P2, 0x03
|
||||
slli.d P3, P3, 0x03
|
||||
add.d P2, DST, P2
|
||||
add.d P3, DST, P3
|
||||
|
||||
srai.d T0, N, 0x02
|
||||
srai.d T1, N, 0x01
|
||||
slli.d T0, T0, 0x02
|
||||
slli.d T1, T1, 0x01
|
||||
mul.d P4, M, T0
|
||||
mul.d P5, M, T1
|
||||
slli.d P4, P4, 0x03
|
||||
slli.d P5, P5, 0x03
|
||||
add.d P4, DST, P4
|
||||
add.d P5, DST, P5
|
||||
|
||||
slli.d TL, LDA, 0x03
|
||||
srai.d J, M, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
slli.d T1, M, 0x07
|
||||
beq ZERO, J, .L_M7
|
||||
|
||||
.L_J1: /* J-- */
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d S0, S7, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x400
|
||||
|
||||
srai.d I, N, 0x04
|
||||
addi.d J, J, -1
|
||||
beq ZERO, I, .L_N15
|
||||
|
||||
.L_I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
xvld U4, S2, 0x00
|
||||
xvld U5, S2, 0x20
|
||||
xvld U6, S2, 0x40
|
||||
xvld U7, S2, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
xvld U0, S3, 0x00
|
||||
xvld U1, S3, 0x20
|
||||
xvld U2, S3, 0x40
|
||||
xvld U3, S3, 0x60
|
||||
xvld U4, S4, 0x00
|
||||
xvld U5, S4, 0x20
|
||||
xvld U6, S4, 0x40
|
||||
xvld U7, S4, 0x60
|
||||
|
||||
xvst U0, P1, 0x100
|
||||
xvst U1, P1, 0x120
|
||||
xvst U2, P1, 0x140
|
||||
xvst U3, P1, 0x160
|
||||
xvst U4, P1, 0x180
|
||||
xvst U5, P1, 0x1A0
|
||||
xvst U6, P1, 0x1C0
|
||||
xvst U7, P1, 0x1E0
|
||||
|
||||
xvld U0, S5, 0x00
|
||||
xvld U1, S5, 0x20
|
||||
xvld U2, S5, 0x40
|
||||
xvld U3, S5, 0x60
|
||||
xvld U4, S6, 0x00
|
||||
xvld U5, S6, 0x20
|
||||
xvld U6, S6, 0x40
|
||||
xvld U7, S6, 0x60
|
||||
|
||||
xvst U0, P1, 0x200
|
||||
xvst U1, P1, 0x220
|
||||
xvst U2, P1, 0x240
|
||||
xvst U3, P1, 0x260
|
||||
xvst U4, P1, 0x280
|
||||
xvst U5, P1, 0x2A0
|
||||
xvst U6, P1, 0x2C0
|
||||
xvst U7, P1, 0x2E0
|
||||
|
||||
xvld U0, S7, 0x00
|
||||
xvld U1, S7, 0x20
|
||||
xvld U2, S7, 0x40
|
||||
xvld U3, S7, 0x60
|
||||
xvld U4, S8, 0x00
|
||||
xvld U5, S8, 0x20
|
||||
xvld U6, S8, 0x40
|
||||
xvld U7, S8, 0x60
|
||||
|
||||
xvst U0, P1, 0x300
|
||||
xvst U1, P1, 0x320
|
||||
xvst U2, P1, 0x340
|
||||
xvst U3, P1, 0x360
|
||||
xvst U4, P1, 0x380
|
||||
xvst U5, P1, 0x3A0
|
||||
xvst U6, P1, 0x3C0
|
||||
xvst U7, P1, 0x3E0
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d S2, S2, 0x80
|
||||
addi.d S3, S3, 0x80
|
||||
addi.d S4, S4, 0x80
|
||||
addi.d S5, S5, 0x80
|
||||
addi.d S6, S6, 0x80
|
||||
addi.d S7, S7, 0x80
|
||||
addi.d S8, S8, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_I1
|
||||
|
||||
.L_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
xvst U2, P2, 0x40
|
||||
xvst U3, P2, 0x60
|
||||
xvst U4, P2, 0x80
|
||||
xvst U5, P2, 0xA0
|
||||
xvst U6, P2, 0xC0
|
||||
xvst U7, P2, 0xE0
|
||||
|
||||
xvld U0, S5, 0x00
|
||||
xvld U1, S5, 0x20
|
||||
xvld U2, S6, 0x00
|
||||
xvld U3, S6, 0x20
|
||||
xvld U4, S7, 0x00
|
||||
xvld U5, S7, 0x20
|
||||
xvld U6, S8, 0x00
|
||||
xvld U7, S8, 0x20
|
||||
|
||||
xvst U0, P2, 0x100
|
||||
xvst U1, P2, 0x120
|
||||
xvst U2, P2, 0x140
|
||||
xvst U3, P2, 0x160
|
||||
xvst U4, P2, 0x180
|
||||
xvst U5, P2, 0x1A0
|
||||
xvst U6, P2, 0x1C0
|
||||
xvst U7, P2, 0x1E0
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
addi.d P2, P2, 0x200
|
||||
|
||||
.L_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
xvst U1, P3, 0x20
|
||||
xvst U2, P3, 0x40
|
||||
xvst U3, P3, 0x60
|
||||
xvst U4, P3, 0x80
|
||||
xvst U5, P3, 0xA0
|
||||
xvst U6, P3, 0xC0
|
||||
xvst U7, P3, 0xE0
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d S5, S5, 0x20
|
||||
addi.d S6, S6, 0x20
|
||||
addi.d S7, S7, 0x20
|
||||
addi.d S8, S8, 0x20
|
||||
addi.d P3, P3, 0x100
|
||||
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
xvpermi.q U2, U3, 0x02
|
||||
xvpermi.q U4, U5, 0x02
|
||||
xvpermi.q U6, U7, 0x02
|
||||
|
||||
xvst U0, P4, 0x00
|
||||
xvst U2, P4, 0x20
|
||||
xvst U4, P4, 0x40
|
||||
xvst U6, P4, 0x60
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d S5, S5, 0x10
|
||||
addi.d S6, S6, 0x10
|
||||
addi.d S7, S7, 0x10
|
||||
addi.d S8, S8, 0x10
|
||||
addi.d P4, P4, 0x80
|
||||
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
fld.d F4, S5, 0x00
|
||||
fld.d F5, S6, 0x00
|
||||
fld.d F6, S7, 0x00
|
||||
fld.d F7, S8, 0x00
|
||||
|
||||
fst.d F0, P5, 0x00
|
||||
fst.d F1, P5, 0x08
|
||||
fst.d F2, P5, 0x10
|
||||
fst.d F3, P5, 0x18
|
||||
fst.d F4, P5, 0x20
|
||||
fst.d F5, P5, 0x28
|
||||
fst.d F6, P5, 0x30
|
||||
fst.d F7, P5, 0x38
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d S5, S5, 0x08
|
||||
addi.d S6, S6, 0x08
|
||||
addi.d S7, S7, 0x08
|
||||
addi.d S8, S8, 0x08
|
||||
addi.d P5, P5, 0x40
|
||||
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_M7:
|
||||
andi J, M, 0x04
|
||||
beq ZERO, J, .L_M3
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x200
|
||||
|
||||
srai.d I, N, 0x04
|
||||
beq ZERO, I, .L_4N15
|
||||
|
||||
.L_4I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
xvld U4, S2, 0x00
|
||||
xvld U5, S2, 0x20
|
||||
xvld U6, S2, 0x40
|
||||
xvld U7, S2, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
xvld U0, S3, 0x00
|
||||
xvld U1, S3, 0x20
|
||||
xvld U2, S3, 0x40
|
||||
xvld U3, S3, 0x60
|
||||
xvld U4, S4, 0x00
|
||||
xvld U5, S4, 0x20
|
||||
xvld U6, S4, 0x40
|
||||
xvld U7, S4, 0x60
|
||||
|
||||
xvst U0, P1, 0x100
|
||||
xvst U1, P1, 0x120
|
||||
xvst U2, P1, 0x140
|
||||
xvst U3, P1, 0x160
|
||||
xvst U4, P1, 0x180
|
||||
xvst U5, P1, 0x1A0
|
||||
xvst U6, P1, 0x1C0
|
||||
xvst U7, P1, 0x1E0
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d S2, S2, 0x80
|
||||
addi.d S3, S3, 0x80
|
||||
addi.d S4, S4, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_4I1
|
||||
|
||||
.L_4N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_4N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
xvst U2, P2, 0x40
|
||||
xvst U3, P2, 0x60
|
||||
xvst U4, P2, 0x80
|
||||
xvst U5, P2, 0xA0
|
||||
xvst U6, P2, 0xC0
|
||||
xvst U7, P2, 0xE0
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d P2, P2, 0x100
|
||||
|
||||
.L_4N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_4N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
xvst U1, P3, 0x20
|
||||
xvst U2, P3, 0x40
|
||||
xvst U3, P3, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d P3, P3, 0x80
|
||||
|
||||
.L_4N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_4N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
xvpermi.q U2, U3, 0x02
|
||||
|
||||
xvst U0, P4, 0x00
|
||||
xvst U2, P4, 0x20
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d P4, P4, 0x40
|
||||
|
||||
.L_4N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M3
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, P5, 0x00
|
||||
fst.d F1, P5, 0x08
|
||||
fst.d F2, P5, 0x10
|
||||
fst.d F3, P5, 0x18
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d P5, P5, 0x20
|
||||
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x100
|
||||
|
||||
srai.d I, N, 0x04
|
||||
beq ZERO, I, .L_2N15
|
||||
|
||||
.L_2I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
xvld U4, S2, 0x00
|
||||
xvld U5, S2, 0x20
|
||||
xvld U6, S2, 0x40
|
||||
xvld U7, S2, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d S2, S2, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_2I1
|
||||
|
||||
.L_2N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_2N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
xvst U2, P2, 0x40
|
||||
xvst U3, P2, 0x60
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d P2, P2, 0x80
|
||||
|
||||
.L_2N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_2N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
xvst U1, P3, 0x20
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d P3, P3, 0x40
|
||||
|
||||
.L_2N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_2N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
|
||||
xvst U0, P4, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d P4, P4, 0x20
|
||||
|
||||
.L_2N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, P5, 0x00
|
||||
fst.d F1, P5, 0x08
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d P5, P5, 0x10
|
||||
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x80
|
||||
|
||||
srai.d I, N, 0x04
|
||||
beq ZERO, I, .L_1N15
|
||||
|
||||
.L_1I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_1I1
|
||||
|
||||
.L_1N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_1N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d P2, P2, 0x40
|
||||
|
||||
.L_1N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_1N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d P3, P3, 0x20
|
||||
|
||||
.L_1N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_1N1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S1, 0x08
|
||||
|
||||
fst.d F0, P4, 0x00
|
||||
fst.d F1, P4, 0x08
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d P4, P4, 0x10
|
||||
|
||||
.L_1N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
|
||||
fst.d F0, P5, 0x00
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d P5, P5, 0x08
|
||||
|
||||
.L_M0:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
LDARG $r25, $sp, 16
|
||||
LDARG $r26, $sp, 24
|
||||
LDARG $r27, $sp, 32
|
||||
LDARG $r28, $sp, 40
|
||||
LDARG $r29, $sp, 48
|
||||
addi.d $sp, $sp, 56
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,270 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define P0 $r16
|
||||
#define P1 $r17
|
||||
#define P2 $r18
|
||||
#define P3 $r19
|
||||
#define T0 $r20
|
||||
#define T1 $r23
|
||||
#define TL $r7
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
srai.d T0, N, 0x02
|
||||
slli.d T0, T0, 0x02
|
||||
srai.d T1, N, 0x01
|
||||
slli.d T1, T1, 0x01
|
||||
mul.d T0, M, T0
|
||||
mul.d T1, M, T1
|
||||
slli.d T0, T0, 0x03
|
||||
slli.d T1, T1, 0x03
|
||||
add.d P2, DST, T0
|
||||
add.d P3, DST, T1
|
||||
|
||||
slli.d TL, LDA, 0x03
|
||||
srai.d J, M, 0x02
|
||||
slli.d T0, TL, 0x01
|
||||
slli.d T1, M, 0x05
|
||||
beq ZERO, J, .L_M3
|
||||
|
||||
.L_J1: /* J-- */
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x80
|
||||
|
||||
srai.d I, N, 0x02
|
||||
addi.d J, J, -1
|
||||
beq ZERO, I, .L_N3
|
||||
|
||||
.L_I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
add.d P1, P1, T1
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
xvpermi.q U2, U3, 0x02
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U2, P2, 0x20
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d P2, P2, 0x40
|
||||
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, P3, 0x00
|
||||
fst.d F1, P3, 0x08
|
||||
fst.d F2, P3, 0x10
|
||||
fst.d F3, P3, 0x18
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d P3, P3, 0x20
|
||||
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x40
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_2N3
|
||||
|
||||
.L_2I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
|
||||
blt ZERO, I, .L_2I1
|
||||
|
||||
.L_2N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_2N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d P2, P2, 0x20
|
||||
|
||||
.L_2N1:
|
||||
addi.d I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, P3, 0x00
|
||||
fst.d F1, P3, 0x08
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d P3, P3, 0x10
|
||||
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
move P1, P0
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_1N3
|
||||
|
||||
.L_1I1:
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
|
||||
blt ZERO, I, .L_1I1
|
||||
|
||||
.L_1N3:
|
||||
andi I, N, 0x02
|
||||
beq I, ZERO, .L_1N1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S1, 0x08
|
||||
|
||||
fst.d F0, P2, 0x00
|
||||
fst.d F1, P2, 0x08
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d P2, P2, 0x10
|
||||
|
||||
.L_1N1:
|
||||
andi I, N, 0x01
|
||||
beq I, ZERO, .L_M0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
|
||||
fst.d F0, P3, 0x00
|
||||
|
||||
.L_M0:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
|
@ -39,11 +39,19 @@ IZAMINKERNEL = izamax.S
|
|||
endif
|
||||
|
||||
ifndef ISMINKERNEL
|
||||
ISMINKERNEL = iamax.S
|
||||
ISMINKERNEL = imax.S
|
||||
endif
|
||||
|
||||
ifndef IDMINKERNEL
|
||||
IDMINKERNEL = iamax.S
|
||||
IDMINKERNEL = imax.S
|
||||
endif
|
||||
|
||||
ifndef ISMAXKERNEL
|
||||
ISMAXKERNEL = imax.S
|
||||
endif
|
||||
|
||||
ifndef IDMAXKERNEL
|
||||
IDMAXKERNEL = imax.S
|
||||
endif
|
||||
|
||||
ifndef SNRM2KERNEL
|
||||
|
|
|
@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
|
||||
NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
|
||||
NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
|
||||
ptr = (FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
sumf += (*ptr);
|
||||
|
|
|
@ -114,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
#else
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#endif
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
|
||||
ptr = (FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
sumf += (*ptr);
|
||||
|
|
|
@ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
#endif
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, y, inc_y, result, 0,
|
||||
( void *)dot_thread_function, nthreads);
|
||||
(int (*)(void)) dot_thread_function, nthreads);
|
||||
|
||||
ptr = (RETURN_TYPE *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
|
|
|
@ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
|
||||
#endif
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads);
|
||||
}
|
||||
#else
|
||||
rot_compute(n, x, inc_x, y, inc_y, c, s);
|
||||
|
|
|
@ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
#else
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#endif
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
|
||||
ptr = (FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
sumf += (*ptr);
|
||||
|
|
|
@ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
|
||||
#endif
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads);
|
||||
}
|
||||
#else
|
||||
rot_compute(n, x, inc_x, y, inc_y, c, s);
|
||||
|
|
|
@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
|
||||
NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
|
||||
NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads);
|
||||
ptr = (FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
sumf += (*ptr);
|
||||
|
|
|
@ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, y, inc_y, result, 0,
|
||||
( void *)zdot_thread_function, nthreads);
|
||||
(int (*)(void))zdot_thread_function, nthreads);
|
||||
|
||||
ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
|
|
|
@ -97,8 +97,6 @@
|
|||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \date December 2016
|
||||
*
|
||||
*> \ingroup complexGEcomputational
|
||||
*
|
||||
*> \par Further Details:
|
||||
|
@ -127,10 +125,9 @@
|
|||
* =====================================================================
|
||||
SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO )
|
||||
*
|
||||
* -- LAPACK computational routine (version 3.7.0) --
|
||||
* -- LAPACK computational routine --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
* December 2016
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
INTEGER INFO, LDA, LDT, M, N
|
||||
|
@ -157,10 +154,10 @@
|
|||
* Test the input arguments
|
||||
*
|
||||
INFO = 0
|
||||
IF( M.LT.0 ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
IF( N.LT.0 ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( M.LT.N ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
|
||||
|
|
|
@ -97,8 +97,6 @@
|
|||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \date December 2016
|
||||
*
|
||||
*> \ingroup doubleGEcomputational
|
||||
*
|
||||
*> \par Further Details:
|
||||
|
@ -127,10 +125,9 @@
|
|||
* =====================================================================
|
||||
SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO )
|
||||
*
|
||||
* -- LAPACK computational routine (version 3.7.0) --
|
||||
* -- LAPACK computational routine --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
* December 2016
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
INTEGER INFO, LDA, LDT, M, N
|
||||
|
@ -157,10 +154,10 @@
|
|||
* Test the input arguments
|
||||
*
|
||||
INFO = 0
|
||||
IF( M.LT.0 ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
IF( N.LT.0 ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( M.LT.N ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
|
||||
|
|
|
@ -97,8 +97,6 @@
|
|||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \date December 2016
|
||||
*
|
||||
*> \ingroup realGEcomputational
|
||||
*
|
||||
*> \par Further Details:
|
||||
|
@ -127,10 +125,9 @@
|
|||
* =====================================================================
|
||||
SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO )
|
||||
*
|
||||
* -- LAPACK computational routine (version 3.7.0) --
|
||||
* -- LAPACK computational routine --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
* December 2016
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
INTEGER INFO, LDA, LDT, M, N
|
||||
|
@ -157,10 +154,10 @@
|
|||
* Test the input arguments
|
||||
*
|
||||
INFO = 0
|
||||
IF( M.LT.0 ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
IF( N.LT.0 ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( M.LT.N ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
|
||||
|
|
|
@ -97,8 +97,6 @@
|
|||
*> \author Univ. of Colorado Denver
|
||||
*> \author NAG Ltd.
|
||||
*
|
||||
*> \date December 2016
|
||||
*
|
||||
*> \ingroup complex16GEcomputational
|
||||
*
|
||||
*> \par Further Details:
|
||||
|
@ -127,10 +125,9 @@
|
|||
* =====================================================================
|
||||
SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO )
|
||||
*
|
||||
* -- LAPACK computational routine (version 3.7.0) --
|
||||
* -- LAPACK computational routine --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
* December 2016
|
||||
*
|
||||
* .. Scalar Arguments ..
|
||||
INTEGER INFO, LDA, LDT, M, N
|
||||
|
@ -157,10 +154,10 @@
|
|||
* Test the input arguments
|
||||
*
|
||||
INFO = 0
|
||||
IF( M.LT.0 ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( N.LT.0 ) THEN
|
||||
IF( N.LT.0 ) THEN
|
||||
INFO = -2
|
||||
ELSE IF( M.LT.N ) THEN
|
||||
INFO = -1
|
||||
ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
|
||||
INFO = -4
|
||||
ELSE IF( LDT.LT.MAX( 1, N ) ) THEN
|
||||
|
|
|
@ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \
|
|||
endif
|
||||
|
||||
.PHONY: all
|
||||
.NOTPARALLEL:
|
||||
all: $(TMGLIB)
|
||||
|
||||
ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \
|
||||
|
|
|
@ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
|
|||
|
||||
blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha,
|
||||
a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
|
||||
ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
|
||||
ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads);
|
||||
|
||||
is += bk;
|
||||
}
|
||||
|
|
|
@ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
|
|||
a--;
|
||||
k1 --;
|
||||
|
||||
#ifndef MINUS
|
||||
ipiv += k1;
|
||||
#else
|
||||
ipiv -= (k2 - 1) * incx;
|
||||
#ifdef MINUS
|
||||
ipiv -= (k2 - k1 - 1) * incx;
|
||||
#endif
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
|
|
@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
|
|||
a--;
|
||||
k1 --;
|
||||
|
||||
#ifndef MINUS
|
||||
ipiv += k1;
|
||||
#else
|
||||
ipiv -= (k2 - 1) * incx;
|
||||
#ifdef MINUS
|
||||
ipiv -= (k2 - k1 - 1) * incx;
|
||||
#endif
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
|
|
@ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
|
|||
a--;
|
||||
k1 --;
|
||||
|
||||
#ifndef MINUS
|
||||
ipiv += k1;
|
||||
#else
|
||||
ipiv -= (k2 - 1) * incx;
|
||||
#ifdef MINUS
|
||||
ipiv -= (k2 - k1 - 1) * incx;
|
||||
#endif
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
|
|
@ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
|
|||
a--;
|
||||
k1 --;
|
||||
|
||||
#ifndef MINUS
|
||||
ipiv += k1;
|
||||
#else
|
||||
ipiv -= (k2 - 1) * incx;
|
||||
#ifdef MINUS
|
||||
ipiv -= (k2 - k1 - 1) * incx;
|
||||
#endif
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
|
|
@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
|
|||
lda *= 2;
|
||||
k1 --;
|
||||
|
||||
#ifndef MINUS
|
||||
ipiv += k1;
|
||||
#else
|
||||
ipiv -= (k2 - 1) * incx;
|
||||
#ifdef MINUS
|
||||
ipiv -= (k2 - k1 - 1) * incx;
|
||||
#endif
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
|
|
@ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4,
|
|||
lda *= 2;
|
||||
k1 --;
|
||||
|
||||
#ifndef MINUS
|
||||
ipiv += k1;
|
||||
#else
|
||||
ipiv -= (k2 - 1) * incx;
|
||||
#ifdef MINUS
|
||||
ipiv -= (k2 - k1 - 1) * incx;
|
||||
#endif
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue