commit
02699226d0
|
@ -6,7 +6,7 @@
|
|||
INCLUDED = 1
|
||||
|
||||
ifndef TOPDIR
|
||||
TOPDIR = .
|
||||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
|
@ -252,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
|
|||
ifndef TARGET_CORE
|
||||
include $(TOPDIR)/Makefile.conf
|
||||
else
|
||||
HAVE_NEON=
|
||||
HAVE_VFP=
|
||||
HAVE_VFPV3=
|
||||
HAVE_VFPV4=
|
||||
HAVE_MMX=
|
||||
HAVE_SSE=
|
||||
HAVE_SSE2=
|
||||
HAVE_SSE3=
|
||||
HAVE_SSSE3=
|
||||
HAVE_SSE4_1=
|
||||
HAVE_SSE4_2=
|
||||
HAVE_SSE4A=
|
||||
HAVE_SSE5=
|
||||
HAVE_AVX=
|
||||
HAVE_AVX2=
|
||||
HAVE_FMA3=
|
||||
include $(TOPDIR)/Makefile_kernel.conf
|
||||
endif
|
||||
|
||||
|
@ -1522,6 +1538,8 @@ export HAVE_SSE4_2
|
|||
export HAVE_SSE4A
|
||||
export HAVE_SSE5
|
||||
export HAVE_AVX
|
||||
export HAVE_AVX2
|
||||
export HAVE_FMA3
|
||||
export HAVE_VFP
|
||||
export HAVE_VFPV3
|
||||
export HAVE_VFPV4
|
||||
|
|
|
@ -9,9 +9,9 @@ endif
|
|||
endif
|
||||
|
||||
ifdef HAVE_SSE3
|
||||
ifndef DYNAMIC_ARCH
|
||||
CCOMMON_OPT += -msse3
|
||||
FCOMMON_OPT += -msse3
|
||||
endif
|
||||
ifdef HAVE_SSSE3
|
||||
CCOMMON_OPT += -mssse3
|
||||
FCOMMON_OPT += -mssse3
|
||||
|
@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1
|
|||
CCOMMON_OPT += -msse4.1
|
||||
FCOMMON_OPT += -msse4.1
|
||||
endif
|
||||
ifdef HAVE_AVX
|
||||
CCOMMON_OPT += -mavx
|
||||
FCOMMON_OPT += -mavx
|
||||
endif
|
||||
ifdef HAVE_AVX2
|
||||
CCOMMON_OPT += -mavx2
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
ifdef HAVE_FMA3
|
||||
CCOMMON_OPT += -mfma
|
||||
FCOMMON_OPT += -mfma
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
|
@ -66,8 +76,7 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifdef HAVE_AVX2
|
||||
ifndef NO_AVX2
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
|
@ -96,7 +105,6 @@ endif
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL "SKYLAKEX")
|
||||
if (${CORE} STREQUAL SKYLAKEX)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
|
@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL "COOPERLAKE")
|
||||
if (${CORE} STREQUAL COOPERLAKE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
|
|
|
@ -139,36 +139,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "BARCELONA")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_SSE3\n")
|
||||
elseif ("${TCORE}" STREQUAL "STEAMROLLER")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_SSE3\n")
|
||||
elseif ("${TCORE}" STREQUAL "EXCAVATOR")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_SSE3\n")
|
||||
elseif ("${TCORE}" STREQUAL "NEHALEM")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_SSE3\n")
|
||||
elseif ("${TCORE}" STREQUAL "PRESCOTT")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_SSE3\n")
|
||||
elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_AVX\n")
|
||||
elseif ("${TCORE}" STREQUAL "HASWELL")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_AVX2\n")
|
||||
elseif ("${TCORE}" STREQUAL "ZEN")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_AVX2\n")
|
||||
elseif ("${TCORE}" STREQUAL "SKYLAKEX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_AVX512\n")
|
||||
elseif ("${TCORE}" STREQUAL "COOPERLAKE")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define HAVE_AVX512\n")
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
|
@ -586,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING)
|
|||
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
|
||||
endif ()
|
||||
endif ()
|
||||
unset (HAVE_AVX2)
|
||||
unset (HAVE_AVX)
|
||||
unset (HAVE_FMA3)
|
||||
unset (HAVE_MMX)
|
||||
unset (HAVE_SSE)
|
||||
unset (HAVE_SSE2)
|
||||
unset (HAVE_SSE3)
|
||||
unset (HAVE_SSSE3)
|
||||
unset (HAVE_SSE4A)
|
||||
unset (HAVE_SSE4_1)
|
||||
unset (HAVE_SSE4_2)
|
||||
unset (HAVE_NEON)
|
||||
unset (HAVE_VFP)
|
||||
unset (HAVE_VFPV3)
|
||||
unset (HAVE_VFPV4)
|
||||
message(STATUS "Running getarch")
|
||||
|
||||
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
|
||||
|
|
|
@ -44,74 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
|
||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
# endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE3)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSSE3)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE4_1)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (DEFINED TARGET)
|
||||
message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --")
|
||||
message(STATUS "Targeting the ${TARGET} architecture.")
|
||||
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
||||
endif ()
|
||||
|
@ -211,6 +146,63 @@ else()
|
|||
endif ()
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
# endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_AVX)
|
||||
if (NOT NO_AVX)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_AVX2)
|
||||
if (NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if (NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE3)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSSE3)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
|
||||
endif()
|
||||
if (DEFINED HAVE_SSE4_1)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
endif ()
|
||||
|
|
|
@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
|
|||
break;
|
||||
}
|
||||
|
||||
mode |= BLAS_LEGACY;
|
||||
if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY;
|
||||
|
||||
for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
|
||||
|
||||
|
|
|
@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
|
||||
routine = queue -> routine;
|
||||
|
||||
if (!(queue -> mode & BLAS_LEGACY)) {
|
||||
if (queue -> mode & BLAS_LEGACY) {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
||||
} else
|
||||
if (queue -> mode & BLAS_PTHREAD) {
|
||||
void (*pthreadcompat)(void *) = queue -> routine;
|
||||
(pthreadcompat)(queue -> args);
|
||||
} else
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n,
|
||||
queue -> sa, queue -> sb, 0);
|
||||
} else {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
||||
}
|
||||
|
||||
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
|
||||
|
||||
|
|
12
getarch.c
12
getarch.c
|
@ -330,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3"
|
||||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
|
@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3"
|
||||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#else
|
||||
|
@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
|
||||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
|
||||
#define LIBNAME "skylakex"
|
||||
#define CORENAME "SKYLAKEX"
|
||||
#endif
|
||||
|
@ -376,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3"
|
||||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#else
|
||||
|
@ -389,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
|
||||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
|
||||
#define LIBNAME "cooperlake"
|
||||
#define CORENAME "COOPERLAKE"
|
||||
#endif
|
||||
|
@ -559,7 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
|
||||
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
|
||||
"-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
|
||||
"-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "zen"
|
||||
#define CORENAME "ZEN"
|
||||
#endif
|
||||
|
|
|
@ -5,13 +5,6 @@ endif
|
|||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
ifdef HAVE_SSE3
|
||||
CFLAGS += -msse3
|
||||
endif
|
||||
ifdef HAVE_SSSE3
|
||||
CFLAGS += -mssse3
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
override CFLAGS += -fno-integrated-as
|
||||
|
@ -38,12 +31,6 @@ ifdef NO_AVX2
|
|||
endif
|
||||
|
||||
ifdef TARGET_CORE
|
||||
ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3))
|
||||
override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON))
|
||||
override CFLAGS += -msse -msse2
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
|
|
|
@ -151,9 +151,9 @@ DCOPYKERNEL = dcopy_power10.c
|
|||
CCOPYKERNEL = ccopy_power10.c
|
||||
ZCOPYKERNEL = zcopy_power10.c
|
||||
#
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
SDOTKERNEL = sdot_power10.c
|
||||
DDOTKERNEL = ddot_power10.c
|
||||
DSDOTKERNEL = sdot_power10.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
else
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static double ddot_kernel_8 (long n, double *x, double *y)
|
||||
{
|
||||
double dot;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
"dcbt 0, %3 \n\t"
|
||||
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
"xxlxor 34, 34, 34 \n\t"
|
||||
"xxlxor 35, 35, 35 \n\t"
|
||||
"xxlxor 36, 36, 36 \n\t"
|
||||
"xxlxor 37, 37, 37 \n\t"
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t"
|
||||
"xvmaddadp 33, 41, 49 \n\t"
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"xvmaddadp 34, 42, 50 \n\t"
|
||||
"xvmaddadp 35, 43, 51 \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"xvmaddadp 36, 44, 52 \n\t"
|
||||
"xvmaddadp 37, 45, 53 \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"xvmaddadp 38, 46, 54 \n\t"
|
||||
"xvmaddadp 39, 47, 55 \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmaddadp 32, 40, 48 \n\t"
|
||||
"xvmaddadp 33, 41, 49 \n\t"
|
||||
"xvmaddadp 34, 42, 50 \n\t"
|
||||
"xvmaddadp 35, 43, 51 \n\t"
|
||||
"xvmaddadp 36, 44, 52 \n\t"
|
||||
"xvmaddadp 37, 45, 53 \n\t"
|
||||
"xvmaddadp 38, 46, 54 \n\t"
|
||||
"xvmaddadp 39, 47, 55 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
XXSWAPD_S(33,32)
|
||||
|
||||
"xsadddp %x0, 32, 33 \n"
|
||||
|
||||
"#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
|
||||
:
|
||||
"=d" (dot), // 0
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x),
|
||||
"m" (*y)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "ddot_microk_power10.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
||||
static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
return dot;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
|
||||
FLOAT dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
|
||||
if ( n1 )
|
||||
dot = ddot_kernel_8(n1, x, y);
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[i] * x[i] ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
FLOAT temp1 = 0.0;
|
||||
FLOAT temp2 = 0.0;
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
|
||||
FLOAT m1 = y[iy] * x[ix] ;
|
||||
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
|
||||
|
||||
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
|
||||
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
|
||||
|
||||
ix += inc_x*4 ;
|
||||
iy += inc_y*4 ;
|
||||
|
||||
temp1 += m1+m3;
|
||||
temp2 += m2+m4;
|
||||
|
||||
i+=4 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
temp1 += y[iy] * x[ix] ;
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
dot = temp1 + temp2;
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static float sdot_kernel_16 (long n, float *x, float *y)
|
||||
{
|
||||
float dot;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
"dcbt 0, %3 \n\t"
|
||||
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
"xxlxor 34, 34, 34 \n\t"
|
||||
"xxlxor 35, 35, 35 \n\t"
|
||||
"xxlxor 36, 36, 36 \n\t"
|
||||
"xxlxor 37, 37, 37 \n\t"
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t"
|
||||
"xvmaddasp 33, 41, 49 \n\t"
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"xvmaddasp 34, 42, 50 \n\t"
|
||||
"xvmaddasp 35, 43, 51 \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"xvmaddasp 36, 44, 52 \n\t"
|
||||
"xvmaddasp 37, 45, 53 \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"xvmaddasp 38, 46, 54 \n\t"
|
||||
"xvmaddasp 39, 47, 55 \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t"
|
||||
"xvmaddasp 33, 41, 49 \n\t"
|
||||
"xvmaddasp 34, 42, 50 \n\t"
|
||||
"xvmaddasp 35, 43, 51 \n\t"
|
||||
"xvmaddasp 36, 44, 52 \n\t"
|
||||
"xvmaddasp 37, 45, 53 \n\t"
|
||||
"xvmaddasp 38, 46, 54 \n\t"
|
||||
"xvmaddasp 39, 47, 55 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
"xvaddsp 34, 34, 35 \n\t"
|
||||
"xvaddsp 36, 36, 37 \n\t"
|
||||
"xvaddsp 38, 38, 39 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 34 \n\t"
|
||||
"xvaddsp 36, 36, 38 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 36 \n\t"
|
||||
|
||||
"xxsldwi 33, 32, 32, 2 \n\t"
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
|
||||
"xxsldwi 33, 32, 32, 1 \n\t"
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
|
||||
"xscvspdp %x0, 32 \n"
|
||||
|
||||
"#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
|
||||
:
|
||||
"=f" (dot), // 0
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x),
|
||||
"m" (*y)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "sdot_microk_power10.c"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_KERNEL_16
|
||||
|
||||
static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
BLASLONG register i = 0;
|
||||
FLOAT dot = 0.0;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
+ y[i+1] * x[i+1]
|
||||
+ y[i+2] * x[i+2]
|
||||
+ y[i+3] * x[i+3]
|
||||
+ y[i+4] * x[i+4]
|
||||
+ y[i+5] * x[i+5]
|
||||
+ y[i+6] * x[i+6]
|
||||
+ y[i+7] * x[i+7] ;
|
||||
|
||||
i+=8 ;
|
||||
|
||||
}
|
||||
return dot;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
double dot = 0.0 ;
|
||||
|
||||
#if defined (DSDOT)
|
||||
double mydot = 0.0;
|
||||
FLOAT asmdot = 0.0;
|
||||
#else
|
||||
FLOAT mydot=0.0;
|
||||
#endif
|
||||
BLASLONG n1;
|
||||
|
||||
if ( n <= 0 ) return(dot);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
n1 = n & (BLASLONG)(-32);
|
||||
|
||||
if ( n1 )
|
||||
#if defined(DSDOT)
|
||||
{
|
||||
FLOAT *x1=x;
|
||||
FLOAT *y1=y;
|
||||
BLASLONG n2 = 32;
|
||||
while (i<n1) {
|
||||
asmdot = sdot_kernel_16(n2, x1, y1);
|
||||
mydot += (double)asmdot;
|
||||
asmdot=0.;
|
||||
x1+=32;
|
||||
y1+=32;
|
||||
i+=32;
|
||||
}
|
||||
}
|
||||
#else
|
||||
mydot = sdot_kernel_16(n1, x, y);
|
||||
#endif
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
#if defined(DSDOT)
|
||||
dot += (double)y[i] * (double)x[i] ;
|
||||
#else
|
||||
dot += y[i] * x[i] ;
|
||||
#endif
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
dot+=mydot;
|
||||
return(dot);
|
||||
|
||||
|
||||
}
|
||||
|
||||
n1 = n & (BLASLONG)(-2);
|
||||
|
||||
while(i < n1)
|
||||
{
|
||||
#if defined (DSDOT)
|
||||
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
|
||||
#else
|
||||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
|
||||
#endif
|
||||
ix += inc_x*2 ;
|
||||
iy += inc_y*2 ;
|
||||
i+=2 ;
|
||||
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
#if defined (DSDOT)
|
||||
dot += (double)y[iy] * (double)x[ix] ;
|
||||
#else
|
||||
dot += y[iy] * x[ix] ;
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
return(dot);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -102,3 +102,6 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c
|
|||
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
|
|
|
@ -0,0 +1,139 @@
|
|||
#include "common.h"
|
||||
|
||||
#if defined(SKYLAKEX)
|
||||
#include "drot_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL)
|
||||
#include "drot_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_DROT_KERNEL
|
||||
|
||||
static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
FLOAT f0, f1, f2, f3;
|
||||
FLOAT x0, x1, x2, x3;
|
||||
FLOAT g0, g1, g2, g3;
|
||||
FLOAT y0, y1, y2, y3;
|
||||
|
||||
FLOAT* xp = x;
|
||||
FLOAT* yp = y;
|
||||
|
||||
BLASLONG n1 = n & (~7);
|
||||
|
||||
while (i < n1) {
|
||||
x0 = xp[0];
|
||||
y0 = yp[0];
|
||||
x1 = xp[1];
|
||||
y1 = yp[1];
|
||||
x2 = xp[2];
|
||||
y2 = yp[2];
|
||||
x3 = xp[3];
|
||||
y3 = yp[3];
|
||||
|
||||
f0 = c*x0 + s*y0;
|
||||
g0 = c*y0 - s*x0;
|
||||
f1 = c*x1 + s*y1;
|
||||
g1 = c*y1 - s*x1;
|
||||
f2 = c*x2 + s*y2;
|
||||
g2 = c*y2 - s*x2;
|
||||
f3 = c*x3 + s*y3;
|
||||
g3 = c*y3 - s*x3;
|
||||
|
||||
xp[0] = f0;
|
||||
yp[0] = g0;
|
||||
xp[1] = f1;
|
||||
yp[1] = g1;
|
||||
xp[2] = f2;
|
||||
yp[2] = g2;
|
||||
xp[3] = f3;
|
||||
yp[3] = g3;
|
||||
|
||||
xp += 4;
|
||||
yp += 4;
|
||||
i += 4;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
FLOAT temp = c*x[i] + s*y[i];
|
||||
y[i] = c*y[i] - s*x[i];
|
||||
x[i] = temp;
|
||||
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
|
||||
FLOAT temp;
|
||||
|
||||
if (n <= 0)
|
||||
return;
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
drot_kernel(n, x, y, c, s);
|
||||
}
|
||||
else {
|
||||
while (i < n) {
|
||||
temp = c * x[ix] + s * y[iy];
|
||||
y[iy] = c * y[iy] - s * x[ix];
|
||||
x[ix] = temp;
|
||||
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
#if defined(SMP)
|
||||
static int rot_thread_function(blas_arg_t *args)
|
||||
{
|
||||
|
||||
rot_compute(args->m,
|
||||
args->a, args->lda,
|
||||
args->b, args->ldb,
|
||||
((FLOAT *)args->alpha)[0],
|
||||
((FLOAT *)args->alpha)[1]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT alpha[2]={c, s};
|
||||
FLOAT dummy_c;
|
||||
#endif
|
||||
|
||||
#if defined(SMP)
|
||||
if (inc_x == 0 || inc_y == 0 || n <= 100000) {
|
||||
nthreads = 1;
|
||||
}
|
||||
else {
|
||||
nthreads = num_cpu_avail(1);
|
||||
}
|
||||
|
||||
if (nthreads == 1) {
|
||||
rot_compute(n, x, inc_x, y, inc_y, c, s);
|
||||
}
|
||||
else {
|
||||
#if defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
|
||||
#endif
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
|
||||
}
|
||||
#else
|
||||
rot_compute(n, x, inc_x, y, inc_y, c, s);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
|
||||
|
||||
#define HAVE_DROT_KERNEL 1
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
||||
BLASLONG tail_index_4 = n&(~3);
|
||||
BLASLONG tail_index_16 = n&(~15);
|
||||
|
||||
__m256d c_256, s_256;
|
||||
if (n >= 4) {
|
||||
c_256 = _mm256_set1_pd(c);
|
||||
s_256 = _mm256_set1_pd(s);
|
||||
}
|
||||
|
||||
__m256d x0, x1, x2, x3;
|
||||
__m256d y0, y1, y2, y3;
|
||||
__m256d t0, t1, t2, t3;
|
||||
|
||||
for (i = 0; i < tail_index_16; i += 16) {
|
||||
x0 = _mm256_loadu_pd(&x[i + 0]);
|
||||
x1 = _mm256_loadu_pd(&x[i + 4]);
|
||||
x2 = _mm256_loadu_pd(&x[i + 8]);
|
||||
x3 = _mm256_loadu_pd(&x[i +12]);
|
||||
y0 = _mm256_loadu_pd(&y[i + 0]);
|
||||
y1 = _mm256_loadu_pd(&y[i + 4]);
|
||||
y2 = _mm256_loadu_pd(&y[i + 8]);
|
||||
y3 = _mm256_loadu_pd(&y[i +12]);
|
||||
|
||||
t0 = _mm256_mul_pd(s_256, y0);
|
||||
t1 = _mm256_mul_pd(s_256, y1);
|
||||
t2 = _mm256_mul_pd(s_256, y2);
|
||||
t3 = _mm256_mul_pd(s_256, y3);
|
||||
|
||||
t0 = _mm256_fmadd_pd(c_256, x0, t0);
|
||||
t1 = _mm256_fmadd_pd(c_256, x1, t1);
|
||||
t2 = _mm256_fmadd_pd(c_256, x2, t2);
|
||||
t3 = _mm256_fmadd_pd(c_256, x3, t3);
|
||||
|
||||
_mm256_storeu_pd(&x[i + 0], t0);
|
||||
_mm256_storeu_pd(&x[i + 4], t1);
|
||||
_mm256_storeu_pd(&x[i + 8], t2);
|
||||
_mm256_storeu_pd(&x[i +12], t3);
|
||||
|
||||
t0 = _mm256_mul_pd(s_256, x0);
|
||||
t1 = _mm256_mul_pd(s_256, x1);
|
||||
t2 = _mm256_mul_pd(s_256, x2);
|
||||
t3 = _mm256_mul_pd(s_256, x3);
|
||||
|
||||
t0 = _mm256_fmsub_pd(c_256, y0, t0);
|
||||
t1 = _mm256_fmsub_pd(c_256, y1, t1);
|
||||
t2 = _mm256_fmsub_pd(c_256, y2, t2);
|
||||
t3 = _mm256_fmsub_pd(c_256, y3, t3);
|
||||
|
||||
_mm256_storeu_pd(&y[i + 0], t0);
|
||||
_mm256_storeu_pd(&y[i + 4], t1);
|
||||
_mm256_storeu_pd(&y[i + 8], t2);
|
||||
_mm256_storeu_pd(&y[i +12], t3);
|
||||
|
||||
}
|
||||
|
||||
for (i = tail_index_16; i < tail_index_4; i += 4) {
|
||||
x0 = _mm256_loadu_pd(&x[i]);
|
||||
y0 = _mm256_loadu_pd(&y[i]);
|
||||
|
||||
t0 = _mm256_mul_pd(s_256, y0);
|
||||
t0 = _mm256_fmadd_pd(c_256, x0, t0);
|
||||
_mm256_storeu_pd(&x[i], t0);
|
||||
|
||||
t0 = _mm256_mul_pd(s_256, x0);
|
||||
t0 = _mm256_fmsub_pd(c_256, y0, t0);
|
||||
_mm256_storeu_pd(&y[i], t0);
|
||||
}
|
||||
|
||||
for (i = tail_index_4; i < n; ++i) {
|
||||
FLOAT temp = c * x[i] + s * y[i];
|
||||
y[i] = c * y[i] - s * x[i];
|
||||
x[i] = temp;
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,94 @@
|
|||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
|
||||
|
||||
#define HAVE_DROT_KERNEL 1
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n1 = n;
|
||||
|
||||
BLASLONG tail_index_8 = 0;
|
||||
BLASLONG tail_index_32 = 0;
|
||||
|
||||
__m512d c_512 = _mm512_set1_pd(c);
|
||||
__m512d s_512 = _mm512_set1_pd(s);
|
||||
|
||||
tail_index_8 = n1 & (~7);
|
||||
tail_index_32 = n1 & (~31);
|
||||
|
||||
|
||||
__m512d x0, x1, x2, x3;
|
||||
__m512d y0, y1, y2, y3;
|
||||
__m512d t0, t1, t2, t3;
|
||||
|
||||
for (i = 0; i < tail_index_32; i += 32) {
|
||||
x0 = _mm512_loadu_pd(&x[i + 0]);
|
||||
x1 = _mm512_loadu_pd(&x[i + 8]);
|
||||
x2 = _mm512_loadu_pd(&x[i +16]);
|
||||
x3 = _mm512_loadu_pd(&x[i +24]);
|
||||
y0 = _mm512_loadu_pd(&y[i + 0]);
|
||||
y1 = _mm512_loadu_pd(&y[i + 8]);
|
||||
y2 = _mm512_loadu_pd(&y[i +16]);
|
||||
y3 = _mm512_loadu_pd(&y[i +24]);
|
||||
|
||||
t0 = _mm512_mul_pd(s_512, y0);
|
||||
t1 = _mm512_mul_pd(s_512, y1);
|
||||
t2 = _mm512_mul_pd(s_512, y2);
|
||||
t3 = _mm512_mul_pd(s_512, y3);
|
||||
|
||||
t0 = _mm512_fmadd_pd(c_512, x0, t0);
|
||||
t1 = _mm512_fmadd_pd(c_512, x1, t1);
|
||||
t2 = _mm512_fmadd_pd(c_512, x2, t2);
|
||||
t3 = _mm512_fmadd_pd(c_512, x3, t3);
|
||||
|
||||
_mm512_storeu_pd(&x[i + 0], t0);
|
||||
_mm512_storeu_pd(&x[i + 8], t1);
|
||||
_mm512_storeu_pd(&x[i +16], t2);
|
||||
_mm512_storeu_pd(&x[i +24], t3);
|
||||
|
||||
t0 = _mm512_mul_pd(s_512, x0);
|
||||
t1 = _mm512_mul_pd(s_512, x1);
|
||||
t2 = _mm512_mul_pd(s_512, x2);
|
||||
t3 = _mm512_mul_pd(s_512, x3);
|
||||
|
||||
t0 = _mm512_fmsub_pd(c_512, y0, t0);
|
||||
t1 = _mm512_fmsub_pd(c_512, y1, t1);
|
||||
t2 = _mm512_fmsub_pd(c_512, y2, t2);
|
||||
t3 = _mm512_fmsub_pd(c_512, y3, t3);
|
||||
|
||||
_mm512_storeu_pd(&y[i + 0], t0);
|
||||
_mm512_storeu_pd(&y[i + 8], t1);
|
||||
_mm512_storeu_pd(&y[i +16], t2);
|
||||
_mm512_storeu_pd(&y[i +24], t3);
|
||||
}
|
||||
|
||||
for (i = tail_index_32; i < tail_index_8; i += 8) {
|
||||
x0 = _mm512_loadu_pd(&x[i]);
|
||||
y0 = _mm512_loadu_pd(&y[i]);
|
||||
|
||||
t0 = _mm512_mul_pd(s_512, y0);
|
||||
t0 = _mm512_fmadd_pd(c_512, x0, t0);
|
||||
_mm512_storeu_pd(&x[i], t0);
|
||||
|
||||
t0 = _mm512_mul_pd(s_512, x0);
|
||||
t0 = _mm512_fmsub_pd(c_512, y0, t0);
|
||||
_mm512_storeu_pd(&y[i], t0);
|
||||
}
|
||||
|
||||
if ((n1&7) > 0) {
|
||||
unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7)));
|
||||
__m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]);
|
||||
__m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]);
|
||||
__m512d temp = _mm512_mul_pd(s_512, tail_y);
|
||||
temp = _mm512_fmadd_pd(c_512, tail_x, temp);
|
||||
_mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp);
|
||||
temp = _mm512_mul_pd(s_512, tail_x);
|
||||
temp = _mm512_fmsub_pd(c_512, tail_y, temp);
|
||||
_mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp);
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,139 @@
|
|||
#include "common.h"
|
||||
|
||||
#if defined(SKYLAKEX)
|
||||
#include "srot_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL)
|
||||
#include "srot_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_SROT_KERNEL
|
||||
|
||||
static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
FLOAT f0, f1, f2, f3;
|
||||
FLOAT x0, x1, x2, x3;
|
||||
FLOAT g0, g1, g2, g3;
|
||||
FLOAT y0, y1, y2, y3;
|
||||
|
||||
FLOAT* xp = x;
|
||||
FLOAT* yp = y;
|
||||
|
||||
BLASLONG n1 = n & (~7);
|
||||
|
||||
while (i < n1) {
|
||||
x0 = xp[0];
|
||||
y0 = yp[0];
|
||||
x1 = xp[1];
|
||||
y1 = yp[1];
|
||||
x2 = xp[2];
|
||||
y2 = yp[2];
|
||||
x3 = xp[3];
|
||||
y3 = yp[3];
|
||||
|
||||
f0 = c*x0 + s*y0;
|
||||
g0 = c*y0 - s*x0;
|
||||
f1 = c*x1 + s*y1;
|
||||
g1 = c*y1 - s*x1;
|
||||
f2 = c*x2 + s*y2;
|
||||
g2 = c*y2 - s*x2;
|
||||
f3 = c*x3 + s*y3;
|
||||
g3 = c*y3 - s*x3;
|
||||
|
||||
xp[0] = f0;
|
||||
yp[0] = g0;
|
||||
xp[1] = f1;
|
||||
yp[1] = g1;
|
||||
xp[2] = f2;
|
||||
yp[2] = g2;
|
||||
xp[3] = f3;
|
||||
yp[3] = g3;
|
||||
|
||||
xp += 4;
|
||||
yp += 4;
|
||||
i += 4;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
FLOAT temp = c*x[i] + s*y[i];
|
||||
y[i] = c*y[i] - s*x[i];
|
||||
x[i] = temp;
|
||||
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
|
||||
FLOAT temp;
|
||||
|
||||
if (n <= 0)
|
||||
return;
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
srot_kernel(n, x, y, c, s);
|
||||
}
|
||||
else {
|
||||
while (i < n) {
|
||||
temp = c * x[ix] + s * y[iy];
|
||||
y[iy] = c * y[iy] - s * x[ix];
|
||||
x[ix] = temp;
|
||||
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
#if defined(SMP)
|
||||
static int rot_thread_function(blas_arg_t *args)
|
||||
{
|
||||
|
||||
rot_compute(args->m,
|
||||
args->a, args->lda,
|
||||
args->b, args->ldb,
|
||||
((float *)args->alpha)[0],
|
||||
((float *)args->alpha)[1]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT alpha[2]={c, s};
|
||||
FLOAT dummy_c;
|
||||
#endif
|
||||
|
||||
#if defined(SMP)
|
||||
if (inc_x == 0 || inc_y == 0 || n <= 100000) {
|
||||
nthreads = 1;
|
||||
}
|
||||
else {
|
||||
nthreads = num_cpu_avail(1);
|
||||
}
|
||||
|
||||
if (nthreads == 1) {
|
||||
rot_compute(n, x, inc_x, y, inc_y, c, s);
|
||||
}
|
||||
else {
|
||||
#if defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
|
||||
#endif
|
||||
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
|
||||
}
|
||||
#else
|
||||
rot_compute(n, x, inc_x, y, inc_y, c, s);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
|
||||
|
||||
#define HAVE_SROT_KERNEL 1
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
|
||||
BLASLONG tail_index_8 = n&(~7);
|
||||
BLASLONG tail_index_32 = n&(~31);
|
||||
|
||||
__m256 c_256, s_256;
|
||||
if (n >= 8) {
|
||||
c_256 = _mm256_set1_ps(c);
|
||||
s_256 = _mm256_set1_ps(s);
|
||||
}
|
||||
|
||||
__m256 x0, x1, x2, x3;
|
||||
__m256 y0, y1, y2, y3;
|
||||
__m256 t0, t1, t2, t3;
|
||||
|
||||
for (i = 0; i < tail_index_32; i += 32) {
|
||||
x0 = _mm256_loadu_ps(&x[i + 0]);
|
||||
x1 = _mm256_loadu_ps(&x[i + 8]);
|
||||
x2 = _mm256_loadu_ps(&x[i +16]);
|
||||
x3 = _mm256_loadu_ps(&x[i +24]);
|
||||
y0 = _mm256_loadu_ps(&y[i + 0]);
|
||||
y1 = _mm256_loadu_ps(&y[i + 8]);
|
||||
y2 = _mm256_loadu_ps(&y[i +16]);
|
||||
y3 = _mm256_loadu_ps(&y[i +24]);
|
||||
|
||||
t0 = _mm256_mul_ps(s_256, y0);
|
||||
t1 = _mm256_mul_ps(s_256, y1);
|
||||
t2 = _mm256_mul_ps(s_256, y2);
|
||||
t3 = _mm256_mul_ps(s_256, y3);
|
||||
|
||||
t0 = _mm256_fmadd_ps(c_256, x0, t0);
|
||||
t1 = _mm256_fmadd_ps(c_256, x1, t1);
|
||||
t2 = _mm256_fmadd_ps(c_256, x2, t2);
|
||||
t3 = _mm256_fmadd_ps(c_256, x3, t3);
|
||||
|
||||
_mm256_storeu_ps(&x[i + 0], t0);
|
||||
_mm256_storeu_ps(&x[i + 8], t1);
|
||||
_mm256_storeu_ps(&x[i +16], t2);
|
||||
_mm256_storeu_ps(&x[i +24], t3);
|
||||
|
||||
t0 = _mm256_mul_ps(s_256, x0);
|
||||
t1 = _mm256_mul_ps(s_256, x1);
|
||||
t2 = _mm256_mul_ps(s_256, x2);
|
||||
t3 = _mm256_mul_ps(s_256, x3);
|
||||
|
||||
t0 = _mm256_fmsub_ps(c_256, y0, t0);
|
||||
t1 = _mm256_fmsub_ps(c_256, y1, t1);
|
||||
t2 = _mm256_fmsub_ps(c_256, y2, t2);
|
||||
t3 = _mm256_fmsub_ps(c_256, y3, t3);
|
||||
|
||||
_mm256_storeu_ps(&y[i + 0], t0);
|
||||
_mm256_storeu_ps(&y[i + 8], t1);
|
||||
_mm256_storeu_ps(&y[i +16], t2);
|
||||
_mm256_storeu_ps(&y[i +24], t3);
|
||||
|
||||
}
|
||||
|
||||
for (i = tail_index_32; i < tail_index_8; i += 8) {
|
||||
x0 = _mm256_loadu_ps(&x[i]);
|
||||
y0 = _mm256_loadu_ps(&y[i]);
|
||||
|
||||
t0 = _mm256_mul_ps(s_256, y0);
|
||||
t0 = _mm256_fmadd_ps(c_256, x0, t0);
|
||||
_mm256_storeu_ps(&x[i], t0);
|
||||
|
||||
t0 = _mm256_mul_ps(s_256, x0);
|
||||
t0 = _mm256_fmsub_ps(c_256, y0, t0);
|
||||
_mm256_storeu_ps(&y[i], t0);
|
||||
}
|
||||
|
||||
for (i = tail_index_8; i < n; ++i) {
|
||||
FLOAT temp = c * x[i] + s * y[i];
|
||||
y[i] = c * y[i] - s * x[i];
|
||||
x[i] = temp;
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,91 @@
|
|||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
|
||||
|
||||
#define HAVE_SROT_KERNEL 1
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
__m512 c_512, s_512;
|
||||
c_512 = _mm512_set1_ps(c);
|
||||
s_512 = _mm512_set1_ps(s);
|
||||
|
||||
BLASLONG tail_index_16 = n&(~15);
|
||||
BLASLONG tail_index_64 = n&(~63);
|
||||
|
||||
|
||||
__m512 x0, x1, x2, x3;
|
||||
__m512 y0, y1, y2, y3;
|
||||
__m512 t0, t1, t2, t3;
|
||||
|
||||
for (i = 0; i < tail_index_64; i += 64) {
|
||||
x0 = _mm512_loadu_ps(&x[i + 0]);
|
||||
x1 = _mm512_loadu_ps(&x[i +16]);
|
||||
x2 = _mm512_loadu_ps(&x[i +32]);
|
||||
x3 = _mm512_loadu_ps(&x[i +48]);
|
||||
y0 = _mm512_loadu_ps(&y[i + 0]);
|
||||
y1 = _mm512_loadu_ps(&y[i +16]);
|
||||
y2 = _mm512_loadu_ps(&y[i +32]);
|
||||
y3 = _mm512_loadu_ps(&y[i +48]);
|
||||
|
||||
t0 = _mm512_mul_ps(s_512, y0);
|
||||
t1 = _mm512_mul_ps(s_512, y1);
|
||||
t2 = _mm512_mul_ps(s_512, y2);
|
||||
t3 = _mm512_mul_ps(s_512, y3);
|
||||
|
||||
t0 = _mm512_fmadd_ps(c_512, x0, t0);
|
||||
t1 = _mm512_fmadd_ps(c_512, x1, t1);
|
||||
t2 = _mm512_fmadd_ps(c_512, x2, t2);
|
||||
t3 = _mm512_fmadd_ps(c_512, x3, t3);
|
||||
|
||||
_mm512_storeu_ps(&x[i + 0], t0);
|
||||
_mm512_storeu_ps(&x[i +16], t1);
|
||||
_mm512_storeu_ps(&x[i +32], t2);
|
||||
_mm512_storeu_ps(&x[i +48], t3);
|
||||
|
||||
t0 = _mm512_mul_ps(s_512, x0);
|
||||
t1 = _mm512_mul_ps(s_512, x1);
|
||||
t2 = _mm512_mul_ps(s_512, x2);
|
||||
t3 = _mm512_mul_ps(s_512, x3);
|
||||
|
||||
t0 = _mm512_fmsub_ps(c_512, y0, t0);
|
||||
t1 = _mm512_fmsub_ps(c_512, y1, t1);
|
||||
t2 = _mm512_fmsub_ps(c_512, y2, t2);
|
||||
t3 = _mm512_fmsub_ps(c_512, y3, t3);
|
||||
|
||||
_mm512_storeu_ps(&y[i + 0], t0);
|
||||
_mm512_storeu_ps(&y[i +16], t1);
|
||||
_mm512_storeu_ps(&y[i +32], t2);
|
||||
_mm512_storeu_ps(&y[i +48], t3);
|
||||
}
|
||||
|
||||
for (i = tail_index_64; i < tail_index_16; i += 16) {
|
||||
x0 = _mm512_loadu_ps(&x[i]);
|
||||
y0 = _mm512_loadu_ps(&y[i]);
|
||||
|
||||
t0 = _mm512_mul_ps(s_512, y0);
|
||||
t0 = _mm512_fmadd_ps(c_512, x0, t0);
|
||||
_mm512_storeu_ps(&x[i], t0);
|
||||
|
||||
t0 = _mm512_mul_ps(s_512, x0);
|
||||
t0 = _mm512_fmsub_ps(c_512, y0, t0);
|
||||
_mm512_storeu_ps(&y[i], t0);
|
||||
}
|
||||
|
||||
|
||||
if ((n & 15) > 0) {
|
||||
uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15)));
|
||||
__m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]);
|
||||
__m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]);
|
||||
__m512 temp = _mm512_mul_ps(s_512, tail_y);
|
||||
temp = _mm512_fmadd_ps(c_512, tail_x, temp);
|
||||
_mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp);
|
||||
temp = _mm512_mul_ps(s_512, tail_x);
|
||||
temp = _mm512_fmsub_ps(c_512, tail_y, temp);
|
||||
_mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp);
|
||||
}
|
||||
}
|
||||
#endif
|
Loading…
Reference in New Issue