Merge pull request #111 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-11-13 09:14:23 +01:00 committed by GitHub
commit 02699226d0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 1313 additions and 130 deletions

View File

@ -6,7 +6,7 @@
INCLUDED = 1 INCLUDED = 1
ifndef TOPDIR ifndef TOPDIR
TOPDIR = . TOPDIR = .
endif endif
# If ARCH is not set, we use the host system's architecture for getarch compile options. # If ARCH is not set, we use the host system's architecture for getarch compile options.
@ -252,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
ifndef TARGET_CORE ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf include $(TOPDIR)/Makefile.conf
else else
HAVE_NEON=
HAVE_VFP=
HAVE_VFPV3=
HAVE_VFPV4=
HAVE_MMX=
HAVE_SSE=
HAVE_SSE2=
HAVE_SSE3=
HAVE_SSSE3=
HAVE_SSE4_1=
HAVE_SSE4_2=
HAVE_SSE4A=
HAVE_SSE5=
HAVE_AVX=
HAVE_AVX2=
HAVE_FMA3=
include $(TOPDIR)/Makefile_kernel.conf include $(TOPDIR)/Makefile_kernel.conf
endif endif
@ -1522,6 +1538,8 @@ export HAVE_SSE4_2
export HAVE_SSE4A export HAVE_SSE4A
export HAVE_SSE5 export HAVE_SSE5
export HAVE_AVX export HAVE_AVX
export HAVE_AVX2
export HAVE_FMA3
export HAVE_VFP export HAVE_VFP
export HAVE_VFPV3 export HAVE_VFPV3
export HAVE_VFPV4 export HAVE_VFPV4

View File

@ -9,9 +9,9 @@ endif
endif endif
ifdef HAVE_SSE3 ifdef HAVE_SSE3
ifndef DYNAMIC_ARCH
CCOMMON_OPT += -msse3 CCOMMON_OPT += -msse3
FCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3
endif
ifdef HAVE_SSSE3 ifdef HAVE_SSSE3
CCOMMON_OPT += -mssse3 CCOMMON_OPT += -mssse3
FCOMMON_OPT += -mssse3 FCOMMON_OPT += -mssse3
@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1
CCOMMON_OPT += -msse4.1 CCOMMON_OPT += -msse4.1
FCOMMON_OPT += -msse4.1 FCOMMON_OPT += -msse4.1
endif endif
ifdef HAVE_AVX
CCOMMON_OPT += -mavx
FCOMMON_OPT += -mavx
endif endif
ifdef HAVE_AVX2
CCOMMON_OPT += -mavx2
FCOMMON_OPT += -mavx2
endif
ifdef HAVE_FMA3
CCOMMON_OPT += -mfma
FCOMMON_OPT += -mfma
endif endif
ifeq ($(CORE), SKYLAKEX) ifeq ($(CORE), SKYLAKEX)
@ -66,8 +76,7 @@ endif
endif endif
endif endif
ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE)) ifdef HAVE_AVX2
ifndef DYNAMIC_ARCH
ifndef NO_AVX2 ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0 # AVX2 support was added in 4.7.0
@ -96,7 +105,6 @@ endif
endif endif
endif endif
endif endif
endif

View File

@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
endif () endif ()
endif () endif ()
if (${CORE} STREQUAL "SKYLAKEX") if (${CORE} STREQUAL SKYLAKEX)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512) if (NOT NO_AVX512)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX")
endif () endif ()
endif () endif ()
if (${CORE} STREQUAL "COOPERLAKE") if (${CORE} STREQUAL COOPERLAKE)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512) if (NOT NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)

View File

@ -139,36 +139,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(CGEMM3M_UNROLL_N 4) set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4) set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4) set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "BARCELONA")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_SSE3\n")
elseif ("${TCORE}" STREQUAL "STEAMROLLER")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_SSE3\n")
elseif ("${TCORE}" STREQUAL "EXCAVATOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_SSE3\n")
elseif ("${TCORE}" STREQUAL "NEHALEM")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_SSE3\n")
elseif ("${TCORE}" STREQUAL "PRESCOTT")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_SSE3\n")
elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_AVX\n")
elseif ("${TCORE}" STREQUAL "HASWELL")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_AVX2\n")
elseif ("${TCORE}" STREQUAL "ZEN")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_AVX2\n")
elseif ("${TCORE}" STREQUAL "SKYLAKEX")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_AVX512\n")
elseif ("${TCORE}" STREQUAL "COOPERLAKE")
file(APPEND ${TARGET_CONF_TEMP}
"#define HAVE_AVX512\n")
elseif ("${TCORE}" STREQUAL "ARMV7") elseif ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_SIZE\t65536\n"
@ -586,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING)
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif () endif ()
endif () endif ()
unset (HAVE_AVX2)
unset (HAVE_AVX)
unset (HAVE_FMA3)
unset (HAVE_MMX)
unset (HAVE_SSE)
unset (HAVE_SSE2)
unset (HAVE_SSE3)
unset (HAVE_SSSE3)
unset (HAVE_SSE4A)
unset (HAVE_SSE4_1)
unset (HAVE_SSE4_2)
unset (HAVE_NEON)
unset (HAVE_VFP)
unset (HAVE_VFPV3)
unset (HAVE_VFPV4)
message(STATUS "Running getarch") message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way # use the cmake binary w/ the -E param to run a shell command in a cross-platform way

View File

@ -44,74 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
endif () endif ()
endif () endif ()
if (DEFINED TARGET)
if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
# endif()
endif()
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2")
endif()
endif()
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
endif()
if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
endif()
if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
endif()
if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (DEFINED HAVE_SSE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
endif()
if (DEFINED HAVE_SSE2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
endif()
if (DEFINED HAVE_SSE3)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (DEFINED HAVE_SSSE3)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
endif()
if (DEFINED HAVE_SSE4_1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
endif()
endif()
if (DEFINED TARGET) if (DEFINED TARGET)
message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --")
message(STATUS "Targeting the ${TARGET} architecture.") message(STATUS "Targeting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}") set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif () endif ()
@ -211,6 +146,63 @@ else()
endif () endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED TARGET)
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
# endif()
endif()
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
if (DEFINED HAVE_AVX)
if (NOT NO_AVX)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
endif()
endif()
if (DEFINED HAVE_AVX2)
if (NOT NO_AVX2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
if (DEFINED HAVE_FMA3)
if (NOT NO_AVX2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
endif()
endif()
if (DEFINED HAVE_SSE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
endif()
if (DEFINED HAVE_SSE2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
endif()
if (DEFINED HAVE_SSE3)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (DEFINED HAVE_SSSE3)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
endif()
if (DEFINED HAVE_SSE4_1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
endif()
endif()
if (DEFINED BINARY) if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.") message(STATUS "Compiling a ${BINARY}-bit binary.")
endif () endif ()

View File

@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
break; break;
} }
mode |= BLAS_LEGACY; if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY;
for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);

View File

@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
routine = queue -> routine; routine = queue -> routine;
if (!(queue -> mode & BLAS_LEGACY)) { if (queue -> mode & BLAS_LEGACY) {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
} else
if (queue -> mode & BLAS_PTHREAD) {
void (*pthreadcompat)(void *) = queue -> routine;
(pthreadcompat)(queue -> args);
} else
(routine)(queue -> args, queue -> range_m, queue -> range_n, (routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0); queue -> sa, queue -> sb, 0);
} else {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
}
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);

View File

@ -330,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3" "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell" #define LIBNAME "haswell"
#define CORENAME "HASWELL" #define CORENAME "HASWELL"
#endif #endif
@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3" "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell" #define LIBNAME "haswell"
#define CORENAME "HASWELL" #define CORENAME "HASWELL"
#else #else
@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512" "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
#define LIBNAME "skylakex" #define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX" #define CORENAME "SKYLAKEX"
#endif #endif
@ -376,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3" "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell" #define LIBNAME "haswell"
#define CORENAME "HASWELL" #define CORENAME "HASWELL"
#else #else
@ -389,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
#define LIBNAME "cooperlake" #define LIBNAME "cooperlake"
#define CORENAME "COOPERLAKE" #define CORENAME "COOPERLAKE"
#endif #endif
@ -559,7 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA3 -DFMA3" "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "zen" #define LIBNAME "zen"
#define CORENAME "ZEN" #define CORENAME "ZEN"
#endif #endif

View File

@ -5,13 +5,6 @@ endif
TOPDIR = .. TOPDIR = ..
include $(TOPDIR)/Makefile.system include $(TOPDIR)/Makefile.system
ifdef HAVE_SSE3
CFLAGS += -msse3
endif
ifdef HAVE_SSSE3
CFLAGS += -mssse3
endif
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
ifeq ($(C_COMPILER), CLANG) ifeq ($(C_COMPILER), CLANG)
override CFLAGS += -fno-integrated-as override CFLAGS += -fno-integrated-as
@ -38,12 +31,6 @@ ifdef NO_AVX2
endif endif
ifdef TARGET_CORE ifdef TARGET_CORE
ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3))
override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1
endif
ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON))
override CFLAGS += -msse -msse2
endif
ifeq ($(TARGET_CORE), COOPERLAKE) ifeq ($(TARGET_CORE), COOPERLAKE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCVERSIONGTEQ10), 1)

View File

@ -151,9 +151,9 @@ DCOPYKERNEL = dcopy_power10.c
CCOPYKERNEL = ccopy_power10.c CCOPYKERNEL = ccopy_power10.c
ZCOPYKERNEL = zcopy_power10.c ZCOPYKERNEL = zcopy_power10.c
# #
SDOTKERNEL = sdot.c SDOTKERNEL = sdot_power10.c
DDOTKERNEL = ddot.c DDOTKERNEL = ddot_power10.c
DSDOTKERNEL = sdot.c DSDOTKERNEL = sdot_power10.c
ifneq ($(GCCVERSIONGTEQ9),1) ifneq ($(GCCVERSIONGTEQ9),1)
CDOTKERNEL = cdot_power9.S CDOTKERNEL = cdot_power9.S
else else

View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static double ddot_kernel_8 (long n, double *x, double *y)
{
double dot;
__asm__
(
"dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t"
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"lxvp 48, 0(%3) \n\t"
"lxvp 50, 32(%3) \n\t"
"lxvp 52, 64(%3) \n\t"
"lxvp 54, 96(%3) \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t"
"xvmaddadp 33, 41, 49 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 48, 0(%3) \n\t"
"xvmaddadp 34, 42, 50 \n\t"
"xvmaddadp 35, 43, 51 \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 50, 32(%3) \n\t"
"xvmaddadp 36, 44, 52 \n\t"
"xvmaddadp 37, 45, 53 \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 52, 64(%3) \n\t"
"xvmaddadp 38, 46, 54 \n\t"
"xvmaddadp 39, 47, 55 \n\t"
"lxvp 46, 96(%2) \n\t"
"lxvp 54, 96(%3) \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t"
"xvmaddadp 33, 41, 49 \n\t"
"xvmaddadp 34, 42, 50 \n\t"
"xvmaddadp 35, 43, 51 \n\t"
"xvmaddadp 36, 44, 52 \n\t"
"xvmaddadp 37, 45, 53 \n\t"
"xvmaddadp 38, 46, 54 \n\t"
"xvmaddadp 39, 47, 55 \n\t"
"xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t"
"xvadddp 36, 36, 37 \n\t"
"xvadddp 38, 38, 39 \n\t"
"xvadddp 32, 32, 34 \n\t"
"xvadddp 36, 36, 38 \n\t"
"xvadddp 32, 32, 36 \n\t"
XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n"
"#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
:
"=d" (dot), // 0
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x),
"m" (*y)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
);
return dot;
}

130
kernel/power/ddot_power10.c Normal file
View File

@ -0,0 +1,130 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "ddot_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_8
static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
return dot;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 )
dot = ddot_kernel_8(n1, x, y);
i = n1;
while(i < n)
{
dot += y[i] * x[i] ;
i++ ;
}
return(dot);
}
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = y[iy] * x[ix] ;
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
ix += inc_x*4 ;
iy += inc_y*4 ;
temp1 += m1+m3;
temp2 += m2+m4;
i+=4 ;
}
while(i < n)
{
temp1 += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
dot = temp1 + temp2;
return(dot);
}

View File

@ -0,0 +1,135 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static float sdot_kernel_16 (long n, float *x, float *y)
{
float dot;
__asm__
(
"dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t"
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"lxvp 48, 0(%3) \n\t"
"lxvp 50, 32(%3) \n\t"
"lxvp 52, 64(%3) \n\t"
"lxvp 54, 96(%3) \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
"xvmaddasp 33, 41, 49 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 48, 0(%3) \n\t"
"xvmaddasp 34, 42, 50 \n\t"
"xvmaddasp 35, 43, 51 \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 50, 32(%3) \n\t"
"xvmaddasp 36, 44, 52 \n\t"
"xvmaddasp 37, 45, 53 \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 52, 64(%3) \n\t"
"xvmaddasp 38, 46, 54 \n\t"
"xvmaddasp 39, 47, 55 \n\t"
"lxvp 46, 96(%2) \n\t"
"lxvp 54, 96(%3) \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
"xvmaddasp 33, 41, 49 \n\t"
"xvmaddasp 34, 42, 50 \n\t"
"xvmaddasp 35, 43, 51 \n\t"
"xvmaddasp 36, 44, 52 \n\t"
"xvmaddasp 37, 45, 53 \n\t"
"xvmaddasp 38, 46, 54 \n\t"
"xvmaddasp 39, 47, 55 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t"
"xvaddsp 36, 36, 37 \n\t"
"xvaddsp 38, 38, 39 \n\t"
"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"
"xvaddsp 32, 32, 36 \n\t"
"xxsldwi 33, 32, 32, 2 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xxsldwi 33, 32, 32, 1 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xscvspdp %x0, 32 \n"
"#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
:
"=f" (dot), // 0
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x),
"m" (*y)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
);
return dot;
}

154
kernel/power/sdot_power10.c Normal file
View File

@ -0,0 +1,154 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "sdot_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_16
static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
i+=8 ;
}
return dot;
}
#endif
#if defined (DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;
#if defined (DSDOT)
double mydot = 0.0;
FLOAT asmdot = 0.0;
#else
FLOAT mydot=0.0;
#endif
BLASLONG n1;
if ( n <= 0 ) return(dot);
if ( (inc_x == 1) && (inc_y == 1) )
{
n1 = n & (BLASLONG)(-32);
if ( n1 )
#if defined(DSDOT)
{
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG n2 = 32;
while (i<n1) {
asmdot = sdot_kernel_16(n2, x1, y1);
mydot += (double)asmdot;
asmdot=0.;
x1+=32;
y1+=32;
i+=32;
}
}
#else
mydot = sdot_kernel_16(n1, x, y);
#endif
i = n1;
while(i < n)
{
#if defined(DSDOT)
dot += (double)y[i] * (double)x[i] ;
#else
dot += y[i] * x[i] ;
#endif
i++ ;
}
dot+=mydot;
return(dot);
}
n1 = n & (BLASLONG)(-2);
while(i < n1)
{
#if defined (DSDOT)
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
#else
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
#endif
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;
}
while(i < n)
{
#if defined (DSDOT)
dot += (double)y[iy] * (double)x[ix] ;
#else
dot += y[iy] * x[ix] ;
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

View File

@ -102,3 +102,6 @@ ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c
SASUMKERNEL = sasum.c SASUMKERNEL = sasum.c
DASUMKERNEL = dasum.c DASUMKERNEL = dasum.c
SROTKERNEL = srot.c
DROTKERNEL = drot.c

139
kernel/x86_64/drot.c Normal file
View File

@ -0,0 +1,139 @@
#include "common.h"
#if defined(SKYLAKEX)
#include "drot_microk_skylakex-2.c"
#elif defined(HASWELL)
#include "drot_microk_haswell-2.c"
#endif
#ifndef HAVE_DROT_KERNEL
static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
FLOAT f0, f1, f2, f3;
FLOAT x0, x1, x2, x3;
FLOAT g0, g1, g2, g3;
FLOAT y0, y1, y2, y3;
FLOAT* xp = x;
FLOAT* yp = y;
BLASLONG n1 = n & (~7);
while (i < n1) {
x0 = xp[0];
y0 = yp[0];
x1 = xp[1];
y1 = yp[1];
x2 = xp[2];
y2 = yp[2];
x3 = xp[3];
y3 = yp[3];
f0 = c*x0 + s*y0;
g0 = c*y0 - s*x0;
f1 = c*x1 + s*y1;
g1 = c*y1 - s*x1;
f2 = c*x2 + s*y2;
g2 = c*y2 - s*x2;
f3 = c*x3 + s*y3;
g3 = c*y3 - s*x3;
xp[0] = f0;
yp[0] = g0;
xp[1] = f1;
yp[1] = g1;
xp[2] = f2;
yp[2] = g2;
xp[3] = f3;
yp[3] = g3;
xp += 4;
yp += 4;
i += 4;
}
while (i < n) {
FLOAT temp = c*x[i] + s*y[i];
y[i] = c*y[i] - s*x[i];
x[i] = temp;
i++;
}
}
#endif
static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;
if (n <= 0)
return;
if ((inc_x == 1) && (inc_y == 1)) {
drot_kernel(n, x, y, c, s);
}
else {
while (i < n) {
temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;
ix += inc_x;
iy += inc_y;
i++;
}
}
return;
}
#if defined(SMP)
static int rot_thread_function(blas_arg_t *args)
{
rot_compute(args->m,
args->a, args->lda,
args->b, args->ldb,
((FLOAT *)args->alpha)[0],
((FLOAT *)args->alpha)[1]);
return 0;
}
extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
#if defined(SMP)
int nthreads;
FLOAT alpha[2]={c, s};
FLOAT dummy_c;
#endif
#if defined(SMP)
if (inc_x == 0 || inc_y == 0 || n <= 100000) {
nthreads = 1;
}
else {
nthreads = num_cpu_avail(1);
}
if (nthreads == 1) {
rot_compute(n, x, inc_x, y, inc_y, c, s);
}
else {
#if defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
#else
int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
#endif
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
}
#else
rot_compute(n, x, inc_x, y, inc_y, c, s);
#endif
return 0;
}

View File

@ -0,0 +1,87 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_DROT_KERNEL 1
#include <immintrin.h>
#include <stdint.h>
static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
BLASLONG tail_index_4 = n&(~3);
BLASLONG tail_index_16 = n&(~15);
__m256d c_256, s_256;
if (n >= 4) {
c_256 = _mm256_set1_pd(c);
s_256 = _mm256_set1_pd(s);
}
__m256d x0, x1, x2, x3;
__m256d y0, y1, y2, y3;
__m256d t0, t1, t2, t3;
for (i = 0; i < tail_index_16; i += 16) {
x0 = _mm256_loadu_pd(&x[i + 0]);
x1 = _mm256_loadu_pd(&x[i + 4]);
x2 = _mm256_loadu_pd(&x[i + 8]);
x3 = _mm256_loadu_pd(&x[i +12]);
y0 = _mm256_loadu_pd(&y[i + 0]);
y1 = _mm256_loadu_pd(&y[i + 4]);
y2 = _mm256_loadu_pd(&y[i + 8]);
y3 = _mm256_loadu_pd(&y[i +12]);
t0 = _mm256_mul_pd(s_256, y0);
t1 = _mm256_mul_pd(s_256, y1);
t2 = _mm256_mul_pd(s_256, y2);
t3 = _mm256_mul_pd(s_256, y3);
t0 = _mm256_fmadd_pd(c_256, x0, t0);
t1 = _mm256_fmadd_pd(c_256, x1, t1);
t2 = _mm256_fmadd_pd(c_256, x2, t2);
t3 = _mm256_fmadd_pd(c_256, x3, t3);
_mm256_storeu_pd(&x[i + 0], t0);
_mm256_storeu_pd(&x[i + 4], t1);
_mm256_storeu_pd(&x[i + 8], t2);
_mm256_storeu_pd(&x[i +12], t3);
t0 = _mm256_mul_pd(s_256, x0);
t1 = _mm256_mul_pd(s_256, x1);
t2 = _mm256_mul_pd(s_256, x2);
t3 = _mm256_mul_pd(s_256, x3);
t0 = _mm256_fmsub_pd(c_256, y0, t0);
t1 = _mm256_fmsub_pd(c_256, y1, t1);
t2 = _mm256_fmsub_pd(c_256, y2, t2);
t3 = _mm256_fmsub_pd(c_256, y3, t3);
_mm256_storeu_pd(&y[i + 0], t0);
_mm256_storeu_pd(&y[i + 4], t1);
_mm256_storeu_pd(&y[i + 8], t2);
_mm256_storeu_pd(&y[i +12], t3);
}
for (i = tail_index_16; i < tail_index_4; i += 4) {
x0 = _mm256_loadu_pd(&x[i]);
y0 = _mm256_loadu_pd(&y[i]);
t0 = _mm256_mul_pd(s_256, y0);
t0 = _mm256_fmadd_pd(c_256, x0, t0);
_mm256_storeu_pd(&x[i], t0);
t0 = _mm256_mul_pd(s_256, x0);
t0 = _mm256_fmsub_pd(c_256, y0, t0);
_mm256_storeu_pd(&y[i], t0);
}
for (i = tail_index_4; i < n; ++i) {
FLOAT temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i];
x[i] = temp;
}
}
#endif

View File

@ -0,0 +1,94 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_DROT_KERNEL 1
#include <immintrin.h>
#include <stdint.h>
static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
BLASLONG n1 = n;
BLASLONG tail_index_8 = 0;
BLASLONG tail_index_32 = 0;
__m512d c_512 = _mm512_set1_pd(c);
__m512d s_512 = _mm512_set1_pd(s);
tail_index_8 = n1 & (~7);
tail_index_32 = n1 & (~31);
__m512d x0, x1, x2, x3;
__m512d y0, y1, y2, y3;
__m512d t0, t1, t2, t3;
for (i = 0; i < tail_index_32; i += 32) {
x0 = _mm512_loadu_pd(&x[i + 0]);
x1 = _mm512_loadu_pd(&x[i + 8]);
x2 = _mm512_loadu_pd(&x[i +16]);
x3 = _mm512_loadu_pd(&x[i +24]);
y0 = _mm512_loadu_pd(&y[i + 0]);
y1 = _mm512_loadu_pd(&y[i + 8]);
y2 = _mm512_loadu_pd(&y[i +16]);
y3 = _mm512_loadu_pd(&y[i +24]);
t0 = _mm512_mul_pd(s_512, y0);
t1 = _mm512_mul_pd(s_512, y1);
t2 = _mm512_mul_pd(s_512, y2);
t3 = _mm512_mul_pd(s_512, y3);
t0 = _mm512_fmadd_pd(c_512, x0, t0);
t1 = _mm512_fmadd_pd(c_512, x1, t1);
t2 = _mm512_fmadd_pd(c_512, x2, t2);
t3 = _mm512_fmadd_pd(c_512, x3, t3);
_mm512_storeu_pd(&x[i + 0], t0);
_mm512_storeu_pd(&x[i + 8], t1);
_mm512_storeu_pd(&x[i +16], t2);
_mm512_storeu_pd(&x[i +24], t3);
t0 = _mm512_mul_pd(s_512, x0);
t1 = _mm512_mul_pd(s_512, x1);
t2 = _mm512_mul_pd(s_512, x2);
t3 = _mm512_mul_pd(s_512, x3);
t0 = _mm512_fmsub_pd(c_512, y0, t0);
t1 = _mm512_fmsub_pd(c_512, y1, t1);
t2 = _mm512_fmsub_pd(c_512, y2, t2);
t3 = _mm512_fmsub_pd(c_512, y3, t3);
_mm512_storeu_pd(&y[i + 0], t0);
_mm512_storeu_pd(&y[i + 8], t1);
_mm512_storeu_pd(&y[i +16], t2);
_mm512_storeu_pd(&y[i +24], t3);
}
for (i = tail_index_32; i < tail_index_8; i += 8) {
x0 = _mm512_loadu_pd(&x[i]);
y0 = _mm512_loadu_pd(&y[i]);
t0 = _mm512_mul_pd(s_512, y0);
t0 = _mm512_fmadd_pd(c_512, x0, t0);
_mm512_storeu_pd(&x[i], t0);
t0 = _mm512_mul_pd(s_512, x0);
t0 = _mm512_fmsub_pd(c_512, y0, t0);
_mm512_storeu_pd(&y[i], t0);
}
if ((n1&7) > 0) {
unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7)));
__m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]);
__m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]);
__m512d temp = _mm512_mul_pd(s_512, tail_y);
temp = _mm512_fmadd_pd(c_512, tail_x, temp);
_mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp);
temp = _mm512_mul_pd(s_512, tail_x);
temp = _mm512_fmsub_pd(c_512, tail_y, temp);
_mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp);
}
}
#endif

139
kernel/x86_64/srot.c Normal file
View File

@ -0,0 +1,139 @@
#include "common.h"
#if defined(SKYLAKEX)
#include "srot_microk_skylakex-2.c"
#elif defined(HASWELL)
#include "srot_microk_haswell-2.c"
#endif
#ifndef HAVE_SROT_KERNEL
static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
FLOAT f0, f1, f2, f3;
FLOAT x0, x1, x2, x3;
FLOAT g0, g1, g2, g3;
FLOAT y0, y1, y2, y3;
FLOAT* xp = x;
FLOAT* yp = y;
BLASLONG n1 = n & (~7);
while (i < n1) {
x0 = xp[0];
y0 = yp[0];
x1 = xp[1];
y1 = yp[1];
x2 = xp[2];
y2 = yp[2];
x3 = xp[3];
y3 = yp[3];
f0 = c*x0 + s*y0;
g0 = c*y0 - s*x0;
f1 = c*x1 + s*y1;
g1 = c*y1 - s*x1;
f2 = c*x2 + s*y2;
g2 = c*y2 - s*x2;
f3 = c*x3 + s*y3;
g3 = c*y3 - s*x3;
xp[0] = f0;
yp[0] = g0;
xp[1] = f1;
yp[1] = g1;
xp[2] = f2;
yp[2] = g2;
xp[3] = f3;
yp[3] = g3;
xp += 4;
yp += 4;
i += 4;
}
while (i < n) {
FLOAT temp = c*x[i] + s*y[i];
y[i] = c*y[i] - s*x[i];
x[i] = temp;
i++;
}
}
#endif
static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT temp;
if (n <= 0)
return;
if ((inc_x == 1) && (inc_y == 1)) {
srot_kernel(n, x, y, c, s);
}
else {
while (i < n) {
temp = c * x[ix] + s * y[iy];
y[iy] = c * y[iy] - s * x[ix];
x[ix] = temp;
ix += inc_x;
iy += inc_y;
i++;
}
}
return;
}
#if defined(SMP)
static int rot_thread_function(blas_arg_t *args)
{
rot_compute(args->m,
args->a, args->lda,
args->b, args->ldb,
((float *)args->alpha)[0],
((float *)args->alpha)[1]);
return 0;
}
extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
#if defined(SMP)
int nthreads;
FLOAT alpha[2]={c, s};
FLOAT dummy_c;
#endif
#if defined(SMP)
if (inc_x == 0 || inc_y == 0 || n <= 100000) {
nthreads = 1;
}
else {
nthreads = num_cpu_avail(1);
}
if (nthreads == 1) {
rot_compute(n, x, inc_x, y, inc_y, c, s);
}
else {
#if defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
#else
int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
#endif
blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
}
#else
rot_compute(n, x, inc_x, y, inc_y, c, s);
#endif
return 0;
}

View File

@ -0,0 +1,87 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_SROT_KERNEL 1
#include <immintrin.h>
#include <stdint.h>
static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
BLASLONG tail_index_8 = n&(~7);
BLASLONG tail_index_32 = n&(~31);
__m256 c_256, s_256;
if (n >= 8) {
c_256 = _mm256_set1_ps(c);
s_256 = _mm256_set1_ps(s);
}
__m256 x0, x1, x2, x3;
__m256 y0, y1, y2, y3;
__m256 t0, t1, t2, t3;
for (i = 0; i < tail_index_32; i += 32) {
x0 = _mm256_loadu_ps(&x[i + 0]);
x1 = _mm256_loadu_ps(&x[i + 8]);
x2 = _mm256_loadu_ps(&x[i +16]);
x3 = _mm256_loadu_ps(&x[i +24]);
y0 = _mm256_loadu_ps(&y[i + 0]);
y1 = _mm256_loadu_ps(&y[i + 8]);
y2 = _mm256_loadu_ps(&y[i +16]);
y3 = _mm256_loadu_ps(&y[i +24]);
t0 = _mm256_mul_ps(s_256, y0);
t1 = _mm256_mul_ps(s_256, y1);
t2 = _mm256_mul_ps(s_256, y2);
t3 = _mm256_mul_ps(s_256, y3);
t0 = _mm256_fmadd_ps(c_256, x0, t0);
t1 = _mm256_fmadd_ps(c_256, x1, t1);
t2 = _mm256_fmadd_ps(c_256, x2, t2);
t3 = _mm256_fmadd_ps(c_256, x3, t3);
_mm256_storeu_ps(&x[i + 0], t0);
_mm256_storeu_ps(&x[i + 8], t1);
_mm256_storeu_ps(&x[i +16], t2);
_mm256_storeu_ps(&x[i +24], t3);
t0 = _mm256_mul_ps(s_256, x0);
t1 = _mm256_mul_ps(s_256, x1);
t2 = _mm256_mul_ps(s_256, x2);
t3 = _mm256_mul_ps(s_256, x3);
t0 = _mm256_fmsub_ps(c_256, y0, t0);
t1 = _mm256_fmsub_ps(c_256, y1, t1);
t2 = _mm256_fmsub_ps(c_256, y2, t2);
t3 = _mm256_fmsub_ps(c_256, y3, t3);
_mm256_storeu_ps(&y[i + 0], t0);
_mm256_storeu_ps(&y[i + 8], t1);
_mm256_storeu_ps(&y[i +16], t2);
_mm256_storeu_ps(&y[i +24], t3);
}
for (i = tail_index_32; i < tail_index_8; i += 8) {
x0 = _mm256_loadu_ps(&x[i]);
y0 = _mm256_loadu_ps(&y[i]);
t0 = _mm256_mul_ps(s_256, y0);
t0 = _mm256_fmadd_ps(c_256, x0, t0);
_mm256_storeu_ps(&x[i], t0);
t0 = _mm256_mul_ps(s_256, x0);
t0 = _mm256_fmsub_ps(c_256, y0, t0);
_mm256_storeu_ps(&y[i], t0);
}
for (i = tail_index_8; i < n; ++i) {
FLOAT temp = c * x[i] + s * y[i];
y[i] = c * y[i] - s * x[i];
x[i] = temp;
}
}
#endif

View File

@ -0,0 +1,91 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_SROT_KERNEL 1
#include <immintrin.h>
#include <stdint.h>
static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
__m512 c_512, s_512;
c_512 = _mm512_set1_ps(c);
s_512 = _mm512_set1_ps(s);
BLASLONG tail_index_16 = n&(~15);
BLASLONG tail_index_64 = n&(~63);
__m512 x0, x1, x2, x3;
__m512 y0, y1, y2, y3;
__m512 t0, t1, t2, t3;
for (i = 0; i < tail_index_64; i += 64) {
x0 = _mm512_loadu_ps(&x[i + 0]);
x1 = _mm512_loadu_ps(&x[i +16]);
x2 = _mm512_loadu_ps(&x[i +32]);
x3 = _mm512_loadu_ps(&x[i +48]);
y0 = _mm512_loadu_ps(&y[i + 0]);
y1 = _mm512_loadu_ps(&y[i +16]);
y2 = _mm512_loadu_ps(&y[i +32]);
y3 = _mm512_loadu_ps(&y[i +48]);
t0 = _mm512_mul_ps(s_512, y0);
t1 = _mm512_mul_ps(s_512, y1);
t2 = _mm512_mul_ps(s_512, y2);
t3 = _mm512_mul_ps(s_512, y3);
t0 = _mm512_fmadd_ps(c_512, x0, t0);
t1 = _mm512_fmadd_ps(c_512, x1, t1);
t2 = _mm512_fmadd_ps(c_512, x2, t2);
t3 = _mm512_fmadd_ps(c_512, x3, t3);
_mm512_storeu_ps(&x[i + 0], t0);
_mm512_storeu_ps(&x[i +16], t1);
_mm512_storeu_ps(&x[i +32], t2);
_mm512_storeu_ps(&x[i +48], t3);
t0 = _mm512_mul_ps(s_512, x0);
t1 = _mm512_mul_ps(s_512, x1);
t2 = _mm512_mul_ps(s_512, x2);
t3 = _mm512_mul_ps(s_512, x3);
t0 = _mm512_fmsub_ps(c_512, y0, t0);
t1 = _mm512_fmsub_ps(c_512, y1, t1);
t2 = _mm512_fmsub_ps(c_512, y2, t2);
t3 = _mm512_fmsub_ps(c_512, y3, t3);
_mm512_storeu_ps(&y[i + 0], t0);
_mm512_storeu_ps(&y[i +16], t1);
_mm512_storeu_ps(&y[i +32], t2);
_mm512_storeu_ps(&y[i +48], t3);
}
for (i = tail_index_64; i < tail_index_16; i += 16) {
x0 = _mm512_loadu_ps(&x[i]);
y0 = _mm512_loadu_ps(&y[i]);
t0 = _mm512_mul_ps(s_512, y0);
t0 = _mm512_fmadd_ps(c_512, x0, t0);
_mm512_storeu_ps(&x[i], t0);
t0 = _mm512_mul_ps(s_512, x0);
t0 = _mm512_fmsub_ps(c_512, y0, t0);
_mm512_storeu_ps(&y[i], t0);
}
if ((n & 15) > 0) {
uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15)));
__m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]);
__m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]);
__m512 temp = _mm512_mul_ps(s_512, tail_y);
temp = _mm512_fmadd_ps(c_512, tail_x, temp);
_mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp);
temp = _mm512_mul_ps(s_512, tail_x);
temp = _mm512_fmsub_ps(c_512, tail_y, temp);
_mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp);
}
}
#endif