diff --git a/CMakeLists.txt b/CMakeLists.txt index 50da721cd..74db77135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 7.dev) +set(OpenBLAS_PATCH_VERSION 8.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -211,7 +211,8 @@ if (USE_THREAD) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() -if (MSVC OR NOT NOFORTRAN) +#if (MSVC OR NOT NOFORTRAN) +if (NOT NO_CBLAS) # Broken without fortran on unix add_subdirectory(utest) endif() diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..f160a4e13 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,46 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.7 +11-Aug 2019 + +common: + * having the gmake special variables TARGET_ARCH or TARGET_MACH + defined no longer causes build failures in ctest or utest + * defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer + has the same effect as setting them to 1 + * a new test program was added to allow checking the library for + thread safety + * a new option USE_LOCKING was added to ensure thread safety when + OpenBLAS itself is built without multithreading but will be + called from multiple threads. + * a build failure on Linux with glibc versions earlier than 2.5 + was fixed + * a runtime error with CPU enumeration (and NO_AFFINITY not set) + on glibc 2.6 was fixed + * NO_AFFINITY was added to the CMAKE options (and defaults to being + active on Linux, as in the gmake builds) + +x86_64: + * the build-time logic for detection of AVX512 availability in + the processor and compiler was fixed + * gmake builds on OSX now set the internal name of the library to + libopenblas.0.dylib (consistent with CMAKE) + * the Haswell DGEMM kernel received a significant speedup through + improved prefetch and load instructions + * performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly + increased by avoiding vpermpd instructions + * the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled + to fix remaining errors in DGEMM, DSYMM and DTRMM + +## POWER: + * added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970 + * added optimized kernels for POWER9 SGEMM and STRMM + +## ARMV7: + * fixed the softfp implementations of xAMAX and IxAMAX + * removed the predefined -march= flags on both ARMV5 and ARMV6 as + they were appropriate for only a subset of platforms + ==================================================================== Version 0.3.6 29-Apr-2019 diff --git a/Makefile b/Makefile index 07b08439e..60f189ef2 100644 --- a/Makefile +++ b/Makefile @@ -109,6 +109,7 @@ endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll diff --git a/Makefile.arm b/Makefile.arm index eedd39b73..b5d80f8e6 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,7 +1,7 @@ ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon else CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a @@ -9,11 +9,6 @@ endif endif ifeq ($(CORE), ARMV6) -CCOMMON_OPT += -mfpu=vfp -march=armv6 -FCOMMON_OPT += -mfpu=vfp -march=armv6 -endif - -ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -march=armv5 -FCOMMON_OPT += -march=armv5 +CCOMMON_OPT += -mfpu=vfp +FCOMMON_OPT += -mfpu=vfp endif diff --git a/Makefile.install b/Makefile.install index fefecd98d..8070b4729 100644 --- a/Makefile.install +++ b/Makefile.install @@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ - ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ + ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif ifeq ($(OSNAME), WINNT) @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" diff --git a/Makefile.rule b/Makefile.rule index a299588e0..c0941e488 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.7.dev +VERSION = 0.3.8.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index 16791bcc2..a54282f6c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -142,9 +142,9 @@ endif endif -# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. +# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(ARCH), x86_64) -ifneq ($(C_COMPILER), PGI) +ifeq ($(findstring pgcc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif @@ -267,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy OBJCONV = $(CROSS_SUFFIX)objconv -# For detect fortran failed, only build BLAS. +# When fortran support was either not detected or actively deselected, only build BLAS. ifeq ($(NOFORTRAN), 1) NO_LAPACK = 1 +override FEXTRALIB = endif # @@ -1124,8 +1125,12 @@ endif endif ifdef NO_AFFINITY +ifeq ($(NO_AFFINITY), 0) +override undefine NO_AFFINITY +else CCOMMON_OPT += -DNO_AFFINITY endif +endif ifdef FUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE diff --git a/appveyor.yml b/appveyor.yml index 44a616aaa..2f9cc7b0b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,7 +35,14 @@ environment: DYNAMIC_ARCH: ON WITH_FORTRAN: no - COMPILER: cl - + - COMPILER: MinGW64-gcc-7.2.0-mingw + DYNAMIC_ARCH: OFF + WITH_FORTRAN: ignore + - COMPILER: MinGW64-gcc-7.2.0 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + COMPILER: MinGW-gcc-5.3.0 + WITH_FORTRAN: ignore + install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force @@ -52,7 +59,14 @@ install: before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% + - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. + - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. @@ -64,3 +78,4 @@ test_script: - echo Running Test - cd utest - openblas_utest + diff --git a/cmake/arch.cmake b/cmake/arch.cmake index b4547b7c9..5a7434551 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -81,7 +81,8 @@ if (DYNAMIC_ARCH) endif () if (NOT DYNAMIC_CORE) - unset(DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") + unset(DYNAMIC_ARCH CACHE) endif () endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index a67c44bf5..e508a46c2 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -59,6 +59,9 @@ set(FU "") if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) set(FU "_") endif() +if(MINGW AND NOT MINGW64) + set(FU "_") +endif() set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") @@ -82,6 +85,11 @@ endif () # f_check if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") +else () + file(APPEND ${TARGET_CONF_TEMP} + "#define BUNDERSCORE _\n" + "#define NEEDBUNDERSCORE 1\n") + set(BU "_") endif () # Cannot run getarch on target if we are cross-compiling diff --git a/cmake/system.cmake b/cmake/system.cmake index 7f3696286..4f8011603 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -66,10 +66,17 @@ if (DEFINED TARGET) endif () # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -if (X86_64) +if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI") set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") endif () +# On x86 no AVX support is available +if (X86 OR X86_64) +if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") +endif () +endif () + if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") @@ -148,7 +155,9 @@ else() endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") - +if (DEFINED BINARY) + message(STATUS "Compiling a ${BINARY}-bit binary.") +endif () if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () @@ -165,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") +else () +set(NO_LAPACK 1) +set(NO_LAPACKE 1) endif () if (BINARY64) @@ -190,9 +202,14 @@ if (NEED_PIC) endif () if (DYNAMIC_ARCH) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") - if (DYNAMIC_OLDER) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + if (X86 OR X86_64 OR ARM64 OR PPC) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") + endif () + else () + unset (DYNAMIC_ARCH) + message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") endif () endif () diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94d3ba643..610f689e0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX") EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) if(${OPERATING_SYSTEM} MATCHES "Android") set(HOST_OS ANDROID) - endif(${OPERATING_SYSTEM} MATCHES "Android") + endif() endif() diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a5e731d74..e8aa29813 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -94,7 +94,7 @@ int get_feature(char *search) if( p == NULL ) return 0; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { if (!strcmp(t, search)) { return(1); } } @@ -344,7 +344,7 @@ void get_features(void) if( p == NULL ) return; t = strtok(p," "); - while( t = strtok(NULL," ")) + while( (t = strtok(NULL," "))) { } diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..8c954bf21 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1211,7 +1211,7 @@ int get_cpuname(void){ return CPUTYPE_CORE2; } break; - case 1: + case 1: // family 6 exmodel 1 switch (model) { case 6: return CPUTYPE_CORE2; @@ -1228,7 +1228,7 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: + case 2: // family 6 exmodel 2 switch (model) { case 5: //Intel Core (Clarkdale) / Core (Arrandale) @@ -1257,7 +1257,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 3: + case 3: // family 6 exmodel 3 switch (model) { case 7: // Bay Trail @@ -1287,7 +1287,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 4: + case 4: // family 6 exmodel 4 switch (model) { case 5: case 6: @@ -1321,7 +1321,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 5: + case 5: // family 6 exmodel 5 switch (model) { case 6: //Broadwell @@ -1364,7 +1364,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 6: + case 6: // family 6 exmodel 6 switch (model) { case 6: // Cannon Lake if(support_avx512()) @@ -1376,7 +1376,22 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; + case 7: // family 6 exmodel 7 + switch (model) { + case 10: // Goldmont Plus + return CPUTYPE_NEHALEM; + case 14: // Ice Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; case 9: case 8: switch (model) { diff --git a/ctest/Makefile b/ctest/Makefile index 569a5dda3..f562c9bb3 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,8 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +override TARGET_ARCH= +override TARGET_MACH= LIB = $(TOPDIR)/$(LIBNAME) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 045fc65b8..f1cd3c6e6 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){ } } return NULL; + case 7: + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; case 9: case 8: - if (model == 14 ) { // Kaby Lake + if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { diff --git a/driver/others/memory.c b/driver/others/memory.c index f67cb01f4..534d6d9fc 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2041,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL; static void alloc_mmap_free(struct release_t *release){ +if (!release->address) return; + if (munmap(release -> address, BUFFER_SIZE)) { - printf("OpenBLAS : munmap failed\n"); + int errsv=errno; + perror("OpenBLAS : munmap failed:"); + printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); } } @@ -2073,6 +2077,12 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + } else { +#ifdef DEBUG + int errsv=errno; + perror("OpenBLAS : mmap failed:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); +#endif } #ifdef OS_LINUX diff --git a/dynamic.c b/dynamic.c new file mode 100644 index 000000000..aa2b87621 --- /dev/null +++ b/dynamic.c @@ -0,0 +1,897 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef _MSC_VER +#define strncasecmp _strnicmp +#define strcasecmp _stricmp +#endif + +#ifdef ARCH_X86 +#define EXTERN extern +#else +#define EXTERN +#endif + +#ifdef DYNAMIC_LIST +extern gotoblas_t gotoblas_PRESCOTT; + +#ifdef DYN_ATHLON +extern gotoblas_t gotoblas_ATHLON; +#else +#define gotoblas_ATHLON gotoblas_PRESCOTT +#endif +#ifdef DYN_KATMAI +extern gotoblas_t gotoblas_KATMAI; +#else +#define gotoblas_KATMAI gotoblas_PRESCOTT +#endif +#ifdef DYN_BANIAS +extern gotoblas_t gotoblas_BANIAS; +#else +#define gotoblas_BANIAS gotoblas_PRESCOTT +#endif +#ifdef DYN_COPPERMINE +extern gotoblas_t gotoblas_COPPERMINE; +#else +#define gotoblas_COPPERMINE gotoblas_PRESCOTT +#endif +#ifdef DYN_NORTHWOOD +extern gotoblas_t gotoblas_NORTHWOOD; +#else +#define gotoblas_NORTHWOOD gotoblas_PRESCOTT +#endif +#ifdef DYN_CORE2 +extern gotoblas_t gotoblas_CORE2; +#else +#define gotoblas_CORE2 gotoblas_PRESCOTT +#endif +#ifdef DYN_NEHALEM +extern gotoblas_t gotoblas_NEHALEM; +#else +#define gotoblas_NEHALEM gotoblas_PRESCOTT +#endif +#ifdef DYN_BARCELONA +extern gotoblas_t gotoblas_BARCELONA; +#elif defined(DYN_NEHALEM) +#define gotoblas_BARCELONA gotoblas_NEHALEM +#else +#define gotoblas_BARCELONA gotoblas_PRESCOTT +#endif +#ifdef DYN_ATOM +extern gotoblas_t gotoblas_ATOM; +elif defined(DYN_NEHALEM) +#define gotoblas_ATOM gotoblas_NEHALEM +#else +#define gotoblas_ATOM gotoblas_PRESCOTT +#endif +#ifdef DYN_NANO +extern gotoblas_t gotoblas_NANO; +#else +#define gotoblas_NANO gotoblas_PRESCOTT +#endif +#ifdef DYN_PENRYN +extern gotoblas_t gotoblas_PENRYN; +#else +#define gotoblas_PENRYN gotoblas_PRESCOTT +#endif +#ifdef DYN_DUNNINGTON +extern gotoblas_t gotoblas_DUNNINGTON; +#else +#define gotoblas_DUNNINGTON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON +extern gotoblas_t gotoblas_OPTERON; +#else +#define gotoblas_OPTERON gotoblas_PRESCOTT +#endif +#ifdef DYN_OPTERON_SSE3 +extern gotoblas_t gotoblas_OPTERON_SSE3; +#else +#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT +#endif +#ifdef DYN_BOBCAT +extern gotoblas_t gotoblas_BOBCAT; +#elif defined(DYN_NEHALEM) +#define gotoblas_BOBCAT gotoblas_NEHALEM +#else +#define gotoblas_BOBCAT gotoblas_PRESCOTT +#endif +#ifdef DYN_SANDYBRIDGE +extern gotoblas_t gotoblas_SANDYBRIDGE; +#elif defined(DYN_NEHALEM) +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#else +#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT +#endif +#ifdef DYN_BULLDOZER +extern gotoblas_t gotoblas_BULLDOZER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_BULLDOZER gotoblas_NEHALEM +#else +#define gotoblas_BULLDOZER gotoblas_PRESCOTT +#endif +#ifdef DYN_PILEDRIVER +extern gotoblas_t gotoblas_PILEDRIVER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_PILEDRIVER gotoblas_NEHALEM +#else +#define gotoblas_PILEDRIVER gotoblas_PRESCOTT +#endif +#ifdef DYN_STEAMROLLER +extern gotoblas_t gotoblas_STEAMROLLER; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_STEAMROLLER gotoblas_NEHALEM +#else +#define gotoblas_STEAMROLLER gotoblas_PRESCOTT +#endif +#ifdef DYN_EXCAVATOR +extern gotoblas_t gotoblas_EXCAVATOR; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_EXCAVATOR gotoblas_NEHALEM +#else +#define gotoblas_EXCAVATOR gotoblas_PRESCOTT +#endif +#ifdef DYN_HASWELL +extern gotoblas_t gotoblas_HASWELL; +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_HASWELL gotoblas_NEHALEM +#else +#define gotoblas_HASWELL gotoblas_PRESCOTT +#endif +#ifdef DYN_ZEN +extern gotoblas_t gotoblas_ZEN; +#elif defined(DYN_HASWELL) +#define gotoblas_ZEN gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_ZEN gotoblas_NEHALEM +#else +#define gotoblas_ZEN gotoblas_PRESCOTT +#endif +#ifdef DYN_SKYLAKEX +extern gotoblas_t gotoblas_SKYLAKEX; +#elif defined(DYN_HASWELL) +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SKYLAKEX gotoblas_NEHALEM +#else +#define gotoblas_SKYLAKEX gotoblas_PRESCOTT +#endif + + +#else // not DYNAMIC_LIST +EXTERN gotoblas_t gotoblas_KATMAI; +EXTERN gotoblas_t gotoblas_COPPERMINE; +EXTERN gotoblas_t gotoblas_NORTHWOOD; +EXTERN gotoblas_t gotoblas_BANIAS; +EXTERN gotoblas_t gotoblas_ATHLON; + +extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_BARCELONA; +#ifdef DYNAMIC_OLDER +extern gotoblas_t gotoblas_ATOM; +extern gotoblas_t gotoblas_NANO; +extern gotoblas_t gotoblas_PENRYN; +extern gotoblas_t gotoblas_DUNNINGTON; +extern gotoblas_t gotoblas_OPTERON; +extern gotoblas_t gotoblas_OPTERON_SSE3; +extern gotoblas_t gotoblas_BOBCAT; +#else +#define gotoblas_ATOM gotoblas_NEHALEM +#define gotoblas_NANO gotoblas_NEHALEM +#define gotoblas_PENRYN gotoblas_CORE2 +#define gotoblas_DUNNINGTON gotoblas_CORE2 +#define gotoblas_OPTERON gotoblas_CORE2 +#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 +#define gotoblas_BOBCAT gotoblas_CORE2 +#endif + +#ifndef NO_AVX +extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BULLDOZER; +extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_STEAMROLLER; +extern gotoblas_t gotoblas_EXCAVATOR; +#ifdef NO_AVX2 +#define gotoblas_HASWELL gotoblas_SANDYBRIDGE +#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#define gotoblas_ZEN gotoblas_SANDYBRIDGE +#else +extern gotoblas_t gotoblas_HASWELL; +extern gotoblas_t gotoblas_ZEN; +#ifndef NO_AVX512 +extern gotoblas_t gotoblas_SKYLAKEX; +#else +#define gotoblas_SKYLAKEX gotoblas_HASWELL +#endif +#endif +#else +//Use NEHALEM kernels for sandy bridge +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_HASWELL gotoblas_NEHALEM +#define gotoblas_SKYLAKEX gotoblas_NEHALEM +#define gotoblas_BULLDOZER gotoblas_BARCELONA +#define gotoblas_PILEDRIVER gotoblas_BARCELONA +#define gotoblas_STEAMROLLER gotoblas_BARCELONA +#define gotoblas_EXCAVATOR gotoblas_BARCELONA +#define gotoblas_ZEN gotoblas_BARCELONA +#endif + +#endif // DYNAMIC_LIST + +#define VENDOR_INTEL 1 +#define VENDOR_AMD 2 +#define VENDOR_CENTAUR 3 +#define VENDOR_HYGON 4 +#define VENDOR_UNKNOWN 99 + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#ifndef NO_AVX +static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv + __asm__ __volatile__ + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} +#endif + +int support_avx(){ +#ifndef NO_AVX + int eax, ebx, ecx, edx; + int ret=0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 6) == 6){ + ret=1; //OS support AVX + } + } + return ret; +#else + return 0; +#endif +} + +int support_avx2(){ +#ifndef NO_AVX2 + int eax, ebx, ecx=0, edx; + int ret=0; + + if (!support_avx()) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 0) + ret=1; //OS supports AVX2 + return ret; +#else + return 0; +#endif +} + +int support_avx512(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx()) + return 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + if((ebx & (1<<7)) != 1){ + ret=0; //OS does not even support AVX2 + } + if((ebx & (1<<31)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 0xe0) == 0xe0) + ret=1; //OS supports AVX512VL + } + return ret; +#else + return 0; +#endif +} + +extern void openblas_warning(int verbose, const char * msg); +#define FALLBACK_VERBOSE 1 +#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" +#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" +#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" +#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" + +static int get_vendor(void){ + int eax, ebx, ecx, edx; + + union + { + char vchar[16]; + int vint[4]; + } vendor; + + cpuid(0, &eax, &ebx, &ecx, &edx); + + *(&vendor.vint[0]) = ebx; + *(&vendor.vint[1]) = edx; + *(&vendor.vint[2]) = ecx; + + vendor.vchar[12] = '\0'; + + if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; + if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; + if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; + + if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +static gotoblas_t *get_coretype(void){ + + int eax, ebx, ecx, edx; + int family, exfamily, model, vendor, exmodel; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + family = BITMASK(eax, 8, 0x0f); + exfamily = BITMASK(eax, 20, 0xff); + model = BITMASK(eax, 4, 0x0f); + exmodel = BITMASK(eax, 16, 0x0f); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 0x6: + switch (exmodel) { + case 0: + if (model <= 0x7) return &gotoblas_KATMAI; + if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; + if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; + if (model == 14) return &gotoblas_BANIAS; + if (model == 15) return &gotoblas_CORE2; + return NULL; + + case 1: + if (model == 6) return &gotoblas_CORE2; + if (model == 7) return &gotoblas_PENRYN; + if (model == 13) return &gotoblas_DUNNINGTON; + if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; + if (model == 12) return &gotoblas_ATOM; + return NULL; + + case 2: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + if (model == 5) return &gotoblas_NEHALEM; + + //Intel Xeon Processor 5600 (Westmere-EP) + //Xeon Processor E7 (Westmere-EX) + //Xeon E7540 + if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; + + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + //Intel Core i7-3000 / Xeon E5 + if (model == 10 || model == 13) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + case 3: + //Intel Sandy Bridge 22nm (Ivy Bridge?) + if (model == 10 || model == 14) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Haswell + if (model == 12 || model == 15) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Broadwell + if (model == 13) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + if (model == 7) return &gotoblas_ATOM; //Bay Trail + return NULL; + case 4: + //Intel Haswell + if (model == 5 || model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Broadwell + if (model == 7 || model == 15) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Skylake + if (model == 14) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Braswell / Avoton + if (model == 12 || model == 13) { + return &gotoblas_NEHALEM; + } + return NULL; + case 5: + //Intel Broadwell + if (model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + if (model == 5) { + // Intel Skylake X + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + //Intel Skylake + if (model == 14) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Intel Phi Knights Landing + if (model == 7) { + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + //Apollo Lake or Denverton + if (model == 12 || model == 15) { + return &gotoblas_NEHALEM; + } + return NULL; + case 6: + if (model == 6) { + // Cannon Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + case 7: + if (model == 10) // Goldmont plus + return &gotoblas_NEHALEM; + if (model == 14) { + // Ice Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + case 9: + case 8: + if (model == 14 ) { // Kaby Lake, Coffee Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; + } + case 0xf: + if (model <= 0x2) return &gotoblas_NORTHWOOD; + return &gotoblas_PRESCOTT; + } + } + + if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ + if (family <= 0xe) { + // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if ( (eax & 0xffff) >= 0x01) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) + return NULL; + } + else + return NULL; + + return &gotoblas_ATHLON; + } + if (family == 0xf){ + if ((exfamily == 0) || (exfamily == 2)) { + if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; + else return &gotoblas_OPTERON; + } else if (exfamily == 5) { + return &gotoblas_BOBCAT; + } else if (exfamily == 6) { + if(model == 1){ + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return &gotoblas_BULLDOZER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 2 || model == 3){ + //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 + if(support_avx()) + return &gotoblas_PILEDRIVER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 5){ + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if(model == 0 || model == 8){ + if (exmodel == 1) { + //AMD Trinity + if(support_avx()) + return &gotoblas_PILEDRIVER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 3) { + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 6) { + if(support_avx()) + return &gotoblas_EXCAVATOR; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + + } + } + } else if (exfamily == 8) { + if (model == 1 || model == 8) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + } + } else if (exfamily == 9) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else { + return &gotoblas_BARCELONA; + } + } + } + + if (vendor == VENDOR_CENTAUR) { + switch (family) { + case 0x6: + return &gotoblas_NANO; + } + } + + return NULL; +} + +static char *corename[] = { + "Unknown", + "Katmai", + "Coppermine", + "Northwood", + "Prescott", + "Banias", + "Atom", + "Core2", + "Penryn", + "Dunnington", + "Nehalem", + "Athlon", + "Opteron", + "Opteron_SSE3", + "Barcelona", + "Nano", + "Sandybridge", + "Bobcat", + "Bulldozer", + "Piledriver", + "Haswell", + "Steamroller", + "Excavator", + "Zen", + "SkylakeX" +}; + +char *gotoblas_corename(void) { + + if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; + if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; + if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; + if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; + if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; + if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_CORE2) return corename[ 7]; + if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; + if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_NEHALEM) return corename[10]; + if (gotoblas == &gotoblas_ATHLON) return corename[11]; + if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; + if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_BARCELONA) return corename[14]; + if (gotoblas == &gotoblas_NANO) return corename[15]; + if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; + if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; + if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_HASWELL) return corename[20]; + if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; + if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; + if (gotoblas == &gotoblas_ZEN) return corename[23]; + if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; + return corename[0]; +} + + +static gotoblas_t *force_coretype(char *coretype){ + + int i ; + int found = -1; + char message[128]; + //char mname[20]; + + for ( i=1 ; i <= 24; i++) + { + if (!strncasecmp(coretype,corename[i],20)) + { + found = i; + break; + } + } + if (found < 0) + { + //strncpy(mname,coretype,20); + snprintf(message, 128, "Core not found: %s\n",coretype); + openblas_warning(1, message); + return(NULL); + } + + switch (found) + { + case 24: return (&gotoblas_SKYLAKEX); + case 23: return (&gotoblas_ZEN); + case 22: return (&gotoblas_EXCAVATOR); + case 21: return (&gotoblas_STEAMROLLER); + case 20: return (&gotoblas_HASWELL); + case 19: return (&gotoblas_PILEDRIVER); + case 18: return (&gotoblas_BULLDOZER); + case 17: return (&gotoblas_BOBCAT); + case 16: return (&gotoblas_SANDYBRIDGE); + case 15: return (&gotoblas_NANO); + case 14: return (&gotoblas_BARCELONA); + case 13: return (&gotoblas_OPTERON); + case 12: return (&gotoblas_OPTERON_SSE3); + case 11: return (&gotoblas_ATHLON); + case 10: return (&gotoblas_NEHALEM); + case 9: return (&gotoblas_DUNNINGTON); + case 8: return (&gotoblas_PENRYN); + case 7: return (&gotoblas_CORE2); + case 6: return (&gotoblas_ATOM); + case 5: return (&gotoblas_BANIAS); + case 4: return (&gotoblas_PRESCOTT); + case 3: return (&gotoblas_NORTHWOOD); + case 2: return (&gotoblas_COPPERMINE); + case 1: return (&gotoblas_KATMAI); + } + return(NULL); + +} + + + + +void gotoblas_dynamic_init(void) { + + char coremsg[128]; + char coren[22]; + char *p; + + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + +#ifdef ARCH_X86 + if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; +#else + if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ + if (sizeof(void*) == 8) { + if (gotoblas == &gotoblas_KATMAI || + gotoblas == &gotoblas_COPPERMINE || + gotoblas == &gotoblas_NORTHWOOD || + gotoblas == &gotoblas_BANIAS || + gotoblas == &gotoblas_ATHLON) + gotoblas = &gotoblas_PRESCOTT; + } +#endif + + if (gotoblas && gotoblas -> init) { + strncpy(coren,gotoblas_corename(),20); + sprintf(coremsg, "Core: %s\n",coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + + gotoblas = NULL; + +} diff --git a/exports/Makefile b/exports/Makefile index b1348bd4a..d32e449df 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) +ifeq ($(OSNAME), Darwin) +INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib +endif + ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else @@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran - $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else - $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif dllinit.$(SUFFIX) : dllinit.c diff --git a/kernel/x86/lsame.S b/kernel/x86/lsame.S index 3ac7a7314..2a2ab2bb5 100644 --- a/kernel/x86/lsame.S +++ b/kernel/x86/lsame.S @@ -56,13 +56,13 @@ #ifndef HAVE_CMOV movl %eax, %ecx subl $32, %ecx - jle .L1 + jl .L1 movl %ecx, %eax .L1: movl %edx, %ecx subl $32, %ecx - jle .L2 + jl .L2 movl %ecx, %edx .L2: subl %eax, %edx diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index c84b599ce..19e32ef2c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -106,7 +106,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define A_PR1 512 -#define B_PR1 512 +#define B_PR1 160 +#define BROADCASTKERNEL /******************************************************************************************* * Macro definitions @@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1+64(BO) vmovups -8 * SIZE(BO), %ymm2 prefetcht0 B_PR1+128(BO) @@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 ,%ymm2 , %ymm8 vmulpd %ymm0 ,%ymm3 , %ymm12 prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 12*SIZE, BO vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_M1 prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 prefetcht0 B_PR1+128(BO) vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups 0 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else vmovups -12 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x12_SUB vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else vmovups -16 * SIZE(AO), %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 vmovups -4 * SIZE(BO), %ymm3 vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 12*SIZE, BO vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 vfmadd231pd %ymm0 ,%ymm3 , %ymm15 @@ -267,43 +347,83 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x12 + prefetcht0 BUFFER1 vbroadcastsd ALPHA, %ymm0 vmulpd %ymm0 , %ymm4 , %ymm4 vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 vmulpd %ymm0 , %ymm7 , %ymm7 - + prefetcht0 64 + BUFFER1 vmulpd %ymm0 , %ymm8 , %ymm8 vmulpd %ymm0 , %ymm9 , %ymm9 vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - +#if B_PR1 > 32 + prefetcht0 128 + BUFFER1 +#endif vmulpd %ymm0 , %ymm12, %ymm12 vmulpd %ymm0 , %ymm13, %ymm13 vmulpd %ymm0 , %ymm14, %ymm14 vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 > 96 + prefetcht0 192 + BUFFER1 +#endif - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif +#if B_PR1 > 160 + prefetcht0 256 + BUFFER1 +#endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 +#if B_PR1 > 224 + prefetcht0 320 + BUFFER1 +#endif +#ifndef BROADCASTKERNEL + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + +#if B_PR1 > 288 + prefetcht0 384 + BUFFER1 +#endif + +#ifndef BROADCASTKERNEL vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif +#if B_PR1 > 352 + prefetcht0 448 + BUFFER1 +#endif leaq (CO1, LDC, 2), %rax +#if B_PR1 > 416 + prefetcht0 512 + BUFFER1 +#endif #if !defined(TRMMKERNEL) @@ -319,29 +439,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -360,29 +488,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 4), %rax leaq (%rbp, LDC, 4), %rbp @@ -401,10 +537,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm @@ -683,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_I vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmovups -8 * SIZE(BO), %ymm2 vmulpd %ymm0 ,%ymm1 , %ymm4 vmulpd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 vmulpd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vmulpd %ymm0 ,%ymm2 , %ymm11 @@ -705,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif prefetcht0 B_PR1(BO) vfmadd231pd %ymm0 ,%ymm1 , %ymm4 prefetcht0 B_PR1+64(BO) vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -726,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -4 * SIZE(BO), %ymm1 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -747,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 addq $ 8*SIZE, BO @@ -766,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 vmovups -8 * SIZE(BO), %ymm2 vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 vfmadd231pd %ymm0 ,%ymm2 , %ymm9 addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 vfmadd231pd %ymm0 ,%ymm2 , %ymm10 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vfmadd231pd %ymm0 ,%ymm2 , %ymm11 @@ -799,23 +1014,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm10, %ymm10 vmulpd %ymm0 , %ymm11, %ymm11 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax @@ -834,29 +1058,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rax) vmovups %ymm7 , (%rax, LDC) - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - +#endif leaq (%rax, LDC, 2), %rax leaq (%rax, LDC, 2), %rbp @@ -875,10 +1107,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmovups %ymm6 , (%rbp) vmovups %ymm7 , (%rbp, LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) addq $ 4*SIZE, CO1 .endm @@ -1082,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I prefetcht0 A_PR1(AO) vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vmulpd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 @@ -1098,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_M1 prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -12 * SIZE(BO), %ymm1 .endm .macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 vmovups -8 * SIZE(BO), %ymm1 addq $ 8*SIZE, BO @@ -1128,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else vmovups -12 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 addq $ 4*SIZE, BO .endm .macro KERNEL4x4_SUB vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else vmovups -16 * SIZE(AO), %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm5 addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm6 addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif vfmadd231pd %ymm0 ,%ymm1 , %ymm7 .endm @@ -1165,23 +1476,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmulpd %ymm0 , %ymm5 , %ymm5 vmulpd %ymm0 , %ymm6 , %ymm6 - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif leaq (CO1, LDC, 2), %rax @@ -1617,6 +1937,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) +.endm /*******************************************************************************************/ #if !defined(TRMMKERNEL) @@ -1784,12 +2112,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. dec %rax jne .L12_12 - + .L12_12a: - + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -1844,6 +2181,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + decq I # i -- jne .L12_11 ALIGN_4 @@ -1851,6 +2205,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + + /* recover the original value of pointer B after prefetch */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + .L12_20: // Test rest of M @@ -2068,10 +2433,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jne .L13_12 .L13_12a: - + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 KERNEL4x12_M2 KERNEL4x12_M1 @@ -2081,7 +2455,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jmp .L13_16 - .L13_13: test $1, %rax @@ -2126,6 +2499,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE4x12 + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + decq I # i -- jne .L13_11 ALIGN_4 @@ -2133,6 +2523,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************** * Rest of M ***************************************************************************/ + /* recover the original value of pointer B */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + .L13_20: // Test rest of M diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c index 651736b89..2acdc4615 100644 --- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" @@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" @@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" - " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + " vpermilpd $0x05 , %%ymm5 , %%ymm5 \n\t" + " vpermilpd $0x05 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" @@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index 9ab78fc8e..cb939e762 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "1: \n\t" " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" // was vpermpd 0xb1 " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" @@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " addq $8, %1 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" @@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" @@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vpermilpd $0x05 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" @@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" @@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vpermilpd $0x05 , %%ymm4 , %%ymm4 \n\t" " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" "3: \n\t" - " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" - " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + " vpermilpd $0x05 , %%ymm9 , %%ymm9 \n\t" + " vpermilpd $0x05 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" - " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" - " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + " vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" + " vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" - " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" - " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" - " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + " vperm2f128 $0x01 , %%ymm2 , %%ymm2 , %%ymm2 \n\t" + " vperm2f128 $0x01 , %%ymm3 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index ef12569c8..48f855b0e 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -86,18 +86,26 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + + + +static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,OPENBLAS_COMPLEX_FLOAT *result) { BLASLONG i; BLASLONG ix,iy; FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; - + if ( n <= 0 ) { -// CREAL(result) = 0.0 ; -// CIMAG(result) = 0.0 ; - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); - return(result); + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); + *result=res; + return; } @@ -150,18 +158,68 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA } #if !defined(CONJ) - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); -// CREAL(result) = dot[0] - dot[1]; -// CIMAG(result) = dot[2] + dot[3]; + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else - OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); -// CREAL(result) = dot[0] + dot[1]; -// CIMAG(result) = dot[2] - dot[3]; - + OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif - - return(result); - + *result=res; + return; } +#if defined(SMP) +static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, +BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + zdot_compute(n, x, inc_x, y, inc_y, (void *)result); + return 0; +} +#endif + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + OPENBLAS_COMPLEX_FLOAT zdot; + CREAL(zdot) = 0.0; + CIMAG(zdot) = 0.0; + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + zdot_compute(n, x, inc_x, y, inc_y, &zdot); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + OPENBLAS_COMPLEX_FLOAT *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_COMPLEX; +#else + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)zdot_thread_function, nthreads); + + ptr = (OPENBLAS_COMPLEX_FLOAT *)result; + for (i = 0; i < nthreads; i++) { + CREAL(zdot) = CREAL(zdot) + CREAL(*ptr); + CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr); + ptr = (void *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + zdot_compute(n, x, inc_x, y, inc_y, &zdot); +#endif + + return zdot; +} diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 9f2fc2c1d..4eade7bfd 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" @@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" - "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t" +// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t" +// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" "vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i "vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i - "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" - "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + "vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t" + "vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t" +// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t" +// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" "vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r "addq $16 , %0 \n\t" diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 4e647cadc..1e3051a8f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,9 +38,14 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c + ) +if (NOT NO_CBLAS AND NOT NO_LAPACKE) +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} test_kernel_regress.c ) endif() +endif() set(OpenBLAS_utest_bin openblas_utest) add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src}) diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..8c7e6b9f8 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -1,6 +1,9 @@ UTEST_CHECK = 1 TOPDIR = .. +override TARGET_ARCH= +override TARGET_MACH= + UTESTBIN=openblas_utest .PHONY : all @@ -13,8 +16,12 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o +ifneq ($(NO_CBLAS), 1) +ifneq ($(NO_LAPACKE), 1) OBJS += test_kernel_regress.o endif +endif +endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch